src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_allocate.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         unsigned shift = panfrost_get_stack_shift(batch->stack_size);
  62         struct mali_shared_memory shared = {
  63                 .stack_shift = shift,
  64                 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
  65                 .shared_workgroup_count = ~0,
  66         };
  67         postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
  68 }
  69
  70 static void
  71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  72                                struct mali_vertex_tiler_postfix *postfix)
  73 {
  74         struct panfrost_device *dev = pan_device(ctx->base.screen);
  75         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  76
  77         /* If we haven't, reserve space for the framebuffer */
  78
  79         if (!batch->framebuffer.gpu) {
  80                 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
  81                         sizeof(struct mali_single_framebuffer) :
  82                         sizeof(struct mali_framebuffer);
  83
  84                 batch->framebuffer = panfrost_allocate_transient(batch, size);
  85
  86                 /* Tag the pointer */
  87                 if (!(dev->quirks & MIDGARD_SFBD))
  88                         batch->framebuffer.gpu |= MALI_MFBD;
  89         }
  90
  91         postfix->shared_memory = batch->framebuffer.gpu;
  92 }
  93
  94 static void
  95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
  96                               struct mali_vertex_tiler_prefix *prefix,
  97                               struct mali_vertex_tiler_postfix *postfix)
  98 {
  99         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 100
 101         postfix->gl_enables |= 0x7;
 102         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
 103                 rasterizer && rasterizer->base.front_ccw);
 104         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
 105                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
 106         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
 107                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
 108         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
 109                 rasterizer && rasterizer->base.flatshade_first);
 110 }
 111
 112 void
 113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 114                                   struct mali_vertex_tiler_prefix *prefix,
 115                                   union midgard_primitive_size *primitive_size)
 116 {
 117         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 118
 119         if (!panfrost_writes_point_size(ctx)) {
 120                 bool points = prefix->draw_mode == MALI_POINTS;
 121                 float val = 0.0f;
 122
 123                 if (rasterizer)
 124                         val = points ?
 125                               rasterizer->base.point_size :
 126                               rasterizer->base.line_width;
 127
 128                 primitive_size->constant = val;
 129         }
 130 }
 131
 132 static void
 133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 134                                    struct mali_vertex_tiler_postfix *postfix)
 135 {
 136         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 137         if (ctx->occlusion_query)
 138                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 139         else
 140                 postfix->occlusion_counter = 0;
 141 }
 142
 143 void
 144 panfrost_vt_init(struct panfrost_context *ctx,
 145                  enum pipe_shader_type stage,
 146                  struct mali_vertex_tiler_prefix *prefix,
 147                  struct mali_vertex_tiler_postfix *postfix)
 148 {
 149         struct panfrost_device *device = pan_device(ctx->base.screen);
 150
 151         if (!ctx->shader[stage])
 152                 return;
 153
 154         memset(prefix, 0, sizeof(*prefix));
 155         memset(postfix, 0, sizeof(*postfix));
 156
 157         if (device->quirks & IS_BIFROST) {
 158                 postfix->gl_enables = 0x2;
 159                 panfrost_vt_emit_shared_memory(ctx, postfix);
 160         } else {
 161                 postfix->gl_enables = 0x6;
 162                 panfrost_vt_attach_framebuffer(ctx, postfix);
 163         }
 164
 165         if (stage == PIPE_SHADER_FRAGMENT) {
 166                 panfrost_vt_update_occlusion_query(ctx, postfix);
 167                 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
 168         }
 169 }
 170
 171 static unsigned
 172 panfrost_translate_index_size(unsigned size)
 173 {
 174         switch (size) {
 175         case 1:
 176                 return MALI_DRAW_INDEXED_UINT8;
 177
 178         case 2:
 179                 return MALI_DRAW_INDEXED_UINT16;
 180
 181         case 4:
 182                 return MALI_DRAW_INDEXED_UINT32;
 183
 184         default:
 185                 unreachable("Invalid index size");
 186         }
 187 }
 188
 189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 190  * good for the duration of the draw (transient), could last longer. Also get
 191  * the bounds on the index buffer for the range accessed by the draw. We do
 192  * these operations together because there are natural optimizations which
 193  * require them to be together. */
 194
 195 static mali_ptr
 196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 197                                   const struct pipe_draw_info *info,
 198                                   unsigned *min_index, unsigned *max_index)
 199 {
 200         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 201         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 202         off_t offset = info->start * info->index_size;
 203         bool needs_indices = true;
 204         mali_ptr out = 0;
 205
 206         if (info->max_index != ~0u) {
 207                 *min_index = info->min_index;
 208                 *max_index = info->max_index;
 209                 needs_indices = false;
 210         }
 211
 212         if (!info->has_user_indices) {
 213                 /* Only resources can be directly mapped */
 214                 panfrost_batch_add_bo(batch, rsrc->bo,
 215                                       PAN_BO_ACCESS_SHARED |
 216                                       PAN_BO_ACCESS_READ |
 217                                       PAN_BO_ACCESS_VERTEX_TILER);
 218                 out = rsrc->bo->gpu + offset;
 219
 220                 /* Check the cache */
 221                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 222                                                            info->start,
 223                                                            info->count,
 224                                                            min_index,
 225                                                            max_index);
 226         } else {
 227                 /* Otherwise, we need to upload to transient memory */
 228                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 229                 out = panfrost_upload_transient(batch, ibuf8 + offset,
 230                                                 info->count *
 231                                                 info->index_size);
 232         }
 233
 234         if (needs_indices) {
 235                 /* Fallback */
 236                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 237
 238                 if (!info->has_user_indices)
 239                         panfrost_minmax_cache_add(rsrc->index_cache,
 240                                                   info->start, info->count,
 241                                                   *min_index, *max_index);
 242         }
 243
 244         return out;
 245 }
 246
 247 void
 248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 249                           const struct pipe_draw_info *info,
 250                           enum mali_draw_mode draw_mode,
 251                           struct mali_vertex_tiler_postfix *vertex_postfix,
 252                           struct mali_vertex_tiler_prefix *tiler_prefix,
 253                           struct mali_vertex_tiler_postfix *tiler_postfix,
 254                           unsigned *vertex_count,
 255                           unsigned *padded_count)
 256 {
 257         tiler_prefix->draw_mode = draw_mode;
 258
 259         unsigned draw_flags = 0;
 260
 261         if (panfrost_writes_point_size(ctx))
 262                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 263
 264         if (info->primitive_restart)
 265                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 266
 267         /* These doesn't make much sense */
 268
 269         draw_flags |= 0x3000;
 270
 271         if (info->index_size) {
 272                 unsigned min_index = 0, max_index = 0;
 273
 274                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 275                                                                        info,
 276                                                                        &min_index,
 277                                                                        &max_index);
 278
 279                 /* Use the corresponding values */
 280                 *vertex_count = max_index - min_index + 1;
 281                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 282                 tiler_prefix->offset_bias_correction = -min_index;
 283                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 284                 draw_flags |= panfrost_translate_index_size(info->index_size);
 285         } else {
 286                 tiler_prefix->indices = 0;
 287                 *vertex_count = ctx->vertex_count;
 288                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 289                 tiler_prefix->offset_bias_correction = 0;
 290                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 291         }
 292
 293         tiler_prefix->unknown_draw = draw_flags;
 294
 295         /* Encode the padded vertex count */
 296
 297         if (info->instance_count > 1) {
 298                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 299
 300                 unsigned shift = __builtin_ctz(ctx->padded_count);
 301                 unsigned k = ctx->padded_count >> (shift + 1);
 302
 303                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 304                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 305         } else {
 306                 *padded_count = *vertex_count;
 307
 308                 /* Reset instancing state */
 309                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 310                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 311         }
 312 }
 313
 314 static void
 315 panfrost_shader_meta_init(struct panfrost_context *ctx,
 316                           enum pipe_shader_type st,
 317                           struct mali_shader_meta *meta)
 318 {
 319         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 320         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 321
 322         memset(meta, 0, sizeof(*meta));
 323         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 324         meta->attribute_count = ss->attribute_count;
 325         meta->varying_count = ss->varying_count;
 326         meta->texture_count = ctx->sampler_view_count[st];
 327         meta->sampler_count = ctx->sampler_count[st];
 328
 329         if (dev->quirks & IS_BIFROST) {
 330                 if (st == PIPE_SHADER_VERTEX)
 331                         meta->bifrost1.unk1 = 0x800000;
 332                 else {
 333                         /* First clause ATEST |= 0x4000000.
 334                          * Less than 32 regs |= 0x200 */
 335                         meta->bifrost1.unk1 = 0x958020;
 336                 }
 337
 338                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 339                 if (st == PIPE_SHADER_VERTEX)
 340                         meta->bifrost2.preload_regs = 0xC0;
 341                 else
 342                         meta->bifrost2.preload_regs = 0x1;
 343                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 344                                                     ss->uniform_cutoff);
 345         } else {
 346                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 347                                                     ss->uniform_cutoff);
 348                 meta->midgard1.work_count = ss->work_reg_count;
 349                 meta->midgard1.flags_hi = 0x8; /* XXX */
 350                 meta->midgard1.flags_lo = 0x220;
 351                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 352         }
 353 }
 354
 355 static unsigned
 356 panfrost_translate_compare_func(enum pipe_compare_func in)
 357 {
 358         switch (in) {
 359         case PIPE_FUNC_NEVER:
 360                 return MALI_FUNC_NEVER;
 361
 362         case PIPE_FUNC_LESS:
 363                 return MALI_FUNC_LESS;
 364
 365         case PIPE_FUNC_EQUAL:
 366                 return MALI_FUNC_EQUAL;
 367
 368         case PIPE_FUNC_LEQUAL:
 369                 return MALI_FUNC_LEQUAL;
 370
 371         case PIPE_FUNC_GREATER:
 372                 return MALI_FUNC_GREATER;
 373
 374         case PIPE_FUNC_NOTEQUAL:
 375                 return MALI_FUNC_NOTEQUAL;
 376
 377         case PIPE_FUNC_GEQUAL:
 378                 return MALI_FUNC_GEQUAL;
 379
 380         case PIPE_FUNC_ALWAYS:
 381                 return MALI_FUNC_ALWAYS;
 382
 383         default:
 384                 unreachable("Invalid func");
 385         }
 386 }
 387
 388 static unsigned
 389 panfrost_translate_stencil_op(enum pipe_stencil_op in)
 390 {
 391         switch (in) {
 392         case PIPE_STENCIL_OP_KEEP:
 393                 return MALI_STENCIL_KEEP;
 394
 395         case PIPE_STENCIL_OP_ZERO:
 396                 return MALI_STENCIL_ZERO;
 397
 398         case PIPE_STENCIL_OP_REPLACE:
 399                return MALI_STENCIL_REPLACE;
 400
 401         case PIPE_STENCIL_OP_INCR:
 402                 return MALI_STENCIL_INCR;
 403
 404         case PIPE_STENCIL_OP_DECR:
 405                 return MALI_STENCIL_DECR;
 406
 407         case PIPE_STENCIL_OP_INCR_WRAP:
 408                 return MALI_STENCIL_INCR_WRAP;
 409
 410         case PIPE_STENCIL_OP_DECR_WRAP:
 411                 return MALI_STENCIL_DECR_WRAP;
 412
 413         case PIPE_STENCIL_OP_INVERT:
 414                 return MALI_STENCIL_INVERT;
 415
 416         default:
 417                 unreachable("Invalid stencil op");
 418         }
 419 }
 420
 421 static unsigned
 422 translate_tex_wrap(enum pipe_tex_wrap w)
 423 {
 424         switch (w) {
 425         case PIPE_TEX_WRAP_REPEAT:
 426                 return MALI_WRAP_REPEAT;
 427
 428         case PIPE_TEX_WRAP_CLAMP:
 429                 return MALI_WRAP_CLAMP;
 430
 431         case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 432                 return MALI_WRAP_CLAMP_TO_EDGE;
 433
 434         case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 435                 return MALI_WRAP_CLAMP_TO_BORDER;
 436
 437         case PIPE_TEX_WRAP_MIRROR_REPEAT:
 438                 return MALI_WRAP_MIRRORED_REPEAT;
 439
 440         case PIPE_TEX_WRAP_MIRROR_CLAMP:
 441                 return MALI_WRAP_MIRRORED_CLAMP;
 442
 443         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 444                 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
 445
 446         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 447                 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
 448
 449         default:
 450                 unreachable("Invalid wrap");
 451         }
 452 }
 453
 454 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 455                                 struct mali_sampler_descriptor *hw)
 456 {
 457         unsigned func = panfrost_translate_compare_func(cso->compare_func);
 458         bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 459         bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 460         bool mip_linear  = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
 461         unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
 462         unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
 463         unsigned mip_filter = mip_linear  ?
 464                               (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
 465         unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
 466
 467         *hw = (struct mali_sampler_descriptor) {
 468                 .filter_mode = min_filter | mag_filter | mip_filter |
 469                                normalized,
 470                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 471                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 472                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 473                 .compare_func = panfrost_flip_compare_func(func),
 474                 .border_color = {
 475                         cso->border_color.f[0],
 476                         cso->border_color.f[1],
 477                         cso->border_color.f[2],
 478                         cso->border_color.f[3]
 479                 },
 480                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 481                 .max_lod = FIXED_16(cso->max_lod, false),
 482                 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
 483                 .seamless_cube_map = cso->seamless_cube_map,
 484         };
 485
 486         /* If necessary, we disable mipmapping in the sampler descriptor by
 487          * clamping the LOD as tight as possible (from 0 to epsilon,
 488          * essentially -- remember these are fixed point numbers, so
 489          * epsilon=1/256) */
 490
 491         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 492                 hw->max_lod = hw->min_lod + 1;
 493 }
 494
 495 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 496                                         struct bifrost_sampler_descriptor *hw)
 497 {
 498         *hw = (struct bifrost_sampler_descriptor) {
 499                 .unk1 = 0x1,
 500                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 501                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 502                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 503                 .unk8 = 0x8,
 504                 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
 505                 .norm_coords = cso->normalized_coords,
 506                 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
 507                 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
 508                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 509                 .max_lod = FIXED_16(cso->max_lod, false),
 510         };
 511
 512         /* If necessary, we disable mipmapping in the sampler descriptor by
 513          * clamping the LOD as tight as possible (from 0 to epsilon,
 514          * essentially -- remember these are fixed point numbers, so
 515          * epsilon=1/256) */
 516
 517         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 518                 hw->max_lod = hw->min_lod + 1;
 519 }
 520
 521 static void
 522 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
 523                             struct mali_stencil_test *out)
 524 {
 525         out->ref = 0; /* Gallium gets it from elsewhere */
 526
 527         out->mask = in->valuemask;
 528         out->func = panfrost_translate_compare_func(in->func);
 529         out->sfail = panfrost_translate_stencil_op(in->fail_op);
 530         out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
 531         out->dppass = panfrost_translate_stencil_op(in->zpass_op);
 532 }
 533
 534 static void
 535 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 536                                      struct mali_shader_meta *fragmeta)
 537 {
 538         if (!ctx->rasterizer) {
 539                 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
 540                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
 541                 fragmeta->depth_units = 0.0f;
 542                 fragmeta->depth_factor = 0.0f;
 543                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
 544                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
 545                 return;
 546         }
 547
 548         bool msaa = ctx->rasterizer->base.multisample;
 549
 550         /* TODO: Sample size */
 551         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 552         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 553         fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
 554         fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
 555
 556         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 557
 558         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
 559                 ctx->rasterizer->base.offset_tri);
 560         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
 561                 ctx->rasterizer->base.offset_tri);
 562 }
 563
 564 static void
 565 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 566                               struct mali_shader_meta *fragmeta)
 567 {
 568         const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 569         int zfunc = PIPE_FUNC_ALWAYS;
 570
 571         if (!zsa) {
 572                 struct pipe_stencil_state default_stencil = {
 573                         .enabled = 0,
 574                         .func = PIPE_FUNC_ALWAYS,
 575                         .fail_op = MALI_STENCIL_KEEP,
 576                         .zfail_op = MALI_STENCIL_KEEP,
 577                         .zpass_op = MALI_STENCIL_KEEP,
 578                         .writemask = 0xFF,
 579                         .valuemask = 0xFF
 580                 };
 581
 582                 panfrost_make_stencil_state(&default_stencil,
 583                                             &fragmeta->stencil_front);
 584                 fragmeta->stencil_mask_front = default_stencil.writemask;
 585                 fragmeta->stencil_back = fragmeta->stencil_front;
 586                 fragmeta->stencil_mask_back = default_stencil.writemask;
 587                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
 588                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
 589         } else {
 590                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 591                         zsa->stencil[0].enabled);
 592                 panfrost_make_stencil_state(&zsa->stencil[0],
 593                                             &fragmeta->stencil_front);
 594                 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
 595                 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
 596
 597                 /* If back-stencil is not enabled, use the front values */
 598
 599                 if (zsa->stencil[1].enabled) {
 600                         panfrost_make_stencil_state(&zsa->stencil[1],
 601                                                     &fragmeta->stencil_back);
 602                         fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
 603                         fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
 604                 } else {
 605                         fragmeta->stencil_back = fragmeta->stencil_front;
 606                         fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
 607                         fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
 608                 }
 609
 610                 if (zsa->depth.enabled)
 611                         zfunc = zsa->depth.func;
 612
 613                 /* Depth state (TODO: Refactor) */
 614
 615                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 616                         zsa->depth.writemask);
 617         }
 618
 619         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 620         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
 621 }
 622
 623 static bool
 624 panfrost_fs_required(
 625                 struct panfrost_shader_state *fs,
 626                 struct panfrost_blend_final *blend,
 627                 unsigned rt_count)
 628 {
 629         /* If we generally have side effects */
 630         if (fs->fs_sidefx)
 631                 return true;
 632
 633         /* If colour is written we need to execute */
 634         for (unsigned i = 0; i < rt_count; ++i) {
 635                 if (!blend[i].no_colour)
 636                         return true;
 637         }
 638
 639         /* If depth is written and not implied we need to execute.
 640          * TODO: Predicate on Z/S writes being enabled */
 641         return (fs->writes_depth || fs->writes_stencil);
 642 }
 643
 644 static void
 645 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 646                                 struct mali_shader_meta *fragmeta,
 647                                 void *rts)
 648 {
 649         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 650         struct panfrost_shader_state *fs;
 651         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 652
 653         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 654                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 655                 !ctx->blend->base.dither);
 656
 657         /* Get blending setup */
 658         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 659
 660         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 661         unsigned shader_offset = 0;
 662         struct panfrost_bo *shader_bo = NULL;
 663
 664         for (unsigned c = 0; c < rt_count; ++c)
 665                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
 666                                                           &shader_offset);
 667
 668         /* Disable shader execution if we can */
 669         if (dev->quirks & MIDGARD_SHADERLESS
 670                         && !panfrost_fs_required(fs, blend, rt_count)) {
 671                 fragmeta->shader = 0;
 672                 fragmeta->attribute_count = 0;
 673                 fragmeta->varying_count = 0;
 674                 fragmeta->texture_count = 0;
 675                 fragmeta->sampler_count = 0;
 676
 677                 /* This feature is not known to work on Bifrost */
 678                 fragmeta->midgard1.work_count = 1;
 679                 fragmeta->midgard1.uniform_count = 0;
 680                 fragmeta->midgard1.uniform_buffer_count = 0;
 681         }
 682
 683          /* If there is a blend shader, work registers are shared. We impose 8
 684           * work registers as a limit for blend shaders. Should be lower XXX */
 685
 686         if (!(dev->quirks & IS_BIFROST)) {
 687                 for (unsigned c = 0; c < rt_count; ++c) {
 688                         if (blend[c].is_shader) {
 689                                 fragmeta->midgard1.work_count =
 690                                         MAX2(fragmeta->midgard1.work_count, 8);
 691                         }
 692                 }
 693         }
 694
 695         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 696          * copied to the blend_meta appended (by convention), but this is the
 697          * field actually read by the hardware. (Or maybe both are read...?).
 698          * Specify the last RTi with a blend shader. */
 699
 700         fragmeta->blend.shader = 0;
 701
 702         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 703                 if (!blend[rt].is_shader)
 704                         continue;
 705
 706                 fragmeta->blend.shader = blend[rt].shader.gpu |
 707                                          blend[rt].shader.first_tag;
 708                 break;
 709         }
 710
 711         if (dev->quirks & MIDGARD_SFBD) {
 712                 /* When only a single render target platform is used, the blend
 713                  * information is inside the shader meta itself. We additionally
 714                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 715                  * we're able to read back the destination buffer) */
 716
 717                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 718                         blend[0].is_shader);
 719
 720                 if (!blend[0].is_shader) {
 721                         fragmeta->blend.equation = *blend[0].equation.equation;
 722                         fragmeta->blend.constant = blend[0].equation.constant;
 723                 }
 724
 725                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 726                         !blend[0].no_blending || fs->can_discard);
 727                 return;
 728         }
 729
 730         /* Additional blend descriptor tacked on for jobs using MFBD */
 731
 732         for (unsigned i = 0; i < rt_count; ++i) {
 733                 unsigned flags = 0;
 734
 735                 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
 736                         flags = 0x200;
 737
 738                         bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 739                                        (ctx->pipe_framebuffer.cbufs[i]) &&
 740                                        util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 741
 742                         SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 743                         SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 744                         SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
 745                         SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 746                 }
 747
 748                 if (dev->quirks & IS_BIFROST) {
 749                         struct bifrost_blend_rt *brts = rts;
 750
 751                         brts[i].flags = flags;
 752
 753                         if (blend[i].is_shader) {
 754                                 /* The blend shader's address needs to be at
 755                                  * the same top 32 bit as the fragment shader.
 756                                  * TODO: Ensure that's always the case.
 757                                  */
 758                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 759                                        (fs->bo->gpu & (0xffffffffull << 32)));
 760                                 brts[i].shader = blend[i].shader.gpu;
 761                                 brts[i].unk2 = 0x0;
 762                         } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
 763                                 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
 764                                 const struct util_format_description *format_desc;
 765                                 format_desc = util_format_description(format);
 766
 767                                 brts[i].equation = *blend[i].equation.equation;
 768
 769                                 /* TODO: this is a bit more complicated */
 770                                 brts[i].constant = blend[i].equation.constant;
 771
 772                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 773
 774                                 /* 0x19 disables blending and forces REPLACE
 775                                  * mode (equivalent to rgb_mode = alpha_mode =
 776                                  * x122, colour mask = 0xF). 0x1a allows
 777                                  * blending. */
 778                                 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
 779
 780                                 brts[i].shader_type = fs->blend_types[i];
 781                         } else {
 782                                 /* Dummy attachment for depth-only */
 783                                 brts[i].unk2 = 0x3;
 784                                 brts[i].shader_type = fs->blend_types[i];
 785                         }
 786                 } else {
 787                         struct midgard_blend_rt *mrts = rts;
 788                         mrts[i].flags = flags;
 789
 790                         if (blend[i].is_shader) {
 791                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 792                         } else {
 793                                 mrts[i].blend.equation = *blend[i].equation.equation;
 794                                 mrts[i].blend.constant = blend[i].equation.constant;
 795                         }
 796                 }
 797         }
 798 }
 799
 800 static void
 801 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 802                                struct mali_shader_meta *fragmeta,
 803                                void *rts)
 804 {
 805         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 806         struct panfrost_shader_state *fs;
 807
 808         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 809
 810         fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
 811         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
 812         fragmeta->unknown2_4 = 0x4e0;
 813
 814         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 815          * is required (independent of 32-bit/64-bit descriptors), or why it's
 816          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 817          * these earlier chips (perhaps this is a chicken bit of some kind).
 818          * More investigation is needed. */
 819
 820         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 821
 822         if (dev->quirks & IS_BIFROST) {
 823                 /* TODO */
 824         } else {
 825                 /* Depending on whether it's legal to in the given shader, we try to
 826                  * enable early-z testing (or forward-pixel kill?) */
 827
 828                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 829                         !fs->can_discard && !fs->writes_depth);
 830
 831                 /* Add the writes Z/S flags if needed. */
 832                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 833                 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 834
 835                 /* Any time texturing is used, derivatives are implicitly calculated,
 836                  * so we need to enable helper invocations */
 837
 838                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 839                         fs->helper_invocations);
 840
 841                 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 842
 843                 bool depth_enabled = fs->writes_depth ||
 844                    (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
 845
 846                 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, !depth_enabled && fs->can_discard);
 847                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
 848         }
 849
 850         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 851         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 852         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 853 }
 854
 855 void
 856 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 857                           enum pipe_shader_type st,
 858                           struct mali_vertex_tiler_postfix *postfix)
 859 {
 860         struct panfrost_context *ctx = batch->ctx;
 861         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 862
 863         if (!ss) {
 864                 postfix->shader = 0;
 865                 return;
 866         }
 867
 868         struct mali_shader_meta meta;
 869
 870         panfrost_shader_meta_init(ctx, st, &meta);
 871
 872         /* Add the shader BO to the batch. */
 873         panfrost_batch_add_bo(batch, ss->bo,
 874                               PAN_BO_ACCESS_PRIVATE |
 875                               PAN_BO_ACCESS_READ |
 876                               panfrost_bo_access_for_stage(st));
 877
 878         mali_ptr shader_ptr;
 879
 880         if (st == PIPE_SHADER_FRAGMENT) {
 881                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 882                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 883                 size_t desc_size = sizeof(meta);
 884                 void *rts = NULL;
 885                 struct panfrost_transfer xfer;
 886                 unsigned rt_size;
 887
 888                 if (dev->quirks & MIDGARD_SFBD)
 889                         rt_size = 0;
 890                 else if (dev->quirks & IS_BIFROST)
 891                         rt_size = sizeof(struct bifrost_blend_rt);
 892                 else
 893                         rt_size = sizeof(struct midgard_blend_rt);
 894
 895                 desc_size += rt_size * rt_count;
 896
 897                 if (rt_size)
 898                         rts = rzalloc_size(ctx, rt_size * rt_count);
 899
 900                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 901
 902                 xfer = panfrost_allocate_transient(batch, desc_size);
 903
 904                 memcpy(xfer.cpu, &meta, sizeof(meta));
 905                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 906
 907                 if (rt_size)
 908                         ralloc_free(rts);
 909
 910                 shader_ptr = xfer.gpu;
 911         } else {
 912                 shader_ptr = panfrost_upload_transient(batch, &meta,
 913                                                        sizeof(meta));
 914         }
 915
 916         postfix->shader = shader_ptr;
 917 }
 918
 919 static void
 920 panfrost_mali_viewport_init(struct panfrost_context *ctx,
 921                             struct mali_viewport *mvp)
 922 {
 923         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 924
 925         /* Clip bounds are encoded as floats. The viewport itself is encoded as
 926          * (somewhat) asymmetric ints. */
 927
 928         const struct pipe_scissor_state *ss = &ctx->scissor;
 929
 930         memset(mvp, 0, sizeof(*mvp));
 931
 932         /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
 933          * each direction. Clipping to the viewport in theory should work, but
 934          * in practice causes issues when we're not explicitly trying to
 935          * scissor */
 936
 937         *mvp = (struct mali_viewport) {
 938                 .clip_minx = -INFINITY,
 939                 .clip_miny = -INFINITY,
 940                 .clip_maxx = INFINITY,
 941                 .clip_maxy = INFINITY,
 942         };
 943
 944         /* Always scissor to the viewport by default. */
 945         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 946         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 947
 948         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 949         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 950
 951         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 952         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 953
 954         /* Apply the scissor test */
 955
 956         unsigned minx, miny, maxx, maxy;
 957
 958         if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
 959                 minx = MAX2(ss->minx, vp_minx);
 960                 miny = MAX2(ss->miny, vp_miny);
 961                 maxx = MIN2(ss->maxx, vp_maxx);
 962                 maxy = MIN2(ss->maxy, vp_maxy);
 963         } else {
 964                 minx = vp_minx;
 965                 miny = vp_miny;
 966                 maxx = vp_maxx;
 967                 maxy = vp_maxy;
 968         }
 969
 970         /* Hardware needs the min/max to be strictly ordered, so flip if we
 971          * need to. The viewport transformation in the vertex shader will
 972          * handle the negatives if we don't */
 973
 974         if (miny > maxy) {
 975                 unsigned temp = miny;
 976                 miny = maxy;
 977                 maxy = temp;
 978         }
 979
 980         if (minx > maxx) {
 981                 unsigned temp = minx;
 982                 minx = maxx;
 983                 maxx = temp;
 984         }
 985
 986         if (minz > maxz) {
 987                 float temp = minz;
 988                 minz = maxz;
 989                 maxz = temp;
 990         }
 991
 992         /* Clamp to the framebuffer size as a last check */
 993
 994         minx = MIN2(ctx->pipe_framebuffer.width, minx);
 995         maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
 996
 997         miny = MIN2(ctx->pipe_framebuffer.height, miny);
 998         maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
 999
1000         /* Upload */
1001
1002         mvp->viewport0[0] = minx;
1003         mvp->viewport1[0] = MALI_POSITIVE(maxx);
1004
1005         mvp->viewport0[1] = miny;
1006         mvp->viewport1[1] = MALI_POSITIVE(maxy);
1007
1008         mvp->clip_minz = minz;
1009         mvp->clip_maxz = maxz;
1010 }
1011
1012 void
1013 panfrost_emit_viewport(struct panfrost_batch *batch,
1014                        struct mali_vertex_tiler_postfix *tiler_postfix)
1015 {
1016         struct panfrost_context *ctx = batch->ctx;
1017         struct mali_viewport mvp;
1018
1019         panfrost_mali_viewport_init(batch->ctx,  &mvp);
1020
1021         /* Update the job, unless we're doing wallpapering (whose lack of
1022          * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
1023          * just... be faster :) */
1024
1025         if (!ctx->wallpaper_batch)
1026                 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
1027                                              mvp.viewport0[1],
1028                                              mvp.viewport1[0] + 1,
1029                                              mvp.viewport1[1] + 1);
1030
1031         tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
1032                                                             sizeof(mvp));
1033 }
1034
1035 static mali_ptr
1036 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1037                                  enum pipe_shader_type st,
1038                                  struct panfrost_constant_buffer *buf,
1039                                  unsigned index)
1040 {
1041         struct pipe_constant_buffer *cb = &buf->cb[index];
1042         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1043
1044         if (rsrc) {
1045                 panfrost_batch_add_bo(batch, rsrc->bo,
1046                                       PAN_BO_ACCESS_SHARED |
1047                                       PAN_BO_ACCESS_READ |
1048                                       panfrost_bo_access_for_stage(st));
1049
1050                 /* Alignment gauranteed by
1051                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1052                 return rsrc->bo->gpu + cb->buffer_offset;
1053         } else if (cb->user_buffer) {
1054                 return panfrost_upload_transient(batch,
1055                                                  cb->user_buffer +
1056                                                  cb->buffer_offset,
1057                                                  cb->buffer_size);
1058         } else {
1059                 unreachable("No constant buffer");
1060         }
1061 }
1062
1063 struct sysval_uniform {
1064         union {
1065                 float f[4];
1066                 int32_t i[4];
1067                 uint32_t u[4];
1068                 uint64_t du[2];
1069         };
1070 };
1071
1072 static void
1073 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1074                                       struct sysval_uniform *uniform)
1075 {
1076         struct panfrost_context *ctx = batch->ctx;
1077         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1078
1079         uniform->f[0] = vp->scale[0];
1080         uniform->f[1] = vp->scale[1];
1081         uniform->f[2] = vp->scale[2];
1082 }
1083
1084 static void
1085 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1086                                        struct sysval_uniform *uniform)
1087 {
1088         struct panfrost_context *ctx = batch->ctx;
1089         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1090
1091         uniform->f[0] = vp->translate[0];
1092         uniform->f[1] = vp->translate[1];
1093         uniform->f[2] = vp->translate[2];
1094 }
1095
1096 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1097                                        enum pipe_shader_type st,
1098                                        unsigned int sysvalid,
1099                                        struct sysval_uniform *uniform)
1100 {
1101         struct panfrost_context *ctx = batch->ctx;
1102         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1103         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1104         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1105         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1106
1107         assert(dim);
1108         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1109
1110         if (dim > 1)
1111                 uniform->i[1] = u_minify(tex->texture->height0,
1112                                          tex->u.tex.first_level);
1113
1114         if (dim > 2)
1115                 uniform->i[2] = u_minify(tex->texture->depth0,
1116                                          tex->u.tex.first_level);
1117
1118         if (is_array)
1119                 uniform->i[dim] = tex->texture->array_size;
1120 }
1121
1122 static void
1123 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1124                             enum pipe_shader_type st,
1125                             unsigned ssbo_id,
1126                             struct sysval_uniform *uniform)
1127 {
1128         struct panfrost_context *ctx = batch->ctx;
1129
1130         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1131         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1132
1133         /* Compute address */
1134         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1135
1136         panfrost_batch_add_bo(batch, bo,
1137                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1138                               panfrost_bo_access_for_stage(st));
1139
1140         /* Upload address and size as sysval */
1141         uniform->du[0] = bo->gpu + sb.buffer_offset;
1142         uniform->u[2] = sb.buffer_size;
1143 }
1144
1145 static void
1146 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1147                                enum pipe_shader_type st,
1148                                unsigned samp_idx,
1149                                struct sysval_uniform *uniform)
1150 {
1151         struct panfrost_context *ctx = batch->ctx;
1152         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1153
1154         uniform->f[0] = sampl->min_lod;
1155         uniform->f[1] = sampl->max_lod;
1156         uniform->f[2] = sampl->lod_bias;
1157
1158         /* Even without any errata, Midgard represents "no mipmapping" as
1159          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1160          * panfrost_create_sampler_state which also explains our choice of
1161          * epsilon value (again to keep behaviour consistent) */
1162
1163         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1164                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1165 }
1166
1167 static void
1168 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1169                                        struct sysval_uniform *uniform)
1170 {
1171         struct panfrost_context *ctx = batch->ctx;
1172
1173         uniform->u[0] = ctx->compute_grid->grid[0];
1174         uniform->u[1] = ctx->compute_grid->grid[1];
1175         uniform->u[2] = ctx->compute_grid->grid[2];
1176 }
1177
1178 static void
1179 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1180                         struct panfrost_shader_state *ss,
1181                         enum pipe_shader_type st)
1182 {
1183         struct sysval_uniform *uniforms = (void *)buf;
1184
1185         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1186                 int sysval = ss->sysval[i];
1187
1188                 switch (PAN_SYSVAL_TYPE(sysval)) {
1189                 case PAN_SYSVAL_VIEWPORT_SCALE:
1190                         panfrost_upload_viewport_scale_sysval(batch,
1191                                                               &uniforms[i]);
1192                         break;
1193                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1194                         panfrost_upload_viewport_offset_sysval(batch,
1195                                                                &uniforms[i]);
1196                         break;
1197                 case PAN_SYSVAL_TEXTURE_SIZE:
1198                         panfrost_upload_txs_sysval(batch, st,
1199                                                    PAN_SYSVAL_ID(sysval),
1200                                                    &uniforms[i]);
1201                         break;
1202                 case PAN_SYSVAL_SSBO:
1203                         panfrost_upload_ssbo_sysval(batch, st,
1204                                                     PAN_SYSVAL_ID(sysval),
1205                                                     &uniforms[i]);
1206                         break;
1207                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1208                         panfrost_upload_num_work_groups_sysval(batch,
1209                                                                &uniforms[i]);
1210                         break;
1211                 case PAN_SYSVAL_SAMPLER:
1212                         panfrost_upload_sampler_sysval(batch, st,
1213                                                        PAN_SYSVAL_ID(sysval),
1214                                                        &uniforms[i]);
1215                         break;
1216                 default:
1217                         assert(0);
1218                 }
1219         }
1220 }
1221
1222 static const void *
1223 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1224                                  unsigned index)
1225 {
1226         struct pipe_constant_buffer *cb = &buf->cb[index];
1227         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1228
1229         if (rsrc)
1230                 return rsrc->bo->cpu;
1231         else if (cb->user_buffer)
1232                 return cb->user_buffer;
1233         else
1234                 unreachable("No constant buffer");
1235 }
1236
1237 void
1238 panfrost_emit_const_buf(struct panfrost_batch *batch,
1239                         enum pipe_shader_type stage,
1240                         struct mali_vertex_tiler_postfix *postfix)
1241 {
1242         struct panfrost_context *ctx = batch->ctx;
1243         struct panfrost_shader_variants *all = ctx->shader[stage];
1244
1245         if (!all)
1246                 return;
1247
1248         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1249
1250         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1251
1252         /* Uniforms are implicitly UBO #0 */
1253         bool has_uniforms = buf->enabled_mask & (1 << 0);
1254
1255         /* Allocate room for the sysval and the uniforms */
1256         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1257         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1258         size_t size = sys_size + uniform_size;
1259         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1260                                                                         size);
1261
1262         /* Upload sysvals requested by the shader */
1263         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1264
1265         /* Upload uniforms */
1266         if (has_uniforms && uniform_size) {
1267                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1268                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1269         }
1270
1271         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1272          * uploaded */
1273
1274         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1275         assert(ubo_count >= 1);
1276
1277         size_t sz = sizeof(uint64_t) * ubo_count;
1278         uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1279         int uniform_count = ss->uniform_count;
1280
1281         /* Upload uniforms as a UBO */
1282         ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1283
1284         /* The rest are honest-to-goodness UBOs */
1285
1286         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1287                 size_t usz = buf->cb[ubo].buffer_size;
1288                 bool enabled = buf->enabled_mask & (1 << ubo);
1289                 bool empty = usz == 0;
1290
1291                 if (!enabled || empty) {
1292                         /* Stub out disabled UBOs to catch accesses */
1293                         ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1294                         continue;
1295                 }
1296
1297                 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1298                                                                 buf, ubo);
1299
1300                 unsigned bytes_per_field = 16;
1301                 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1302                 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1303         }
1304
1305         mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1306         postfix->uniforms = transfer.gpu;
1307         postfix->uniform_buffers = ubufs;
1308
1309         buf->dirty_mask = 0;
1310 }
1311
1312 void
1313 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1314                             const struct pipe_grid_info *info,
1315                             struct midgard_payload_vertex_tiler *vtp)
1316 {
1317         struct panfrost_context *ctx = batch->ctx;
1318         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1319         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1320         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1321                                                            128));
1322         unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1323                                info->grid[2] * 4;
1324         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1325                                                                   shared_size,
1326                                                                   1);
1327
1328         struct mali_shared_memory shared = {
1329                 .shared_memory = bo->gpu,
1330                 .shared_workgroup_count =
1331                         util_logbase2_ceil(info->grid[0]) +
1332                         util_logbase2_ceil(info->grid[1]) +
1333                         util_logbase2_ceil(info->grid[2]),
1334                 .shared_unk1 = 0x2,
1335                 .shared_shift = util_logbase2(single_size) - 1
1336         };
1337
1338         vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1339                                                                sizeof(shared));
1340 }
1341
1342 static mali_ptr
1343 panfrost_get_tex_desc(struct panfrost_batch *batch,
1344                       enum pipe_shader_type st,
1345                       struct panfrost_sampler_view *view)
1346 {
1347         if (!view)
1348                 return (mali_ptr) 0;
1349
1350         struct pipe_sampler_view *pview = &view->base;
1351         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1352
1353         /* Add the BO to the job so it's retained until the job is done. */
1354
1355         panfrost_batch_add_bo(batch, rsrc->bo,
1356                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1357                               panfrost_bo_access_for_stage(st));
1358
1359         panfrost_batch_add_bo(batch, view->midgard_bo,
1360                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1361                               panfrost_bo_access_for_stage(st));
1362
1363         return view->midgard_bo->gpu;
1364 }
1365
1366 void
1367 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1368                                   enum pipe_shader_type stage,
1369                                   struct mali_vertex_tiler_postfix *postfix)
1370 {
1371         struct panfrost_context *ctx = batch->ctx;
1372         struct panfrost_device *device = pan_device(ctx->base.screen);
1373
1374         if (!ctx->sampler_view_count[stage])
1375                 return;
1376
1377         if (device->quirks & IS_BIFROST) {
1378                 struct bifrost_texture_descriptor *descriptors;
1379
1380                 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1381                                      ctx->sampler_view_count[stage]);
1382
1383                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1384                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1385                         struct pipe_sampler_view *pview = &view->base;
1386                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1387
1388                         /* Add the BOs to the job so they are retained until the job is done. */
1389
1390                         panfrost_batch_add_bo(batch, rsrc->bo,
1391                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1392                                               panfrost_bo_access_for_stage(stage));
1393
1394                         panfrost_batch_add_bo(batch, view->bifrost_bo,
1395                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1396                                               panfrost_bo_access_for_stage(stage));
1397
1398                         memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1399                 }
1400
1401                 postfix->textures = panfrost_upload_transient(batch,
1402                                                               descriptors,
1403                                                               sizeof(struct bifrost_texture_descriptor) *
1404                                                                       ctx->sampler_view_count[stage]);
1405
1406                 free(descriptors);
1407         } else {
1408                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1409
1410                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1411                         trampolines[i] = panfrost_get_tex_desc(batch, stage,
1412                                                                ctx->sampler_views[stage][i]);
1413
1414                 postfix->textures = panfrost_upload_transient(batch,
1415                                                               trampolines,
1416                                                               sizeof(uint64_t) *
1417                                                               ctx->sampler_view_count[stage]);
1418         }
1419 }
1420
1421 void
1422 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1423                                   enum pipe_shader_type stage,
1424                                   struct mali_vertex_tiler_postfix *postfix)
1425 {
1426         struct panfrost_context *ctx = batch->ctx;
1427         struct panfrost_device *device = pan_device(ctx->base.screen);
1428
1429         if (!ctx->sampler_count[stage])
1430                 return;
1431
1432         if (device->quirks & IS_BIFROST) {
1433                 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1434                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1435                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1436                                                                                 transfer_size);
1437                 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1438
1439                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1440                         desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1441
1442                 postfix->sampler_descriptor = transfer.gpu;
1443         } else {
1444                 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1445                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1446                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1447                                                                                 transfer_size);
1448                 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1449
1450                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1451                         desc[i] = ctx->samplers[stage][i]->midgard_hw;
1452
1453                 postfix->sampler_descriptor = transfer.gpu;
1454         }
1455 }
1456
1457 void
1458 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1459                                struct mali_vertex_tiler_postfix *vertex_postfix)
1460 {
1461         struct panfrost_context *ctx = batch->ctx;
1462
1463         if (!ctx->vertex)
1464                 return;
1465
1466         struct panfrost_vertex_state *so = ctx->vertex;
1467
1468         panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1469         vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1470                                                                sizeof(*so->hw) *
1471                                                                PAN_MAX_ATTRIBUTE);
1472 }
1473
1474 void
1475 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1476                           struct mali_vertex_tiler_postfix *vertex_postfix)
1477 {
1478         struct panfrost_context *ctx = batch->ctx;
1479         struct panfrost_vertex_state *so = ctx->vertex;
1480
1481         /* Staged mali_attr, and index into them. i =/= k, depending on the
1482          * vertex buffer mask and instancing. Twice as much room is allocated,
1483          * for a worst case of NPOT_DIVIDEs which take up extra slot */
1484         union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1485         unsigned k = 0;
1486
1487         for (unsigned i = 0; i < so->num_elements; ++i) {
1488                 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1489                  * means duplicating some vertex buffers (who cares? aside from
1490                  * maybe some caching implications but I somehow doubt that
1491                  * matters) */
1492
1493                 struct pipe_vertex_element *elem = &so->pipe[i];
1494                 unsigned vbi = elem->vertex_buffer_index;
1495
1496                 /* The exception to 1:1 mapping is that we can have multiple
1497                  * entries (NPOT divisors), so we fixup anyways */
1498
1499                 so->hw[i].index = k;
1500
1501                 if (!(ctx->vb_mask & (1 << vbi)))
1502                         continue;
1503
1504                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1505                 struct panfrost_resource *rsrc;
1506
1507                 rsrc = pan_resource(buf->buffer.resource);
1508                 if (!rsrc)
1509                         continue;
1510
1511                 /* Align to 64 bytes by masking off the lower bits. This
1512                  * will be adjusted back when we fixup the src_offset in
1513                  * mali_attr_meta */
1514
1515                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1516                 mali_ptr addr = raw_addr & ~63;
1517                 unsigned chopped_addr = raw_addr - addr;
1518
1519                 /* Add a dependency of the batch on the vertex buffer */
1520                 panfrost_batch_add_bo(batch, rsrc->bo,
1521                                       PAN_BO_ACCESS_SHARED |
1522                                       PAN_BO_ACCESS_READ |
1523                                       PAN_BO_ACCESS_VERTEX_TILER);
1524
1525                 /* Set common fields */
1526                 attrs[k].elements = addr;
1527                 attrs[k].stride = buf->stride;
1528
1529                 /* Since we advanced the base pointer, we shrink the buffer
1530                  * size */
1531                 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1532
1533                 /* We need to add the extra size we masked off (for
1534                  * correctness) so the data doesn't get clamped away */
1535                 attrs[k].size += chopped_addr;
1536
1537                 /* For non-instancing make sure we initialize */
1538                 attrs[k].shift = attrs[k].extra_flags = 0;
1539
1540                 /* Instancing uses a dramatically different code path than
1541                  * linear, so dispatch for the actual emission now that the
1542                  * common code is finished */
1543
1544                 unsigned divisor = elem->instance_divisor;
1545
1546                 if (divisor && ctx->instance_count == 1) {
1547                         /* Silly corner case where there's a divisor(=1) but
1548                          * there's no legitimate instancing. So we want *every*
1549                          * attribute to be the same. So set stride to zero so
1550                          * we don't go anywhere. */
1551
1552                         attrs[k].size = attrs[k].stride + chopped_addr;
1553                         attrs[k].stride = 0;
1554                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1555                 } else if (ctx->instance_count <= 1) {
1556                         /* Normal, non-instanced attributes */
1557                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1558                 } else {
1559                         unsigned instance_shift = vertex_postfix->instance_shift;
1560                         unsigned instance_odd = vertex_postfix->instance_odd;
1561
1562                         k += panfrost_vertex_instanced(ctx->padded_count,
1563                                                        instance_shift,
1564                                                        instance_odd,
1565                                                        divisor, &attrs[k]);
1566                 }
1567         }
1568
1569         /* Add special gl_VertexID/gl_InstanceID buffers */
1570
1571         panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1572         so->hw[PAN_VERTEX_ID].index = k++;
1573         panfrost_instance_id(ctx->padded_count, &attrs[k]);
1574         so->hw[PAN_INSTANCE_ID].index = k++;
1575
1576         /* Upload whatever we emitted and go */
1577
1578         vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1579                                                            k * sizeof(*attrs));
1580 }
1581
1582 static mali_ptr
1583 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1584                        unsigned stride, unsigned count)
1585 {
1586         /* Fill out the descriptor */
1587         slot->stride = stride;
1588         slot->size = stride * count;
1589         slot->shift = slot->extra_flags = 0;
1590
1591         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1592                                                                         slot->size);
1593
1594         slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1595
1596         return transfer.gpu;
1597 }
1598
1599 static void
1600 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1601                         unsigned stride, unsigned offset, unsigned count,
1602                         struct pipe_stream_output_target *target)
1603 {
1604         /* Fill out the descriptor */
1605         slot->stride = stride * 4;
1606         slot->shift = slot->extra_flags = 0;
1607
1608         unsigned max_size = target->buffer_size;
1609         unsigned expected_size = slot->stride * count;
1610
1611         slot->size = MIN2(max_size, expected_size);
1612
1613         /* Grab the BO and bind it to the batch */
1614         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1615
1616         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1617          * the perspective of the TILER and FRAGMENT.
1618          */
1619         panfrost_batch_add_bo(batch, bo,
1620                               PAN_BO_ACCESS_SHARED |
1621                               PAN_BO_ACCESS_RW |
1622                               PAN_BO_ACCESS_VERTEX_TILER |
1623                               PAN_BO_ACCESS_FRAGMENT);
1624
1625         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1626         slot->elements = addr;
1627 }
1628
1629 /* Given a shader and buffer indices, link varying metadata together */
1630
1631 static bool
1632 is_special_varying(gl_varying_slot loc)
1633 {
1634         switch (loc) {
1635         case VARYING_SLOT_POS:
1636         case VARYING_SLOT_PSIZ:
1637         case VARYING_SLOT_PNTC:
1638         case VARYING_SLOT_FACE:
1639                 return true;
1640         default:
1641                 return false;
1642         }
1643 }
1644
1645 static void
1646 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1647                            signed general, signed gl_Position,
1648                            signed gl_PointSize, signed gl_PointCoord,
1649                            signed gl_FrontFacing)
1650 {
1651         struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1652
1653         for (unsigned i = 0; i < ss->varying_count; ++i) {
1654                 gl_varying_slot location = ss->varyings_loc[i];
1655                 int index = -1;
1656
1657                 switch (location) {
1658                 case VARYING_SLOT_POS:
1659                         index = gl_Position;
1660                         break;
1661                 case VARYING_SLOT_PSIZ:
1662                         index = gl_PointSize;
1663                         break;
1664                 case VARYING_SLOT_PNTC:
1665                         index = gl_PointCoord;
1666                         break;
1667                 case VARYING_SLOT_FACE:
1668                         index = gl_FrontFacing;
1669                         break;
1670                 default:
1671                         index = general;
1672                         break;
1673                 }
1674
1675                 assert(index >= 0);
1676                 out[i].index = index;
1677         }
1678 }
1679
1680 static bool
1681 has_point_coord(unsigned mask, gl_varying_slot loc)
1682 {
1683         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1684                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1685         else if (loc == VARYING_SLOT_PNTC)
1686                 return (mask & (1 << 8));
1687         else
1688                 return false;
1689 }
1690
1691 /* Helpers for manipulating stream out information so we can pack varyings
1692  * accordingly. Compute the src_offset for a given captured varying */
1693
1694 static struct pipe_stream_output *
1695 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1696 {
1697         for (unsigned i = 0; i < info->num_outputs; ++i) {
1698                 if (info->output[i].register_index == loc)
1699                         return &info->output[i];
1700         }
1701
1702         unreachable("Varying not captured");
1703 }
1704
1705 void
1706 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1707                                  unsigned vertex_count,
1708                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1709                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1710                                  union midgard_primitive_size *primitive_size)
1711 {
1712         /* Load the shaders */
1713         struct panfrost_context *ctx = batch->ctx;
1714         struct panfrost_shader_state *vs, *fs;
1715         unsigned int num_gen_varyings = 0;
1716         size_t vs_size, fs_size;
1717
1718         /* Allocate the varying descriptor */
1719
1720         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1721         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1722         vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1723         fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1724
1725         struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1726                                                                      vs_size +
1727                                                                      fs_size);
1728
1729         struct pipe_stream_output_info *so = &vs->stream_output;
1730
1731         /* Check if this varying is linked by us. This is the case for
1732          * general-purpose, non-captured varyings. If it is, link it. If it's
1733          * not, use the provided stream out information to determine the
1734          * offset, since it was already linked for us. */
1735
1736         for (unsigned i = 0; i < vs->varying_count; i++) {
1737                 gl_varying_slot loc = vs->varyings_loc[i];
1738
1739                 bool special = is_special_varying(loc);
1740                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1741
1742                 if (captured) {
1743                         struct pipe_stream_output *o = pan_get_so(so, loc);
1744
1745                         unsigned dst_offset = o->dst_offset * 4; /* dwords */
1746                         vs->varyings[i].src_offset = dst_offset;
1747                 } else if (!special) {
1748                         vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1749                 }
1750         }
1751
1752         /* Conversely, we need to set src_offset for the captured varyings.
1753          * Here, the layout is defined by the stream out info, not us */
1754
1755         /* Link up with fragment varyings */
1756         bool reads_point_coord = fs->reads_point_coord;
1757
1758         for (unsigned i = 0; i < fs->varying_count; i++) {
1759                 gl_varying_slot loc = fs->varyings_loc[i];
1760                 unsigned src_offset;
1761                 signed vs_idx = -1;
1762
1763                 /* Link up */
1764                 for (unsigned j = 0; j < vs->varying_count; ++j) {
1765                         if (vs->varyings_loc[j] == loc) {
1766                                 vs_idx = j;
1767                                 break;
1768                         }
1769                 }
1770
1771                 /* Either assign or reuse */
1772                 if (vs_idx >= 0)
1773                         src_offset = vs->varyings[vs_idx].src_offset;
1774                 else
1775                         src_offset = 16 * (num_gen_varyings++);
1776
1777                 fs->varyings[i].src_offset = src_offset;
1778
1779                 if (has_point_coord(fs->point_sprite_mask, loc))
1780                         reads_point_coord = true;
1781         }
1782
1783         memcpy(trans.cpu, vs->varyings, vs_size);
1784         memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1785
1786         union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1787
1788         /* Figure out how many streamout buffers could be bound */
1789         unsigned so_count = ctx->streamout.num_targets;
1790         for (unsigned i = 0; i < vs->varying_count; i++) {
1791                 gl_varying_slot loc = vs->varyings_loc[i];
1792
1793                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1794                 if (!captured) continue;
1795
1796                 struct pipe_stream_output *o = pan_get_so(so, loc);
1797                 so_count = MAX2(so_count, o->output_buffer + 1);
1798         }
1799
1800         signed idx = so_count;
1801         signed general = idx++;
1802         signed gl_Position = idx++;
1803         signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1804         signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1805         signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1806         signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1807
1808         /* Emit the stream out buffers */
1809
1810         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1811                                                            ctx->vertex_count);
1812
1813         for (unsigned i = 0; i < so_count; ++i) {
1814                 if (i < ctx->streamout.num_targets) {
1815                         panfrost_emit_streamout(batch, &varyings[i],
1816                                                 so->stride[i],
1817                                                 ctx->streamout.offsets[i],
1818                                                 out_count,
1819                                                 ctx->streamout.targets[i]);
1820                 } else {
1821                         /* Emit a dummy buffer */
1822                         panfrost_emit_varyings(batch, &varyings[i],
1823                                                so->stride[i] * 4,
1824                                                out_count);
1825
1826                         /* Clear the attribute type */
1827                         varyings[i].elements &= ~0xF;
1828                 }
1829         }
1830
1831         panfrost_emit_varyings(batch, &varyings[general],
1832                                num_gen_varyings * 16,
1833                                vertex_count);
1834
1835         mali_ptr varyings_p;
1836
1837         /* fp32 vec4 gl_Position */
1838         varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1839                                             sizeof(float) * 4, vertex_count);
1840         tiler_postfix->position_varying = varyings_p;
1841
1842
1843         if (panfrost_writes_point_size(ctx)) {
1844                 varyings_p = panfrost_emit_varyings(batch,
1845                                                     &varyings[gl_PointSize],
1846                                                     2, vertex_count);
1847                 primitive_size->pointer = varyings_p;
1848         }
1849
1850         if (reads_point_coord)
1851                 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1852
1853         if (fs->reads_face)
1854                 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1855
1856         if (fs->reads_frag_coord)
1857                 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1858
1859         struct panfrost_device *device = pan_device(ctx->base.screen);
1860         assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord));
1861
1862         /* Let's go ahead and link varying meta to the buffer in question, now
1863          * that that information is available. VARYING_SLOT_POS is mapped to
1864          * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1865          * */
1866
1867         panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1868                                    gl_PointSize, gl_PointCoord,
1869                                    gl_FrontFacing);
1870
1871         panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1872                                    gl_FragCoord, gl_PointSize,
1873                                    gl_PointCoord, gl_FrontFacing);
1874
1875         /* Replace streamout */
1876
1877         struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1878         struct mali_attr_meta *ofs = ovs + vs->varying_count;
1879
1880         for (unsigned i = 0; i < vs->varying_count; i++) {
1881                 gl_varying_slot loc = vs->varyings_loc[i];
1882
1883                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1884                 if (!captured)
1885                         continue;
1886
1887                 struct pipe_stream_output *o = pan_get_so(so, loc);
1888                 ovs[i].index = o->output_buffer;
1889
1890                 assert(o->stream == 0);
1891                 ovs[i].format = (vs->varyings[i].format & ~MALI_NR_CHANNELS(4))
1892                         | MALI_NR_CHANNELS(o->num_components);
1893
1894                 if (device->quirks & HAS_SWIZZLES)
1895                         ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1896                 else
1897                         ovs[i].swizzle = panfrost_bifrost_swizzle(o->num_components);
1898
1899                 /* Link to the fragment */
1900                 signed fs_idx = -1;
1901
1902                 /* Link up */
1903                 for (unsigned j = 0; j < fs->varying_count; ++j) {
1904                         if (fs->varyings_loc[j] == loc) {
1905                                 fs_idx = j;
1906                                 break;
1907                         }
1908                 }
1909
1910                 if (fs_idx >= 0) {
1911                         ofs[fs_idx].index = ovs[i].index;
1912                         ofs[fs_idx].format = ovs[i].format;
1913                         ofs[fs_idx].swizzle = ovs[i].swizzle;
1914                 }
1915         }
1916
1917         /* Replace point sprite */
1918         for (unsigned i = 0; i < fs->varying_count; i++) {
1919                 /* If we have a point sprite replacement, handle that here. We
1920                  * have to translate location first.  TODO: Flip y in shader.
1921                  * We're already keying ... just time crunch .. */
1922
1923                 if (has_point_coord(fs->point_sprite_mask,
1924                                     fs->varyings_loc[i])) {
1925                         ofs[i].index = gl_PointCoord;
1926
1927                         /* Swizzle out the z/w to 0/1 */
1928                         ofs[i].format = MALI_RG16F;
1929                         ofs[i].swizzle = panfrost_get_default_swizzle(2);
1930                 }
1931         }
1932
1933         /* Fix up unaligned addresses */
1934         for (unsigned i = 0; i < so_count; ++i) {
1935                 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1936                         continue;
1937
1938                 unsigned align = (varyings[i].elements & 63);
1939
1940                 /* While we're at it, the SO buffers are linear */
1941
1942                 if (!align) {
1943                         varyings[i].elements |= MALI_ATTR_LINEAR;
1944                         continue;
1945                 }
1946
1947                 /* We need to adjust alignment */
1948                 varyings[i].elements &= ~63;
1949                 varyings[i].elements |= MALI_ATTR_LINEAR;
1950                 varyings[i].size += align;
1951
1952                 for (unsigned v = 0; v < vs->varying_count; ++v) {
1953                         if (ovs[v].index != i)
1954                                 continue;
1955
1956                         ovs[v].src_offset = vs->varyings[v].src_offset + align;
1957                 }
1958
1959                 for (unsigned f = 0; f < fs->varying_count; ++f) {
1960                         if (ofs[f].index != i)
1961                                 continue;
1962
1963                         ofs[f].src_offset = fs->varyings[f].src_offset + align;
1964                 }
1965         }
1966
1967         varyings_p = panfrost_upload_transient(batch, varyings,
1968                                                idx * sizeof(*varyings));
1969         vertex_postfix->varyings = varyings_p;
1970         tiler_postfix->varyings = varyings_p;
1971
1972         vertex_postfix->varying_meta = trans.gpu;
1973         tiler_postfix->varying_meta = trans.gpu + vs_size;
1974 }
1975
1976 void
1977 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1978                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1979                                 struct mali_vertex_tiler_postfix *vertex_postfix,
1980                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1981                                 struct mali_vertex_tiler_postfix *tiler_postfix,
1982                                 union midgard_primitive_size *primitive_size)
1983 {
1984         struct panfrost_context *ctx = batch->ctx;
1985         struct panfrost_device *device = pan_device(ctx->base.screen);
1986         bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
1987         struct bifrost_payload_vertex bifrost_vertex = {0,};
1988         struct bifrost_payload_tiler bifrost_tiler = {0,};
1989         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1990         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1991         void *vp, *tp;
1992         size_t vp_size, tp_size;
1993
1994         if (device->quirks & IS_BIFROST) {
1995                 bifrost_vertex.prefix = *vertex_prefix;
1996                 bifrost_vertex.postfix = *vertex_postfix;
1997                 vp = &bifrost_vertex;
1998                 vp_size = sizeof(bifrost_vertex);
1999
2000                 bifrost_tiler.prefix = *tiler_prefix;
2001                 bifrost_tiler.tiler.primitive_size = *primitive_size;
2002                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2003                 bifrost_tiler.postfix = *tiler_postfix;
2004                 tp = &bifrost_tiler;
2005                 tp_size = sizeof(bifrost_tiler);
2006         } else {
2007                 midgard_vertex.prefix = *vertex_prefix;
2008                 midgard_vertex.postfix = *vertex_postfix;
2009                 vp = &midgard_vertex;
2010                 vp_size = sizeof(midgard_vertex);
2011
2012                 midgard_tiler.prefix = *tiler_prefix;
2013                 midgard_tiler.postfix = *tiler_postfix;
2014                 midgard_tiler.primitive_size = *primitive_size;
2015                 tp = &midgard_tiler;
2016                 tp_size = sizeof(midgard_tiler);
2017         }
2018
2019         if (wallpapering) {
2020                 /* Inject in reverse order, with "predicted" job indices.
2021                  * THIS IS A HACK XXX */
2022                 panfrost_new_job(batch, JOB_TYPE_TILER, false,
2023                                  batch->job_index + 2, tp, tp_size, true);
2024                 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2025                                  vp, vp_size, true);
2026                 return;
2027         }
2028
2029         /* If rasterizer discard is enable, only submit the vertex */
2030
2031         bool rasterizer_discard = ctx->rasterizer &&
2032                                   ctx->rasterizer->base.rasterizer_discard;
2033
2034         unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2035                                            vp, vp_size, false);
2036
2037         if (rasterizer_discard)
2038                 return;
2039
2040         panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2041                          false);
2042 }
2043
2044 /* TODO: stop hardcoding this */
2045 mali_ptr
2046 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2047 {
2048         uint16_t locations[] = {
2049             128, 128,
2050             0, 256,
2051             0, 256,
2052             0, 256,
2053             0, 256,
2054             0, 256,
2055             0, 256,
2056             0, 256,
2057             0, 256,
2058             0, 256,
2059             0, 256,
2060             0, 256,
2061             0, 256,
2062             0, 256,
2063             0, 256,
2064             0, 256,
2065             0, 256,
2066             0, 256,
2067             0, 256,
2068             0, 256,
2069             0, 256,
2070             0, 256,
2071             0, 256,
2072             0, 256,
2073             0, 256,
2074             0, 256,
2075             0, 256,
2076             0, 256,
2077             0, 256,
2078             0, 256,
2079             0, 256,
2080             0, 256,
2081             128, 128,
2082             0, 0,
2083             0, 0,
2084             0, 0,
2085             0, 0,
2086             0, 0,
2087             0, 0,
2088             0, 0,
2089             0, 0,
2090             0, 0,
2091             0, 0,
2092             0, 0,
2093             0, 0,
2094             0, 0,
2095             0, 0,
2096             0, 0,
2097         };
2098
2099         return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2100 }