src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_allocate.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         unsigned shift = panfrost_get_stack_shift(batch->stack_size);
  62         struct mali_shared_memory shared = {
  63                 .stack_shift = shift,
  64                 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
  65                 .shared_workgroup_count = ~0,
  66         };
  67         postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
  68 }
  69
  70 static void
  71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  72                                struct mali_vertex_tiler_postfix *postfix)
  73 {
  74         struct panfrost_device *dev = pan_device(ctx->base.screen);
  75         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  76
  77         /* If we haven't, reserve space for the framebuffer */
  78
  79         if (!batch->framebuffer.gpu) {
  80                 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
  81                         sizeof(struct mali_single_framebuffer) :
  82                         sizeof(struct mali_framebuffer);
  83
  84                 batch->framebuffer = panfrost_allocate_transient(batch, size);
  85
  86                 /* Tag the pointer */
  87                 if (!(dev->quirks & MIDGARD_SFBD))
  88                         batch->framebuffer.gpu |= MALI_MFBD;
  89         }
  90
  91         postfix->shared_memory = batch->framebuffer.gpu;
  92 }
  93
  94 static void
  95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
  96                               struct mali_vertex_tiler_prefix *prefix,
  97                               struct mali_vertex_tiler_postfix *postfix)
  98 {
  99         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 100
 101         postfix->gl_enables |= 0x7;
 102         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
 103                 rasterizer && rasterizer->base.front_ccw);
 104         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
 105                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
 106         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
 107                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
 108         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
 109                 rasterizer && rasterizer->base.flatshade_first);
 110 }
 111
 112 void
 113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 114                                   struct mali_vertex_tiler_prefix *prefix,
 115                                   union midgard_primitive_size *primitive_size)
 116 {
 117         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 118
 119         if (!panfrost_writes_point_size(ctx)) {
 120                 bool points = prefix->draw_mode == MALI_POINTS;
 121                 float val = 0.0f;
 122
 123                 if (rasterizer)
 124                         val = points ?
 125                               rasterizer->base.point_size :
 126                               rasterizer->base.line_width;
 127
 128                 primitive_size->constant = val;
 129         }
 130 }
 131
 132 static void
 133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 134                                    struct mali_vertex_tiler_postfix *postfix)
 135 {
 136         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 137         if (ctx->occlusion_query)
 138                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 139         else
 140                 postfix->occlusion_counter = 0;
 141 }
 142
 143 void
 144 panfrost_vt_init(struct panfrost_context *ctx,
 145                  enum pipe_shader_type stage,
 146                  struct mali_vertex_tiler_prefix *prefix,
 147                  struct mali_vertex_tiler_postfix *postfix)
 148 {
 149         struct panfrost_device *device = pan_device(ctx->base.screen);
 150
 151         if (!ctx->shader[stage])
 152                 return;
 153
 154         memset(prefix, 0, sizeof(*prefix));
 155         memset(postfix, 0, sizeof(*postfix));
 156
 157         if (device->quirks & IS_BIFROST) {
 158                 postfix->gl_enables = 0x2;
 159                 panfrost_vt_emit_shared_memory(ctx, postfix);
 160         } else {
 161                 postfix->gl_enables = 0x6;
 162                 panfrost_vt_attach_framebuffer(ctx, postfix);
 163         }
 164
 165         if (stage == PIPE_SHADER_FRAGMENT) {
 166                 panfrost_vt_update_occlusion_query(ctx, postfix);
 167                 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
 168         }
 169 }
 170
 171 static unsigned
 172 panfrost_translate_index_size(unsigned size)
 173 {
 174         switch (size) {
 175         case 1:
 176                 return MALI_DRAW_INDEXED_UINT8;
 177
 178         case 2:
 179                 return MALI_DRAW_INDEXED_UINT16;
 180
 181         case 4:
 182                 return MALI_DRAW_INDEXED_UINT32;
 183
 184         default:
 185                 unreachable("Invalid index size");
 186         }
 187 }
 188
 189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 190  * good for the duration of the draw (transient), could last longer. Also get
 191  * the bounds on the index buffer for the range accessed by the draw. We do
 192  * these operations together because there are natural optimizations which
 193  * require them to be together. */
 194
 195 static mali_ptr
 196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 197                                   const struct pipe_draw_info *info,
 198                                   unsigned *min_index, unsigned *max_index)
 199 {
 200         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 201         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 202         off_t offset = info->start * info->index_size;
 203         bool needs_indices = true;
 204         mali_ptr out = 0;
 205
 206         if (info->max_index != ~0u) {
 207                 *min_index = info->min_index;
 208                 *max_index = info->max_index;
 209                 needs_indices = false;
 210         }
 211
 212         if (!info->has_user_indices) {
 213                 /* Only resources can be directly mapped */
 214                 panfrost_batch_add_bo(batch, rsrc->bo,
 215                                       PAN_BO_ACCESS_SHARED |
 216                                       PAN_BO_ACCESS_READ |
 217                                       PAN_BO_ACCESS_VERTEX_TILER);
 218                 out = rsrc->bo->gpu + offset;
 219
 220                 /* Check the cache */
 221                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 222                                                            info->start,
 223                                                            info->count,
 224                                                            min_index,
 225                                                            max_index);
 226         } else {
 227                 /* Otherwise, we need to upload to transient memory */
 228                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 229                 out = panfrost_upload_transient(batch, ibuf8 + offset,
 230                                                 info->count *
 231                                                 info->index_size);
 232         }
 233
 234         if (needs_indices) {
 235                 /* Fallback */
 236                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 237
 238                 if (!info->has_user_indices)
 239                         panfrost_minmax_cache_add(rsrc->index_cache,
 240                                                   info->start, info->count,
 241                                                   *min_index, *max_index);
 242         }
 243
 244         return out;
 245 }
 246
 247 void
 248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 249                           const struct pipe_draw_info *info,
 250                           enum mali_draw_mode draw_mode,
 251                           struct mali_vertex_tiler_postfix *vertex_postfix,
 252                           struct mali_vertex_tiler_prefix *tiler_prefix,
 253                           struct mali_vertex_tiler_postfix *tiler_postfix,
 254                           unsigned *vertex_count,
 255                           unsigned *padded_count)
 256 {
 257         tiler_prefix->draw_mode = draw_mode;
 258
 259         unsigned draw_flags = 0;
 260
 261         if (panfrost_writes_point_size(ctx))
 262                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 263
 264         if (info->primitive_restart)
 265                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 266
 267         /* These doesn't make much sense */
 268
 269         draw_flags |= 0x3000;
 270
 271         if (info->index_size) {
 272                 unsigned min_index = 0, max_index = 0;
 273
 274                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 275                                                                        info,
 276                                                                        &min_index,
 277                                                                        &max_index);
 278
 279                 /* Use the corresponding values */
 280                 *vertex_count = max_index - min_index + 1;
 281                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 282                 tiler_prefix->offset_bias_correction = -min_index;
 283                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 284                 draw_flags |= panfrost_translate_index_size(info->index_size);
 285         } else {
 286                 tiler_prefix->indices = 0;
 287                 *vertex_count = ctx->vertex_count;
 288                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 289                 tiler_prefix->offset_bias_correction = 0;
 290                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 291         }
 292
 293         tiler_prefix->unknown_draw = draw_flags;
 294
 295         /* Encode the padded vertex count */
 296
 297         if (info->instance_count > 1) {
 298                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 299
 300                 unsigned shift = __builtin_ctz(ctx->padded_count);
 301                 unsigned k = ctx->padded_count >> (shift + 1);
 302
 303                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 304                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 305         } else {
 306                 *padded_count = *vertex_count;
 307
 308                 /* Reset instancing state */
 309                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 310                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 311         }
 312 }
 313
 314 static void
 315 panfrost_shader_meta_init(struct panfrost_context *ctx,
 316                           enum pipe_shader_type st,
 317                           struct mali_shader_meta *meta)
 318 {
 319         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 320         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 321
 322         memset(meta, 0, sizeof(*meta));
 323         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 324         meta->attribute_count = ss->attribute_count;
 325         meta->varying_count = ss->varying_count;
 326         meta->texture_count = ctx->sampler_view_count[st];
 327         meta->sampler_count = ctx->sampler_count[st];
 328
 329         if (dev->quirks & IS_BIFROST) {
 330                 if (st == PIPE_SHADER_VERTEX)
 331                         meta->bifrost1.unk1 = 0x800000;
 332                 else {
 333                         /* First clause ATEST |= 0x4000000.
 334                          * Less than 32 regs |= 0x200 */
 335                         meta->bifrost1.unk1 = 0x950020;
 336                 }
 337
 338                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 339                 if (st == PIPE_SHADER_VERTEX)
 340                         meta->bifrost2.preload_regs = 0xC0;
 341                 else {
 342                         meta->bifrost2.preload_regs = 0x1;
 343                         SET_BIT(meta->bifrost2.preload_regs, 0x10, ss->reads_frag_coord);
 344                 }
 345
 346                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 347                                                     ss->uniform_cutoff);
 348         } else {
 349                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 350                                                     ss->uniform_cutoff);
 351                 meta->midgard1.work_count = ss->work_reg_count;
 352
 353                 /* TODO: This is not conformant on ES3 */
 354                 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
 355
 356                 meta->midgard1.flags_lo = MALI_WRITES_GLOBAL | 0x20;
 357                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 358         }
 359 }
 360
 361 static unsigned
 362 panfrost_translate_compare_func(enum pipe_compare_func in)
 363 {
 364         switch (in) {
 365         case PIPE_FUNC_NEVER:
 366                 return MALI_FUNC_NEVER;
 367
 368         case PIPE_FUNC_LESS:
 369                 return MALI_FUNC_LESS;
 370
 371         case PIPE_FUNC_EQUAL:
 372                 return MALI_FUNC_EQUAL;
 373
 374         case PIPE_FUNC_LEQUAL:
 375                 return MALI_FUNC_LEQUAL;
 376
 377         case PIPE_FUNC_GREATER:
 378                 return MALI_FUNC_GREATER;
 379
 380         case PIPE_FUNC_NOTEQUAL:
 381                 return MALI_FUNC_NOTEQUAL;
 382
 383         case PIPE_FUNC_GEQUAL:
 384                 return MALI_FUNC_GEQUAL;
 385
 386         case PIPE_FUNC_ALWAYS:
 387                 return MALI_FUNC_ALWAYS;
 388
 389         default:
 390                 unreachable("Invalid func");
 391         }
 392 }
 393
 394 static unsigned
 395 panfrost_translate_stencil_op(enum pipe_stencil_op in)
 396 {
 397         switch (in) {
 398         case PIPE_STENCIL_OP_KEEP:
 399                 return MALI_STENCIL_KEEP;
 400
 401         case PIPE_STENCIL_OP_ZERO:
 402                 return MALI_STENCIL_ZERO;
 403
 404         case PIPE_STENCIL_OP_REPLACE:
 405                return MALI_STENCIL_REPLACE;
 406
 407         case PIPE_STENCIL_OP_INCR:
 408                 return MALI_STENCIL_INCR;
 409
 410         case PIPE_STENCIL_OP_DECR:
 411                 return MALI_STENCIL_DECR;
 412
 413         case PIPE_STENCIL_OP_INCR_WRAP:
 414                 return MALI_STENCIL_INCR_WRAP;
 415
 416         case PIPE_STENCIL_OP_DECR_WRAP:
 417                 return MALI_STENCIL_DECR_WRAP;
 418
 419         case PIPE_STENCIL_OP_INVERT:
 420                 return MALI_STENCIL_INVERT;
 421
 422         default:
 423                 unreachable("Invalid stencil op");
 424         }
 425 }
 426
 427 static unsigned
 428 translate_tex_wrap(enum pipe_tex_wrap w)
 429 {
 430         switch (w) {
 431         case PIPE_TEX_WRAP_REPEAT:
 432                 return MALI_WRAP_REPEAT;
 433
 434         case PIPE_TEX_WRAP_CLAMP:
 435                 return MALI_WRAP_CLAMP;
 436
 437         case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 438                 return MALI_WRAP_CLAMP_TO_EDGE;
 439
 440         case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 441                 return MALI_WRAP_CLAMP_TO_BORDER;
 442
 443         case PIPE_TEX_WRAP_MIRROR_REPEAT:
 444                 return MALI_WRAP_MIRRORED_REPEAT;
 445
 446         case PIPE_TEX_WRAP_MIRROR_CLAMP:
 447                 return MALI_WRAP_MIRRORED_CLAMP;
 448
 449         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 450                 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
 451
 452         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 453                 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
 454
 455         default:
 456                 unreachable("Invalid wrap");
 457         }
 458 }
 459
 460 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 461                                 struct mali_sampler_descriptor *hw)
 462 {
 463         unsigned func = panfrost_translate_compare_func(cso->compare_func);
 464         bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 465         bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 466         bool mip_linear  = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
 467         unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
 468         unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
 469         unsigned mip_filter = mip_linear  ?
 470                               (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
 471         unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
 472
 473         *hw = (struct mali_sampler_descriptor) {
 474                 .filter_mode = min_filter | mag_filter | mip_filter |
 475                                normalized,
 476                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 477                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 478                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 479                 .compare_func = panfrost_flip_compare_func(func),
 480                 .border_color = {
 481                         cso->border_color.f[0],
 482                         cso->border_color.f[1],
 483                         cso->border_color.f[2],
 484                         cso->border_color.f[3]
 485                 },
 486                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 487                 .max_lod = FIXED_16(cso->max_lod, false),
 488                 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
 489                 .seamless_cube_map = cso->seamless_cube_map,
 490         };
 491
 492         /* If necessary, we disable mipmapping in the sampler descriptor by
 493          * clamping the LOD as tight as possible (from 0 to epsilon,
 494          * essentially -- remember these are fixed point numbers, so
 495          * epsilon=1/256) */
 496
 497         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 498                 hw->max_lod = hw->min_lod + 1;
 499 }
 500
 501 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 502                                         struct bifrost_sampler_descriptor *hw)
 503 {
 504         *hw = (struct bifrost_sampler_descriptor) {
 505                 .unk1 = 0x1,
 506                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 507                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 508                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 509                 .unk8 = 0x8,
 510                 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
 511                 .norm_coords = cso->normalized_coords,
 512                 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
 513                 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
 514                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 515                 .max_lod = FIXED_16(cso->max_lod, false),
 516         };
 517
 518         /* If necessary, we disable mipmapping in the sampler descriptor by
 519          * clamping the LOD as tight as possible (from 0 to epsilon,
 520          * essentially -- remember these are fixed point numbers, so
 521          * epsilon=1/256) */
 522
 523         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 524                 hw->max_lod = hw->min_lod + 1;
 525 }
 526
 527 static void
 528 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
 529                             struct mali_stencil_test *out)
 530 {
 531         out->ref = 0; /* Gallium gets it from elsewhere */
 532
 533         out->mask = in->valuemask;
 534         out->func = panfrost_translate_compare_func(in->func);
 535         out->sfail = panfrost_translate_stencil_op(in->fail_op);
 536         out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
 537         out->dppass = panfrost_translate_stencil_op(in->zpass_op);
 538 }
 539
 540 static void
 541 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 542                                      struct mali_shader_meta *fragmeta)
 543 {
 544         if (!ctx->rasterizer) {
 545                 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
 546                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
 547                 fragmeta->depth_units = 0.0f;
 548                 fragmeta->depth_factor = 0.0f;
 549                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
 550                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
 551                 return;
 552         }
 553
 554         bool msaa = ctx->rasterizer->base.multisample;
 555
 556         /* TODO: Sample size */
 557         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 558         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 559         fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
 560         fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
 561
 562         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 563
 564         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
 565                 ctx->rasterizer->base.offset_tri);
 566         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
 567                 ctx->rasterizer->base.offset_tri);
 568 }
 569
 570 static void
 571 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 572                               struct mali_shader_meta *fragmeta)
 573 {
 574         const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 575         int zfunc = PIPE_FUNC_ALWAYS;
 576
 577         if (!zsa) {
 578                 struct pipe_stencil_state default_stencil = {
 579                         .enabled = 0,
 580                         .func = PIPE_FUNC_ALWAYS,
 581                         .fail_op = MALI_STENCIL_KEEP,
 582                         .zfail_op = MALI_STENCIL_KEEP,
 583                         .zpass_op = MALI_STENCIL_KEEP,
 584                         .writemask = 0xFF,
 585                         .valuemask = 0xFF
 586                 };
 587
 588                 panfrost_make_stencil_state(&default_stencil,
 589                                             &fragmeta->stencil_front);
 590                 fragmeta->stencil_mask_front = default_stencil.writemask;
 591                 fragmeta->stencil_back = fragmeta->stencil_front;
 592                 fragmeta->stencil_mask_back = default_stencil.writemask;
 593                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
 594                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
 595         } else {
 596                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 597                         zsa->stencil[0].enabled);
 598                 panfrost_make_stencil_state(&zsa->stencil[0],
 599                                             &fragmeta->stencil_front);
 600                 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
 601                 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
 602
 603                 /* If back-stencil is not enabled, use the front values */
 604
 605                 if (zsa->stencil[1].enabled) {
 606                         panfrost_make_stencil_state(&zsa->stencil[1],
 607                                                     &fragmeta->stencil_back);
 608                         fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
 609                         fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
 610                 } else {
 611                         fragmeta->stencil_back = fragmeta->stencil_front;
 612                         fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
 613                         fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
 614                 }
 615
 616                 if (zsa->depth.enabled)
 617                         zfunc = zsa->depth.func;
 618
 619                 /* Depth state (TODO: Refactor) */
 620
 621                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 622                         zsa->depth.writemask);
 623         }
 624
 625         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 626         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
 627 }
 628
 629 static bool
 630 panfrost_fs_required(
 631                 struct panfrost_shader_state *fs,
 632                 struct panfrost_blend_final *blend,
 633                 unsigned rt_count)
 634 {
 635         /* If we generally have side effects */
 636         if (fs->fs_sidefx)
 637                 return true;
 638
 639         /* If colour is written we need to execute */
 640         for (unsigned i = 0; i < rt_count; ++i) {
 641                 if (!blend[i].no_colour)
 642                         return true;
 643         }
 644
 645         /* If depth is written and not implied we need to execute.
 646          * TODO: Predicate on Z/S writes being enabled */
 647         return (fs->writes_depth || fs->writes_stencil);
 648 }
 649
 650 static void
 651 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 652                                 struct mali_shader_meta *fragmeta,
 653                                 void *rts)
 654 {
 655         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 656         struct panfrost_shader_state *fs;
 657         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 658
 659         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 660                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 661                 !ctx->blend->base.dither);
 662
 663         /* Get blending setup */
 664         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 665
 666         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 667         unsigned shader_offset = 0;
 668         struct panfrost_bo *shader_bo = NULL;
 669
 670         for (unsigned c = 0; c < rt_count; ++c)
 671                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
 672                                                           &shader_offset);
 673
 674         /* Disable shader execution if we can */
 675         if (dev->quirks & MIDGARD_SHADERLESS
 676                         && !panfrost_fs_required(fs, blend, rt_count)) {
 677                 fragmeta->shader = 0;
 678                 fragmeta->attribute_count = 0;
 679                 fragmeta->varying_count = 0;
 680                 fragmeta->texture_count = 0;
 681                 fragmeta->sampler_count = 0;
 682
 683                 /* This feature is not known to work on Bifrost */
 684                 fragmeta->midgard1.work_count = 1;
 685                 fragmeta->midgard1.uniform_count = 0;
 686                 fragmeta->midgard1.uniform_buffer_count = 0;
 687         }
 688
 689          /* If there is a blend shader, work registers are shared. We impose 8
 690           * work registers as a limit for blend shaders. Should be lower XXX */
 691
 692         if (!(dev->quirks & IS_BIFROST)) {
 693                 for (unsigned c = 0; c < rt_count; ++c) {
 694                         if (blend[c].is_shader) {
 695                                 fragmeta->midgard1.work_count =
 696                                         MAX2(fragmeta->midgard1.work_count, 8);
 697                         }
 698                 }
 699         }
 700
 701         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 702          * copied to the blend_meta appended (by convention), but this is the
 703          * field actually read by the hardware. (Or maybe both are read...?).
 704          * Specify the last RTi with a blend shader. */
 705
 706         fragmeta->blend.shader = 0;
 707
 708         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 709                 if (!blend[rt].is_shader)
 710                         continue;
 711
 712                 fragmeta->blend.shader = blend[rt].shader.gpu |
 713                                          blend[rt].shader.first_tag;
 714                 break;
 715         }
 716
 717         if (dev->quirks & MIDGARD_SFBD) {
 718                 /* When only a single render target platform is used, the blend
 719                  * information is inside the shader meta itself. We additionally
 720                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 721                  * we're able to read back the destination buffer) */
 722
 723                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 724                         blend[0].is_shader);
 725
 726                 if (!blend[0].is_shader) {
 727                         fragmeta->blend.equation = *blend[0].equation.equation;
 728                         fragmeta->blend.constant = blend[0].equation.constant;
 729                 }
 730
 731                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 732                         !blend[0].no_blending || fs->can_discard);
 733                 return;
 734         }
 735
 736         if (dev->quirks & IS_BIFROST) {
 737                 bool no_blend = true;
 738
 739                 for (unsigned i = 0; i < rt_count; ++i)
 740                         no_blend &= (blend[i].no_blending | blend[i].no_colour);
 741
 742                 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
 743                         !fs->can_discard && !fs->writes_depth && no_blend);
 744         }
 745
 746         /* Additional blend descriptor tacked on for jobs using MFBD */
 747
 748         for (unsigned i = 0; i < rt_count; ++i) {
 749                 unsigned flags = 0;
 750
 751                 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
 752                         flags = 0x200;
 753
 754                         bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 755                                        (ctx->pipe_framebuffer.cbufs[i]) &&
 756                                        util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 757
 758                         SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 759                         SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 760                         SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
 761                         SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 762                 }
 763
 764                 if (dev->quirks & IS_BIFROST) {
 765                         struct bifrost_blend_rt *brts = rts;
 766
 767                         brts[i].flags = flags;
 768
 769                         if (blend[i].is_shader) {
 770                                 /* The blend shader's address needs to be at
 771                                  * the same top 32 bit as the fragment shader.
 772                                  * TODO: Ensure that's always the case.
 773                                  */
 774                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 775                                        (fs->bo->gpu & (0xffffffffull << 32)));
 776                                 brts[i].shader = blend[i].shader.gpu;
 777                                 brts[i].unk2 = 0x0;
 778                         } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
 779                                 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
 780                                 const struct util_format_description *format_desc;
 781                                 format_desc = util_format_description(format);
 782
 783                                 brts[i].equation = *blend[i].equation.equation;
 784
 785                                 /* TODO: this is a bit more complicated */
 786                                 brts[i].constant = blend[i].equation.constant;
 787
 788                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 789
 790                                 /* 0x19 disables blending and forces REPLACE
 791                                  * mode (equivalent to rgb_mode = alpha_mode =
 792                                  * x122, colour mask = 0xF). 0x1a allows
 793                                  * blending. */
 794                                 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
 795
 796                                 brts[i].shader_type = fs->blend_types[i];
 797                         } else {
 798                                 /* Dummy attachment for depth-only */
 799                                 brts[i].unk2 = 0x3;
 800                                 brts[i].shader_type = fs->blend_types[i];
 801                         }
 802                 } else {
 803                         struct midgard_blend_rt *mrts = rts;
 804                         mrts[i].flags = flags;
 805
 806                         if (blend[i].is_shader) {
 807                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 808                         } else {
 809                                 mrts[i].blend.equation = *blend[i].equation.equation;
 810                                 mrts[i].blend.constant = blend[i].equation.constant;
 811                         }
 812                 }
 813         }
 814 }
 815
 816 static void
 817 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 818                                struct mali_shader_meta *fragmeta,
 819                                void *rts)
 820 {
 821         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 822         struct panfrost_shader_state *fs;
 823
 824         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 825
 826         fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
 827         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
 828         fragmeta->unknown2_4 = 0x4e0;
 829
 830         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 831          * is required (independent of 32-bit/64-bit descriptors), or why it's
 832          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 833          * these earlier chips (perhaps this is a chicken bit of some kind).
 834          * More investigation is needed. */
 835
 836         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 837
 838         if (dev->quirks & IS_BIFROST) {
 839                 /* TODO */
 840         } else {
 841                 /* Depending on whether it's legal to in the given shader, we try to
 842                  * enable early-z testing (or forward-pixel kill?) */
 843
 844                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 845                         !fs->can_discard && !fs->writes_depth);
 846
 847                 /* Add the writes Z/S flags if needed. */
 848                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 849                 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 850
 851                 /* Any time texturing is used, derivatives are implicitly calculated,
 852                  * so we need to enable helper invocations */
 853
 854                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 855                         fs->helper_invocations);
 856
 857                 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 858
 859                 bool depth_enabled = fs->writes_depth ||
 860                    (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
 861
 862                 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, !depth_enabled && fs->can_discard);
 863                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
 864         }
 865
 866         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 867         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 868         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 869 }
 870
 871 void
 872 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 873                           enum pipe_shader_type st,
 874                           struct mali_vertex_tiler_postfix *postfix)
 875 {
 876         struct panfrost_context *ctx = batch->ctx;
 877         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 878
 879         if (!ss) {
 880                 postfix->shader = 0;
 881                 return;
 882         }
 883
 884         struct mali_shader_meta meta;
 885
 886         panfrost_shader_meta_init(ctx, st, &meta);
 887
 888         /* Add the shader BO to the batch. */
 889         panfrost_batch_add_bo(batch, ss->bo,
 890                               PAN_BO_ACCESS_PRIVATE |
 891                               PAN_BO_ACCESS_READ |
 892                               panfrost_bo_access_for_stage(st));
 893
 894         mali_ptr shader_ptr;
 895
 896         if (st == PIPE_SHADER_FRAGMENT) {
 897                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 898                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 899                 size_t desc_size = sizeof(meta);
 900                 void *rts = NULL;
 901                 struct panfrost_transfer xfer;
 902                 unsigned rt_size;
 903
 904                 if (dev->quirks & MIDGARD_SFBD)
 905                         rt_size = 0;
 906                 else if (dev->quirks & IS_BIFROST)
 907                         rt_size = sizeof(struct bifrost_blend_rt);
 908                 else
 909                         rt_size = sizeof(struct midgard_blend_rt);
 910
 911                 desc_size += rt_size * rt_count;
 912
 913                 if (rt_size)
 914                         rts = rzalloc_size(ctx, rt_size * rt_count);
 915
 916                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 917
 918                 xfer = panfrost_allocate_transient(batch, desc_size);
 919
 920                 memcpy(xfer.cpu, &meta, sizeof(meta));
 921                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 922
 923                 if (rt_size)
 924                         ralloc_free(rts);
 925
 926                 shader_ptr = xfer.gpu;
 927         } else {
 928                 shader_ptr = panfrost_upload_transient(batch, &meta,
 929                                                        sizeof(meta));
 930         }
 931
 932         postfix->shader = shader_ptr;
 933 }
 934
 935 static void
 936 panfrost_mali_viewport_init(struct panfrost_context *ctx,
 937                             struct mali_viewport *mvp)
 938 {
 939         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 940
 941         /* Clip bounds are encoded as floats. The viewport itself is encoded as
 942          * (somewhat) asymmetric ints. */
 943
 944         const struct pipe_scissor_state *ss = &ctx->scissor;
 945
 946         memset(mvp, 0, sizeof(*mvp));
 947
 948         /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
 949          * each direction. Clipping to the viewport in theory should work, but
 950          * in practice causes issues when we're not explicitly trying to
 951          * scissor */
 952
 953         *mvp = (struct mali_viewport) {
 954                 .clip_minx = -INFINITY,
 955                 .clip_miny = -INFINITY,
 956                 .clip_maxx = INFINITY,
 957                 .clip_maxy = INFINITY,
 958         };
 959
 960         /* Always scissor to the viewport by default. */
 961         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 962         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 963
 964         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 965         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 966
 967         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 968         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 969
 970         /* Apply the scissor test */
 971
 972         unsigned minx, miny, maxx, maxy;
 973
 974         if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
 975                 minx = MAX2(ss->minx, vp_minx);
 976                 miny = MAX2(ss->miny, vp_miny);
 977                 maxx = MIN2(ss->maxx, vp_maxx);
 978                 maxy = MIN2(ss->maxy, vp_maxy);
 979         } else {
 980                 minx = vp_minx;
 981                 miny = vp_miny;
 982                 maxx = vp_maxx;
 983                 maxy = vp_maxy;
 984         }
 985
 986         /* Hardware needs the min/max to be strictly ordered, so flip if we
 987          * need to. The viewport transformation in the vertex shader will
 988          * handle the negatives if we don't */
 989
 990         if (miny > maxy) {
 991                 unsigned temp = miny;
 992                 miny = maxy;
 993                 maxy = temp;
 994         }
 995
 996         if (minx > maxx) {
 997                 unsigned temp = minx;
 998                 minx = maxx;
 999                 maxx = temp;
1000         }
1001
1002         if (minz > maxz) {
1003                 float temp = minz;
1004                 minz = maxz;
1005                 maxz = temp;
1006         }
1007
1008         /* Clamp to the framebuffer size as a last check */
1009
1010         minx = MIN2(ctx->pipe_framebuffer.width, minx);
1011         maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
1012
1013         miny = MIN2(ctx->pipe_framebuffer.height, miny);
1014         maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
1015
1016         /* Upload */
1017
1018         mvp->viewport0[0] = minx;
1019         mvp->viewport1[0] = MALI_POSITIVE(maxx);
1020
1021         mvp->viewport0[1] = miny;
1022         mvp->viewport1[1] = MALI_POSITIVE(maxy);
1023
1024         mvp->clip_minz = minz;
1025         mvp->clip_maxz = maxz;
1026 }
1027
1028 void
1029 panfrost_emit_viewport(struct panfrost_batch *batch,
1030                        struct mali_vertex_tiler_postfix *tiler_postfix)
1031 {
1032         struct panfrost_context *ctx = batch->ctx;
1033         struct mali_viewport mvp;
1034
1035         panfrost_mali_viewport_init(batch->ctx,  &mvp);
1036
1037         /* Update the job, unless we're doing wallpapering (whose lack of
1038          * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
1039          * just... be faster :) */
1040
1041         if (!ctx->wallpaper_batch)
1042                 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
1043                                              mvp.viewport0[1],
1044                                              mvp.viewport1[0] + 1,
1045                                              mvp.viewport1[1] + 1);
1046
1047         tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
1048                                                             sizeof(mvp));
1049 }
1050
1051 static mali_ptr
1052 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1053                                  enum pipe_shader_type st,
1054                                  struct panfrost_constant_buffer *buf,
1055                                  unsigned index)
1056 {
1057         struct pipe_constant_buffer *cb = &buf->cb[index];
1058         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1059
1060         if (rsrc) {
1061                 panfrost_batch_add_bo(batch, rsrc->bo,
1062                                       PAN_BO_ACCESS_SHARED |
1063                                       PAN_BO_ACCESS_READ |
1064                                       panfrost_bo_access_for_stage(st));
1065
1066                 /* Alignment gauranteed by
1067                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1068                 return rsrc->bo->gpu + cb->buffer_offset;
1069         } else if (cb->user_buffer) {
1070                 return panfrost_upload_transient(batch,
1071                                                  cb->user_buffer +
1072                                                  cb->buffer_offset,
1073                                                  cb->buffer_size);
1074         } else {
1075                 unreachable("No constant buffer");
1076         }
1077 }
1078
1079 struct sysval_uniform {
1080         union {
1081                 float f[4];
1082                 int32_t i[4];
1083                 uint32_t u[4];
1084                 uint64_t du[2];
1085         };
1086 };
1087
1088 static void
1089 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1090                                       struct sysval_uniform *uniform)
1091 {
1092         struct panfrost_context *ctx = batch->ctx;
1093         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1094
1095         uniform->f[0] = vp->scale[0];
1096         uniform->f[1] = vp->scale[1];
1097         uniform->f[2] = vp->scale[2];
1098 }
1099
1100 static void
1101 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1102                                        struct sysval_uniform *uniform)
1103 {
1104         struct panfrost_context *ctx = batch->ctx;
1105         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1106
1107         uniform->f[0] = vp->translate[0];
1108         uniform->f[1] = vp->translate[1];
1109         uniform->f[2] = vp->translate[2];
1110 }
1111
1112 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1113                                        enum pipe_shader_type st,
1114                                        unsigned int sysvalid,
1115                                        struct sysval_uniform *uniform)
1116 {
1117         struct panfrost_context *ctx = batch->ctx;
1118         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1119         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1120         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1121         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1122
1123         assert(dim);
1124         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1125
1126         if (dim > 1)
1127                 uniform->i[1] = u_minify(tex->texture->height0,
1128                                          tex->u.tex.first_level);
1129
1130         if (dim > 2)
1131                 uniform->i[2] = u_minify(tex->texture->depth0,
1132                                          tex->u.tex.first_level);
1133
1134         if (is_array)
1135                 uniform->i[dim] = tex->texture->array_size;
1136 }
1137
1138 static void
1139 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1140                             enum pipe_shader_type st,
1141                             unsigned ssbo_id,
1142                             struct sysval_uniform *uniform)
1143 {
1144         struct panfrost_context *ctx = batch->ctx;
1145
1146         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1147         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1148
1149         /* Compute address */
1150         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1151
1152         panfrost_batch_add_bo(batch, bo,
1153                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1154                               panfrost_bo_access_for_stage(st));
1155
1156         /* Upload address and size as sysval */
1157         uniform->du[0] = bo->gpu + sb.buffer_offset;
1158         uniform->u[2] = sb.buffer_size;
1159 }
1160
1161 static void
1162 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1163                                enum pipe_shader_type st,
1164                                unsigned samp_idx,
1165                                struct sysval_uniform *uniform)
1166 {
1167         struct panfrost_context *ctx = batch->ctx;
1168         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1169
1170         uniform->f[0] = sampl->min_lod;
1171         uniform->f[1] = sampl->max_lod;
1172         uniform->f[2] = sampl->lod_bias;
1173
1174         /* Even without any errata, Midgard represents "no mipmapping" as
1175          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1176          * panfrost_create_sampler_state which also explains our choice of
1177          * epsilon value (again to keep behaviour consistent) */
1178
1179         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1180                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1181 }
1182
1183 static void
1184 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1185                                        struct sysval_uniform *uniform)
1186 {
1187         struct panfrost_context *ctx = batch->ctx;
1188
1189         uniform->u[0] = ctx->compute_grid->grid[0];
1190         uniform->u[1] = ctx->compute_grid->grid[1];
1191         uniform->u[2] = ctx->compute_grid->grid[2];
1192 }
1193
1194 static void
1195 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1196                         struct panfrost_shader_state *ss,
1197                         enum pipe_shader_type st)
1198 {
1199         struct sysval_uniform *uniforms = (void *)buf;
1200
1201         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1202                 int sysval = ss->sysval[i];
1203
1204                 switch (PAN_SYSVAL_TYPE(sysval)) {
1205                 case PAN_SYSVAL_VIEWPORT_SCALE:
1206                         panfrost_upload_viewport_scale_sysval(batch,
1207                                                               &uniforms[i]);
1208                         break;
1209                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1210                         panfrost_upload_viewport_offset_sysval(batch,
1211                                                                &uniforms[i]);
1212                         break;
1213                 case PAN_SYSVAL_TEXTURE_SIZE:
1214                         panfrost_upload_txs_sysval(batch, st,
1215                                                    PAN_SYSVAL_ID(sysval),
1216                                                    &uniforms[i]);
1217                         break;
1218                 case PAN_SYSVAL_SSBO:
1219                         panfrost_upload_ssbo_sysval(batch, st,
1220                                                     PAN_SYSVAL_ID(sysval),
1221                                                     &uniforms[i]);
1222                         break;
1223                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1224                         panfrost_upload_num_work_groups_sysval(batch,
1225                                                                &uniforms[i]);
1226                         break;
1227                 case PAN_SYSVAL_SAMPLER:
1228                         panfrost_upload_sampler_sysval(batch, st,
1229                                                        PAN_SYSVAL_ID(sysval),
1230                                                        &uniforms[i]);
1231                         break;
1232                 default:
1233                         assert(0);
1234                 }
1235         }
1236 }
1237
1238 static const void *
1239 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1240                                  unsigned index)
1241 {
1242         struct pipe_constant_buffer *cb = &buf->cb[index];
1243         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1244
1245         if (rsrc)
1246                 return rsrc->bo->cpu;
1247         else if (cb->user_buffer)
1248                 return cb->user_buffer;
1249         else
1250                 unreachable("No constant buffer");
1251 }
1252
1253 void
1254 panfrost_emit_const_buf(struct panfrost_batch *batch,
1255                         enum pipe_shader_type stage,
1256                         struct mali_vertex_tiler_postfix *postfix)
1257 {
1258         struct panfrost_context *ctx = batch->ctx;
1259         struct panfrost_shader_variants *all = ctx->shader[stage];
1260
1261         if (!all)
1262                 return;
1263
1264         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1265
1266         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1267
1268         /* Uniforms are implicitly UBO #0 */
1269         bool has_uniforms = buf->enabled_mask & (1 << 0);
1270
1271         /* Allocate room for the sysval and the uniforms */
1272         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1273         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1274         size_t size = sys_size + uniform_size;
1275         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1276                                                                         size);
1277
1278         /* Upload sysvals requested by the shader */
1279         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1280
1281         /* Upload uniforms */
1282         if (has_uniforms && uniform_size) {
1283                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1284                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1285         }
1286
1287         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1288          * uploaded */
1289
1290         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1291         assert(ubo_count >= 1);
1292
1293         size_t sz = sizeof(uint64_t) * ubo_count;
1294         uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1295         int uniform_count = ss->uniform_count;
1296
1297         /* Upload uniforms as a UBO */
1298         ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1299
1300         /* The rest are honest-to-goodness UBOs */
1301
1302         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1303                 size_t usz = buf->cb[ubo].buffer_size;
1304                 bool enabled = buf->enabled_mask & (1 << ubo);
1305                 bool empty = usz == 0;
1306
1307                 if (!enabled || empty) {
1308                         /* Stub out disabled UBOs to catch accesses */
1309                         ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1310                         continue;
1311                 }
1312
1313                 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1314                                                                 buf, ubo);
1315
1316                 unsigned bytes_per_field = 16;
1317                 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1318                 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1319         }
1320
1321         mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1322         postfix->uniforms = transfer.gpu;
1323         postfix->uniform_buffers = ubufs;
1324
1325         buf->dirty_mask = 0;
1326 }
1327
1328 void
1329 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1330                             const struct pipe_grid_info *info,
1331                             struct midgard_payload_vertex_tiler *vtp)
1332 {
1333         struct panfrost_context *ctx = batch->ctx;
1334         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1335         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1336         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1337                                                            128));
1338         unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1339                                info->grid[2] * 4;
1340         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1341                                                                   shared_size,
1342                                                                   1);
1343
1344         struct mali_shared_memory shared = {
1345                 .shared_memory = bo->gpu,
1346                 .shared_workgroup_count =
1347                         util_logbase2_ceil(info->grid[0]) +
1348                         util_logbase2_ceil(info->grid[1]) +
1349                         util_logbase2_ceil(info->grid[2]),
1350                 .shared_unk1 = 0x2,
1351                 .shared_shift = util_logbase2(single_size) - 1
1352         };
1353
1354         vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1355                                                                sizeof(shared));
1356 }
1357
1358 static mali_ptr
1359 panfrost_get_tex_desc(struct panfrost_batch *batch,
1360                       enum pipe_shader_type st,
1361                       struct panfrost_sampler_view *view)
1362 {
1363         if (!view)
1364                 return (mali_ptr) 0;
1365
1366         struct pipe_sampler_view *pview = &view->base;
1367         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1368
1369         /* Add the BO to the job so it's retained until the job is done. */
1370
1371         panfrost_batch_add_bo(batch, rsrc->bo,
1372                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1373                               panfrost_bo_access_for_stage(st));
1374
1375         panfrost_batch_add_bo(batch, view->midgard_bo,
1376                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1377                               panfrost_bo_access_for_stage(st));
1378
1379         return view->midgard_bo->gpu;
1380 }
1381
1382 void
1383 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1384                                   enum pipe_shader_type stage,
1385                                   struct mali_vertex_tiler_postfix *postfix)
1386 {
1387         struct panfrost_context *ctx = batch->ctx;
1388         struct panfrost_device *device = pan_device(ctx->base.screen);
1389
1390         if (!ctx->sampler_view_count[stage])
1391                 return;
1392
1393         if (device->quirks & IS_BIFROST) {
1394                 struct bifrost_texture_descriptor *descriptors;
1395
1396                 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1397                                      ctx->sampler_view_count[stage]);
1398
1399                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1400                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1401                         struct pipe_sampler_view *pview = &view->base;
1402                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1403
1404                         /* Add the BOs to the job so they are retained until the job is done. */
1405
1406                         panfrost_batch_add_bo(batch, rsrc->bo,
1407                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1408                                               panfrost_bo_access_for_stage(stage));
1409
1410                         panfrost_batch_add_bo(batch, view->bifrost_bo,
1411                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1412                                               panfrost_bo_access_for_stage(stage));
1413
1414                         memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1415                 }
1416
1417                 postfix->textures = panfrost_upload_transient(batch,
1418                                                               descriptors,
1419                                                               sizeof(struct bifrost_texture_descriptor) *
1420                                                                       ctx->sampler_view_count[stage]);
1421
1422                 free(descriptors);
1423         } else {
1424                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1425
1426                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1427                         trampolines[i] = panfrost_get_tex_desc(batch, stage,
1428                                                                ctx->sampler_views[stage][i]);
1429
1430                 postfix->textures = panfrost_upload_transient(batch,
1431                                                               trampolines,
1432                                                               sizeof(uint64_t) *
1433                                                               ctx->sampler_view_count[stage]);
1434         }
1435 }
1436
1437 void
1438 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1439                                   enum pipe_shader_type stage,
1440                                   struct mali_vertex_tiler_postfix *postfix)
1441 {
1442         struct panfrost_context *ctx = batch->ctx;
1443         struct panfrost_device *device = pan_device(ctx->base.screen);
1444
1445         if (!ctx->sampler_count[stage])
1446                 return;
1447
1448         if (device->quirks & IS_BIFROST) {
1449                 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1450                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1451                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1452                                                                                 transfer_size);
1453                 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1454
1455                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1456                         desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1457
1458                 postfix->sampler_descriptor = transfer.gpu;
1459         } else {
1460                 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1461                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1462                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1463                                                                                 transfer_size);
1464                 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1465
1466                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1467                         desc[i] = ctx->samplers[stage][i]->midgard_hw;
1468
1469                 postfix->sampler_descriptor = transfer.gpu;
1470         }
1471 }
1472
1473 void
1474 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1475                                struct mali_vertex_tiler_postfix *vertex_postfix)
1476 {
1477         struct panfrost_context *ctx = batch->ctx;
1478
1479         if (!ctx->vertex)
1480                 return;
1481
1482         struct panfrost_vertex_state *so = ctx->vertex;
1483
1484         panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1485         vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1486                                                                sizeof(*so->hw) *
1487                                                                PAN_MAX_ATTRIBUTE);
1488 }
1489
1490 void
1491 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1492                           struct mali_vertex_tiler_postfix *vertex_postfix)
1493 {
1494         struct panfrost_context *ctx = batch->ctx;
1495         struct panfrost_vertex_state *so = ctx->vertex;
1496
1497         /* Staged mali_attr, and index into them. i =/= k, depending on the
1498          * vertex buffer mask and instancing. Twice as much room is allocated,
1499          * for a worst case of NPOT_DIVIDEs which take up extra slot */
1500         union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1501         unsigned k = 0;
1502
1503         for (unsigned i = 0; i < so->num_elements; ++i) {
1504                 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1505                  * means duplicating some vertex buffers (who cares? aside from
1506                  * maybe some caching implications but I somehow doubt that
1507                  * matters) */
1508
1509                 struct pipe_vertex_element *elem = &so->pipe[i];
1510                 unsigned vbi = elem->vertex_buffer_index;
1511
1512                 /* The exception to 1:1 mapping is that we can have multiple
1513                  * entries (NPOT divisors), so we fixup anyways */
1514
1515                 so->hw[i].index = k;
1516
1517                 if (!(ctx->vb_mask & (1 << vbi)))
1518                         continue;
1519
1520                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1521                 struct panfrost_resource *rsrc;
1522
1523                 rsrc = pan_resource(buf->buffer.resource);
1524                 if (!rsrc)
1525                         continue;
1526
1527                 /* Align to 64 bytes by masking off the lower bits. This
1528                  * will be adjusted back when we fixup the src_offset in
1529                  * mali_attr_meta */
1530
1531                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1532                 mali_ptr addr = raw_addr & ~63;
1533                 unsigned chopped_addr = raw_addr - addr;
1534
1535                 /* Add a dependency of the batch on the vertex buffer */
1536                 panfrost_batch_add_bo(batch, rsrc->bo,
1537                                       PAN_BO_ACCESS_SHARED |
1538                                       PAN_BO_ACCESS_READ |
1539                                       PAN_BO_ACCESS_VERTEX_TILER);
1540
1541                 /* Set common fields */
1542                 attrs[k].elements = addr;
1543                 attrs[k].stride = buf->stride;
1544
1545                 /* Since we advanced the base pointer, we shrink the buffer
1546                  * size */
1547                 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1548
1549                 /* We need to add the extra size we masked off (for
1550                  * correctness) so the data doesn't get clamped away */
1551                 attrs[k].size += chopped_addr;
1552
1553                 /* For non-instancing make sure we initialize */
1554                 attrs[k].shift = attrs[k].extra_flags = 0;
1555
1556                 /* Instancing uses a dramatically different code path than
1557                  * linear, so dispatch for the actual emission now that the
1558                  * common code is finished */
1559
1560                 unsigned divisor = elem->instance_divisor;
1561
1562                 if (divisor && ctx->instance_count == 1) {
1563                         /* Silly corner case where there's a divisor(=1) but
1564                          * there's no legitimate instancing. So we want *every*
1565                          * attribute to be the same. So set stride to zero so
1566                          * we don't go anywhere. */
1567
1568                         attrs[k].size = attrs[k].stride + chopped_addr;
1569                         attrs[k].stride = 0;
1570                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1571                 } else if (ctx->instance_count <= 1) {
1572                         /* Normal, non-instanced attributes */
1573                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1574                 } else {
1575                         unsigned instance_shift = vertex_postfix->instance_shift;
1576                         unsigned instance_odd = vertex_postfix->instance_odd;
1577
1578                         k += panfrost_vertex_instanced(ctx->padded_count,
1579                                                        instance_shift,
1580                                                        instance_odd,
1581                                                        divisor, &attrs[k]);
1582                 }
1583         }
1584
1585         /* Add special gl_VertexID/gl_InstanceID buffers */
1586
1587         panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1588         so->hw[PAN_VERTEX_ID].index = k++;
1589         panfrost_instance_id(ctx->padded_count, &attrs[k]);
1590         so->hw[PAN_INSTANCE_ID].index = k++;
1591
1592         /* Upload whatever we emitted and go */
1593
1594         vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1595                                                            k * sizeof(*attrs));
1596 }
1597
1598 static mali_ptr
1599 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1600                        unsigned stride, unsigned count)
1601 {
1602         /* Fill out the descriptor */
1603         slot->stride = stride;
1604         slot->size = stride * count;
1605         slot->shift = slot->extra_flags = 0;
1606
1607         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1608                                                                         slot->size);
1609
1610         slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1611
1612         return transfer.gpu;
1613 }
1614
1615 static void
1616 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1617                         unsigned stride, unsigned offset, unsigned count,
1618                         struct pipe_stream_output_target *target)
1619 {
1620         /* Fill out the descriptor */
1621         slot->stride = stride * 4;
1622         slot->shift = slot->extra_flags = 0;
1623
1624         unsigned max_size = target->buffer_size;
1625         unsigned expected_size = slot->stride * count;
1626
1627         slot->size = MIN2(max_size, expected_size);
1628
1629         /* Grab the BO and bind it to the batch */
1630         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1631
1632         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1633          * the perspective of the TILER and FRAGMENT.
1634          */
1635         panfrost_batch_add_bo(batch, bo,
1636                               PAN_BO_ACCESS_SHARED |
1637                               PAN_BO_ACCESS_RW |
1638                               PAN_BO_ACCESS_VERTEX_TILER |
1639                               PAN_BO_ACCESS_FRAGMENT);
1640
1641         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1642         slot->elements = addr;
1643 }
1644
1645 /* Given a shader and buffer indices, link varying metadata together */
1646
1647 static bool
1648 is_special_varying(gl_varying_slot loc)
1649 {
1650         switch (loc) {
1651         case VARYING_SLOT_POS:
1652         case VARYING_SLOT_PSIZ:
1653         case VARYING_SLOT_PNTC:
1654         case VARYING_SLOT_FACE:
1655                 return true;
1656         default:
1657                 return false;
1658         }
1659 }
1660
1661 static void
1662 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1663                            signed general, signed gl_Position,
1664                            signed gl_PointSize, signed gl_PointCoord,
1665                            signed gl_FrontFacing)
1666 {
1667         struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1668
1669         for (unsigned i = 0; i < ss->varying_count; ++i) {
1670                 gl_varying_slot location = ss->varyings_loc[i];
1671                 int index = -1;
1672
1673                 switch (location) {
1674                 case VARYING_SLOT_POS:
1675                         index = gl_Position;
1676                         break;
1677                 case VARYING_SLOT_PSIZ:
1678                         index = gl_PointSize;
1679                         break;
1680                 case VARYING_SLOT_PNTC:
1681                         index = gl_PointCoord;
1682                         break;
1683                 case VARYING_SLOT_FACE:
1684                         index = gl_FrontFacing;
1685                         break;
1686                 default:
1687                         index = general;
1688                         break;
1689                 }
1690
1691                 assert(index >= 0);
1692                 out[i].index = index;
1693         }
1694 }
1695
1696 static bool
1697 has_point_coord(unsigned mask, gl_varying_slot loc)
1698 {
1699         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1700                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1701         else if (loc == VARYING_SLOT_PNTC)
1702                 return (mask & (1 << 8));
1703         else
1704                 return false;
1705 }
1706
1707 /* Helpers for manipulating stream out information so we can pack varyings
1708  * accordingly. Compute the src_offset for a given captured varying */
1709
1710 static struct pipe_stream_output *
1711 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1712 {
1713         for (unsigned i = 0; i < info->num_outputs; ++i) {
1714                 if (info->output[i].register_index == loc)
1715                         return &info->output[i];
1716         }
1717
1718         unreachable("Varying not captured");
1719 }
1720
1721 void
1722 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1723                                  unsigned vertex_count,
1724                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1725                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1726                                  union midgard_primitive_size *primitive_size)
1727 {
1728         /* Load the shaders */
1729         struct panfrost_context *ctx = batch->ctx;
1730         struct panfrost_device *device = pan_device(ctx->base.screen);
1731         struct panfrost_shader_state *vs, *fs;
1732         unsigned int num_gen_varyings = 0;
1733         size_t vs_size, fs_size;
1734
1735         /* Allocate the varying descriptor */
1736
1737         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1738         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1739         vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1740         fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1741
1742         struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1743                                                                      vs_size +
1744                                                                      fs_size);
1745
1746         struct pipe_stream_output_info *so = &vs->stream_output;
1747
1748         /* Check if this varying is linked by us. This is the case for
1749          * general-purpose, non-captured varyings. If it is, link it. If it's
1750          * not, use the provided stream out information to determine the
1751          * offset, since it was already linked for us. */
1752
1753         for (unsigned i = 0; i < vs->varying_count; i++) {
1754                 gl_varying_slot loc = vs->varyings_loc[i];
1755
1756                 bool special = is_special_varying(loc);
1757                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1758
1759                 if (captured) {
1760                         struct pipe_stream_output *o = pan_get_so(so, loc);
1761
1762                         unsigned dst_offset = o->dst_offset * 4; /* dwords */
1763                         vs->varyings[i].src_offset = dst_offset;
1764                 } else if (!special) {
1765                         vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1766                 }
1767         }
1768
1769         /* Conversely, we need to set src_offset for the captured varyings.
1770          * Here, the layout is defined by the stream out info, not us */
1771
1772         /* Link up with fragment varyings */
1773         bool reads_point_coord = fs->reads_point_coord;
1774
1775         for (unsigned i = 0; i < fs->varying_count; i++) {
1776                 gl_varying_slot loc = fs->varyings_loc[i];
1777                 unsigned src_offset;
1778                 signed vs_idx = -1;
1779
1780                 /* Link up */
1781                 for (unsigned j = 0; j < vs->varying_count; ++j) {
1782                         if (vs->varyings_loc[j] == loc) {
1783                                 vs_idx = j;
1784                                 break;
1785                         }
1786                 }
1787
1788                 /* Either assign or reuse */
1789                 if (vs_idx >= 0)
1790                         src_offset = vs->varyings[vs_idx].src_offset;
1791                 else
1792                         src_offset = 16 * (num_gen_varyings++);
1793
1794                 fs->varyings[i].src_offset = src_offset;
1795
1796                 if (has_point_coord(fs->point_sprite_mask, loc))
1797                         reads_point_coord = true;
1798         }
1799
1800         memcpy(trans.cpu, vs->varyings, vs_size);
1801         memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1802
1803         union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1804
1805         /* Figure out how many streamout buffers could be bound */
1806         unsigned so_count = ctx->streamout.num_targets;
1807         for (unsigned i = 0; i < vs->varying_count; i++) {
1808                 gl_varying_slot loc = vs->varyings_loc[i];
1809
1810                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1811                 if (!captured) continue;
1812
1813                 struct pipe_stream_output *o = pan_get_so(so, loc);
1814                 so_count = MAX2(so_count, o->output_buffer + 1);
1815         }
1816
1817         signed idx = so_count;
1818         signed general = idx++;
1819         signed gl_Position = idx++;
1820         signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1821         signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1822         signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1823         signed gl_FragCoord = (fs->reads_frag_coord &&
1824                         !(device->quirks & IS_BIFROST))
1825                         ? (idx++) : -1;
1826
1827         /* Emit the stream out buffers */
1828
1829         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1830                                                            ctx->vertex_count);
1831
1832         for (unsigned i = 0; i < so_count; ++i) {
1833                 if (i < ctx->streamout.num_targets) {
1834                         panfrost_emit_streamout(batch, &varyings[i],
1835                                                 so->stride[i],
1836                                                 ctx->streamout.offsets[i],
1837                                                 out_count,
1838                                                 ctx->streamout.targets[i]);
1839                 } else {
1840                         /* Emit a dummy buffer */
1841                         panfrost_emit_varyings(batch, &varyings[i],
1842                                                so->stride[i] * 4,
1843                                                out_count);
1844
1845                         /* Clear the attribute type */
1846                         varyings[i].elements &= ~0xF;
1847                 }
1848         }
1849
1850         panfrost_emit_varyings(batch, &varyings[general],
1851                                num_gen_varyings * 16,
1852                                vertex_count);
1853
1854         mali_ptr varyings_p;
1855
1856         /* fp32 vec4 gl_Position */
1857         varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1858                                             sizeof(float) * 4, vertex_count);
1859         tiler_postfix->position_varying = varyings_p;
1860
1861
1862         if (panfrost_writes_point_size(ctx)) {
1863                 varyings_p = panfrost_emit_varyings(batch,
1864                                                     &varyings[gl_PointSize],
1865                                                     2, vertex_count);
1866                 primitive_size->pointer = varyings_p;
1867         }
1868
1869         if (gl_PointCoord >= 0)
1870                 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1871
1872         if (gl_FrontFacing >= 0)
1873                 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1874
1875         if (gl_FragCoord >= 0)
1876                 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1877
1878         assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord));
1879
1880         /* Let's go ahead and link varying meta to the buffer in question, now
1881          * that that information is available. VARYING_SLOT_POS is mapped to
1882          * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1883          * */
1884
1885         panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1886                                    gl_PointSize, gl_PointCoord,
1887                                    gl_FrontFacing);
1888
1889         panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1890                                    gl_FragCoord, gl_PointSize,
1891                                    gl_PointCoord, gl_FrontFacing);
1892
1893         /* Replace streamout */
1894
1895         struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1896         struct mali_attr_meta *ofs = ovs + vs->varying_count;
1897
1898         for (unsigned i = 0; i < vs->varying_count; i++) {
1899                 gl_varying_slot loc = vs->varyings_loc[i];
1900
1901                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1902                 if (!captured)
1903                         continue;
1904
1905                 struct pipe_stream_output *o = pan_get_so(so, loc);
1906                 ovs[i].index = o->output_buffer;
1907
1908                 assert(o->stream == 0);
1909                 ovs[i].format = (vs->varyings[i].format & ~MALI_NR_CHANNELS(4))
1910                         | MALI_NR_CHANNELS(o->num_components);
1911
1912                 if (device->quirks & HAS_SWIZZLES)
1913                         ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1914                 else
1915                         ovs[i].swizzle = panfrost_bifrost_swizzle(o->num_components);
1916
1917                 /* Link to the fragment */
1918                 signed fs_idx = -1;
1919
1920                 /* Link up */
1921                 for (unsigned j = 0; j < fs->varying_count; ++j) {
1922                         if (fs->varyings_loc[j] == loc) {
1923                                 fs_idx = j;
1924                                 break;
1925                         }
1926                 }
1927
1928                 if (fs_idx >= 0) {
1929                         ofs[fs_idx].index = ovs[i].index;
1930                         ofs[fs_idx].format = ovs[i].format;
1931                         ofs[fs_idx].swizzle = ovs[i].swizzle;
1932                 }
1933         }
1934
1935         /* Replace point sprite */
1936         for (unsigned i = 0; i < fs->varying_count; i++) {
1937                 /* If we have a point sprite replacement, handle that here. We
1938                  * have to translate location first.  TODO: Flip y in shader.
1939                  * We're already keying ... just time crunch .. */
1940
1941                 if (has_point_coord(fs->point_sprite_mask,
1942                                     fs->varyings_loc[i])) {
1943                         ofs[i].index = gl_PointCoord;
1944
1945                         /* Swizzle out the z/w to 0/1 */
1946                         ofs[i].format = MALI_RG16F;
1947                         ofs[i].swizzle = panfrost_get_default_swizzle(2);
1948                 }
1949         }
1950
1951         /* Fix up unaligned addresses */
1952         for (unsigned i = 0; i < so_count; ++i) {
1953                 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1954                         continue;
1955
1956                 unsigned align = (varyings[i].elements & 63);
1957
1958                 /* While we're at it, the SO buffers are linear */
1959
1960                 if (!align) {
1961                         varyings[i].elements |= MALI_ATTR_LINEAR;
1962                         continue;
1963                 }
1964
1965                 /* We need to adjust alignment */
1966                 varyings[i].elements &= ~63;
1967                 varyings[i].elements |= MALI_ATTR_LINEAR;
1968                 varyings[i].size += align;
1969
1970                 for (unsigned v = 0; v < vs->varying_count; ++v) {
1971                         if (ovs[v].index != i)
1972                                 continue;
1973
1974                         ovs[v].src_offset = vs->varyings[v].src_offset + align;
1975                 }
1976
1977                 for (unsigned f = 0; f < fs->varying_count; ++f) {
1978                         if (ofs[f].index != i)
1979                                 continue;
1980
1981                         ofs[f].src_offset = fs->varyings[f].src_offset + align;
1982                 }
1983         }
1984
1985         varyings_p = panfrost_upload_transient(batch, varyings,
1986                                                idx * sizeof(*varyings));
1987         vertex_postfix->varyings = varyings_p;
1988         tiler_postfix->varyings = varyings_p;
1989
1990         vertex_postfix->varying_meta = trans.gpu;
1991         tiler_postfix->varying_meta = trans.gpu + vs_size;
1992 }
1993
1994 void
1995 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1996                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1997                                 struct mali_vertex_tiler_postfix *vertex_postfix,
1998                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1999                                 struct mali_vertex_tiler_postfix *tiler_postfix,
2000                                 union midgard_primitive_size *primitive_size)
2001 {
2002         struct panfrost_context *ctx = batch->ctx;
2003         struct panfrost_device *device = pan_device(ctx->base.screen);
2004         bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
2005         struct bifrost_payload_vertex bifrost_vertex = {0,};
2006         struct bifrost_payload_tiler bifrost_tiler = {0,};
2007         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2008         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2009         void *vp, *tp;
2010         size_t vp_size, tp_size;
2011
2012         if (device->quirks & IS_BIFROST) {
2013                 bifrost_vertex.prefix = *vertex_prefix;
2014                 bifrost_vertex.postfix = *vertex_postfix;
2015                 vp = &bifrost_vertex;
2016                 vp_size = sizeof(bifrost_vertex);
2017
2018                 bifrost_tiler.prefix = *tiler_prefix;
2019                 bifrost_tiler.tiler.primitive_size = *primitive_size;
2020                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2021                 bifrost_tiler.postfix = *tiler_postfix;
2022                 tp = &bifrost_tiler;
2023                 tp_size = sizeof(bifrost_tiler);
2024         } else {
2025                 midgard_vertex.prefix = *vertex_prefix;
2026                 midgard_vertex.postfix = *vertex_postfix;
2027                 vp = &midgard_vertex;
2028                 vp_size = sizeof(midgard_vertex);
2029
2030                 midgard_tiler.prefix = *tiler_prefix;
2031                 midgard_tiler.postfix = *tiler_postfix;
2032                 midgard_tiler.primitive_size = *primitive_size;
2033                 tp = &midgard_tiler;
2034                 tp_size = sizeof(midgard_tiler);
2035         }
2036
2037         if (wallpapering) {
2038                 /* Inject in reverse order, with "predicted" job indices.
2039                  * THIS IS A HACK XXX */
2040                 panfrost_new_job(batch, JOB_TYPE_TILER, false,
2041                                  batch->job_index + 2, tp, tp_size, true);
2042                 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2043                                  vp, vp_size, true);
2044                 return;
2045         }
2046
2047         /* If rasterizer discard is enable, only submit the vertex */
2048
2049         bool rasterizer_discard = ctx->rasterizer &&
2050                                   ctx->rasterizer->base.rasterizer_discard;
2051
2052         unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2053                                            vp, vp_size, false);
2054
2055         if (rasterizer_discard)
2056                 return;
2057
2058         panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2059                          false);
2060 }
2061
2062 /* TODO: stop hardcoding this */
2063 mali_ptr
2064 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2065 {
2066         uint16_t locations[] = {
2067             128, 128,
2068             0, 256,
2069             0, 256,
2070             0, 256,
2071             0, 256,
2072             0, 256,
2073             0, 256,
2074             0, 256,
2075             0, 256,
2076             0, 256,
2077             0, 256,
2078             0, 256,
2079             0, 256,
2080             0, 256,
2081             0, 256,
2082             0, 256,
2083             0, 256,
2084             0, 256,
2085             0, 256,
2086             0, 256,
2087             0, 256,
2088             0, 256,
2089             0, 256,
2090             0, 256,
2091             0, 256,
2092             0, 256,
2093             0, 256,
2094             0, 256,
2095             0, 256,
2096             0, 256,
2097             0, 256,
2098             0, 256,
2099             128, 128,
2100             0, 0,
2101             0, 0,
2102             0, 0,
2103             0, 0,
2104             0, 0,
2105             0, 0,
2106             0, 0,
2107             0, 0,
2108             0, 0,
2109             0, 0,
2110             0, 0,
2111             0, 0,
2112             0, 0,
2113             0, 0,
2114             0, 0,
2115         };
2116
2117         return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2118 }