src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_allocate.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         unsigned shift = panfrost_get_stack_shift(batch->stack_size);
  62         struct mali_shared_memory shared = {
  63                 .stack_shift = shift,
  64                 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
  65                 .shared_workgroup_count = ~0,
  66         };
  67         postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
  68 }
  69
  70 static void
  71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  72                                struct mali_vertex_tiler_postfix *postfix)
  73 {
  74         struct panfrost_device *dev = pan_device(ctx->base.screen);
  75         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  76
  77         /* If we haven't, reserve space for the framebuffer */
  78
  79         if (!batch->framebuffer.gpu) {
  80                 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
  81                         sizeof(struct mali_single_framebuffer) :
  82                         sizeof(struct mali_framebuffer);
  83
  84                 batch->framebuffer = panfrost_allocate_transient(batch, size);
  85
  86                 /* Tag the pointer */
  87                 if (!(dev->quirks & MIDGARD_SFBD))
  88                         batch->framebuffer.gpu |= MALI_MFBD;
  89         }
  90
  91         postfix->shared_memory = batch->framebuffer.gpu;
  92 }
  93
  94 static void
  95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
  96                               struct mali_vertex_tiler_prefix *prefix,
  97                               struct mali_vertex_tiler_postfix *postfix)
  98 {
  99         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 100
 101         postfix->gl_enables |= 0x7;
 102         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
 103                 rasterizer && rasterizer->base.front_ccw);
 104         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
 105                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
 106         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
 107                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
 108         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
 109                 rasterizer && rasterizer->base.flatshade_first);
 110 }
 111
 112 void
 113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 114                                   struct mali_vertex_tiler_prefix *prefix,
 115                                   union midgard_primitive_size *primitive_size)
 116 {
 117         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 118
 119         if (!panfrost_writes_point_size(ctx)) {
 120                 bool points = prefix->draw_mode == MALI_POINTS;
 121                 float val = 0.0f;
 122
 123                 if (rasterizer)
 124                         val = points ?
 125                               rasterizer->base.point_size :
 126                               rasterizer->base.line_width;
 127
 128                 primitive_size->constant = val;
 129         }
 130 }
 131
 132 static void
 133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 134                                    struct mali_vertex_tiler_postfix *postfix)
 135 {
 136         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 137         if (ctx->occlusion_query)
 138                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 139         else
 140                 postfix->occlusion_counter = 0;
 141 }
 142
 143 void
 144 panfrost_vt_init(struct panfrost_context *ctx,
 145                  enum pipe_shader_type stage,
 146                  struct mali_vertex_tiler_prefix *prefix,
 147                  struct mali_vertex_tiler_postfix *postfix)
 148 {
 149         struct panfrost_device *device = pan_device(ctx->base.screen);
 150
 151         if (!ctx->shader[stage])
 152                 return;
 153
 154         memset(prefix, 0, sizeof(*prefix));
 155         memset(postfix, 0, sizeof(*postfix));
 156
 157         if (device->quirks & IS_BIFROST) {
 158                 postfix->gl_enables = 0x2;
 159                 panfrost_vt_emit_shared_memory(ctx, postfix);
 160         } else {
 161                 postfix->gl_enables = 0x6;
 162                 panfrost_vt_attach_framebuffer(ctx, postfix);
 163         }
 164
 165         if (stage == PIPE_SHADER_FRAGMENT) {
 166                 panfrost_vt_update_occlusion_query(ctx, postfix);
 167                 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
 168         }
 169 }
 170
 171 static unsigned
 172 panfrost_translate_index_size(unsigned size)
 173 {
 174         switch (size) {
 175         case 1:
 176                 return MALI_DRAW_INDEXED_UINT8;
 177
 178         case 2:
 179                 return MALI_DRAW_INDEXED_UINT16;
 180
 181         case 4:
 182                 return MALI_DRAW_INDEXED_UINT32;
 183
 184         default:
 185                 unreachable("Invalid index size");
 186         }
 187 }
 188
 189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 190  * good for the duration of the draw (transient), could last longer. Also get
 191  * the bounds on the index buffer for the range accessed by the draw. We do
 192  * these operations together because there are natural optimizations which
 193  * require them to be together. */
 194
 195 static mali_ptr
 196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 197                                   const struct pipe_draw_info *info,
 198                                   unsigned *min_index, unsigned *max_index)
 199 {
 200         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 201         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 202         off_t offset = info->start * info->index_size;
 203         bool needs_indices = true;
 204         mali_ptr out = 0;
 205
 206         if (info->max_index != ~0u) {
 207                 *min_index = info->min_index;
 208                 *max_index = info->max_index;
 209                 needs_indices = false;
 210         }
 211
 212         if (!info->has_user_indices) {
 213                 /* Only resources can be directly mapped */
 214                 panfrost_batch_add_bo(batch, rsrc->bo,
 215                                       PAN_BO_ACCESS_SHARED |
 216                                       PAN_BO_ACCESS_READ |
 217                                       PAN_BO_ACCESS_VERTEX_TILER);
 218                 out = rsrc->bo->gpu + offset;
 219
 220                 /* Check the cache */
 221                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 222                                                            info->start,
 223                                                            info->count,
 224                                                            min_index,
 225                                                            max_index);
 226         } else {
 227                 /* Otherwise, we need to upload to transient memory */
 228                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 229                 out = panfrost_upload_transient(batch, ibuf8 + offset,
 230                                                 info->count *
 231                                                 info->index_size);
 232         }
 233
 234         if (needs_indices) {
 235                 /* Fallback */
 236                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 237
 238                 if (!info->has_user_indices)
 239                         panfrost_minmax_cache_add(rsrc->index_cache,
 240                                                   info->start, info->count,
 241                                                   *min_index, *max_index);
 242         }
 243
 244         return out;
 245 }
 246
 247 void
 248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 249                           const struct pipe_draw_info *info,
 250                           enum mali_draw_mode draw_mode,
 251                           struct mali_vertex_tiler_postfix *vertex_postfix,
 252                           struct mali_vertex_tiler_prefix *tiler_prefix,
 253                           struct mali_vertex_tiler_postfix *tiler_postfix,
 254                           unsigned *vertex_count,
 255                           unsigned *padded_count)
 256 {
 257         tiler_prefix->draw_mode = draw_mode;
 258
 259         unsigned draw_flags = 0;
 260
 261         if (panfrost_writes_point_size(ctx))
 262                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 263
 264         if (info->primitive_restart)
 265                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 266
 267         /* These doesn't make much sense */
 268
 269         draw_flags |= 0x3000;
 270
 271         if (info->index_size) {
 272                 unsigned min_index = 0, max_index = 0;
 273
 274                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 275                                                                        info,
 276                                                                        &min_index,
 277                                                                        &max_index);
 278
 279                 /* Use the corresponding values */
 280                 *vertex_count = max_index - min_index + 1;
 281                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 282                 tiler_prefix->offset_bias_correction = -min_index;
 283                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 284                 draw_flags |= panfrost_translate_index_size(info->index_size);
 285         } else {
 286                 tiler_prefix->indices = 0;
 287                 *vertex_count = ctx->vertex_count;
 288                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 289                 tiler_prefix->offset_bias_correction = 0;
 290                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 291         }
 292
 293         tiler_prefix->unknown_draw = draw_flags;
 294
 295         /* Encode the padded vertex count */
 296
 297         if (info->instance_count > 1) {
 298                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 299
 300                 unsigned shift = __builtin_ctz(ctx->padded_count);
 301                 unsigned k = ctx->padded_count >> (shift + 1);
 302
 303                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 304                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 305         } else {
 306                 *padded_count = *vertex_count;
 307
 308                 /* Reset instancing state */
 309                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 310                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 311         }
 312 }
 313
 314 static void
 315 panfrost_shader_meta_init(struct panfrost_context *ctx,
 316                           enum pipe_shader_type st,
 317                           struct mali_shader_meta *meta)
 318 {
 319         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 320         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 321
 322         memset(meta, 0, sizeof(*meta));
 323         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 324         meta->attribute_count = ss->attribute_count;
 325         meta->varying_count = ss->varying_count;
 326         meta->texture_count = ctx->sampler_view_count[st];
 327         meta->sampler_count = ctx->sampler_count[st];
 328
 329         if (dev->quirks & IS_BIFROST) {
 330                 if (st == PIPE_SHADER_VERTEX)
 331                         meta->bifrost1.unk1 = 0x800000;
 332                 else {
 333                         /* First clause ATEST |= 0x4000000.
 334                          * Less than 32 regs |= 0x200 */
 335                         meta->bifrost1.unk1 = 0x950020;
 336                 }
 337
 338                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 339                 if (st == PIPE_SHADER_VERTEX)
 340                         meta->bifrost2.preload_regs = 0xC0;
 341                 else {
 342                         meta->bifrost2.preload_regs = 0x1;
 343                         SET_BIT(meta->bifrost2.preload_regs, 0x10, ss->reads_frag_coord);
 344                 }
 345
 346                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 347                                                     ss->uniform_cutoff);
 348         } else {
 349                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 350                                                     ss->uniform_cutoff);
 351                 meta->midgard1.work_count = ss->work_reg_count;
 352
 353                 /* TODO: This is not conformant on ES3 */
 354                 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
 355
 356                 meta->midgard1.flags_lo = 0x20;
 357                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 358
 359                 SET_BIT(meta->midgard1.flags_hi, MALI_WRITES_GLOBAL, ss->writes_global);
 360         }
 361 }
 362
 363 static unsigned
 364 panfrost_translate_compare_func(enum pipe_compare_func in)
 365 {
 366         switch (in) {
 367         case PIPE_FUNC_NEVER:
 368                 return MALI_FUNC_NEVER;
 369
 370         case PIPE_FUNC_LESS:
 371                 return MALI_FUNC_LESS;
 372
 373         case PIPE_FUNC_EQUAL:
 374                 return MALI_FUNC_EQUAL;
 375
 376         case PIPE_FUNC_LEQUAL:
 377                 return MALI_FUNC_LEQUAL;
 378
 379         case PIPE_FUNC_GREATER:
 380                 return MALI_FUNC_GREATER;
 381
 382         case PIPE_FUNC_NOTEQUAL:
 383                 return MALI_FUNC_NOTEQUAL;
 384
 385         case PIPE_FUNC_GEQUAL:
 386                 return MALI_FUNC_GEQUAL;
 387
 388         case PIPE_FUNC_ALWAYS:
 389                 return MALI_FUNC_ALWAYS;
 390
 391         default:
 392                 unreachable("Invalid func");
 393         }
 394 }
 395
 396 static unsigned
 397 panfrost_translate_stencil_op(enum pipe_stencil_op in)
 398 {
 399         switch (in) {
 400         case PIPE_STENCIL_OP_KEEP:
 401                 return MALI_STENCIL_KEEP;
 402
 403         case PIPE_STENCIL_OP_ZERO:
 404                 return MALI_STENCIL_ZERO;
 405
 406         case PIPE_STENCIL_OP_REPLACE:
 407                return MALI_STENCIL_REPLACE;
 408
 409         case PIPE_STENCIL_OP_INCR:
 410                 return MALI_STENCIL_INCR;
 411
 412         case PIPE_STENCIL_OP_DECR:
 413                 return MALI_STENCIL_DECR;
 414
 415         case PIPE_STENCIL_OP_INCR_WRAP:
 416                 return MALI_STENCIL_INCR_WRAP;
 417
 418         case PIPE_STENCIL_OP_DECR_WRAP:
 419                 return MALI_STENCIL_DECR_WRAP;
 420
 421         case PIPE_STENCIL_OP_INVERT:
 422                 return MALI_STENCIL_INVERT;
 423
 424         default:
 425                 unreachable("Invalid stencil op");
 426         }
 427 }
 428
 429 static unsigned
 430 translate_tex_wrap(enum pipe_tex_wrap w)
 431 {
 432         switch (w) {
 433         case PIPE_TEX_WRAP_REPEAT:
 434                 return MALI_WRAP_REPEAT;
 435
 436         case PIPE_TEX_WRAP_CLAMP:
 437                 return MALI_WRAP_CLAMP;
 438
 439         case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 440                 return MALI_WRAP_CLAMP_TO_EDGE;
 441
 442         case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 443                 return MALI_WRAP_CLAMP_TO_BORDER;
 444
 445         case PIPE_TEX_WRAP_MIRROR_REPEAT:
 446                 return MALI_WRAP_MIRRORED_REPEAT;
 447
 448         case PIPE_TEX_WRAP_MIRROR_CLAMP:
 449                 return MALI_WRAP_MIRRORED_CLAMP;
 450
 451         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 452                 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
 453
 454         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 455                 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
 456
 457         default:
 458                 unreachable("Invalid wrap");
 459         }
 460 }
 461
 462 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 463                                 struct mali_sampler_descriptor *hw)
 464 {
 465         unsigned func = panfrost_translate_compare_func(cso->compare_func);
 466         bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 467         bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 468         bool mip_linear  = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
 469         unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
 470         unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
 471         unsigned mip_filter = mip_linear  ?
 472                               (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
 473         unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
 474
 475         *hw = (struct mali_sampler_descriptor) {
 476                 .filter_mode = min_filter | mag_filter | mip_filter |
 477                                normalized,
 478                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 479                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 480                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 481                 .compare_func = panfrost_flip_compare_func(func),
 482                 .border_color = {
 483                         cso->border_color.f[0],
 484                         cso->border_color.f[1],
 485                         cso->border_color.f[2],
 486                         cso->border_color.f[3]
 487                 },
 488                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 489                 .max_lod = FIXED_16(cso->max_lod, false),
 490                 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
 491                 .seamless_cube_map = cso->seamless_cube_map,
 492         };
 493
 494         /* If necessary, we disable mipmapping in the sampler descriptor by
 495          * clamping the LOD as tight as possible (from 0 to epsilon,
 496          * essentially -- remember these are fixed point numbers, so
 497          * epsilon=1/256) */
 498
 499         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 500                 hw->max_lod = hw->min_lod + 1;
 501 }
 502
 503 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 504                                         struct bifrost_sampler_descriptor *hw)
 505 {
 506         *hw = (struct bifrost_sampler_descriptor) {
 507                 .unk1 = 0x1,
 508                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 509                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 510                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 511                 .unk8 = 0x8,
 512                 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
 513                 .norm_coords = cso->normalized_coords,
 514                 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
 515                 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
 516                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 517                 .max_lod = FIXED_16(cso->max_lod, false),
 518         };
 519
 520         /* If necessary, we disable mipmapping in the sampler descriptor by
 521          * clamping the LOD as tight as possible (from 0 to epsilon,
 522          * essentially -- remember these are fixed point numbers, so
 523          * epsilon=1/256) */
 524
 525         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 526                 hw->max_lod = hw->min_lod + 1;
 527 }
 528
 529 static void
 530 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
 531                             struct mali_stencil_test *out)
 532 {
 533         out->ref = 0; /* Gallium gets it from elsewhere */
 534
 535         out->mask = in->valuemask;
 536         out->func = panfrost_translate_compare_func(in->func);
 537         out->sfail = panfrost_translate_stencil_op(in->fail_op);
 538         out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
 539         out->dppass = panfrost_translate_stencil_op(in->zpass_op);
 540 }
 541
 542 static void
 543 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 544                                      struct mali_shader_meta *fragmeta)
 545 {
 546         if (!ctx->rasterizer) {
 547                 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
 548                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
 549                 fragmeta->depth_units = 0.0f;
 550                 fragmeta->depth_factor = 0.0f;
 551                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
 552                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
 553                 return;
 554         }
 555
 556         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 557
 558         bool msaa = rast->multisample;
 559
 560         /* TODO: Sample size */
 561         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 562         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 563         fragmeta->depth_units = rast->offset_units * 2.0f;
 564         fragmeta->depth_factor = rast->offset_scale;
 565
 566         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 567
 568         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
 569         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
 570 }
 571
 572 static void
 573 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 574                               struct mali_shader_meta *fragmeta)
 575 {
 576         const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 577         int zfunc = PIPE_FUNC_ALWAYS;
 578
 579         if (!zsa) {
 580                 struct pipe_stencil_state default_stencil = {
 581                         .enabled = 0,
 582                         .func = PIPE_FUNC_ALWAYS,
 583                         .fail_op = MALI_STENCIL_KEEP,
 584                         .zfail_op = MALI_STENCIL_KEEP,
 585                         .zpass_op = MALI_STENCIL_KEEP,
 586                         .writemask = 0xFF,
 587                         .valuemask = 0xFF
 588                 };
 589
 590                 panfrost_make_stencil_state(&default_stencil,
 591                                             &fragmeta->stencil_front);
 592                 fragmeta->stencil_mask_front = default_stencil.writemask;
 593                 fragmeta->stencil_back = fragmeta->stencil_front;
 594                 fragmeta->stencil_mask_back = default_stencil.writemask;
 595                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
 596                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
 597         } else {
 598                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 599                         zsa->stencil[0].enabled);
 600                 panfrost_make_stencil_state(&zsa->stencil[0],
 601                                             &fragmeta->stencil_front);
 602                 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
 603                 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
 604
 605                 /* If back-stencil is not enabled, use the front values */
 606
 607                 if (zsa->stencil[1].enabled) {
 608                         panfrost_make_stencil_state(&zsa->stencil[1],
 609                                                     &fragmeta->stencil_back);
 610                         fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
 611                         fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
 612                 } else {
 613                         fragmeta->stencil_back = fragmeta->stencil_front;
 614                         fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
 615                         fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
 616                 }
 617
 618                 if (zsa->depth.enabled)
 619                         zfunc = zsa->depth.func;
 620
 621                 /* Depth state (TODO: Refactor) */
 622
 623                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 624                         zsa->depth.writemask);
 625         }
 626
 627         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 628         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
 629 }
 630
 631 static bool
 632 panfrost_fs_required(
 633                 struct panfrost_shader_state *fs,
 634                 struct panfrost_blend_final *blend,
 635                 unsigned rt_count)
 636 {
 637         /* If we generally have side effects */
 638         if (fs->fs_sidefx)
 639                 return true;
 640
 641         /* If colour is written we need to execute */
 642         for (unsigned i = 0; i < rt_count; ++i) {
 643                 if (!blend[i].no_colour)
 644                         return true;
 645         }
 646
 647         /* If depth is written and not implied we need to execute.
 648          * TODO: Predicate on Z/S writes being enabled */
 649         return (fs->writes_depth || fs->writes_stencil);
 650 }
 651
 652 static void
 653 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 654                                 struct mali_shader_meta *fragmeta,
 655                                 void *rts)
 656 {
 657         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 658         struct panfrost_shader_state *fs;
 659         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 660
 661         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 662                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 663                 !ctx->blend->base.dither);
 664
 665         /* Get blending setup */
 666         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 667
 668         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 669         unsigned shader_offset = 0;
 670         struct panfrost_bo *shader_bo = NULL;
 671
 672         for (unsigned c = 0; c < rt_count; ++c)
 673                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
 674                                                           &shader_offset);
 675
 676         /* Disable shader execution if we can */
 677         if (dev->quirks & MIDGARD_SHADERLESS
 678                         && !panfrost_fs_required(fs, blend, rt_count)) {
 679                 fragmeta->shader = 0;
 680                 fragmeta->attribute_count = 0;
 681                 fragmeta->varying_count = 0;
 682                 fragmeta->texture_count = 0;
 683                 fragmeta->sampler_count = 0;
 684
 685                 /* This feature is not known to work on Bifrost */
 686                 fragmeta->midgard1.work_count = 1;
 687                 fragmeta->midgard1.uniform_count = 0;
 688                 fragmeta->midgard1.uniform_buffer_count = 0;
 689         }
 690
 691          /* If there is a blend shader, work registers are shared. We impose 8
 692           * work registers as a limit for blend shaders. Should be lower XXX */
 693
 694         if (!(dev->quirks & IS_BIFROST)) {
 695                 for (unsigned c = 0; c < rt_count; ++c) {
 696                         if (blend[c].is_shader) {
 697                                 fragmeta->midgard1.work_count =
 698                                         MAX2(fragmeta->midgard1.work_count, 8);
 699                         }
 700                 }
 701         }
 702
 703         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 704          * copied to the blend_meta appended (by convention), but this is the
 705          * field actually read by the hardware. (Or maybe both are read...?).
 706          * Specify the last RTi with a blend shader. */
 707
 708         fragmeta->blend.shader = 0;
 709
 710         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 711                 if (!blend[rt].is_shader)
 712                         continue;
 713
 714                 fragmeta->blend.shader = blend[rt].shader.gpu |
 715                                          blend[rt].shader.first_tag;
 716                 break;
 717         }
 718
 719         if (dev->quirks & MIDGARD_SFBD) {
 720                 /* When only a single render target platform is used, the blend
 721                  * information is inside the shader meta itself. We additionally
 722                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 723                  * we're able to read back the destination buffer) */
 724
 725                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 726                         blend[0].is_shader);
 727
 728                 if (!blend[0].is_shader) {
 729                         fragmeta->blend.equation = *blend[0].equation.equation;
 730                         fragmeta->blend.constant = blend[0].equation.constant;
 731                 }
 732
 733                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 734                         !blend[0].no_blending || fs->can_discard);
 735                 return;
 736         }
 737
 738         if (dev->quirks & IS_BIFROST) {
 739                 bool no_blend = true;
 740
 741                 for (unsigned i = 0; i < rt_count; ++i)
 742                         no_blend &= (blend[i].no_blending | blend[i].no_colour);
 743
 744                 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
 745                         !fs->can_discard && !fs->writes_depth && no_blend);
 746         }
 747
 748         /* Additional blend descriptor tacked on for jobs using MFBD */
 749
 750         for (unsigned i = 0; i < rt_count; ++i) {
 751                 unsigned flags = 0;
 752
 753                 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
 754                         flags = 0x200;
 755
 756                         bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 757                                        (ctx->pipe_framebuffer.cbufs[i]) &&
 758                                        util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 759
 760                         SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 761                         SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 762                         SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
 763                         SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 764                 }
 765
 766                 if (dev->quirks & IS_BIFROST) {
 767                         struct bifrost_blend_rt *brts = rts;
 768
 769                         brts[i].flags = flags;
 770
 771                         if (blend[i].is_shader) {
 772                                 /* The blend shader's address needs to be at
 773                                  * the same top 32 bit as the fragment shader.
 774                                  * TODO: Ensure that's always the case.
 775                                  */
 776                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 777                                        (fs->bo->gpu & (0xffffffffull << 32)));
 778                                 brts[i].shader = blend[i].shader.gpu;
 779                                 brts[i].unk2 = 0x0;
 780                         } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
 781                                 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
 782                                 const struct util_format_description *format_desc;
 783                                 format_desc = util_format_description(format);
 784
 785                                 brts[i].equation = *blend[i].equation.equation;
 786
 787                                 /* TODO: this is a bit more complicated */
 788                                 brts[i].constant = blend[i].equation.constant;
 789
 790                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 791
 792                                 /* 0x19 disables blending and forces REPLACE
 793                                  * mode (equivalent to rgb_mode = alpha_mode =
 794                                  * x122, colour mask = 0xF). 0x1a allows
 795                                  * blending. */
 796                                 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
 797
 798                                 brts[i].shader_type = fs->blend_types[i];
 799                         } else {
 800                                 /* Dummy attachment for depth-only */
 801                                 brts[i].unk2 = 0x3;
 802                                 brts[i].shader_type = fs->blend_types[i];
 803                         }
 804                 } else {
 805                         struct midgard_blend_rt *mrts = rts;
 806                         mrts[i].flags = flags;
 807
 808                         if (blend[i].is_shader) {
 809                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 810                         } else {
 811                                 mrts[i].blend.equation = *blend[i].equation.equation;
 812                                 mrts[i].blend.constant = blend[i].equation.constant;
 813                         }
 814                 }
 815         }
 816 }
 817
 818 static void
 819 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 820                                struct mali_shader_meta *fragmeta,
 821                                void *rts)
 822 {
 823         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 824         struct panfrost_shader_state *fs;
 825
 826         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 827
 828         fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
 829         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
 830         fragmeta->unknown2_4 = 0x4e0;
 831
 832         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 833          * is required (independent of 32-bit/64-bit descriptors), or why it's
 834          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 835          * these earlier chips (perhaps this is a chicken bit of some kind).
 836          * More investigation is needed. */
 837
 838         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 839
 840         if (dev->quirks & IS_BIFROST) {
 841                 /* TODO */
 842         } else {
 843                 /* Depending on whether it's legal to in the given shader, we try to
 844                  * enable early-z testing. TODO: respect e-z force */
 845
 846                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 847                         !fs->can_discard && !fs->writes_global &&
 848                         !fs->writes_depth && !fs->writes_stencil);
 849
 850                 /* Add the writes Z/S flags if needed. */
 851                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 852                 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 853
 854                 /* Any time texturing is used, derivatives are implicitly calculated,
 855                  * so we need to enable helper invocations */
 856
 857                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 858                         fs->helper_invocations);
 859
 860                 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 861
 862                 bool depth_enabled = fs->writes_depth ||
 863                    (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
 864
 865                 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, !depth_enabled && fs->can_discard);
 866                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
 867         }
 868
 869         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 870         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 871         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 872 }
 873
 874 void
 875 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 876                           enum pipe_shader_type st,
 877                           struct mali_vertex_tiler_postfix *postfix)
 878 {
 879         struct panfrost_context *ctx = batch->ctx;
 880         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 881
 882         if (!ss) {
 883                 postfix->shader = 0;
 884                 return;
 885         }
 886
 887         struct mali_shader_meta meta;
 888
 889         panfrost_shader_meta_init(ctx, st, &meta);
 890
 891         /* Add the shader BO to the batch. */
 892         panfrost_batch_add_bo(batch, ss->bo,
 893                               PAN_BO_ACCESS_PRIVATE |
 894                               PAN_BO_ACCESS_READ |
 895                               panfrost_bo_access_for_stage(st));
 896
 897         mali_ptr shader_ptr;
 898
 899         if (st == PIPE_SHADER_FRAGMENT) {
 900                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 901                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 902                 size_t desc_size = sizeof(meta);
 903                 void *rts = NULL;
 904                 struct panfrost_transfer xfer;
 905                 unsigned rt_size;
 906
 907                 if (dev->quirks & MIDGARD_SFBD)
 908                         rt_size = 0;
 909                 else if (dev->quirks & IS_BIFROST)
 910                         rt_size = sizeof(struct bifrost_blend_rt);
 911                 else
 912                         rt_size = sizeof(struct midgard_blend_rt);
 913
 914                 desc_size += rt_size * rt_count;
 915
 916                 if (rt_size)
 917                         rts = rzalloc_size(ctx, rt_size * rt_count);
 918
 919                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 920
 921                 xfer = panfrost_allocate_transient(batch, desc_size);
 922
 923                 memcpy(xfer.cpu, &meta, sizeof(meta));
 924                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 925
 926                 if (rt_size)
 927                         ralloc_free(rts);
 928
 929                 shader_ptr = xfer.gpu;
 930         } else {
 931                 shader_ptr = panfrost_upload_transient(batch, &meta,
 932                                                        sizeof(meta));
 933         }
 934
 935         postfix->shader = shader_ptr;
 936 }
 937
 938 static void
 939 panfrost_mali_viewport_init(struct panfrost_context *ctx,
 940                             struct mali_viewport *mvp)
 941 {
 942         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 943
 944         /* Clip bounds are encoded as floats. The viewport itself is encoded as
 945          * (somewhat) asymmetric ints. */
 946
 947         const struct pipe_scissor_state *ss = &ctx->scissor;
 948
 949         memset(mvp, 0, sizeof(*mvp));
 950
 951         /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
 952          * each direction. Clipping to the viewport in theory should work, but
 953          * in practice causes issues when we're not explicitly trying to
 954          * scissor */
 955
 956         *mvp = (struct mali_viewport) {
 957                 .clip_minx = -INFINITY,
 958                 .clip_miny = -INFINITY,
 959                 .clip_maxx = INFINITY,
 960                 .clip_maxy = INFINITY,
 961         };
 962
 963         /* Always scissor to the viewport by default. */
 964         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 965         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 966
 967         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 968         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 969
 970         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 971         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 972
 973         /* Apply the scissor test */
 974
 975         unsigned minx, miny, maxx, maxy;
 976
 977         if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
 978                 minx = MAX2(ss->minx, vp_minx);
 979                 miny = MAX2(ss->miny, vp_miny);
 980                 maxx = MIN2(ss->maxx, vp_maxx);
 981                 maxy = MIN2(ss->maxy, vp_maxy);
 982         } else {
 983                 minx = vp_minx;
 984                 miny = vp_miny;
 985                 maxx = vp_maxx;
 986                 maxy = vp_maxy;
 987         }
 988
 989         /* Hardware needs the min/max to be strictly ordered, so flip if we
 990          * need to. The viewport transformation in the vertex shader will
 991          * handle the negatives if we don't */
 992
 993         if (miny > maxy) {
 994                 unsigned temp = miny;
 995                 miny = maxy;
 996                 maxy = temp;
 997         }
 998
 999         if (minx > maxx) {
1000                 unsigned temp = minx;
1001                 minx = maxx;
1002                 maxx = temp;
1003         }
1004
1005         if (minz > maxz) {
1006                 float temp = minz;
1007                 minz = maxz;
1008                 maxz = temp;
1009         }
1010
1011         /* Clamp to the framebuffer size as a last check */
1012
1013         minx = MIN2(ctx->pipe_framebuffer.width, minx);
1014         maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
1015
1016         miny = MIN2(ctx->pipe_framebuffer.height, miny);
1017         maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
1018
1019         /* Upload */
1020
1021         mvp->viewport0[0] = minx;
1022         mvp->viewport1[0] = MALI_POSITIVE(maxx);
1023
1024         mvp->viewport0[1] = miny;
1025         mvp->viewport1[1] = MALI_POSITIVE(maxy);
1026
1027         mvp->clip_minz = minz;
1028         mvp->clip_maxz = maxz;
1029 }
1030
1031 void
1032 panfrost_emit_viewport(struct panfrost_batch *batch,
1033                        struct mali_vertex_tiler_postfix *tiler_postfix)
1034 {
1035         struct panfrost_context *ctx = batch->ctx;
1036         struct mali_viewport mvp;
1037
1038         panfrost_mali_viewport_init(batch->ctx,  &mvp);
1039
1040         /* Update the job, unless we're doing wallpapering (whose lack of
1041          * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
1042          * just... be faster :) */
1043
1044         if (!ctx->wallpaper_batch)
1045                 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
1046                                              mvp.viewport0[1],
1047                                              mvp.viewport1[0] + 1,
1048                                              mvp.viewport1[1] + 1);
1049
1050         tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
1051                                                             sizeof(mvp));
1052 }
1053
1054 static mali_ptr
1055 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1056                                  enum pipe_shader_type st,
1057                                  struct panfrost_constant_buffer *buf,
1058                                  unsigned index)
1059 {
1060         struct pipe_constant_buffer *cb = &buf->cb[index];
1061         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1062
1063         if (rsrc) {
1064                 panfrost_batch_add_bo(batch, rsrc->bo,
1065                                       PAN_BO_ACCESS_SHARED |
1066                                       PAN_BO_ACCESS_READ |
1067                                       panfrost_bo_access_for_stage(st));
1068
1069                 /* Alignment gauranteed by
1070                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1071                 return rsrc->bo->gpu + cb->buffer_offset;
1072         } else if (cb->user_buffer) {
1073                 return panfrost_upload_transient(batch,
1074                                                  cb->user_buffer +
1075                                                  cb->buffer_offset,
1076                                                  cb->buffer_size);
1077         } else {
1078                 unreachable("No constant buffer");
1079         }
1080 }
1081
1082 struct sysval_uniform {
1083         union {
1084                 float f[4];
1085                 int32_t i[4];
1086                 uint32_t u[4];
1087                 uint64_t du[2];
1088         };
1089 };
1090
1091 static void
1092 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1093                                       struct sysval_uniform *uniform)
1094 {
1095         struct panfrost_context *ctx = batch->ctx;
1096         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1097
1098         uniform->f[0] = vp->scale[0];
1099         uniform->f[1] = vp->scale[1];
1100         uniform->f[2] = vp->scale[2];
1101 }
1102
1103 static void
1104 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1105                                        struct sysval_uniform *uniform)
1106 {
1107         struct panfrost_context *ctx = batch->ctx;
1108         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1109
1110         uniform->f[0] = vp->translate[0];
1111         uniform->f[1] = vp->translate[1];
1112         uniform->f[2] = vp->translate[2];
1113 }
1114
1115 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1116                                        enum pipe_shader_type st,
1117                                        unsigned int sysvalid,
1118                                        struct sysval_uniform *uniform)
1119 {
1120         struct panfrost_context *ctx = batch->ctx;
1121         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1122         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1123         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1124         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1125
1126         assert(dim);
1127         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1128
1129         if (dim > 1)
1130                 uniform->i[1] = u_minify(tex->texture->height0,
1131                                          tex->u.tex.first_level);
1132
1133         if (dim > 2)
1134                 uniform->i[2] = u_minify(tex->texture->depth0,
1135                                          tex->u.tex.first_level);
1136
1137         if (is_array)
1138                 uniform->i[dim] = tex->texture->array_size;
1139 }
1140
1141 static void
1142 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1143                             enum pipe_shader_type st,
1144                             unsigned ssbo_id,
1145                             struct sysval_uniform *uniform)
1146 {
1147         struct panfrost_context *ctx = batch->ctx;
1148
1149         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1150         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1151
1152         /* Compute address */
1153         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1154
1155         panfrost_batch_add_bo(batch, bo,
1156                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1157                               panfrost_bo_access_for_stage(st));
1158
1159         /* Upload address and size as sysval */
1160         uniform->du[0] = bo->gpu + sb.buffer_offset;
1161         uniform->u[2] = sb.buffer_size;
1162 }
1163
1164 static void
1165 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1166                                enum pipe_shader_type st,
1167                                unsigned samp_idx,
1168                                struct sysval_uniform *uniform)
1169 {
1170         struct panfrost_context *ctx = batch->ctx;
1171         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1172
1173         uniform->f[0] = sampl->min_lod;
1174         uniform->f[1] = sampl->max_lod;
1175         uniform->f[2] = sampl->lod_bias;
1176
1177         /* Even without any errata, Midgard represents "no mipmapping" as
1178          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1179          * panfrost_create_sampler_state which also explains our choice of
1180          * epsilon value (again to keep behaviour consistent) */
1181
1182         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1183                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1184 }
1185
1186 static void
1187 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1188                                        struct sysval_uniform *uniform)
1189 {
1190         struct panfrost_context *ctx = batch->ctx;
1191
1192         uniform->u[0] = ctx->compute_grid->grid[0];
1193         uniform->u[1] = ctx->compute_grid->grid[1];
1194         uniform->u[2] = ctx->compute_grid->grid[2];
1195 }
1196
1197 static void
1198 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1199                         struct panfrost_shader_state *ss,
1200                         enum pipe_shader_type st)
1201 {
1202         struct sysval_uniform *uniforms = (void *)buf;
1203
1204         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1205                 int sysval = ss->sysval[i];
1206
1207                 switch (PAN_SYSVAL_TYPE(sysval)) {
1208                 case PAN_SYSVAL_VIEWPORT_SCALE:
1209                         panfrost_upload_viewport_scale_sysval(batch,
1210                                                               &uniforms[i]);
1211                         break;
1212                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1213                         panfrost_upload_viewport_offset_sysval(batch,
1214                                                                &uniforms[i]);
1215                         break;
1216                 case PAN_SYSVAL_TEXTURE_SIZE:
1217                         panfrost_upload_txs_sysval(batch, st,
1218                                                    PAN_SYSVAL_ID(sysval),
1219                                                    &uniforms[i]);
1220                         break;
1221                 case PAN_SYSVAL_SSBO:
1222                         panfrost_upload_ssbo_sysval(batch, st,
1223                                                     PAN_SYSVAL_ID(sysval),
1224                                                     &uniforms[i]);
1225                         break;
1226                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1227                         panfrost_upload_num_work_groups_sysval(batch,
1228                                                                &uniforms[i]);
1229                         break;
1230                 case PAN_SYSVAL_SAMPLER:
1231                         panfrost_upload_sampler_sysval(batch, st,
1232                                                        PAN_SYSVAL_ID(sysval),
1233                                                        &uniforms[i]);
1234                         break;
1235                 default:
1236                         assert(0);
1237                 }
1238         }
1239 }
1240
1241 static const void *
1242 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1243                                  unsigned index)
1244 {
1245         struct pipe_constant_buffer *cb = &buf->cb[index];
1246         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1247
1248         if (rsrc)
1249                 return rsrc->bo->cpu;
1250         else if (cb->user_buffer)
1251                 return cb->user_buffer;
1252         else
1253                 unreachable("No constant buffer");
1254 }
1255
1256 void
1257 panfrost_emit_const_buf(struct panfrost_batch *batch,
1258                         enum pipe_shader_type stage,
1259                         struct mali_vertex_tiler_postfix *postfix)
1260 {
1261         struct panfrost_context *ctx = batch->ctx;
1262         struct panfrost_shader_variants *all = ctx->shader[stage];
1263
1264         if (!all)
1265                 return;
1266
1267         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1268
1269         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1270
1271         /* Uniforms are implicitly UBO #0 */
1272         bool has_uniforms = buf->enabled_mask & (1 << 0);
1273
1274         /* Allocate room for the sysval and the uniforms */
1275         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1276         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1277         size_t size = sys_size + uniform_size;
1278         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1279                                                                         size);
1280
1281         /* Upload sysvals requested by the shader */
1282         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1283
1284         /* Upload uniforms */
1285         if (has_uniforms && uniform_size) {
1286                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1287                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1288         }
1289
1290         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1291          * uploaded */
1292
1293         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1294         assert(ubo_count >= 1);
1295
1296         size_t sz = sizeof(uint64_t) * ubo_count;
1297         uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1298         int uniform_count = ss->uniform_count;
1299
1300         /* Upload uniforms as a UBO */
1301         ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1302
1303         /* The rest are honest-to-goodness UBOs */
1304
1305         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1306                 size_t usz = buf->cb[ubo].buffer_size;
1307                 bool enabled = buf->enabled_mask & (1 << ubo);
1308                 bool empty = usz == 0;
1309
1310                 if (!enabled || empty) {
1311                         /* Stub out disabled UBOs to catch accesses */
1312                         ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1313                         continue;
1314                 }
1315
1316                 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1317                                                                 buf, ubo);
1318
1319                 unsigned bytes_per_field = 16;
1320                 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1321                 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1322         }
1323
1324         mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1325         postfix->uniforms = transfer.gpu;
1326         postfix->uniform_buffers = ubufs;
1327
1328         buf->dirty_mask = 0;
1329 }
1330
1331 void
1332 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1333                             const struct pipe_grid_info *info,
1334                             struct midgard_payload_vertex_tiler *vtp)
1335 {
1336         struct panfrost_context *ctx = batch->ctx;
1337         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1338         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1339         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1340                                                            128));
1341         unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1342                                info->grid[2] * 4;
1343         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1344                                                                   shared_size,
1345                                                                   1);
1346
1347         struct mali_shared_memory shared = {
1348                 .shared_memory = bo->gpu,
1349                 .shared_workgroup_count =
1350                         util_logbase2_ceil(info->grid[0]) +
1351                         util_logbase2_ceil(info->grid[1]) +
1352                         util_logbase2_ceil(info->grid[2]),
1353                 .shared_unk1 = 0x2,
1354                 .shared_shift = util_logbase2(single_size) - 1
1355         };
1356
1357         vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1358                                                                sizeof(shared));
1359 }
1360
1361 static mali_ptr
1362 panfrost_get_tex_desc(struct panfrost_batch *batch,
1363                       enum pipe_shader_type st,
1364                       struct panfrost_sampler_view *view)
1365 {
1366         if (!view)
1367                 return (mali_ptr) 0;
1368
1369         struct pipe_sampler_view *pview = &view->base;
1370         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1371
1372         /* Add the BO to the job so it's retained until the job is done. */
1373
1374         panfrost_batch_add_bo(batch, rsrc->bo,
1375                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1376                               panfrost_bo_access_for_stage(st));
1377
1378         panfrost_batch_add_bo(batch, view->bo,
1379                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1380                               panfrost_bo_access_for_stage(st));
1381
1382         return view->bo->gpu;
1383 }
1384
1385 static void
1386 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1387                              struct pipe_context *pctx)
1388 {
1389         struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1390         if (view->layout != rsrc->layout) {
1391                 panfrost_bo_unreference(view->bo);
1392                 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1393         }
1394 }
1395
1396 void
1397 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1398                                   enum pipe_shader_type stage,
1399                                   struct mali_vertex_tiler_postfix *postfix)
1400 {
1401         struct panfrost_context *ctx = batch->ctx;
1402         struct panfrost_device *device = pan_device(ctx->base.screen);
1403
1404         if (!ctx->sampler_view_count[stage])
1405                 return;
1406
1407         if (device->quirks & IS_BIFROST) {
1408                 struct bifrost_texture_descriptor *descriptors;
1409
1410                 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1411                                      ctx->sampler_view_count[stage]);
1412
1413                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1414                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1415                         struct pipe_sampler_view *pview = &view->base;
1416                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1417                         panfrost_update_sampler_view(view, &ctx->base);
1418
1419                         /* Add the BOs to the job so they are retained until the job is done. */
1420
1421                         panfrost_batch_add_bo(batch, rsrc->bo,
1422                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1423                                               panfrost_bo_access_for_stage(stage));
1424
1425                         panfrost_batch_add_bo(batch, view->bo,
1426                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1427                                               panfrost_bo_access_for_stage(stage));
1428
1429                         memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1430                 }
1431
1432                 postfix->textures = panfrost_upload_transient(batch,
1433                                                               descriptors,
1434                                                               sizeof(struct bifrost_texture_descriptor) *
1435                                                                       ctx->sampler_view_count[stage]);
1436
1437                 free(descriptors);
1438         } else {
1439                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1440
1441                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1442                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1443
1444                         panfrost_update_sampler_view(view, &ctx->base);
1445
1446                         trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1447                 }
1448
1449                 postfix->textures = panfrost_upload_transient(batch,
1450                                                               trampolines,
1451                                                               sizeof(uint64_t) *
1452                                                               ctx->sampler_view_count[stage]);
1453         }
1454 }
1455
1456 void
1457 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1458                                   enum pipe_shader_type stage,
1459                                   struct mali_vertex_tiler_postfix *postfix)
1460 {
1461         struct panfrost_context *ctx = batch->ctx;
1462         struct panfrost_device *device = pan_device(ctx->base.screen);
1463
1464         if (!ctx->sampler_count[stage])
1465                 return;
1466
1467         if (device->quirks & IS_BIFROST) {
1468                 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1469                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1470                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1471                                                                                 transfer_size);
1472                 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1473
1474                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1475                         desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1476
1477                 postfix->sampler_descriptor = transfer.gpu;
1478         } else {
1479                 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1480                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1481                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1482                                                                                 transfer_size);
1483                 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1484
1485                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1486                         desc[i] = ctx->samplers[stage][i]->midgard_hw;
1487
1488                 postfix->sampler_descriptor = transfer.gpu;
1489         }
1490 }
1491
1492 void
1493 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1494                                struct mali_vertex_tiler_postfix *vertex_postfix)
1495 {
1496         struct panfrost_context *ctx = batch->ctx;
1497
1498         if (!ctx->vertex)
1499                 return;
1500
1501         struct panfrost_vertex_state *so = ctx->vertex;
1502
1503         panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1504         vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1505                                                                sizeof(*so->hw) *
1506                                                                PAN_MAX_ATTRIBUTE);
1507 }
1508
1509 void
1510 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1511                           struct mali_vertex_tiler_postfix *vertex_postfix)
1512 {
1513         struct panfrost_context *ctx = batch->ctx;
1514         struct panfrost_vertex_state *so = ctx->vertex;
1515
1516         /* Staged mali_attr, and index into them. i =/= k, depending on the
1517          * vertex buffer mask and instancing. Twice as much room is allocated,
1518          * for a worst case of NPOT_DIVIDEs which take up extra slot */
1519         union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1520         unsigned k = 0;
1521
1522         for (unsigned i = 0; i < so->num_elements; ++i) {
1523                 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1524                  * means duplicating some vertex buffers (who cares? aside from
1525                  * maybe some caching implications but I somehow doubt that
1526                  * matters) */
1527
1528                 struct pipe_vertex_element *elem = &so->pipe[i];
1529                 unsigned vbi = elem->vertex_buffer_index;
1530
1531                 /* The exception to 1:1 mapping is that we can have multiple
1532                  * entries (NPOT divisors), so we fixup anyways */
1533
1534                 so->hw[i].index = k;
1535
1536                 if (!(ctx->vb_mask & (1 << vbi)))
1537                         continue;
1538
1539                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1540                 struct panfrost_resource *rsrc;
1541
1542                 rsrc = pan_resource(buf->buffer.resource);
1543                 if (!rsrc)
1544                         continue;
1545
1546                 /* Align to 64 bytes by masking off the lower bits. This
1547                  * will be adjusted back when we fixup the src_offset in
1548                  * mali_attr_meta */
1549
1550                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1551                 mali_ptr addr = raw_addr & ~63;
1552                 unsigned chopped_addr = raw_addr - addr;
1553
1554                 /* Add a dependency of the batch on the vertex buffer */
1555                 panfrost_batch_add_bo(batch, rsrc->bo,
1556                                       PAN_BO_ACCESS_SHARED |
1557                                       PAN_BO_ACCESS_READ |
1558                                       PAN_BO_ACCESS_VERTEX_TILER);
1559
1560                 /* Set common fields */
1561                 attrs[k].elements = addr;
1562                 attrs[k].stride = buf->stride;
1563
1564                 /* Since we advanced the base pointer, we shrink the buffer
1565                  * size */
1566                 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1567
1568                 /* We need to add the extra size we masked off (for
1569                  * correctness) so the data doesn't get clamped away */
1570                 attrs[k].size += chopped_addr;
1571
1572                 /* For non-instancing make sure we initialize */
1573                 attrs[k].shift = attrs[k].extra_flags = 0;
1574
1575                 /* Instancing uses a dramatically different code path than
1576                  * linear, so dispatch for the actual emission now that the
1577                  * common code is finished */
1578
1579                 unsigned divisor = elem->instance_divisor;
1580
1581                 if (divisor && ctx->instance_count == 1) {
1582                         /* Silly corner case where there's a divisor(=1) but
1583                          * there's no legitimate instancing. So we want *every*
1584                          * attribute to be the same. So set stride to zero so
1585                          * we don't go anywhere. */
1586
1587                         attrs[k].size = attrs[k].stride + chopped_addr;
1588                         attrs[k].stride = 0;
1589                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1590                 } else if (ctx->instance_count <= 1) {
1591                         /* Normal, non-instanced attributes */
1592                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1593                 } else {
1594                         unsigned instance_shift = vertex_postfix->instance_shift;
1595                         unsigned instance_odd = vertex_postfix->instance_odd;
1596
1597                         k += panfrost_vertex_instanced(ctx->padded_count,
1598                                                        instance_shift,
1599                                                        instance_odd,
1600                                                        divisor, &attrs[k]);
1601                 }
1602         }
1603
1604         /* Add special gl_VertexID/gl_InstanceID buffers */
1605
1606         panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1607         so->hw[PAN_VERTEX_ID].index = k++;
1608         panfrost_instance_id(ctx->padded_count, &attrs[k]);
1609         so->hw[PAN_INSTANCE_ID].index = k++;
1610
1611         /* Upload whatever we emitted and go */
1612
1613         vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1614                                                            k * sizeof(*attrs));
1615 }
1616
1617 static mali_ptr
1618 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1619                        unsigned stride, unsigned count)
1620 {
1621         /* Fill out the descriptor */
1622         slot->stride = stride;
1623         slot->size = stride * count;
1624         slot->shift = slot->extra_flags = 0;
1625
1626         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1627                                                                         slot->size);
1628
1629         slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1630
1631         return transfer.gpu;
1632 }
1633
1634 static unsigned
1635 panfrost_streamout_offset(unsigned stride, unsigned offset,
1636                         struct pipe_stream_output_target *target)
1637 {
1638         return (target->buffer_offset + (offset * stride * 4)) & 63;
1639 }
1640
1641 static void
1642 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1643                         unsigned stride, unsigned offset, unsigned count,
1644                         struct pipe_stream_output_target *target)
1645 {
1646         /* Fill out the descriptor */
1647         slot->stride = stride * 4;
1648         slot->shift = slot->extra_flags = 0;
1649
1650         unsigned max_size = target->buffer_size;
1651         unsigned expected_size = slot->stride * count;
1652
1653         /* Grab the BO and bind it to the batch */
1654         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1655
1656         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1657          * the perspective of the TILER and FRAGMENT.
1658          */
1659         panfrost_batch_add_bo(batch, bo,
1660                               PAN_BO_ACCESS_SHARED |
1661                               PAN_BO_ACCESS_RW |
1662                               PAN_BO_ACCESS_VERTEX_TILER |
1663                               PAN_BO_ACCESS_FRAGMENT);
1664
1665         /* We will have an offset applied to get alignment */
1666         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1667         slot->elements = (addr & ~63) | MALI_ATTR_LINEAR;
1668         slot->size = MIN2(max_size, expected_size) + (addr & 63);
1669 }
1670
1671 static bool
1672 has_point_coord(unsigned mask, gl_varying_slot loc)
1673 {
1674         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1675                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1676         else if (loc == VARYING_SLOT_PNTC)
1677                 return (mask & (1 << 8));
1678         else
1679                 return false;
1680 }
1681
1682 /* Helpers for manipulating stream out information so we can pack varyings
1683  * accordingly. Compute the src_offset for a given captured varying */
1684
1685 static struct pipe_stream_output *
1686 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1687 {
1688         for (unsigned i = 0; i < info->num_outputs; ++i) {
1689                 if (info->output[i].register_index == loc)
1690                         return &info->output[i];
1691         }
1692
1693         unreachable("Varying not captured");
1694 }
1695
1696 static unsigned
1697 pan_varying_size(enum mali_format fmt)
1698 {
1699         unsigned type = MALI_EXTRACT_TYPE(fmt);
1700         unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1701         unsigned bits = MALI_EXTRACT_BITS(fmt);
1702         unsigned bpc = 0;
1703
1704         if (bits == MALI_CHANNEL_FLOAT) {
1705                 /* No doubles */
1706                 bool fp16 = (type == MALI_FORMAT_SINT);
1707                 assert(fp16 || (type == MALI_FORMAT_UNORM));
1708
1709                 bpc = fp16 ? 2 : 4;
1710         } else {
1711                 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1712
1713                 /* See the enums */
1714                 bits = 1 << bits;
1715                 assert(bits >= 8);
1716                 bpc = bits / 8;
1717         }
1718
1719         return bpc * chan;
1720 }
1721
1722 /* Indices for named (non-XFB) varyings that are present. These are packed
1723  * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1724  * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1725  * of a given special field given a shift S by:
1726  *
1727  *      idx = popcount(P & ((1 << S) - 1))
1728  *
1729  * That is... look at all of the varyings that come earlier and count them, the
1730  * count is the new index since plus one. Likewise, the total number of special
1731  * buffers required is simply popcount(P)
1732  */
1733
1734 enum pan_special_varying {
1735         PAN_VARY_GENERAL = 0,
1736         PAN_VARY_POSITION = 1,
1737         PAN_VARY_PSIZ = 2,
1738         PAN_VARY_PNTCOORD = 3,
1739         PAN_VARY_FACE = 4,
1740         PAN_VARY_FRAGCOORD = 5,
1741
1742         /* Keep last */
1743         PAN_VARY_MAX,
1744 };
1745
1746 /* Given a varying, figure out which index it correpsonds to */
1747
1748 static inline unsigned
1749 pan_varying_index(unsigned present, enum pan_special_varying v)
1750 {
1751         unsigned mask = (1 << v) - 1;
1752         return util_bitcount(present & mask);
1753 }
1754
1755 /* Get the base offset for XFB buffers, which by convention come after
1756  * everything else. Wrapper function for semantic reasons; by construction this
1757  * is just popcount. */
1758
1759 static inline unsigned
1760 pan_xfb_base(unsigned present)
1761 {
1762         return util_bitcount(present);
1763 }
1764
1765 /* Computes the present mask for varyings so we can start emitting varying records */
1766
1767 static inline unsigned
1768 pan_varying_present(
1769         struct panfrost_shader_state *vs,
1770         struct panfrost_shader_state *fs,
1771         unsigned quirks)
1772 {
1773         /* At the moment we always emit general and position buffers. Not
1774          * strictly necessary but usually harmless */
1775
1776         unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1777
1778         /* Enable special buffers by the shader info */
1779
1780         if (vs->writes_point_size)
1781                 present |= (1 << PAN_VARY_PSIZ);
1782
1783         if (fs->reads_point_coord)
1784                 present |= (1 << PAN_VARY_PNTCOORD);
1785
1786         if (fs->reads_face)
1787                 present |= (1 << PAN_VARY_FACE);
1788
1789         if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1790                 present |= (1 << PAN_VARY_FRAGCOORD);
1791
1792         /* Also, if we have a point sprite, we need a point coord buffer */
1793
1794         for (unsigned i = 0; i < fs->varying_count; i++)  {
1795                 gl_varying_slot loc = fs->varyings_loc[i];
1796
1797                 if (has_point_coord(fs->point_sprite_mask, loc))
1798                         present |= (1 << PAN_VARY_PNTCOORD);
1799         }
1800
1801         return present;
1802 }
1803
1804 /* Emitters for varying records */
1805
1806 static struct mali_attr_meta
1807 pan_emit_vary(unsigned present, enum pan_special_varying buf,
1808                 unsigned quirks, enum mali_format format,
1809                 unsigned offset)
1810 {
1811         unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1812
1813         struct mali_attr_meta meta = {
1814                 .index = pan_varying_index(present, buf),
1815                 .unknown1 = quirks & IS_BIFROST ? 0x0 : 0x2,
1816                 .swizzle = quirks & HAS_SWIZZLES ?
1817                         panfrost_get_default_swizzle(nr_channels) :
1818                         panfrost_bifrost_swizzle(nr_channels),
1819                 .format = format,
1820                 .src_offset = offset
1821         };
1822
1823         return meta;
1824 }
1825
1826 /* General varying that is unused */
1827
1828 static struct mali_attr_meta
1829 pan_emit_vary_only(unsigned present, unsigned quirks)
1830 {
1831         return pan_emit_vary(present, 0, quirks, MALI_VARYING_DISCARD, 0);
1832 }
1833
1834 /* Special records */
1835
1836 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1837         [PAN_VARY_POSITION]     = MALI_VARYING_POS,
1838         [PAN_VARY_PSIZ]         = MALI_R16F,
1839         [PAN_VARY_PNTCOORD]     = MALI_R16F,
1840         [PAN_VARY_FACE]         = MALI_R32I,
1841         [PAN_VARY_FRAGCOORD]    = MALI_RGBA32F
1842 };
1843
1844 static struct mali_attr_meta
1845 pan_emit_vary_special(unsigned present, enum pan_special_varying buf,
1846                 unsigned quirks)
1847 {
1848         assert(buf < PAN_VARY_MAX);
1849         return pan_emit_vary(present, buf, quirks, pan_varying_formats[buf], 0);
1850 }
1851
1852 static enum mali_format
1853 pan_xfb_format(enum mali_format format, unsigned nr)
1854 {
1855         if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1856                 return MALI_R32F | MALI_NR_CHANNELS(nr);
1857         else
1858                 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1859 }
1860
1861 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1862  * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1863  * value. */
1864
1865 static struct mali_attr_meta
1866 pan_emit_vary_xfb(unsigned present,
1867                 unsigned max_xfb,
1868                 unsigned *streamout_offsets,
1869                 unsigned quirks,
1870                 enum mali_format format,
1871                 struct pipe_stream_output o)
1872 {
1873         /* Otherwise construct a record for it */
1874         struct mali_attr_meta meta = {
1875                 /* XFB buffers come after everything else */
1876                 .index = pan_xfb_base(present) + o.output_buffer,
1877
1878                 /* As usual unknown bit */
1879                 .unknown1 = quirks & IS_BIFROST ? 0x0 : 0x2,
1880
1881                 /* Override swizzle with number of channels */
1882                 .swizzle = quirks & HAS_SWIZZLES ?
1883                         panfrost_get_default_swizzle(o.num_components) :
1884                         panfrost_bifrost_swizzle(o.num_components),
1885
1886                 /* Override number of channels and precision to highp */
1887                 .format = pan_xfb_format(format, o.num_components),
1888
1889                 /* Apply given offsets together */
1890                 .src_offset = (o.dst_offset * 4) /* dwords */
1891                         + streamout_offsets[o.output_buffer]
1892         };
1893
1894         return meta;
1895 }
1896
1897 /* Determine if we should capture a varying for XFB. This requires actually
1898  * having a buffer for it. If we don't capture it, we'll fallback to a general
1899  * varying path (linked or unlinked, possibly discarding the write) */
1900
1901 static bool
1902 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1903                 unsigned loc, unsigned max_xfb)
1904 {
1905         if (!(xfb->so_mask & (1ll << loc)))
1906                 return false;
1907
1908         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1909         return o->output_buffer < max_xfb;
1910 }
1911
1912 /* Higher-level wrapper around all of the above, classifying a varying into one
1913  * of the above types */
1914
1915 static struct mali_attr_meta
1916 panfrost_emit_varying(
1917                 struct panfrost_shader_state *stage,
1918                 struct panfrost_shader_state *other,
1919                 struct panfrost_shader_state *xfb,
1920                 unsigned present,
1921                 unsigned max_xfb,
1922                 unsigned *streamout_offsets,
1923                 unsigned quirks,
1924                 unsigned *gen_offsets,
1925                 enum mali_format *gen_formats,
1926                 unsigned *gen_stride,
1927                 unsigned idx,
1928                 bool should_alloc,
1929                 bool is_fragment)
1930 {
1931         gl_varying_slot loc = stage->varyings_loc[idx];
1932         enum mali_format format = stage->varyings[idx];
1933
1934         /* Override format to match linkage */
1935         if (!should_alloc && gen_formats[idx])
1936                 format = gen_formats[idx];
1937
1938         if (has_point_coord(stage->point_sprite_mask, loc)) {
1939                 return pan_emit_vary_special(present, PAN_VARY_PNTCOORD, quirks);
1940         } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1941                 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1942                 return pan_emit_vary_xfb(present, max_xfb, streamout_offsets, quirks, format, *o);
1943         } else if (loc == VARYING_SLOT_POS) {
1944                 if (is_fragment)
1945                         return pan_emit_vary_special(present, PAN_VARY_FRAGCOORD, quirks);
1946                 else
1947                         return pan_emit_vary_special(present, PAN_VARY_POSITION, quirks);
1948         } else if (loc == VARYING_SLOT_PSIZ) {
1949                 return pan_emit_vary_special(present, PAN_VARY_PSIZ, quirks);
1950         } else if (loc == VARYING_SLOT_PNTC) {
1951                 return pan_emit_vary_special(present, PAN_VARY_PNTCOORD, quirks);
1952         } else if (loc == VARYING_SLOT_FACE) {
1953                 return pan_emit_vary_special(present, PAN_VARY_FACE, quirks);
1954         }
1955
1956         /* We've exhausted special cases, so it's otherwise a general varying. Check if we're linked */
1957         signed other_idx = -1;
1958
1959         for (unsigned j = 0; j < other->varying_count; ++j) {
1960                 if (other->varyings_loc[j] == loc) {
1961                         other_idx = j;
1962                         break;
1963                 }
1964         }
1965
1966         if (other_idx < 0)
1967                 return pan_emit_vary_only(present, quirks);
1968
1969         unsigned offset = gen_offsets[other_idx];
1970
1971         if (should_alloc) {
1972                 /* We're linked, so allocate a space via a watermark allocation */
1973                 enum mali_format alt = other->varyings[other_idx];
1974
1975                 /* Do interpolation at minimum precision */
1976                 unsigned size_main = pan_varying_size(format);
1977                 unsigned size_alt = pan_varying_size(alt);
1978                 unsigned size = MIN2(size_main, size_alt);
1979
1980                 /* If a varying is marked for XFB but not actually captured, we
1981                  * should match the format to the format that would otherwise
1982                  * be used for XFB, since dEQP checks for invariance here. It's
1983                  * unclear if this is required by the spec. */
1984
1985                 if (xfb->so_mask & (1ull << loc)) {
1986                         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1987                         format = pan_xfb_format(format, o->num_components);
1988                         size = pan_varying_size(format);
1989                 } else if (size == size_alt) {
1990                         format = alt;
1991                 }
1992
1993                 gen_offsets[idx] = *gen_stride;
1994                 gen_formats[other_idx] = format;
1995                 offset = *gen_stride;
1996                 *gen_stride += size;
1997         }
1998
1999         return pan_emit_vary(present, PAN_VARY_GENERAL,
2000                         quirks, format, offset);
2001 }
2002
2003 static void
2004 pan_emit_special_input(union mali_attr *varyings,
2005                 unsigned present,
2006                 enum pan_special_varying v,
2007                 mali_ptr addr)
2008 {
2009         if (present & (1 << v)) {
2010                 /* Ensure we write exactly once for performance and with fields
2011                  * zeroed appropriately to avoid flakes */
2012
2013                 union mali_attr s = {
2014                         .elements = addr
2015                 };
2016
2017                 varyings[pan_varying_index(present, v)] = s;
2018         }
2019 }
2020
2021 void
2022 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
2023                                  unsigned vertex_count,
2024                                  struct mali_vertex_tiler_postfix *vertex_postfix,
2025                                  struct mali_vertex_tiler_postfix *tiler_postfix,
2026                                  union midgard_primitive_size *primitive_size)
2027 {
2028         /* Load the shaders */
2029         struct panfrost_context *ctx = batch->ctx;
2030         struct panfrost_device *dev = pan_device(ctx->base.screen);
2031         struct panfrost_shader_state *vs, *fs;
2032         size_t vs_size, fs_size;
2033
2034         /* Allocate the varying descriptor */
2035
2036         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
2037         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
2038         vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
2039         fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
2040
2041         struct panfrost_transfer trans = panfrost_allocate_transient(batch,
2042                                                                      vs_size +
2043                                                                      fs_size);
2044
2045         struct pipe_stream_output_info *so = &vs->stream_output;
2046         unsigned present = pan_varying_present(vs, fs, dev->quirks);
2047
2048         /* Check if this varying is linked by us. This is the case for
2049          * general-purpose, non-captured varyings. If it is, link it. If it's
2050          * not, use the provided stream out information to determine the
2051          * offset, since it was already linked for us. */
2052
2053         unsigned gen_offsets[32];
2054         enum mali_format gen_formats[32];
2055         memset(gen_offsets, 0, sizeof(gen_offsets));
2056         memset(gen_formats, 0, sizeof(gen_formats));
2057
2058         unsigned gen_stride = 0;
2059         assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
2060         assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
2061
2062         unsigned streamout_offsets[32];
2063
2064         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2065                 streamout_offsets[i] = panfrost_streamout_offset(
2066                                         so->stride[i],
2067                                         ctx->streamout.offsets[i],
2068                                         ctx->streamout.targets[i]);
2069         }
2070
2071         struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
2072         struct mali_attr_meta *ofs = ovs + vs->varying_count;
2073
2074         for (unsigned i = 0; i < vs->varying_count; i++) {
2075                 ovs[i] = panfrost_emit_varying(vs, fs, vs, present,
2076                                 ctx->streamout.num_targets, streamout_offsets,
2077                                 dev->quirks,
2078                                 gen_offsets, gen_formats, &gen_stride, i, true, false);
2079         }
2080
2081         for (unsigned i = 0; i < fs->varying_count; i++) {
2082                 ofs[i] = panfrost_emit_varying(fs, vs, vs, present,
2083                                 ctx->streamout.num_targets, streamout_offsets,
2084                                 dev->quirks,
2085                                 gen_offsets, gen_formats, &gen_stride, i, false, true);
2086         }
2087
2088         unsigned xfb_base = pan_xfb_base(present);
2089         struct panfrost_transfer T = panfrost_allocate_transient(batch,
2090                         sizeof(union mali_attr) * (xfb_base + ctx->streamout.num_targets));
2091         union mali_attr *varyings = (union mali_attr *) T.cpu;
2092
2093         /* Emit the stream out buffers */
2094
2095         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
2096                                                            ctx->vertex_count);
2097
2098         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2099                 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
2100                                         so->stride[i],
2101                                         ctx->streamout.offsets[i],
2102                                         out_count,
2103                                         ctx->streamout.targets[i]);
2104         }
2105
2106         panfrost_emit_varyings(batch,
2107                         &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2108                         gen_stride, vertex_count);
2109
2110         /* fp32 vec4 gl_Position */
2111         tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2112                         &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2113                         sizeof(float) * 4, vertex_count);
2114
2115         if (present & (1 << PAN_VARY_PSIZ)) {
2116                 primitive_size->pointer = panfrost_emit_varyings(batch,
2117                                 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2118                                 2, vertex_count);
2119         }
2120
2121         pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_VARYING_POINT_COORD);
2122         pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_VARYING_FRONT_FACING);
2123         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_VARYING_FRAG_COORD);
2124
2125         vertex_postfix->varyings = T.gpu;
2126         tiler_postfix->varyings = T.gpu;
2127
2128         vertex_postfix->varying_meta = trans.gpu;
2129         tiler_postfix->varying_meta = trans.gpu + vs_size;
2130 }
2131
2132 void
2133 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2134                                 struct mali_vertex_tiler_prefix *vertex_prefix,
2135                                 struct mali_vertex_tiler_postfix *vertex_postfix,
2136                                 struct mali_vertex_tiler_prefix *tiler_prefix,
2137                                 struct mali_vertex_tiler_postfix *tiler_postfix,
2138                                 union midgard_primitive_size *primitive_size)
2139 {
2140         struct panfrost_context *ctx = batch->ctx;
2141         struct panfrost_device *device = pan_device(ctx->base.screen);
2142         bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
2143         struct bifrost_payload_vertex bifrost_vertex = {0,};
2144         struct bifrost_payload_tiler bifrost_tiler = {0,};
2145         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2146         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2147         void *vp, *tp;
2148         size_t vp_size, tp_size;
2149
2150         if (device->quirks & IS_BIFROST) {
2151                 bifrost_vertex.prefix = *vertex_prefix;
2152                 bifrost_vertex.postfix = *vertex_postfix;
2153                 vp = &bifrost_vertex;
2154                 vp_size = sizeof(bifrost_vertex);
2155
2156                 bifrost_tiler.prefix = *tiler_prefix;
2157                 bifrost_tiler.tiler.primitive_size = *primitive_size;
2158                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2159                 bifrost_tiler.postfix = *tiler_postfix;
2160                 tp = &bifrost_tiler;
2161                 tp_size = sizeof(bifrost_tiler);
2162         } else {
2163                 midgard_vertex.prefix = *vertex_prefix;
2164                 midgard_vertex.postfix = *vertex_postfix;
2165                 vp = &midgard_vertex;
2166                 vp_size = sizeof(midgard_vertex);
2167
2168                 midgard_tiler.prefix = *tiler_prefix;
2169                 midgard_tiler.postfix = *tiler_postfix;
2170                 midgard_tiler.primitive_size = *primitive_size;
2171                 tp = &midgard_tiler;
2172                 tp_size = sizeof(midgard_tiler);
2173         }
2174
2175         if (wallpapering) {
2176                 /* Inject in reverse order, with "predicted" job indices.
2177                  * THIS IS A HACK XXX */
2178                 panfrost_new_job(batch, JOB_TYPE_TILER, false,
2179                                  batch->job_index + 2, tp, tp_size, true);
2180                 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2181                                  vp, vp_size, true);
2182                 return;
2183         }
2184
2185         /* If rasterizer discard is enable, only submit the vertex */
2186
2187         bool rasterizer_discard = ctx->rasterizer &&
2188                                   ctx->rasterizer->base.rasterizer_discard;
2189
2190         unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2191                                            vp, vp_size, false);
2192
2193         if (rasterizer_discard)
2194                 return;
2195
2196         panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2197                          false);
2198 }
2199
2200 /* TODO: stop hardcoding this */
2201 mali_ptr
2202 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2203 {
2204         uint16_t locations[] = {
2205             128, 128,
2206             0, 256,
2207             0, 256,
2208             0, 256,
2209             0, 256,
2210             0, 256,
2211             0, 256,
2212             0, 256,
2213             0, 256,
2214             0, 256,
2215             0, 256,
2216             0, 256,
2217             0, 256,
2218             0, 256,
2219             0, 256,
2220             0, 256,
2221             0, 256,
2222             0, 256,
2223             0, 256,
2224             0, 256,
2225             0, 256,
2226             0, 256,
2227             0, 256,
2228             0, 256,
2229             0, 256,
2230             0, 256,
2231             0, 256,
2232             0, 256,
2233             0, 256,
2234             0, 256,
2235             0, 256,
2236             0, 256,
2237             128, 128,
2238             0, 0,
2239             0, 0,
2240             0, 0,
2241             0, 0,
2242             0, 0,
2243             0, 0,
2244             0, 0,
2245             0, 0,
2246             0, 0,
2247             0, 0,
2248             0, 0,
2249             0, 0,
2250             0, 0,
2251             0, 0,
2252             0, 0,
2253         };
2254
2255         return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2256 }