src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_allocate.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         unsigned shift = panfrost_get_stack_shift(batch->stack_size);
  62         struct mali_shared_memory shared = {
  63                 .stack_shift = shift,
  64                 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
  65                 .shared_workgroup_count = ~0,
  66         };
  67         postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
  68 }
  69
  70 static void
  71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  72                                struct mali_vertex_tiler_postfix *postfix)
  73 {
  74         struct panfrost_device *dev = pan_device(ctx->base.screen);
  75         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  76
  77         /* If we haven't, reserve space for the framebuffer */
  78
  79         if (!batch->framebuffer.gpu) {
  80                 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
  81                         sizeof(struct mali_single_framebuffer) :
  82                         sizeof(struct mali_framebuffer);
  83
  84                 batch->framebuffer = panfrost_allocate_transient(batch, size);
  85
  86                 /* Tag the pointer */
  87                 if (!(dev->quirks & MIDGARD_SFBD))
  88                         batch->framebuffer.gpu |= MALI_MFBD;
  89         }
  90
  91         postfix->shared_memory = batch->framebuffer.gpu;
  92 }
  93
  94 static void
  95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
  96                               struct mali_vertex_tiler_prefix *prefix,
  97                               struct mali_vertex_tiler_postfix *postfix)
  98 {
  99         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 100
 101         postfix->gl_enables |= 0x7;
 102         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
 103                 rasterizer && rasterizer->base.front_ccw);
 104         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
 105                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
 106         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
 107                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
 108         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
 109                 rasterizer && rasterizer->base.flatshade_first);
 110 }
 111
 112 void
 113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 114                                   struct mali_vertex_tiler_prefix *prefix,
 115                                   union midgard_primitive_size *primitive_size)
 116 {
 117         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 118
 119         if (!panfrost_writes_point_size(ctx)) {
 120                 bool points = prefix->draw_mode == MALI_POINTS;
 121                 float val = 0.0f;
 122
 123                 if (rasterizer)
 124                         val = points ?
 125                               rasterizer->base.point_size :
 126                               rasterizer->base.line_width;
 127
 128                 primitive_size->constant = val;
 129         }
 130 }
 131
 132 static void
 133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 134                                    struct mali_vertex_tiler_postfix *postfix)
 135 {
 136         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 137         if (ctx->occlusion_query)
 138                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 139         else
 140                 postfix->occlusion_counter = 0;
 141 }
 142
 143 void
 144 panfrost_vt_init(struct panfrost_context *ctx,
 145                  enum pipe_shader_type stage,
 146                  struct mali_vertex_tiler_prefix *prefix,
 147                  struct mali_vertex_tiler_postfix *postfix)
 148 {
 149         struct panfrost_device *device = pan_device(ctx->base.screen);
 150
 151         if (!ctx->shader[stage])
 152                 return;
 153
 154         memset(prefix, 0, sizeof(*prefix));
 155         memset(postfix, 0, sizeof(*postfix));
 156
 157         if (device->quirks & IS_BIFROST) {
 158                 postfix->gl_enables = 0x2;
 159                 panfrost_vt_emit_shared_memory(ctx, postfix);
 160         } else {
 161                 postfix->gl_enables = 0x6;
 162                 panfrost_vt_attach_framebuffer(ctx, postfix);
 163         }
 164
 165         if (stage == PIPE_SHADER_FRAGMENT) {
 166                 panfrost_vt_update_occlusion_query(ctx, postfix);
 167                 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
 168         }
 169 }
 170
 171 static unsigned
 172 panfrost_translate_index_size(unsigned size)
 173 {
 174         switch (size) {
 175         case 1:
 176                 return MALI_DRAW_INDEXED_UINT8;
 177
 178         case 2:
 179                 return MALI_DRAW_INDEXED_UINT16;
 180
 181         case 4:
 182                 return MALI_DRAW_INDEXED_UINT32;
 183
 184         default:
 185                 unreachable("Invalid index size");
 186         }
 187 }
 188
 189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 190  * good for the duration of the draw (transient), could last longer. Also get
 191  * the bounds on the index buffer for the range accessed by the draw. We do
 192  * these operations together because there are natural optimizations which
 193  * require them to be together. */
 194
 195 static mali_ptr
 196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 197                                   const struct pipe_draw_info *info,
 198                                   unsigned *min_index, unsigned *max_index)
 199 {
 200         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 201         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 202         off_t offset = info->start * info->index_size;
 203         bool needs_indices = true;
 204         mali_ptr out = 0;
 205
 206         if (info->max_index != ~0u) {
 207                 *min_index = info->min_index;
 208                 *max_index = info->max_index;
 209                 needs_indices = false;
 210         }
 211
 212         if (!info->has_user_indices) {
 213                 /* Only resources can be directly mapped */
 214                 panfrost_batch_add_bo(batch, rsrc->bo,
 215                                       PAN_BO_ACCESS_SHARED |
 216                                       PAN_BO_ACCESS_READ |
 217                                       PAN_BO_ACCESS_VERTEX_TILER);
 218                 out = rsrc->bo->gpu + offset;
 219
 220                 /* Check the cache */
 221                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 222                                                            info->start,
 223                                                            info->count,
 224                                                            min_index,
 225                                                            max_index);
 226         } else {
 227                 /* Otherwise, we need to upload to transient memory */
 228                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 229                 out = panfrost_upload_transient(batch, ibuf8 + offset,
 230                                                 info->count *
 231                                                 info->index_size);
 232         }
 233
 234         if (needs_indices) {
 235                 /* Fallback */
 236                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 237
 238                 if (!info->has_user_indices)
 239                         panfrost_minmax_cache_add(rsrc->index_cache,
 240                                                   info->start, info->count,
 241                                                   *min_index, *max_index);
 242         }
 243
 244         return out;
 245 }
 246
 247 void
 248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 249                           const struct pipe_draw_info *info,
 250                           enum mali_draw_mode draw_mode,
 251                           struct mali_vertex_tiler_postfix *vertex_postfix,
 252                           struct mali_vertex_tiler_prefix *tiler_prefix,
 253                           struct mali_vertex_tiler_postfix *tiler_postfix,
 254                           unsigned *vertex_count,
 255                           unsigned *padded_count)
 256 {
 257         tiler_prefix->draw_mode = draw_mode;
 258
 259         unsigned draw_flags = 0;
 260
 261         if (panfrost_writes_point_size(ctx))
 262                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 263
 264         if (info->primitive_restart)
 265                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 266
 267         /* These doesn't make much sense */
 268
 269         draw_flags |= 0x3000;
 270
 271         if (info->index_size) {
 272                 unsigned min_index = 0, max_index = 0;
 273
 274                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 275                                                                        info,
 276                                                                        &min_index,
 277                                                                        &max_index);
 278
 279                 /* Use the corresponding values */
 280                 *vertex_count = max_index - min_index + 1;
 281                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 282                 tiler_prefix->offset_bias_correction = -min_index;
 283                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 284                 draw_flags |= panfrost_translate_index_size(info->index_size);
 285         } else {
 286                 tiler_prefix->indices = 0;
 287                 *vertex_count = ctx->vertex_count;
 288                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 289                 tiler_prefix->offset_bias_correction = 0;
 290                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 291         }
 292
 293         tiler_prefix->unknown_draw = draw_flags;
 294
 295         /* Encode the padded vertex count */
 296
 297         if (info->instance_count > 1) {
 298                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 299
 300                 unsigned shift = __builtin_ctz(ctx->padded_count);
 301                 unsigned k = ctx->padded_count >> (shift + 1);
 302
 303                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 304                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 305         } else {
 306                 *padded_count = *vertex_count;
 307
 308                 /* Reset instancing state */
 309                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 310                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 311         }
 312 }
 313
 314 static void
 315 panfrost_shader_meta_init(struct panfrost_context *ctx,
 316                           enum pipe_shader_type st,
 317                           struct mali_shader_meta *meta)
 318 {
 319         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 320         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 321
 322         memset(meta, 0, sizeof(*meta));
 323         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 324         meta->attribute_count = ss->attribute_count;
 325         meta->varying_count = ss->varying_count;
 326         meta->texture_count = ctx->sampler_view_count[st];
 327         meta->sampler_count = ctx->sampler_count[st];
 328
 329         if (dev->quirks & IS_BIFROST) {
 330                 if (st == PIPE_SHADER_VERTEX)
 331                         meta->bifrost1.unk1 = 0x800000;
 332                 else {
 333                         /* First clause ATEST |= 0x4000000.
 334                          * Less than 32 regs |= 0x200 */
 335                         meta->bifrost1.unk1 = 0x958020;
 336                 }
 337
 338                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 339                 if (st == PIPE_SHADER_VERTEX)
 340                         meta->bifrost2.preload_regs = 0xC0;
 341                 else
 342                         meta->bifrost2.preload_regs = 0x1;
 343                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 344                                                     ss->uniform_cutoff);
 345         } else {
 346                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 347                                                     ss->uniform_cutoff);
 348                 meta->midgard1.work_count = ss->work_reg_count;
 349
 350                 /* TODO: This is not conformant on ES3 */
 351                 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
 352
 353                 meta->midgard1.flags_lo = 0x220;
 354                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 355         }
 356 }
 357
 358 static unsigned
 359 panfrost_translate_compare_func(enum pipe_compare_func in)
 360 {
 361         switch (in) {
 362         case PIPE_FUNC_NEVER:
 363                 return MALI_FUNC_NEVER;
 364
 365         case PIPE_FUNC_LESS:
 366                 return MALI_FUNC_LESS;
 367
 368         case PIPE_FUNC_EQUAL:
 369                 return MALI_FUNC_EQUAL;
 370
 371         case PIPE_FUNC_LEQUAL:
 372                 return MALI_FUNC_LEQUAL;
 373
 374         case PIPE_FUNC_GREATER:
 375                 return MALI_FUNC_GREATER;
 376
 377         case PIPE_FUNC_NOTEQUAL:
 378                 return MALI_FUNC_NOTEQUAL;
 379
 380         case PIPE_FUNC_GEQUAL:
 381                 return MALI_FUNC_GEQUAL;
 382
 383         case PIPE_FUNC_ALWAYS:
 384                 return MALI_FUNC_ALWAYS;
 385
 386         default:
 387                 unreachable("Invalid func");
 388         }
 389 }
 390
 391 static unsigned
 392 panfrost_translate_stencil_op(enum pipe_stencil_op in)
 393 {
 394         switch (in) {
 395         case PIPE_STENCIL_OP_KEEP:
 396                 return MALI_STENCIL_KEEP;
 397
 398         case PIPE_STENCIL_OP_ZERO:
 399                 return MALI_STENCIL_ZERO;
 400
 401         case PIPE_STENCIL_OP_REPLACE:
 402                return MALI_STENCIL_REPLACE;
 403
 404         case PIPE_STENCIL_OP_INCR:
 405                 return MALI_STENCIL_INCR;
 406
 407         case PIPE_STENCIL_OP_DECR:
 408                 return MALI_STENCIL_DECR;
 409
 410         case PIPE_STENCIL_OP_INCR_WRAP:
 411                 return MALI_STENCIL_INCR_WRAP;
 412
 413         case PIPE_STENCIL_OP_DECR_WRAP:
 414                 return MALI_STENCIL_DECR_WRAP;
 415
 416         case PIPE_STENCIL_OP_INVERT:
 417                 return MALI_STENCIL_INVERT;
 418
 419         default:
 420                 unreachable("Invalid stencil op");
 421         }
 422 }
 423
 424 static unsigned
 425 translate_tex_wrap(enum pipe_tex_wrap w)
 426 {
 427         switch (w) {
 428         case PIPE_TEX_WRAP_REPEAT:
 429                 return MALI_WRAP_REPEAT;
 430
 431         case PIPE_TEX_WRAP_CLAMP:
 432                 return MALI_WRAP_CLAMP;
 433
 434         case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 435                 return MALI_WRAP_CLAMP_TO_EDGE;
 436
 437         case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 438                 return MALI_WRAP_CLAMP_TO_BORDER;
 439
 440         case PIPE_TEX_WRAP_MIRROR_REPEAT:
 441                 return MALI_WRAP_MIRRORED_REPEAT;
 442
 443         case PIPE_TEX_WRAP_MIRROR_CLAMP:
 444                 return MALI_WRAP_MIRRORED_CLAMP;
 445
 446         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 447                 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
 448
 449         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 450                 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
 451
 452         default:
 453                 unreachable("Invalid wrap");
 454         }
 455 }
 456
 457 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 458                                 struct mali_sampler_descriptor *hw)
 459 {
 460         unsigned func = panfrost_translate_compare_func(cso->compare_func);
 461         bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 462         bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 463         bool mip_linear  = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
 464         unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
 465         unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
 466         unsigned mip_filter = mip_linear  ?
 467                               (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
 468         unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
 469
 470         *hw = (struct mali_sampler_descriptor) {
 471                 .filter_mode = min_filter | mag_filter | mip_filter |
 472                                normalized,
 473                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 474                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 475                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 476                 .compare_func = panfrost_flip_compare_func(func),
 477                 .border_color = {
 478                         cso->border_color.f[0],
 479                         cso->border_color.f[1],
 480                         cso->border_color.f[2],
 481                         cso->border_color.f[3]
 482                 },
 483                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 484                 .max_lod = FIXED_16(cso->max_lod, false),
 485                 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
 486                 .seamless_cube_map = cso->seamless_cube_map,
 487         };
 488
 489         /* If necessary, we disable mipmapping in the sampler descriptor by
 490          * clamping the LOD as tight as possible (from 0 to epsilon,
 491          * essentially -- remember these are fixed point numbers, so
 492          * epsilon=1/256) */
 493
 494         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 495                 hw->max_lod = hw->min_lod + 1;
 496 }
 497
 498 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 499                                         struct bifrost_sampler_descriptor *hw)
 500 {
 501         *hw = (struct bifrost_sampler_descriptor) {
 502                 .unk1 = 0x1,
 503                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 504                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 505                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 506                 .unk8 = 0x8,
 507                 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
 508                 .norm_coords = cso->normalized_coords,
 509                 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
 510                 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
 511                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 512                 .max_lod = FIXED_16(cso->max_lod, false),
 513         };
 514
 515         /* If necessary, we disable mipmapping in the sampler descriptor by
 516          * clamping the LOD as tight as possible (from 0 to epsilon,
 517          * essentially -- remember these are fixed point numbers, so
 518          * epsilon=1/256) */
 519
 520         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 521                 hw->max_lod = hw->min_lod + 1;
 522 }
 523
 524 static void
 525 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
 526                             struct mali_stencil_test *out)
 527 {
 528         out->ref = 0; /* Gallium gets it from elsewhere */
 529
 530         out->mask = in->valuemask;
 531         out->func = panfrost_translate_compare_func(in->func);
 532         out->sfail = panfrost_translate_stencil_op(in->fail_op);
 533         out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
 534         out->dppass = panfrost_translate_stencil_op(in->zpass_op);
 535 }
 536
 537 static void
 538 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 539                                      struct mali_shader_meta *fragmeta)
 540 {
 541         if (!ctx->rasterizer) {
 542                 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
 543                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
 544                 fragmeta->depth_units = 0.0f;
 545                 fragmeta->depth_factor = 0.0f;
 546                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
 547                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
 548                 return;
 549         }
 550
 551         bool msaa = ctx->rasterizer->base.multisample;
 552
 553         /* TODO: Sample size */
 554         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 555         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 556         fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
 557         fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
 558
 559         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 560
 561         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
 562                 ctx->rasterizer->base.offset_tri);
 563         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
 564                 ctx->rasterizer->base.offset_tri);
 565 }
 566
 567 static void
 568 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 569                               struct mali_shader_meta *fragmeta)
 570 {
 571         const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 572         int zfunc = PIPE_FUNC_ALWAYS;
 573
 574         if (!zsa) {
 575                 struct pipe_stencil_state default_stencil = {
 576                         .enabled = 0,
 577                         .func = PIPE_FUNC_ALWAYS,
 578                         .fail_op = MALI_STENCIL_KEEP,
 579                         .zfail_op = MALI_STENCIL_KEEP,
 580                         .zpass_op = MALI_STENCIL_KEEP,
 581                         .writemask = 0xFF,
 582                         .valuemask = 0xFF
 583                 };
 584
 585                 panfrost_make_stencil_state(&default_stencil,
 586                                             &fragmeta->stencil_front);
 587                 fragmeta->stencil_mask_front = default_stencil.writemask;
 588                 fragmeta->stencil_back = fragmeta->stencil_front;
 589                 fragmeta->stencil_mask_back = default_stencil.writemask;
 590                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
 591                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
 592         } else {
 593                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 594                         zsa->stencil[0].enabled);
 595                 panfrost_make_stencil_state(&zsa->stencil[0],
 596                                             &fragmeta->stencil_front);
 597                 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
 598                 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
 599
 600                 /* If back-stencil is not enabled, use the front values */
 601
 602                 if (zsa->stencil[1].enabled) {
 603                         panfrost_make_stencil_state(&zsa->stencil[1],
 604                                                     &fragmeta->stencil_back);
 605                         fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
 606                         fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
 607                 } else {
 608                         fragmeta->stencil_back = fragmeta->stencil_front;
 609                         fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
 610                         fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
 611                 }
 612
 613                 if (zsa->depth.enabled)
 614                         zfunc = zsa->depth.func;
 615
 616                 /* Depth state (TODO: Refactor) */
 617
 618                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 619                         zsa->depth.writemask);
 620         }
 621
 622         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 623         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
 624 }
 625
 626 static bool
 627 panfrost_fs_required(
 628                 struct panfrost_shader_state *fs,
 629                 struct panfrost_blend_final *blend,
 630                 unsigned rt_count)
 631 {
 632         /* If we generally have side effects */
 633         if (fs->fs_sidefx)
 634                 return true;
 635
 636         /* If colour is written we need to execute */
 637         for (unsigned i = 0; i < rt_count; ++i) {
 638                 if (!blend[i].no_colour)
 639                         return true;
 640         }
 641
 642         /* If depth is written and not implied we need to execute.
 643          * TODO: Predicate on Z/S writes being enabled */
 644         return (fs->writes_depth || fs->writes_stencil);
 645 }
 646
 647 static void
 648 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 649                                 struct mali_shader_meta *fragmeta,
 650                                 void *rts)
 651 {
 652         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 653         struct panfrost_shader_state *fs;
 654         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 655
 656         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 657                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 658                 !ctx->blend->base.dither);
 659
 660         /* Get blending setup */
 661         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 662
 663         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 664         unsigned shader_offset = 0;
 665         struct panfrost_bo *shader_bo = NULL;
 666
 667         for (unsigned c = 0; c < rt_count; ++c)
 668                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
 669                                                           &shader_offset);
 670
 671         /* Disable shader execution if we can */
 672         if (dev->quirks & MIDGARD_SHADERLESS
 673                         && !panfrost_fs_required(fs, blend, rt_count)) {
 674                 fragmeta->shader = 0;
 675                 fragmeta->attribute_count = 0;
 676                 fragmeta->varying_count = 0;
 677                 fragmeta->texture_count = 0;
 678                 fragmeta->sampler_count = 0;
 679
 680                 /* This feature is not known to work on Bifrost */
 681                 fragmeta->midgard1.work_count = 1;
 682                 fragmeta->midgard1.uniform_count = 0;
 683                 fragmeta->midgard1.uniform_buffer_count = 0;
 684         }
 685
 686          /* If there is a blend shader, work registers are shared. We impose 8
 687           * work registers as a limit for blend shaders. Should be lower XXX */
 688
 689         if (!(dev->quirks & IS_BIFROST)) {
 690                 for (unsigned c = 0; c < rt_count; ++c) {
 691                         if (blend[c].is_shader) {
 692                                 fragmeta->midgard1.work_count =
 693                                         MAX2(fragmeta->midgard1.work_count, 8);
 694                         }
 695                 }
 696         }
 697
 698         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 699          * copied to the blend_meta appended (by convention), but this is the
 700          * field actually read by the hardware. (Or maybe both are read...?).
 701          * Specify the last RTi with a blend shader. */
 702
 703         fragmeta->blend.shader = 0;
 704
 705         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 706                 if (!blend[rt].is_shader)
 707                         continue;
 708
 709                 fragmeta->blend.shader = blend[rt].shader.gpu |
 710                                          blend[rt].shader.first_tag;
 711                 break;
 712         }
 713
 714         if (dev->quirks & MIDGARD_SFBD) {
 715                 /* When only a single render target platform is used, the blend
 716                  * information is inside the shader meta itself. We additionally
 717                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 718                  * we're able to read back the destination buffer) */
 719
 720                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 721                         blend[0].is_shader);
 722
 723                 if (!blend[0].is_shader) {
 724                         fragmeta->blend.equation = *blend[0].equation.equation;
 725                         fragmeta->blend.constant = blend[0].equation.constant;
 726                 }
 727
 728                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 729                         !blend[0].no_blending || fs->can_discard);
 730                 return;
 731         }
 732
 733         /* Additional blend descriptor tacked on for jobs using MFBD */
 734
 735         for (unsigned i = 0; i < rt_count; ++i) {
 736                 unsigned flags = 0;
 737
 738                 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
 739                         flags = 0x200;
 740
 741                         bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 742                                        (ctx->pipe_framebuffer.cbufs[i]) &&
 743                                        util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 744
 745                         SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 746                         SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 747                         SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
 748                         SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 749                 }
 750
 751                 if (dev->quirks & IS_BIFROST) {
 752                         struct bifrost_blend_rt *brts = rts;
 753
 754                         brts[i].flags = flags;
 755
 756                         if (blend[i].is_shader) {
 757                                 /* The blend shader's address needs to be at
 758                                  * the same top 32 bit as the fragment shader.
 759                                  * TODO: Ensure that's always the case.
 760                                  */
 761                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 762                                        (fs->bo->gpu & (0xffffffffull << 32)));
 763                                 brts[i].shader = blend[i].shader.gpu;
 764                                 brts[i].unk2 = 0x0;
 765                         } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
 766                                 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
 767                                 const struct util_format_description *format_desc;
 768                                 format_desc = util_format_description(format);
 769
 770                                 brts[i].equation = *blend[i].equation.equation;
 771
 772                                 /* TODO: this is a bit more complicated */
 773                                 brts[i].constant = blend[i].equation.constant;
 774
 775                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 776
 777                                 /* 0x19 disables blending and forces REPLACE
 778                                  * mode (equivalent to rgb_mode = alpha_mode =
 779                                  * x122, colour mask = 0xF). 0x1a allows
 780                                  * blending. */
 781                                 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
 782
 783                                 brts[i].shader_type = fs->blend_types[i];
 784                         } else {
 785                                 /* Dummy attachment for depth-only */
 786                                 brts[i].unk2 = 0x3;
 787                                 brts[i].shader_type = fs->blend_types[i];
 788                         }
 789                 } else {
 790                         struct midgard_blend_rt *mrts = rts;
 791                         mrts[i].flags = flags;
 792
 793                         if (blend[i].is_shader) {
 794                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 795                         } else {
 796                                 mrts[i].blend.equation = *blend[i].equation.equation;
 797                                 mrts[i].blend.constant = blend[i].equation.constant;
 798                         }
 799                 }
 800         }
 801 }
 802
 803 static void
 804 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 805                                struct mali_shader_meta *fragmeta,
 806                                void *rts)
 807 {
 808         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 809         struct panfrost_shader_state *fs;
 810
 811         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 812
 813         fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
 814         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
 815         fragmeta->unknown2_4 = 0x4e0;
 816
 817         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 818          * is required (independent of 32-bit/64-bit descriptors), or why it's
 819          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 820          * these earlier chips (perhaps this is a chicken bit of some kind).
 821          * More investigation is needed. */
 822
 823         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 824
 825         if (dev->quirks & IS_BIFROST) {
 826                 /* TODO */
 827         } else {
 828                 /* Depending on whether it's legal to in the given shader, we try to
 829                  * enable early-z testing (or forward-pixel kill?) */
 830
 831                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 832                         !fs->can_discard && !fs->writes_depth);
 833
 834                 /* Add the writes Z/S flags if needed. */
 835                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 836                 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 837
 838                 /* Any time texturing is used, derivatives are implicitly calculated,
 839                  * so we need to enable helper invocations */
 840
 841                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 842                         fs->helper_invocations);
 843
 844                 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 845
 846                 bool depth_enabled = fs->writes_depth ||
 847                    (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
 848
 849                 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, !depth_enabled && fs->can_discard);
 850                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
 851         }
 852
 853         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 854         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 855         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 856 }
 857
 858 void
 859 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 860                           enum pipe_shader_type st,
 861                           struct mali_vertex_tiler_postfix *postfix)
 862 {
 863         struct panfrost_context *ctx = batch->ctx;
 864         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 865
 866         if (!ss) {
 867                 postfix->shader = 0;
 868                 return;
 869         }
 870
 871         struct mali_shader_meta meta;
 872
 873         panfrost_shader_meta_init(ctx, st, &meta);
 874
 875         /* Add the shader BO to the batch. */
 876         panfrost_batch_add_bo(batch, ss->bo,
 877                               PAN_BO_ACCESS_PRIVATE |
 878                               PAN_BO_ACCESS_READ |
 879                               panfrost_bo_access_for_stage(st));
 880
 881         mali_ptr shader_ptr;
 882
 883         if (st == PIPE_SHADER_FRAGMENT) {
 884                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 885                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 886                 size_t desc_size = sizeof(meta);
 887                 void *rts = NULL;
 888                 struct panfrost_transfer xfer;
 889                 unsigned rt_size;
 890
 891                 if (dev->quirks & MIDGARD_SFBD)
 892                         rt_size = 0;
 893                 else if (dev->quirks & IS_BIFROST)
 894                         rt_size = sizeof(struct bifrost_blend_rt);
 895                 else
 896                         rt_size = sizeof(struct midgard_blend_rt);
 897
 898                 desc_size += rt_size * rt_count;
 899
 900                 if (rt_size)
 901                         rts = rzalloc_size(ctx, rt_size * rt_count);
 902
 903                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 904
 905                 xfer = panfrost_allocate_transient(batch, desc_size);
 906
 907                 memcpy(xfer.cpu, &meta, sizeof(meta));
 908                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 909
 910                 if (rt_size)
 911                         ralloc_free(rts);
 912
 913                 shader_ptr = xfer.gpu;
 914         } else {
 915                 shader_ptr = panfrost_upload_transient(batch, &meta,
 916                                                        sizeof(meta));
 917         }
 918
 919         postfix->shader = shader_ptr;
 920 }
 921
 922 static void
 923 panfrost_mali_viewport_init(struct panfrost_context *ctx,
 924                             struct mali_viewport *mvp)
 925 {
 926         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 927
 928         /* Clip bounds are encoded as floats. The viewport itself is encoded as
 929          * (somewhat) asymmetric ints. */
 930
 931         const struct pipe_scissor_state *ss = &ctx->scissor;
 932
 933         memset(mvp, 0, sizeof(*mvp));
 934
 935         /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
 936          * each direction. Clipping to the viewport in theory should work, but
 937          * in practice causes issues when we're not explicitly trying to
 938          * scissor */
 939
 940         *mvp = (struct mali_viewport) {
 941                 .clip_minx = -INFINITY,
 942                 .clip_miny = -INFINITY,
 943                 .clip_maxx = INFINITY,
 944                 .clip_maxy = INFINITY,
 945         };
 946
 947         /* Always scissor to the viewport by default. */
 948         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 949         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 950
 951         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 952         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 953
 954         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 955         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 956
 957         /* Apply the scissor test */
 958
 959         unsigned minx, miny, maxx, maxy;
 960
 961         if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
 962                 minx = MAX2(ss->minx, vp_minx);
 963                 miny = MAX2(ss->miny, vp_miny);
 964                 maxx = MIN2(ss->maxx, vp_maxx);
 965                 maxy = MIN2(ss->maxy, vp_maxy);
 966         } else {
 967                 minx = vp_minx;
 968                 miny = vp_miny;
 969                 maxx = vp_maxx;
 970                 maxy = vp_maxy;
 971         }
 972
 973         /* Hardware needs the min/max to be strictly ordered, so flip if we
 974          * need to. The viewport transformation in the vertex shader will
 975          * handle the negatives if we don't */
 976
 977         if (miny > maxy) {
 978                 unsigned temp = miny;
 979                 miny = maxy;
 980                 maxy = temp;
 981         }
 982
 983         if (minx > maxx) {
 984                 unsigned temp = minx;
 985                 minx = maxx;
 986                 maxx = temp;
 987         }
 988
 989         if (minz > maxz) {
 990                 float temp = minz;
 991                 minz = maxz;
 992                 maxz = temp;
 993         }
 994
 995         /* Clamp to the framebuffer size as a last check */
 996
 997         minx = MIN2(ctx->pipe_framebuffer.width, minx);
 998         maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
 999
1000         miny = MIN2(ctx->pipe_framebuffer.height, miny);
1001         maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
1002
1003         /* Upload */
1004
1005         mvp->viewport0[0] = minx;
1006         mvp->viewport1[0] = MALI_POSITIVE(maxx);
1007
1008         mvp->viewport0[1] = miny;
1009         mvp->viewport1[1] = MALI_POSITIVE(maxy);
1010
1011         mvp->clip_minz = minz;
1012         mvp->clip_maxz = maxz;
1013 }
1014
1015 void
1016 panfrost_emit_viewport(struct panfrost_batch *batch,
1017                        struct mali_vertex_tiler_postfix *tiler_postfix)
1018 {
1019         struct panfrost_context *ctx = batch->ctx;
1020         struct mali_viewport mvp;
1021
1022         panfrost_mali_viewport_init(batch->ctx,  &mvp);
1023
1024         /* Update the job, unless we're doing wallpapering (whose lack of
1025          * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
1026          * just... be faster :) */
1027
1028         if (!ctx->wallpaper_batch)
1029                 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
1030                                              mvp.viewport0[1],
1031                                              mvp.viewport1[0] + 1,
1032                                              mvp.viewport1[1] + 1);
1033
1034         tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
1035                                                             sizeof(mvp));
1036 }
1037
1038 static mali_ptr
1039 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1040                                  enum pipe_shader_type st,
1041                                  struct panfrost_constant_buffer *buf,
1042                                  unsigned index)
1043 {
1044         struct pipe_constant_buffer *cb = &buf->cb[index];
1045         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1046
1047         if (rsrc) {
1048                 panfrost_batch_add_bo(batch, rsrc->bo,
1049                                       PAN_BO_ACCESS_SHARED |
1050                                       PAN_BO_ACCESS_READ |
1051                                       panfrost_bo_access_for_stage(st));
1052
1053                 /* Alignment gauranteed by
1054                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1055                 return rsrc->bo->gpu + cb->buffer_offset;
1056         } else if (cb->user_buffer) {
1057                 return panfrost_upload_transient(batch,
1058                                                  cb->user_buffer +
1059                                                  cb->buffer_offset,
1060                                                  cb->buffer_size);
1061         } else {
1062                 unreachable("No constant buffer");
1063         }
1064 }
1065
1066 struct sysval_uniform {
1067         union {
1068                 float f[4];
1069                 int32_t i[4];
1070                 uint32_t u[4];
1071                 uint64_t du[2];
1072         };
1073 };
1074
1075 static void
1076 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1077                                       struct sysval_uniform *uniform)
1078 {
1079         struct panfrost_context *ctx = batch->ctx;
1080         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1081
1082         uniform->f[0] = vp->scale[0];
1083         uniform->f[1] = vp->scale[1];
1084         uniform->f[2] = vp->scale[2];
1085 }
1086
1087 static void
1088 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1089                                        struct sysval_uniform *uniform)
1090 {
1091         struct panfrost_context *ctx = batch->ctx;
1092         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1093
1094         uniform->f[0] = vp->translate[0];
1095         uniform->f[1] = vp->translate[1];
1096         uniform->f[2] = vp->translate[2];
1097 }
1098
1099 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1100                                        enum pipe_shader_type st,
1101                                        unsigned int sysvalid,
1102                                        struct sysval_uniform *uniform)
1103 {
1104         struct panfrost_context *ctx = batch->ctx;
1105         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1106         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1107         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1108         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1109
1110         assert(dim);
1111         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1112
1113         if (dim > 1)
1114                 uniform->i[1] = u_minify(tex->texture->height0,
1115                                          tex->u.tex.first_level);
1116
1117         if (dim > 2)
1118                 uniform->i[2] = u_minify(tex->texture->depth0,
1119                                          tex->u.tex.first_level);
1120
1121         if (is_array)
1122                 uniform->i[dim] = tex->texture->array_size;
1123 }
1124
1125 static void
1126 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1127                             enum pipe_shader_type st,
1128                             unsigned ssbo_id,
1129                             struct sysval_uniform *uniform)
1130 {
1131         struct panfrost_context *ctx = batch->ctx;
1132
1133         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1134         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1135
1136         /* Compute address */
1137         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1138
1139         panfrost_batch_add_bo(batch, bo,
1140                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1141                               panfrost_bo_access_for_stage(st));
1142
1143         /* Upload address and size as sysval */
1144         uniform->du[0] = bo->gpu + sb.buffer_offset;
1145         uniform->u[2] = sb.buffer_size;
1146 }
1147
1148 static void
1149 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1150                                enum pipe_shader_type st,
1151                                unsigned samp_idx,
1152                                struct sysval_uniform *uniform)
1153 {
1154         struct panfrost_context *ctx = batch->ctx;
1155         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1156
1157         uniform->f[0] = sampl->min_lod;
1158         uniform->f[1] = sampl->max_lod;
1159         uniform->f[2] = sampl->lod_bias;
1160
1161         /* Even without any errata, Midgard represents "no mipmapping" as
1162          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1163          * panfrost_create_sampler_state which also explains our choice of
1164          * epsilon value (again to keep behaviour consistent) */
1165
1166         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1167                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1168 }
1169
1170 static void
1171 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1172                                        struct sysval_uniform *uniform)
1173 {
1174         struct panfrost_context *ctx = batch->ctx;
1175
1176         uniform->u[0] = ctx->compute_grid->grid[0];
1177         uniform->u[1] = ctx->compute_grid->grid[1];
1178         uniform->u[2] = ctx->compute_grid->grid[2];
1179 }
1180
1181 static void
1182 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1183                         struct panfrost_shader_state *ss,
1184                         enum pipe_shader_type st)
1185 {
1186         struct sysval_uniform *uniforms = (void *)buf;
1187
1188         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1189                 int sysval = ss->sysval[i];
1190
1191                 switch (PAN_SYSVAL_TYPE(sysval)) {
1192                 case PAN_SYSVAL_VIEWPORT_SCALE:
1193                         panfrost_upload_viewport_scale_sysval(batch,
1194                                                               &uniforms[i]);
1195                         break;
1196                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1197                         panfrost_upload_viewport_offset_sysval(batch,
1198                                                                &uniforms[i]);
1199                         break;
1200                 case PAN_SYSVAL_TEXTURE_SIZE:
1201                         panfrost_upload_txs_sysval(batch, st,
1202                                                    PAN_SYSVAL_ID(sysval),
1203                                                    &uniforms[i]);
1204                         break;
1205                 case PAN_SYSVAL_SSBO:
1206                         panfrost_upload_ssbo_sysval(batch, st,
1207                                                     PAN_SYSVAL_ID(sysval),
1208                                                     &uniforms[i]);
1209                         break;
1210                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1211                         panfrost_upload_num_work_groups_sysval(batch,
1212                                                                &uniforms[i]);
1213                         break;
1214                 case PAN_SYSVAL_SAMPLER:
1215                         panfrost_upload_sampler_sysval(batch, st,
1216                                                        PAN_SYSVAL_ID(sysval),
1217                                                        &uniforms[i]);
1218                         break;
1219                 default:
1220                         assert(0);
1221                 }
1222         }
1223 }
1224
1225 static const void *
1226 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1227                                  unsigned index)
1228 {
1229         struct pipe_constant_buffer *cb = &buf->cb[index];
1230         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1231
1232         if (rsrc)
1233                 return rsrc->bo->cpu;
1234         else if (cb->user_buffer)
1235                 return cb->user_buffer;
1236         else
1237                 unreachable("No constant buffer");
1238 }
1239
1240 void
1241 panfrost_emit_const_buf(struct panfrost_batch *batch,
1242                         enum pipe_shader_type stage,
1243                         struct mali_vertex_tiler_postfix *postfix)
1244 {
1245         struct panfrost_context *ctx = batch->ctx;
1246         struct panfrost_shader_variants *all = ctx->shader[stage];
1247
1248         if (!all)
1249                 return;
1250
1251         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1252
1253         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1254
1255         /* Uniforms are implicitly UBO #0 */
1256         bool has_uniforms = buf->enabled_mask & (1 << 0);
1257
1258         /* Allocate room for the sysval and the uniforms */
1259         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1260         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1261         size_t size = sys_size + uniform_size;
1262         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1263                                                                         size);
1264
1265         /* Upload sysvals requested by the shader */
1266         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1267
1268         /* Upload uniforms */
1269         if (has_uniforms && uniform_size) {
1270                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1271                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1272         }
1273
1274         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1275          * uploaded */
1276
1277         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1278         assert(ubo_count >= 1);
1279
1280         size_t sz = sizeof(uint64_t) * ubo_count;
1281         uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1282         int uniform_count = ss->uniform_count;
1283
1284         /* Upload uniforms as a UBO */
1285         ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1286
1287         /* The rest are honest-to-goodness UBOs */
1288
1289         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1290                 size_t usz = buf->cb[ubo].buffer_size;
1291                 bool enabled = buf->enabled_mask & (1 << ubo);
1292                 bool empty = usz == 0;
1293
1294                 if (!enabled || empty) {
1295                         /* Stub out disabled UBOs to catch accesses */
1296                         ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1297                         continue;
1298                 }
1299
1300                 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1301                                                                 buf, ubo);
1302
1303                 unsigned bytes_per_field = 16;
1304                 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1305                 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1306         }
1307
1308         mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1309         postfix->uniforms = transfer.gpu;
1310         postfix->uniform_buffers = ubufs;
1311
1312         buf->dirty_mask = 0;
1313 }
1314
1315 void
1316 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1317                             const struct pipe_grid_info *info,
1318                             struct midgard_payload_vertex_tiler *vtp)
1319 {
1320         struct panfrost_context *ctx = batch->ctx;
1321         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1322         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1323         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1324                                                            128));
1325         unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1326                                info->grid[2] * 4;
1327         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1328                                                                   shared_size,
1329                                                                   1);
1330
1331         struct mali_shared_memory shared = {
1332                 .shared_memory = bo->gpu,
1333                 .shared_workgroup_count =
1334                         util_logbase2_ceil(info->grid[0]) +
1335                         util_logbase2_ceil(info->grid[1]) +
1336                         util_logbase2_ceil(info->grid[2]),
1337                 .shared_unk1 = 0x2,
1338                 .shared_shift = util_logbase2(single_size) - 1
1339         };
1340
1341         vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1342                                                                sizeof(shared));
1343 }
1344
1345 static mali_ptr
1346 panfrost_get_tex_desc(struct panfrost_batch *batch,
1347                       enum pipe_shader_type st,
1348                       struct panfrost_sampler_view *view)
1349 {
1350         if (!view)
1351                 return (mali_ptr) 0;
1352
1353         struct pipe_sampler_view *pview = &view->base;
1354         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1355
1356         /* Add the BO to the job so it's retained until the job is done. */
1357
1358         panfrost_batch_add_bo(batch, rsrc->bo,
1359                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1360                               panfrost_bo_access_for_stage(st));
1361
1362         panfrost_batch_add_bo(batch, view->midgard_bo,
1363                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1364                               panfrost_bo_access_for_stage(st));
1365
1366         return view->midgard_bo->gpu;
1367 }
1368
1369 void
1370 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1371                                   enum pipe_shader_type stage,
1372                                   struct mali_vertex_tiler_postfix *postfix)
1373 {
1374         struct panfrost_context *ctx = batch->ctx;
1375         struct panfrost_device *device = pan_device(ctx->base.screen);
1376
1377         if (!ctx->sampler_view_count[stage])
1378                 return;
1379
1380         if (device->quirks & IS_BIFROST) {
1381                 struct bifrost_texture_descriptor *descriptors;
1382
1383                 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1384                                      ctx->sampler_view_count[stage]);
1385
1386                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1387                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1388                         struct pipe_sampler_view *pview = &view->base;
1389                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1390
1391                         /* Add the BOs to the job so they are retained until the job is done. */
1392
1393                         panfrost_batch_add_bo(batch, rsrc->bo,
1394                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1395                                               panfrost_bo_access_for_stage(stage));
1396
1397                         panfrost_batch_add_bo(batch, view->bifrost_bo,
1398                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1399                                               panfrost_bo_access_for_stage(stage));
1400
1401                         memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1402                 }
1403
1404                 postfix->textures = panfrost_upload_transient(batch,
1405                                                               descriptors,
1406                                                               sizeof(struct bifrost_texture_descriptor) *
1407                                                                       ctx->sampler_view_count[stage]);
1408
1409                 free(descriptors);
1410         } else {
1411                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1412
1413                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1414                         trampolines[i] = panfrost_get_tex_desc(batch, stage,
1415                                                                ctx->sampler_views[stage][i]);
1416
1417                 postfix->textures = panfrost_upload_transient(batch,
1418                                                               trampolines,
1419                                                               sizeof(uint64_t) *
1420                                                               ctx->sampler_view_count[stage]);
1421         }
1422 }
1423
1424 void
1425 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1426                                   enum pipe_shader_type stage,
1427                                   struct mali_vertex_tiler_postfix *postfix)
1428 {
1429         struct panfrost_context *ctx = batch->ctx;
1430         struct panfrost_device *device = pan_device(ctx->base.screen);
1431
1432         if (!ctx->sampler_count[stage])
1433                 return;
1434
1435         if (device->quirks & IS_BIFROST) {
1436                 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1437                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1438                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1439                                                                                 transfer_size);
1440                 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1441
1442                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1443                         desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1444
1445                 postfix->sampler_descriptor = transfer.gpu;
1446         } else {
1447                 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1448                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1449                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1450                                                                                 transfer_size);
1451                 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1452
1453                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1454                         desc[i] = ctx->samplers[stage][i]->midgard_hw;
1455
1456                 postfix->sampler_descriptor = transfer.gpu;
1457         }
1458 }
1459
1460 void
1461 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1462                                struct mali_vertex_tiler_postfix *vertex_postfix)
1463 {
1464         struct panfrost_context *ctx = batch->ctx;
1465
1466         if (!ctx->vertex)
1467                 return;
1468
1469         struct panfrost_vertex_state *so = ctx->vertex;
1470
1471         panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1472         vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1473                                                                sizeof(*so->hw) *
1474                                                                PAN_MAX_ATTRIBUTE);
1475 }
1476
1477 void
1478 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1479                           struct mali_vertex_tiler_postfix *vertex_postfix)
1480 {
1481         struct panfrost_context *ctx = batch->ctx;
1482         struct panfrost_vertex_state *so = ctx->vertex;
1483
1484         /* Staged mali_attr, and index into them. i =/= k, depending on the
1485          * vertex buffer mask and instancing. Twice as much room is allocated,
1486          * for a worst case of NPOT_DIVIDEs which take up extra slot */
1487         union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1488         unsigned k = 0;
1489
1490         for (unsigned i = 0; i < so->num_elements; ++i) {
1491                 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1492                  * means duplicating some vertex buffers (who cares? aside from
1493                  * maybe some caching implications but I somehow doubt that
1494                  * matters) */
1495
1496                 struct pipe_vertex_element *elem = &so->pipe[i];
1497                 unsigned vbi = elem->vertex_buffer_index;
1498
1499                 /* The exception to 1:1 mapping is that we can have multiple
1500                  * entries (NPOT divisors), so we fixup anyways */
1501
1502                 so->hw[i].index = k;
1503
1504                 if (!(ctx->vb_mask & (1 << vbi)))
1505                         continue;
1506
1507                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1508                 struct panfrost_resource *rsrc;
1509
1510                 rsrc = pan_resource(buf->buffer.resource);
1511                 if (!rsrc)
1512                         continue;
1513
1514                 /* Align to 64 bytes by masking off the lower bits. This
1515                  * will be adjusted back when we fixup the src_offset in
1516                  * mali_attr_meta */
1517
1518                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1519                 mali_ptr addr = raw_addr & ~63;
1520                 unsigned chopped_addr = raw_addr - addr;
1521
1522                 /* Add a dependency of the batch on the vertex buffer */
1523                 panfrost_batch_add_bo(batch, rsrc->bo,
1524                                       PAN_BO_ACCESS_SHARED |
1525                                       PAN_BO_ACCESS_READ |
1526                                       PAN_BO_ACCESS_VERTEX_TILER);
1527
1528                 /* Set common fields */
1529                 attrs[k].elements = addr;
1530                 attrs[k].stride = buf->stride;
1531
1532                 /* Since we advanced the base pointer, we shrink the buffer
1533                  * size */
1534                 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1535
1536                 /* We need to add the extra size we masked off (for
1537                  * correctness) so the data doesn't get clamped away */
1538                 attrs[k].size += chopped_addr;
1539
1540                 /* For non-instancing make sure we initialize */
1541                 attrs[k].shift = attrs[k].extra_flags = 0;
1542
1543                 /* Instancing uses a dramatically different code path than
1544                  * linear, so dispatch for the actual emission now that the
1545                  * common code is finished */
1546
1547                 unsigned divisor = elem->instance_divisor;
1548
1549                 if (divisor && ctx->instance_count == 1) {
1550                         /* Silly corner case where there's a divisor(=1) but
1551                          * there's no legitimate instancing. So we want *every*
1552                          * attribute to be the same. So set stride to zero so
1553                          * we don't go anywhere. */
1554
1555                         attrs[k].size = attrs[k].stride + chopped_addr;
1556                         attrs[k].stride = 0;
1557                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1558                 } else if (ctx->instance_count <= 1) {
1559                         /* Normal, non-instanced attributes */
1560                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1561                 } else {
1562                         unsigned instance_shift = vertex_postfix->instance_shift;
1563                         unsigned instance_odd = vertex_postfix->instance_odd;
1564
1565                         k += panfrost_vertex_instanced(ctx->padded_count,
1566                                                        instance_shift,
1567                                                        instance_odd,
1568                                                        divisor, &attrs[k]);
1569                 }
1570         }
1571
1572         /* Add special gl_VertexID/gl_InstanceID buffers */
1573
1574         panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1575         so->hw[PAN_VERTEX_ID].index = k++;
1576         panfrost_instance_id(ctx->padded_count, &attrs[k]);
1577         so->hw[PAN_INSTANCE_ID].index = k++;
1578
1579         /* Upload whatever we emitted and go */
1580
1581         vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1582                                                            k * sizeof(*attrs));
1583 }
1584
1585 static mali_ptr
1586 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1587                        unsigned stride, unsigned count)
1588 {
1589         /* Fill out the descriptor */
1590         slot->stride = stride;
1591         slot->size = stride * count;
1592         slot->shift = slot->extra_flags = 0;
1593
1594         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1595                                                                         slot->size);
1596
1597         slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1598
1599         return transfer.gpu;
1600 }
1601
1602 static void
1603 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1604                         unsigned stride, unsigned offset, unsigned count,
1605                         struct pipe_stream_output_target *target)
1606 {
1607         /* Fill out the descriptor */
1608         slot->stride = stride * 4;
1609         slot->shift = slot->extra_flags = 0;
1610
1611         unsigned max_size = target->buffer_size;
1612         unsigned expected_size = slot->stride * count;
1613
1614         slot->size = MIN2(max_size, expected_size);
1615
1616         /* Grab the BO and bind it to the batch */
1617         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1618
1619         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1620          * the perspective of the TILER and FRAGMENT.
1621          */
1622         panfrost_batch_add_bo(batch, bo,
1623                               PAN_BO_ACCESS_SHARED |
1624                               PAN_BO_ACCESS_RW |
1625                               PAN_BO_ACCESS_VERTEX_TILER |
1626                               PAN_BO_ACCESS_FRAGMENT);
1627
1628         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1629         slot->elements = addr;
1630 }
1631
1632 /* Given a shader and buffer indices, link varying metadata together */
1633
1634 static bool
1635 is_special_varying(gl_varying_slot loc)
1636 {
1637         switch (loc) {
1638         case VARYING_SLOT_POS:
1639         case VARYING_SLOT_PSIZ:
1640         case VARYING_SLOT_PNTC:
1641         case VARYING_SLOT_FACE:
1642                 return true;
1643         default:
1644                 return false;
1645         }
1646 }
1647
1648 static void
1649 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1650                            signed general, signed gl_Position,
1651                            signed gl_PointSize, signed gl_PointCoord,
1652                            signed gl_FrontFacing)
1653 {
1654         struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1655
1656         for (unsigned i = 0; i < ss->varying_count; ++i) {
1657                 gl_varying_slot location = ss->varyings_loc[i];
1658                 int index = -1;
1659
1660                 switch (location) {
1661                 case VARYING_SLOT_POS:
1662                         index = gl_Position;
1663                         break;
1664                 case VARYING_SLOT_PSIZ:
1665                         index = gl_PointSize;
1666                         break;
1667                 case VARYING_SLOT_PNTC:
1668                         index = gl_PointCoord;
1669                         break;
1670                 case VARYING_SLOT_FACE:
1671                         index = gl_FrontFacing;
1672                         break;
1673                 default:
1674                         index = general;
1675                         break;
1676                 }
1677
1678                 assert(index >= 0);
1679                 out[i].index = index;
1680         }
1681 }
1682
1683 static bool
1684 has_point_coord(unsigned mask, gl_varying_slot loc)
1685 {
1686         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1687                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1688         else if (loc == VARYING_SLOT_PNTC)
1689                 return (mask & (1 << 8));
1690         else
1691                 return false;
1692 }
1693
1694 /* Helpers for manipulating stream out information so we can pack varyings
1695  * accordingly. Compute the src_offset for a given captured varying */
1696
1697 static struct pipe_stream_output *
1698 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1699 {
1700         for (unsigned i = 0; i < info->num_outputs; ++i) {
1701                 if (info->output[i].register_index == loc)
1702                         return &info->output[i];
1703         }
1704
1705         unreachable("Varying not captured");
1706 }
1707
1708 void
1709 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1710                                  unsigned vertex_count,
1711                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1712                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1713                                  union midgard_primitive_size *primitive_size)
1714 {
1715         /* Load the shaders */
1716         struct panfrost_context *ctx = batch->ctx;
1717         struct panfrost_shader_state *vs, *fs;
1718         unsigned int num_gen_varyings = 0;
1719         size_t vs_size, fs_size;
1720
1721         /* Allocate the varying descriptor */
1722
1723         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1724         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1725         vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1726         fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1727
1728         struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1729                                                                      vs_size +
1730                                                                      fs_size);
1731
1732         struct pipe_stream_output_info *so = &vs->stream_output;
1733
1734         /* Check if this varying is linked by us. This is the case for
1735          * general-purpose, non-captured varyings. If it is, link it. If it's
1736          * not, use the provided stream out information to determine the
1737          * offset, since it was already linked for us. */
1738
1739         for (unsigned i = 0; i < vs->varying_count; i++) {
1740                 gl_varying_slot loc = vs->varyings_loc[i];
1741
1742                 bool special = is_special_varying(loc);
1743                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1744
1745                 if (captured) {
1746                         struct pipe_stream_output *o = pan_get_so(so, loc);
1747
1748                         unsigned dst_offset = o->dst_offset * 4; /* dwords */
1749                         vs->varyings[i].src_offset = dst_offset;
1750                 } else if (!special) {
1751                         vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1752                 }
1753         }
1754
1755         /* Conversely, we need to set src_offset for the captured varyings.
1756          * Here, the layout is defined by the stream out info, not us */
1757
1758         /* Link up with fragment varyings */
1759         bool reads_point_coord = fs->reads_point_coord;
1760
1761         for (unsigned i = 0; i < fs->varying_count; i++) {
1762                 gl_varying_slot loc = fs->varyings_loc[i];
1763                 unsigned src_offset;
1764                 signed vs_idx = -1;
1765
1766                 /* Link up */
1767                 for (unsigned j = 0; j < vs->varying_count; ++j) {
1768                         if (vs->varyings_loc[j] == loc) {
1769                                 vs_idx = j;
1770                                 break;
1771                         }
1772                 }
1773
1774                 /* Either assign or reuse */
1775                 if (vs_idx >= 0)
1776                         src_offset = vs->varyings[vs_idx].src_offset;
1777                 else
1778                         src_offset = 16 * (num_gen_varyings++);
1779
1780                 fs->varyings[i].src_offset = src_offset;
1781
1782                 if (has_point_coord(fs->point_sprite_mask, loc))
1783                         reads_point_coord = true;
1784         }
1785
1786         memcpy(trans.cpu, vs->varyings, vs_size);
1787         memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1788
1789         union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1790
1791         /* Figure out how many streamout buffers could be bound */
1792         unsigned so_count = ctx->streamout.num_targets;
1793         for (unsigned i = 0; i < vs->varying_count; i++) {
1794                 gl_varying_slot loc = vs->varyings_loc[i];
1795
1796                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1797                 if (!captured) continue;
1798
1799                 struct pipe_stream_output *o = pan_get_so(so, loc);
1800                 so_count = MAX2(so_count, o->output_buffer + 1);
1801         }
1802
1803         signed idx = so_count;
1804         signed general = idx++;
1805         signed gl_Position = idx++;
1806         signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1807         signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1808         signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1809         signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1810
1811         /* Emit the stream out buffers */
1812
1813         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1814                                                            ctx->vertex_count);
1815
1816         for (unsigned i = 0; i < so_count; ++i) {
1817                 if (i < ctx->streamout.num_targets) {
1818                         panfrost_emit_streamout(batch, &varyings[i],
1819                                                 so->stride[i],
1820                                                 ctx->streamout.offsets[i],
1821                                                 out_count,
1822                                                 ctx->streamout.targets[i]);
1823                 } else {
1824                         /* Emit a dummy buffer */
1825                         panfrost_emit_varyings(batch, &varyings[i],
1826                                                so->stride[i] * 4,
1827                                                out_count);
1828
1829                         /* Clear the attribute type */
1830                         varyings[i].elements &= ~0xF;
1831                 }
1832         }
1833
1834         panfrost_emit_varyings(batch, &varyings[general],
1835                                num_gen_varyings * 16,
1836                                vertex_count);
1837
1838         mali_ptr varyings_p;
1839
1840         /* fp32 vec4 gl_Position */
1841         varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1842                                             sizeof(float) * 4, vertex_count);
1843         tiler_postfix->position_varying = varyings_p;
1844
1845
1846         if (panfrost_writes_point_size(ctx)) {
1847                 varyings_p = panfrost_emit_varyings(batch,
1848                                                     &varyings[gl_PointSize],
1849                                                     2, vertex_count);
1850                 primitive_size->pointer = varyings_p;
1851         }
1852
1853         if (reads_point_coord)
1854                 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1855
1856         if (fs->reads_face)
1857                 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1858
1859         if (fs->reads_frag_coord)
1860                 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1861
1862         struct panfrost_device *device = pan_device(ctx->base.screen);
1863         assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord));
1864
1865         /* Let's go ahead and link varying meta to the buffer in question, now
1866          * that that information is available. VARYING_SLOT_POS is mapped to
1867          * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1868          * */
1869
1870         panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1871                                    gl_PointSize, gl_PointCoord,
1872                                    gl_FrontFacing);
1873
1874         panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1875                                    gl_FragCoord, gl_PointSize,
1876                                    gl_PointCoord, gl_FrontFacing);
1877
1878         /* Replace streamout */
1879
1880         struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1881         struct mali_attr_meta *ofs = ovs + vs->varying_count;
1882
1883         for (unsigned i = 0; i < vs->varying_count; i++) {
1884                 gl_varying_slot loc = vs->varyings_loc[i];
1885
1886                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1887                 if (!captured)
1888                         continue;
1889
1890                 struct pipe_stream_output *o = pan_get_so(so, loc);
1891                 ovs[i].index = o->output_buffer;
1892
1893                 assert(o->stream == 0);
1894                 ovs[i].format = (vs->varyings[i].format & ~MALI_NR_CHANNELS(4))
1895                         | MALI_NR_CHANNELS(o->num_components);
1896
1897                 if (device->quirks & HAS_SWIZZLES)
1898                         ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1899                 else
1900                         ovs[i].swizzle = panfrost_bifrost_swizzle(o->num_components);
1901
1902                 /* Link to the fragment */
1903                 signed fs_idx = -1;
1904
1905                 /* Link up */
1906                 for (unsigned j = 0; j < fs->varying_count; ++j) {
1907                         if (fs->varyings_loc[j] == loc) {
1908                                 fs_idx = j;
1909                                 break;
1910                         }
1911                 }
1912
1913                 if (fs_idx >= 0) {
1914                         ofs[fs_idx].index = ovs[i].index;
1915                         ofs[fs_idx].format = ovs[i].format;
1916                         ofs[fs_idx].swizzle = ovs[i].swizzle;
1917                 }
1918         }
1919
1920         /* Replace point sprite */
1921         for (unsigned i = 0; i < fs->varying_count; i++) {
1922                 /* If we have a point sprite replacement, handle that here. We
1923                  * have to translate location first.  TODO: Flip y in shader.
1924                  * We're already keying ... just time crunch .. */
1925
1926                 if (has_point_coord(fs->point_sprite_mask,
1927                                     fs->varyings_loc[i])) {
1928                         ofs[i].index = gl_PointCoord;
1929
1930                         /* Swizzle out the z/w to 0/1 */
1931                         ofs[i].format = MALI_RG16F;
1932                         ofs[i].swizzle = panfrost_get_default_swizzle(2);
1933                 }
1934         }
1935
1936         /* Fix up unaligned addresses */
1937         for (unsigned i = 0; i < so_count; ++i) {
1938                 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1939                         continue;
1940
1941                 unsigned align = (varyings[i].elements & 63);
1942
1943                 /* While we're at it, the SO buffers are linear */
1944
1945                 if (!align) {
1946                         varyings[i].elements |= MALI_ATTR_LINEAR;
1947                         continue;
1948                 }
1949
1950                 /* We need to adjust alignment */
1951                 varyings[i].elements &= ~63;
1952                 varyings[i].elements |= MALI_ATTR_LINEAR;
1953                 varyings[i].size += align;
1954
1955                 for (unsigned v = 0; v < vs->varying_count; ++v) {
1956                         if (ovs[v].index != i)
1957                                 continue;
1958
1959                         ovs[v].src_offset = vs->varyings[v].src_offset + align;
1960                 }
1961
1962                 for (unsigned f = 0; f < fs->varying_count; ++f) {
1963                         if (ofs[f].index != i)
1964                                 continue;
1965
1966                         ofs[f].src_offset = fs->varyings[f].src_offset + align;
1967                 }
1968         }
1969
1970         varyings_p = panfrost_upload_transient(batch, varyings,
1971                                                idx * sizeof(*varyings));
1972         vertex_postfix->varyings = varyings_p;
1973         tiler_postfix->varyings = varyings_p;
1974
1975         vertex_postfix->varying_meta = trans.gpu;
1976         tiler_postfix->varying_meta = trans.gpu + vs_size;
1977 }
1978
1979 void
1980 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1981                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1982                                 struct mali_vertex_tiler_postfix *vertex_postfix,
1983                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1984                                 struct mali_vertex_tiler_postfix *tiler_postfix,
1985                                 union midgard_primitive_size *primitive_size)
1986 {
1987         struct panfrost_context *ctx = batch->ctx;
1988         struct panfrost_device *device = pan_device(ctx->base.screen);
1989         bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
1990         struct bifrost_payload_vertex bifrost_vertex = {0,};
1991         struct bifrost_payload_tiler bifrost_tiler = {0,};
1992         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1993         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1994         void *vp, *tp;
1995         size_t vp_size, tp_size;
1996
1997         if (device->quirks & IS_BIFROST) {
1998                 bifrost_vertex.prefix = *vertex_prefix;
1999                 bifrost_vertex.postfix = *vertex_postfix;
2000                 vp = &bifrost_vertex;
2001                 vp_size = sizeof(bifrost_vertex);
2002
2003                 bifrost_tiler.prefix = *tiler_prefix;
2004                 bifrost_tiler.tiler.primitive_size = *primitive_size;
2005                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2006                 bifrost_tiler.postfix = *tiler_postfix;
2007                 tp = &bifrost_tiler;
2008                 tp_size = sizeof(bifrost_tiler);
2009         } else {
2010                 midgard_vertex.prefix = *vertex_prefix;
2011                 midgard_vertex.postfix = *vertex_postfix;
2012                 vp = &midgard_vertex;
2013                 vp_size = sizeof(midgard_vertex);
2014
2015                 midgard_tiler.prefix = *tiler_prefix;
2016                 midgard_tiler.postfix = *tiler_postfix;
2017                 midgard_tiler.primitive_size = *primitive_size;
2018                 tp = &midgard_tiler;
2019                 tp_size = sizeof(midgard_tiler);
2020         }
2021
2022         if (wallpapering) {
2023                 /* Inject in reverse order, with "predicted" job indices.
2024                  * THIS IS A HACK XXX */
2025                 panfrost_new_job(batch, JOB_TYPE_TILER, false,
2026                                  batch->job_index + 2, tp, tp_size, true);
2027                 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2028                                  vp, vp_size, true);
2029                 return;
2030         }
2031
2032         /* If rasterizer discard is enable, only submit the vertex */
2033
2034         bool rasterizer_discard = ctx->rasterizer &&
2035                                   ctx->rasterizer->base.rasterizer_discard;
2036
2037         unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2038                                            vp, vp_size, false);
2039
2040         if (rasterizer_discard)
2041                 return;
2042
2043         panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2044                          false);
2045 }
2046
2047 /* TODO: stop hardcoding this */
2048 mali_ptr
2049 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2050 {
2051         uint16_t locations[] = {
2052             128, 128,
2053             0, 256,
2054             0, 256,
2055             0, 256,
2056             0, 256,
2057             0, 256,
2058             0, 256,
2059             0, 256,
2060             0, 256,
2061             0, 256,
2062             0, 256,
2063             0, 256,
2064             0, 256,
2065             0, 256,
2066             0, 256,
2067             0, 256,
2068             0, 256,
2069             0, 256,
2070             0, 256,
2071             0, 256,
2072             0, 256,
2073             0, 256,
2074             0, 256,
2075             0, 256,
2076             0, 256,
2077             0, 256,
2078             0, 256,
2079             0, 256,
2080             0, 256,
2081             0, 256,
2082             0, 256,
2083             0, 256,
2084             128, 128,
2085             0, 0,
2086             0, 0,
2087             0, 0,
2088             0, 0,
2089             0, 0,
2090             0, 0,
2091             0, 0,
2092             0, 0,
2093             0, 0,
2094             0, 0,
2095             0, 0,
2096             0, 0,
2097             0, 0,
2098             0, 0,
2099             0, 0,
2100         };
2101
2102         return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2103 }