src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_allocate.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         unsigned shift = panfrost_get_stack_shift(batch->stack_size);
  62         struct mali_shared_memory shared = {
  63                 .stack_shift = shift,
  64                 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
  65                 .shared_workgroup_count = ~0,
  66         };
  67         postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
  68 }
  69
  70 static void
  71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  72                                struct mali_vertex_tiler_postfix *postfix)
  73 {
  74         struct panfrost_device *dev = pan_device(ctx->base.screen);
  75         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  76
  77         /* If we haven't, reserve space for the framebuffer */
  78
  79         if (!batch->framebuffer.gpu) {
  80                 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
  81                         sizeof(struct mali_single_framebuffer) :
  82                         sizeof(struct mali_framebuffer);
  83
  84                 batch->framebuffer = panfrost_allocate_transient(batch, size);
  85
  86                 /* Tag the pointer */
  87                 if (!(dev->quirks & MIDGARD_SFBD))
  88                         batch->framebuffer.gpu |= MALI_MFBD;
  89         }
  90
  91         postfix->shared_memory = batch->framebuffer.gpu;
  92 }
  93
  94 static void
  95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
  96                               struct mali_vertex_tiler_prefix *prefix,
  97                               struct mali_vertex_tiler_postfix *postfix)
  98 {
  99         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 100
 101         postfix->gl_enables |= 0x7;
 102         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
 103                 rasterizer && rasterizer->base.front_ccw);
 104         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
 105                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
 106         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
 107                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
 108         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
 109                 rasterizer && rasterizer->base.flatshade_first);
 110 }
 111
 112 void
 113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 114                                   struct mali_vertex_tiler_prefix *prefix,
 115                                   union midgard_primitive_size *primitive_size)
 116 {
 117         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 118
 119         if (!panfrost_writes_point_size(ctx)) {
 120                 bool points = prefix->draw_mode == MALI_POINTS;
 121                 float val = 0.0f;
 122
 123                 if (rasterizer)
 124                         val = points ?
 125                               rasterizer->base.point_size :
 126                               rasterizer->base.line_width;
 127
 128                 primitive_size->constant = val;
 129         }
 130 }
 131
 132 static void
 133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 134                                    struct mali_vertex_tiler_postfix *postfix)
 135 {
 136         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 137         if (ctx->occlusion_query)
 138                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 139         else
 140                 postfix->occlusion_counter = 0;
 141 }
 142
 143 void
 144 panfrost_vt_init(struct panfrost_context *ctx,
 145                  enum pipe_shader_type stage,
 146                  struct mali_vertex_tiler_prefix *prefix,
 147                  struct mali_vertex_tiler_postfix *postfix)
 148 {
 149         struct panfrost_device *device = pan_device(ctx->base.screen);
 150
 151         if (!ctx->shader[stage])
 152                 return;
 153
 154         memset(prefix, 0, sizeof(*prefix));
 155         memset(postfix, 0, sizeof(*postfix));
 156
 157         if (device->quirks & IS_BIFROST) {
 158                 postfix->gl_enables = 0x2;
 159                 panfrost_vt_emit_shared_memory(ctx, postfix);
 160         } else {
 161                 postfix->gl_enables = 0x6;
 162                 panfrost_vt_attach_framebuffer(ctx, postfix);
 163         }
 164
 165         if (stage == PIPE_SHADER_FRAGMENT) {
 166                 panfrost_vt_update_occlusion_query(ctx, postfix);
 167                 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
 168         }
 169 }
 170
 171 static unsigned
 172 panfrost_translate_index_size(unsigned size)
 173 {
 174         switch (size) {
 175         case 1:
 176                 return MALI_DRAW_INDEXED_UINT8;
 177
 178         case 2:
 179                 return MALI_DRAW_INDEXED_UINT16;
 180
 181         case 4:
 182                 return MALI_DRAW_INDEXED_UINT32;
 183
 184         default:
 185                 unreachable("Invalid index size");
 186         }
 187 }
 188
 189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 190  * good for the duration of the draw (transient), could last longer. Also get
 191  * the bounds on the index buffer for the range accessed by the draw. We do
 192  * these operations together because there are natural optimizations which
 193  * require them to be together. */
 194
 195 static mali_ptr
 196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 197                                   const struct pipe_draw_info *info,
 198                                   unsigned *min_index, unsigned *max_index)
 199 {
 200         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 201         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 202         off_t offset = info->start * info->index_size;
 203         bool needs_indices = true;
 204         mali_ptr out = 0;
 205
 206         if (info->max_index != ~0u) {
 207                 *min_index = info->min_index;
 208                 *max_index = info->max_index;
 209                 needs_indices = false;
 210         }
 211
 212         if (!info->has_user_indices) {
 213                 /* Only resources can be directly mapped */
 214                 panfrost_batch_add_bo(batch, rsrc->bo,
 215                                       PAN_BO_ACCESS_SHARED |
 216                                       PAN_BO_ACCESS_READ |
 217                                       PAN_BO_ACCESS_VERTEX_TILER);
 218                 out = rsrc->bo->gpu + offset;
 219
 220                 /* Check the cache */
 221                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 222                                                            info->start,
 223                                                            info->count,
 224                                                            min_index,
 225                                                            max_index);
 226         } else {
 227                 /* Otherwise, we need to upload to transient memory */
 228                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 229                 out = panfrost_upload_transient(batch, ibuf8 + offset,
 230                                                 info->count *
 231                                                 info->index_size);
 232         }
 233
 234         if (needs_indices) {
 235                 /* Fallback */
 236                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 237
 238                 if (!info->has_user_indices)
 239                         panfrost_minmax_cache_add(rsrc->index_cache,
 240                                                   info->start, info->count,
 241                                                   *min_index, *max_index);
 242         }
 243
 244         return out;
 245 }
 246
 247 void
 248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 249                           const struct pipe_draw_info *info,
 250                           enum mali_draw_mode draw_mode,
 251                           struct mali_vertex_tiler_postfix *vertex_postfix,
 252                           struct mali_vertex_tiler_prefix *tiler_prefix,
 253                           struct mali_vertex_tiler_postfix *tiler_postfix,
 254                           unsigned *vertex_count,
 255                           unsigned *padded_count)
 256 {
 257         tiler_prefix->draw_mode = draw_mode;
 258
 259         unsigned draw_flags = 0;
 260
 261         if (panfrost_writes_point_size(ctx))
 262                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 263
 264         if (info->primitive_restart)
 265                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 266
 267         /* These doesn't make much sense */
 268
 269         draw_flags |= 0x3000;
 270
 271         if (info->index_size) {
 272                 unsigned min_index = 0, max_index = 0;
 273
 274                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 275                                                                        info,
 276                                                                        &min_index,
 277                                                                        &max_index);
 278
 279                 /* Use the corresponding values */
 280                 *vertex_count = max_index - min_index + 1;
 281                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 282                 tiler_prefix->offset_bias_correction = -min_index;
 283                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 284                 draw_flags |= panfrost_translate_index_size(info->index_size);
 285         } else {
 286                 tiler_prefix->indices = 0;
 287                 *vertex_count = ctx->vertex_count;
 288                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 289                 tiler_prefix->offset_bias_correction = 0;
 290                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 291         }
 292
 293         tiler_prefix->unknown_draw = draw_flags;
 294
 295         /* Encode the padded vertex count */
 296
 297         if (info->instance_count > 1) {
 298                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 299
 300                 unsigned shift = __builtin_ctz(ctx->padded_count);
 301                 unsigned k = ctx->padded_count >> (shift + 1);
 302
 303                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 304                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 305         } else {
 306                 *padded_count = *vertex_count;
 307
 308                 /* Reset instancing state */
 309                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 310                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 311         }
 312 }
 313
 314 static void
 315 panfrost_shader_meta_init(struct panfrost_context *ctx,
 316                           enum pipe_shader_type st,
 317                           struct mali_shader_meta *meta)
 318 {
 319         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 320         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 321
 322         memset(meta, 0, sizeof(*meta));
 323         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 324         meta->attribute_count = ss->attribute_count;
 325         meta->varying_count = ss->varying_count;
 326         meta->texture_count = ctx->sampler_view_count[st];
 327         meta->sampler_count = ctx->sampler_count[st];
 328
 329         if (dev->quirks & IS_BIFROST) {
 330                 if (st == PIPE_SHADER_VERTEX)
 331                         meta->bifrost1.unk1 = 0x800000;
 332                 else {
 333                         /* First clause ATEST |= 0x4000000.
 334                          * Less than 32 regs |= 0x200 */
 335                         meta->bifrost1.unk1 = 0x958020;
 336                 }
 337
 338                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 339                 if (st == PIPE_SHADER_VERTEX)
 340                         meta->bifrost2.preload_regs = 0xC0;
 341                 else
 342                         meta->bifrost2.preload_regs = 0x1;
 343                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 344                                                     ss->uniform_cutoff);
 345         } else {
 346                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 347                                                     ss->uniform_cutoff);
 348                 meta->midgard1.work_count = ss->work_reg_count;
 349                 meta->midgard1.flags_hi = 0x8; /* XXX */
 350                 meta->midgard1.flags_lo = 0x220;
 351                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 352         }
 353 }
 354
 355 static unsigned
 356 panfrost_translate_compare_func(enum pipe_compare_func in)
 357 {
 358         switch (in) {
 359         case PIPE_FUNC_NEVER:
 360                 return MALI_FUNC_NEVER;
 361
 362         case PIPE_FUNC_LESS:
 363                 return MALI_FUNC_LESS;
 364
 365         case PIPE_FUNC_EQUAL:
 366                 return MALI_FUNC_EQUAL;
 367
 368         case PIPE_FUNC_LEQUAL:
 369                 return MALI_FUNC_LEQUAL;
 370
 371         case PIPE_FUNC_GREATER:
 372                 return MALI_FUNC_GREATER;
 373
 374         case PIPE_FUNC_NOTEQUAL:
 375                 return MALI_FUNC_NOTEQUAL;
 376
 377         case PIPE_FUNC_GEQUAL:
 378                 return MALI_FUNC_GEQUAL;
 379
 380         case PIPE_FUNC_ALWAYS:
 381                 return MALI_FUNC_ALWAYS;
 382
 383         default:
 384                 unreachable("Invalid func");
 385         }
 386 }
 387
 388 static unsigned
 389 panfrost_translate_stencil_op(enum pipe_stencil_op in)
 390 {
 391         switch (in) {
 392         case PIPE_STENCIL_OP_KEEP:
 393                 return MALI_STENCIL_KEEP;
 394
 395         case PIPE_STENCIL_OP_ZERO:
 396                 return MALI_STENCIL_ZERO;
 397
 398         case PIPE_STENCIL_OP_REPLACE:
 399                return MALI_STENCIL_REPLACE;
 400
 401         case PIPE_STENCIL_OP_INCR:
 402                 return MALI_STENCIL_INCR;
 403
 404         case PIPE_STENCIL_OP_DECR:
 405                 return MALI_STENCIL_DECR;
 406
 407         case PIPE_STENCIL_OP_INCR_WRAP:
 408                 return MALI_STENCIL_INCR_WRAP;
 409
 410         case PIPE_STENCIL_OP_DECR_WRAP:
 411                 return MALI_STENCIL_DECR_WRAP;
 412
 413         case PIPE_STENCIL_OP_INVERT:
 414                 return MALI_STENCIL_INVERT;
 415
 416         default:
 417                 unreachable("Invalid stencil op");
 418         }
 419 }
 420
 421 static unsigned
 422 translate_tex_wrap(enum pipe_tex_wrap w)
 423 {
 424         switch (w) {
 425         case PIPE_TEX_WRAP_REPEAT:
 426                 return MALI_WRAP_REPEAT;
 427
 428         case PIPE_TEX_WRAP_CLAMP:
 429                 return MALI_WRAP_CLAMP;
 430
 431         case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 432                 return MALI_WRAP_CLAMP_TO_EDGE;
 433
 434         case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 435                 return MALI_WRAP_CLAMP_TO_BORDER;
 436
 437         case PIPE_TEX_WRAP_MIRROR_REPEAT:
 438                 return MALI_WRAP_MIRRORED_REPEAT;
 439
 440         case PIPE_TEX_WRAP_MIRROR_CLAMP:
 441                 return MALI_WRAP_MIRRORED_CLAMP;
 442
 443         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 444                 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
 445
 446         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 447                 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
 448
 449         default:
 450                 unreachable("Invalid wrap");
 451         }
 452 }
 453
 454 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 455                                 struct mali_sampler_descriptor *hw)
 456 {
 457         unsigned func = panfrost_translate_compare_func(cso->compare_func);
 458         bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 459         bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 460         bool mip_linear  = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
 461         unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
 462         unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
 463         unsigned mip_filter = mip_linear  ?
 464                               (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
 465         unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
 466
 467         *hw = (struct mali_sampler_descriptor) {
 468                 .filter_mode = min_filter | mag_filter | mip_filter |
 469                                normalized,
 470                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 471                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 472                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 473                 .compare_func = panfrost_flip_compare_func(func),
 474                 .border_color = {
 475                         cso->border_color.f[0],
 476                         cso->border_color.f[1],
 477                         cso->border_color.f[2],
 478                         cso->border_color.f[3]
 479                 },
 480                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 481                 .max_lod = FIXED_16(cso->max_lod, false),
 482                 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
 483                 .seamless_cube_map = cso->seamless_cube_map,
 484         };
 485
 486         /* If necessary, we disable mipmapping in the sampler descriptor by
 487          * clamping the LOD as tight as possible (from 0 to epsilon,
 488          * essentially -- remember these are fixed point numbers, so
 489          * epsilon=1/256) */
 490
 491         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 492                 hw->max_lod = hw->min_lod + 1;
 493 }
 494
 495 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 496                                         struct bifrost_sampler_descriptor *hw)
 497 {
 498         *hw = (struct bifrost_sampler_descriptor) {
 499                 .unk1 = 0x1,
 500                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 501                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 502                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 503                 .unk8 = 0x8,
 504                 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
 505                 .norm_coords = cso->normalized_coords,
 506                 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
 507                 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
 508                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 509                 .max_lod = FIXED_16(cso->max_lod, false),
 510         };
 511
 512         /* If necessary, we disable mipmapping in the sampler descriptor by
 513          * clamping the LOD as tight as possible (from 0 to epsilon,
 514          * essentially -- remember these are fixed point numbers, so
 515          * epsilon=1/256) */
 516
 517         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 518                 hw->max_lod = hw->min_lod + 1;
 519 }
 520
 521 static void
 522 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
 523                             struct mali_stencil_test *out)
 524 {
 525         out->ref = 0; /* Gallium gets it from elsewhere */
 526
 527         out->mask = in->valuemask;
 528         out->func = panfrost_translate_compare_func(in->func);
 529         out->sfail = panfrost_translate_stencil_op(in->fail_op);
 530         out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
 531         out->dppass = panfrost_translate_stencil_op(in->zpass_op);
 532 }
 533
 534 static void
 535 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 536                                      struct mali_shader_meta *fragmeta)
 537 {
 538         if (!ctx->rasterizer) {
 539                 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
 540                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
 541                 fragmeta->depth_units = 0.0f;
 542                 fragmeta->depth_factor = 0.0f;
 543                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
 544                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
 545                 return;
 546         }
 547
 548         bool msaa = ctx->rasterizer->base.multisample;
 549
 550         /* TODO: Sample size */
 551         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 552         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 553         fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
 554         fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
 555
 556         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 557
 558         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
 559                 ctx->rasterizer->base.offset_tri);
 560         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
 561                 ctx->rasterizer->base.offset_tri);
 562 }
 563
 564 static void
 565 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 566                               struct mali_shader_meta *fragmeta)
 567 {
 568         const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 569         int zfunc = PIPE_FUNC_ALWAYS;
 570
 571         if (!zsa) {
 572                 struct pipe_stencil_state default_stencil = {
 573                         .enabled = 0,
 574                         .func = PIPE_FUNC_ALWAYS,
 575                         .fail_op = MALI_STENCIL_KEEP,
 576                         .zfail_op = MALI_STENCIL_KEEP,
 577                         .zpass_op = MALI_STENCIL_KEEP,
 578                         .writemask = 0xFF,
 579                         .valuemask = 0xFF
 580                 };
 581
 582                 panfrost_make_stencil_state(&default_stencil,
 583                                             &fragmeta->stencil_front);
 584                 fragmeta->stencil_mask_front = default_stencil.writemask;
 585                 fragmeta->stencil_back = fragmeta->stencil_front;
 586                 fragmeta->stencil_mask_back = default_stencil.writemask;
 587                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
 588                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
 589         } else {
 590                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 591                         zsa->stencil[0].enabled);
 592                 panfrost_make_stencil_state(&zsa->stencil[0],
 593                                             &fragmeta->stencil_front);
 594                 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
 595                 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
 596
 597                 /* If back-stencil is not enabled, use the front values */
 598
 599                 if (zsa->stencil[1].enabled) {
 600                         panfrost_make_stencil_state(&zsa->stencil[1],
 601                                                     &fragmeta->stencil_back);
 602                         fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
 603                         fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
 604                 } else {
 605                         fragmeta->stencil_back = fragmeta->stencil_front;
 606                         fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
 607                         fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
 608                 }
 609
 610                 if (zsa->depth.enabled)
 611                         zfunc = zsa->depth.func;
 612
 613                 /* Depth state (TODO: Refactor) */
 614
 615                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 616                         zsa->depth.writemask);
 617         }
 618
 619         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 620         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
 621 }
 622
 623 static void
 624 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 625                                 struct mali_shader_meta *fragmeta,
 626                                 void *rts)
 627 {
 628         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 629
 630         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 631                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 632                 !ctx->blend->base.dither);
 633
 634         /* Get blending setup */
 635         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 636
 637         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 638         unsigned shader_offset = 0;
 639         struct panfrost_bo *shader_bo = NULL;
 640
 641         for (unsigned c = 0; c < rt_count; ++c)
 642                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
 643                                                           &shader_offset);
 644
 645          /* If there is a blend shader, work registers are shared. XXX: opt */
 646
 647         if (!(dev->quirks & IS_BIFROST)) {
 648                 for (unsigned c = 0; c < rt_count; ++c) {
 649                         if (blend[c].is_shader)
 650                                 fragmeta->midgard1.work_count = 16;
 651                 }
 652         }
 653
 654         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 655          * copied to the blend_meta appended (by convention), but this is the
 656          * field actually read by the hardware. (Or maybe both are read...?).
 657          * Specify the last RTi with a blend shader. */
 658
 659         fragmeta->blend.shader = 0;
 660
 661         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 662                 if (!blend[rt].is_shader)
 663                         continue;
 664
 665                 fragmeta->blend.shader = blend[rt].shader.gpu |
 666                                          blend[rt].shader.first_tag;
 667                 break;
 668         }
 669
 670         if (dev->quirks & MIDGARD_SFBD) {
 671                 /* When only a single render target platform is used, the blend
 672                  * information is inside the shader meta itself. We additionally
 673                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 674                  * we're able to read back the destination buffer) */
 675
 676                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 677                         blend[0].is_shader);
 678
 679                 if (!blend[0].is_shader) {
 680                         fragmeta->blend.equation = *blend[0].equation.equation;
 681                         fragmeta->blend.constant = blend[0].equation.constant;
 682                 }
 683
 684                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 685                         !blend[0].no_blending);
 686                 return;
 687         }
 688
 689         /* Additional blend descriptor tacked on for jobs using MFBD */
 690
 691         for (unsigned i = 0; i < rt_count; ++i) {
 692                 if (dev->quirks & IS_BIFROST) {
 693                         struct bifrost_blend_rt *brts = rts;
 694                         struct panfrost_shader_state *fs;
 695                         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 696
 697                         brts[i].flags = 0x200;
 698                         if (blend[i].is_shader) {
 699                                 /* The blend shader's address needs to be at
 700                                  * the same top 32 bit as the fragment shader.
 701                                  * TODO: Ensure that's always the case.
 702                                  */
 703                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 704                                        (fs->bo->gpu & (0xffffffffull << 32)));
 705                                 brts[i].shader = blend[i].shader.gpu;
 706                                 brts[i].unk2 = 0x0;
 707                         } else {
 708                                 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
 709                                 const struct util_format_description *format_desc;
 710                                 format_desc = util_format_description(format);
 711
 712                                 brts[i].equation = *blend[i].equation.equation;
 713
 714                                 /* TODO: this is a bit more complicated */
 715                                 brts[i].constant = blend[i].equation.constant;
 716
 717                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 718                                 brts[i].unk2 = 0x19;
 719
 720                                 brts[i].shader_type = fs->blend_types[i];
 721                         }
 722                 } else {
 723                         struct midgard_blend_rt *mrts = rts;
 724
 725                         mrts[i].flags = 0x200;
 726
 727                         bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 728                                        (ctx->pipe_framebuffer.cbufs[i]) &&
 729                                        util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 730
 731                         SET_BIT(mrts[i].flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 732                         SET_BIT(mrts[i].flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 733                         SET_BIT(mrts[i].flags, MALI_BLEND_SRGB, is_srgb);
 734                         SET_BIT(mrts[i].flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 735
 736                         if (blend[i].is_shader) {
 737                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 738                         } else {
 739                                 mrts[i].blend.equation = *blend[i].equation.equation;
 740                                 mrts[i].blend.constant = blend[i].equation.constant;
 741                         }
 742                 }
 743         }
 744 }
 745
 746 static void
 747 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 748                                struct mali_shader_meta *fragmeta,
 749                                void *rts)
 750 {
 751         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 752         struct panfrost_shader_state *fs;
 753
 754         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 755
 756         fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
 757         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
 758         fragmeta->unknown2_4 = 0x4e0;
 759
 760         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 761          * is required (independent of 32-bit/64-bit descriptors), or why it's
 762          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 763          * these earlier chips (perhaps this is a chicken bit of some kind).
 764          * More investigation is needed. */
 765
 766         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 767
 768         if (dev->quirks & IS_BIFROST) {
 769                 /* TODO */
 770         } else {
 771                 /* Depending on whether it's legal to in the given shader, we try to
 772                  * enable early-z testing (or forward-pixel kill?) */
 773
 774                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 775                         !fs->can_discard && !fs->writes_depth);
 776
 777                 /* Add the writes Z/S flags if needed. */
 778                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 779                 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 780
 781                 /* Any time texturing is used, derivatives are implicitly calculated,
 782                  * so we need to enable helper invocations */
 783
 784                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 785                         fs->helper_invocations);
 786
 787                 /* CAN_DISCARD should be set if the fragment shader possibly contains a
 788                  * 'discard' instruction. It is likely this is related to optimizations
 789                  * related to forward-pixel kill, as per "Mali Performance 3: Is
 790                  * EGL_BUFFER_PRESERVED a good thing?" by Peter Harris */
 791
 792                 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 793
 794                 bool depth_enabled = fs->writes_depth ||
 795                    (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
 796
 797                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD, fs->can_discard);
 798                 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, !depth_enabled && fs->can_discard);
 799                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
 800         }
 801
 802         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 803         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 804         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 805 }
 806
 807 void
 808 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 809                           enum pipe_shader_type st,
 810                           struct mali_vertex_tiler_postfix *postfix)
 811 {
 812         struct panfrost_context *ctx = batch->ctx;
 813         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 814
 815         if (!ss) {
 816                 postfix->shader = 0;
 817                 return;
 818         }
 819
 820         struct mali_shader_meta meta;
 821
 822         panfrost_shader_meta_init(ctx, st, &meta);
 823
 824         /* Add the shader BO to the batch. */
 825         panfrost_batch_add_bo(batch, ss->bo,
 826                               PAN_BO_ACCESS_PRIVATE |
 827                               PAN_BO_ACCESS_READ |
 828                               panfrost_bo_access_for_stage(st));
 829
 830         mali_ptr shader_ptr;
 831
 832         if (st == PIPE_SHADER_FRAGMENT) {
 833                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 834                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 835                 size_t desc_size = sizeof(meta);
 836                 void *rts = NULL;
 837                 struct panfrost_transfer xfer;
 838                 unsigned rt_size;
 839
 840                 if (dev->quirks & MIDGARD_SFBD)
 841                         rt_size = 0;
 842                 else if (dev->quirks & IS_BIFROST)
 843                         rt_size = sizeof(struct bifrost_blend_rt);
 844                 else
 845                         rt_size = sizeof(struct midgard_blend_rt);
 846
 847                 desc_size += rt_size * rt_count;
 848
 849                 if (rt_size)
 850                         rts = rzalloc_size(ctx, rt_size * rt_count);
 851
 852                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 853
 854                 xfer = panfrost_allocate_transient(batch, desc_size);
 855
 856                 memcpy(xfer.cpu, &meta, sizeof(meta));
 857                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 858
 859                 if (rt_size)
 860                         ralloc_free(rts);
 861
 862                 shader_ptr = xfer.gpu;
 863         } else {
 864                 shader_ptr = panfrost_upload_transient(batch, &meta,
 865                                                        sizeof(meta));
 866         }
 867
 868         postfix->shader = shader_ptr;
 869 }
 870
 871 static void
 872 panfrost_mali_viewport_init(struct panfrost_context *ctx,
 873                             struct mali_viewport *mvp)
 874 {
 875         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 876
 877         /* Clip bounds are encoded as floats. The viewport itself is encoded as
 878          * (somewhat) asymmetric ints. */
 879
 880         const struct pipe_scissor_state *ss = &ctx->scissor;
 881
 882         memset(mvp, 0, sizeof(*mvp));
 883
 884         /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
 885          * each direction. Clipping to the viewport in theory should work, but
 886          * in practice causes issues when we're not explicitly trying to
 887          * scissor */
 888
 889         *mvp = (struct mali_viewport) {
 890                 .clip_minx = -INFINITY,
 891                 .clip_miny = -INFINITY,
 892                 .clip_maxx = INFINITY,
 893                 .clip_maxy = INFINITY,
 894         };
 895
 896         /* Always scissor to the viewport by default. */
 897         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 898         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 899
 900         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 901         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 902
 903         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 904         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 905
 906         /* Apply the scissor test */
 907
 908         unsigned minx, miny, maxx, maxy;
 909
 910         if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
 911                 minx = MAX2(ss->minx, vp_minx);
 912                 miny = MAX2(ss->miny, vp_miny);
 913                 maxx = MIN2(ss->maxx, vp_maxx);
 914                 maxy = MIN2(ss->maxy, vp_maxy);
 915         } else {
 916                 minx = vp_minx;
 917                 miny = vp_miny;
 918                 maxx = vp_maxx;
 919                 maxy = vp_maxy;
 920         }
 921
 922         /* Hardware needs the min/max to be strictly ordered, so flip if we
 923          * need to. The viewport transformation in the vertex shader will
 924          * handle the negatives if we don't */
 925
 926         if (miny > maxy) {
 927                 unsigned temp = miny;
 928                 miny = maxy;
 929                 maxy = temp;
 930         }
 931
 932         if (minx > maxx) {
 933                 unsigned temp = minx;
 934                 minx = maxx;
 935                 maxx = temp;
 936         }
 937
 938         if (minz > maxz) {
 939                 float temp = minz;
 940                 minz = maxz;
 941                 maxz = temp;
 942         }
 943
 944         /* Clamp to the framebuffer size as a last check */
 945
 946         minx = MIN2(ctx->pipe_framebuffer.width, minx);
 947         maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
 948
 949         miny = MIN2(ctx->pipe_framebuffer.height, miny);
 950         maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
 951
 952         /* Upload */
 953
 954         mvp->viewport0[0] = minx;
 955         mvp->viewport1[0] = MALI_POSITIVE(maxx);
 956
 957         mvp->viewport0[1] = miny;
 958         mvp->viewport1[1] = MALI_POSITIVE(maxy);
 959
 960         mvp->clip_minz = minz;
 961         mvp->clip_maxz = maxz;
 962 }
 963
 964 void
 965 panfrost_emit_viewport(struct panfrost_batch *batch,
 966                        struct mali_vertex_tiler_postfix *tiler_postfix)
 967 {
 968         struct panfrost_context *ctx = batch->ctx;
 969         struct mali_viewport mvp;
 970
 971         panfrost_mali_viewport_init(batch->ctx,  &mvp);
 972
 973         /* Update the job, unless we're doing wallpapering (whose lack of
 974          * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
 975          * just... be faster :) */
 976
 977         if (!ctx->wallpaper_batch)
 978                 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
 979                                              mvp.viewport0[1],
 980                                              mvp.viewport1[0] + 1,
 981                                              mvp.viewport1[1] + 1);
 982
 983         tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
 984                                                             sizeof(mvp));
 985 }
 986
 987 static mali_ptr
 988 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 989                                  enum pipe_shader_type st,
 990                                  struct panfrost_constant_buffer *buf,
 991                                  unsigned index)
 992 {
 993         struct pipe_constant_buffer *cb = &buf->cb[index];
 994         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 995
 996         if (rsrc) {
 997                 panfrost_batch_add_bo(batch, rsrc->bo,
 998                                       PAN_BO_ACCESS_SHARED |
 999                                       PAN_BO_ACCESS_READ |
1000                                       panfrost_bo_access_for_stage(st));
1001
1002                 /* Alignment gauranteed by
1003                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1004                 return rsrc->bo->gpu + cb->buffer_offset;
1005         } else if (cb->user_buffer) {
1006                 return panfrost_upload_transient(batch,
1007                                                  cb->user_buffer +
1008                                                  cb->buffer_offset,
1009                                                  cb->buffer_size);
1010         } else {
1011                 unreachable("No constant buffer");
1012         }
1013 }
1014
1015 struct sysval_uniform {
1016         union {
1017                 float f[4];
1018                 int32_t i[4];
1019                 uint32_t u[4];
1020                 uint64_t du[2];
1021         };
1022 };
1023
1024 static void
1025 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1026                                       struct sysval_uniform *uniform)
1027 {
1028         struct panfrost_context *ctx = batch->ctx;
1029         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1030
1031         uniform->f[0] = vp->scale[0];
1032         uniform->f[1] = vp->scale[1];
1033         uniform->f[2] = vp->scale[2];
1034 }
1035
1036 static void
1037 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1038                                        struct sysval_uniform *uniform)
1039 {
1040         struct panfrost_context *ctx = batch->ctx;
1041         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1042
1043         uniform->f[0] = vp->translate[0];
1044         uniform->f[1] = vp->translate[1];
1045         uniform->f[2] = vp->translate[2];
1046 }
1047
1048 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1049                                        enum pipe_shader_type st,
1050                                        unsigned int sysvalid,
1051                                        struct sysval_uniform *uniform)
1052 {
1053         struct panfrost_context *ctx = batch->ctx;
1054         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1055         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1056         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1057         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1058
1059         assert(dim);
1060         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1061
1062         if (dim > 1)
1063                 uniform->i[1] = u_minify(tex->texture->height0,
1064                                          tex->u.tex.first_level);
1065
1066         if (dim > 2)
1067                 uniform->i[2] = u_minify(tex->texture->depth0,
1068                                          tex->u.tex.first_level);
1069
1070         if (is_array)
1071                 uniform->i[dim] = tex->texture->array_size;
1072 }
1073
1074 static void
1075 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1076                             enum pipe_shader_type st,
1077                             unsigned ssbo_id,
1078                             struct sysval_uniform *uniform)
1079 {
1080         struct panfrost_context *ctx = batch->ctx;
1081
1082         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1083         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1084
1085         /* Compute address */
1086         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1087
1088         panfrost_batch_add_bo(batch, bo,
1089                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1090                               panfrost_bo_access_for_stage(st));
1091
1092         /* Upload address and size as sysval */
1093         uniform->du[0] = bo->gpu + sb.buffer_offset;
1094         uniform->u[2] = sb.buffer_size;
1095 }
1096
1097 static void
1098 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1099                                enum pipe_shader_type st,
1100                                unsigned samp_idx,
1101                                struct sysval_uniform *uniform)
1102 {
1103         struct panfrost_context *ctx = batch->ctx;
1104         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1105
1106         uniform->f[0] = sampl->min_lod;
1107         uniform->f[1] = sampl->max_lod;
1108         uniform->f[2] = sampl->lod_bias;
1109
1110         /* Even without any errata, Midgard represents "no mipmapping" as
1111          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1112          * panfrost_create_sampler_state which also explains our choice of
1113          * epsilon value (again to keep behaviour consistent) */
1114
1115         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1116                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1117 }
1118
1119 static void
1120 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1121                                        struct sysval_uniform *uniform)
1122 {
1123         struct panfrost_context *ctx = batch->ctx;
1124
1125         uniform->u[0] = ctx->compute_grid->grid[0];
1126         uniform->u[1] = ctx->compute_grid->grid[1];
1127         uniform->u[2] = ctx->compute_grid->grid[2];
1128 }
1129
1130 static void
1131 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1132                         struct panfrost_shader_state *ss,
1133                         enum pipe_shader_type st)
1134 {
1135         struct sysval_uniform *uniforms = (void *)buf;
1136
1137         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1138                 int sysval = ss->sysval[i];
1139
1140                 switch (PAN_SYSVAL_TYPE(sysval)) {
1141                 case PAN_SYSVAL_VIEWPORT_SCALE:
1142                         panfrost_upload_viewport_scale_sysval(batch,
1143                                                               &uniforms[i]);
1144                         break;
1145                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1146                         panfrost_upload_viewport_offset_sysval(batch,
1147                                                                &uniforms[i]);
1148                         break;
1149                 case PAN_SYSVAL_TEXTURE_SIZE:
1150                         panfrost_upload_txs_sysval(batch, st,
1151                                                    PAN_SYSVAL_ID(sysval),
1152                                                    &uniforms[i]);
1153                         break;
1154                 case PAN_SYSVAL_SSBO:
1155                         panfrost_upload_ssbo_sysval(batch, st,
1156                                                     PAN_SYSVAL_ID(sysval),
1157                                                     &uniforms[i]);
1158                         break;
1159                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1160                         panfrost_upload_num_work_groups_sysval(batch,
1161                                                                &uniforms[i]);
1162                         break;
1163                 case PAN_SYSVAL_SAMPLER:
1164                         panfrost_upload_sampler_sysval(batch, st,
1165                                                        PAN_SYSVAL_ID(sysval),
1166                                                        &uniforms[i]);
1167                         break;
1168                 default:
1169                         assert(0);
1170                 }
1171         }
1172 }
1173
1174 static const void *
1175 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1176                                  unsigned index)
1177 {
1178         struct pipe_constant_buffer *cb = &buf->cb[index];
1179         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1180
1181         if (rsrc)
1182                 return rsrc->bo->cpu;
1183         else if (cb->user_buffer)
1184                 return cb->user_buffer;
1185         else
1186                 unreachable("No constant buffer");
1187 }
1188
1189 void
1190 panfrost_emit_const_buf(struct panfrost_batch *batch,
1191                         enum pipe_shader_type stage,
1192                         struct mali_vertex_tiler_postfix *postfix)
1193 {
1194         struct panfrost_context *ctx = batch->ctx;
1195         struct panfrost_shader_variants *all = ctx->shader[stage];
1196
1197         if (!all)
1198                 return;
1199
1200         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1201
1202         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1203
1204         /* Uniforms are implicitly UBO #0 */
1205         bool has_uniforms = buf->enabled_mask & (1 << 0);
1206
1207         /* Allocate room for the sysval and the uniforms */
1208         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1209         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1210         size_t size = sys_size + uniform_size;
1211         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1212                                                                         size);
1213
1214         /* Upload sysvals requested by the shader */
1215         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1216
1217         /* Upload uniforms */
1218         if (has_uniforms && uniform_size) {
1219                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1220                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1221         }
1222
1223         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1224          * uploaded */
1225
1226         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1227         assert(ubo_count >= 1);
1228
1229         size_t sz = sizeof(uint64_t) * ubo_count;
1230         uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1231         int uniform_count = ss->uniform_count;
1232
1233         /* Upload uniforms as a UBO */
1234         ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1235
1236         /* The rest are honest-to-goodness UBOs */
1237
1238         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1239                 size_t usz = buf->cb[ubo].buffer_size;
1240                 bool enabled = buf->enabled_mask & (1 << ubo);
1241                 bool empty = usz == 0;
1242
1243                 if (!enabled || empty) {
1244                         /* Stub out disabled UBOs to catch accesses */
1245                         ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1246                         continue;
1247                 }
1248
1249                 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1250                                                                 buf, ubo);
1251
1252                 unsigned bytes_per_field = 16;
1253                 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1254                 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1255         }
1256
1257         mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1258         postfix->uniforms = transfer.gpu;
1259         postfix->uniform_buffers = ubufs;
1260
1261         buf->dirty_mask = 0;
1262 }
1263
1264 void
1265 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1266                             const struct pipe_grid_info *info,
1267                             struct midgard_payload_vertex_tiler *vtp)
1268 {
1269         struct panfrost_context *ctx = batch->ctx;
1270         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1271         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1272         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1273                                                            128));
1274         unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1275                                info->grid[2] * 4;
1276         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1277                                                                   shared_size,
1278                                                                   1);
1279
1280         struct mali_shared_memory shared = {
1281                 .shared_memory = bo->gpu,
1282                 .shared_workgroup_count =
1283                         util_logbase2_ceil(info->grid[0]) +
1284                         util_logbase2_ceil(info->grid[1]) +
1285                         util_logbase2_ceil(info->grid[2]),
1286                 .shared_unk1 = 0x2,
1287                 .shared_shift = util_logbase2(single_size) - 1
1288         };
1289
1290         vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1291                                                                sizeof(shared));
1292 }
1293
1294 static mali_ptr
1295 panfrost_get_tex_desc(struct panfrost_batch *batch,
1296                       enum pipe_shader_type st,
1297                       struct panfrost_sampler_view *view)
1298 {
1299         if (!view)
1300                 return (mali_ptr) 0;
1301
1302         struct pipe_sampler_view *pview = &view->base;
1303         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1304
1305         /* Add the BO to the job so it's retained until the job is done. */
1306
1307         panfrost_batch_add_bo(batch, rsrc->bo,
1308                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1309                               panfrost_bo_access_for_stage(st));
1310
1311         panfrost_batch_add_bo(batch, view->midgard_bo,
1312                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1313                               panfrost_bo_access_for_stage(st));
1314
1315         return view->midgard_bo->gpu;
1316 }
1317
1318 void
1319 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1320                                   enum pipe_shader_type stage,
1321                                   struct mali_vertex_tiler_postfix *postfix)
1322 {
1323         struct panfrost_context *ctx = batch->ctx;
1324         struct panfrost_device *device = pan_device(ctx->base.screen);
1325
1326         if (!ctx->sampler_view_count[stage])
1327                 return;
1328
1329         if (device->quirks & IS_BIFROST) {
1330                 struct bifrost_texture_descriptor *descriptors;
1331
1332                 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1333                                      ctx->sampler_view_count[stage]);
1334
1335                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1336                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1337                         struct pipe_sampler_view *pview = &view->base;
1338                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1339
1340                         /* Add the BOs to the job so they are retained until the job is done. */
1341
1342                         panfrost_batch_add_bo(batch, rsrc->bo,
1343                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1344                                               panfrost_bo_access_for_stage(stage));
1345
1346                         panfrost_batch_add_bo(batch, view->bifrost_bo,
1347                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1348                                               panfrost_bo_access_for_stage(stage));
1349
1350                         memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1351                 }
1352
1353                 postfix->textures = panfrost_upload_transient(batch,
1354                                                               descriptors,
1355                                                               sizeof(struct bifrost_texture_descriptor) *
1356                                                                       ctx->sampler_view_count[stage]);
1357
1358                 free(descriptors);
1359         } else {
1360                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1361
1362                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1363                         trampolines[i] = panfrost_get_tex_desc(batch, stage,
1364                                                                ctx->sampler_views[stage][i]);
1365
1366                 postfix->textures = panfrost_upload_transient(batch,
1367                                                               trampolines,
1368                                                               sizeof(uint64_t) *
1369                                                               ctx->sampler_view_count[stage]);
1370         }
1371 }
1372
1373 void
1374 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1375                                   enum pipe_shader_type stage,
1376                                   struct mali_vertex_tiler_postfix *postfix)
1377 {
1378         struct panfrost_context *ctx = batch->ctx;
1379         struct panfrost_device *device = pan_device(ctx->base.screen);
1380
1381         if (!ctx->sampler_count[stage])
1382                 return;
1383
1384         if (device->quirks & IS_BIFROST) {
1385                 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1386                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1387                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1388                                                                                 transfer_size);
1389                 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1390
1391                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1392                         desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1393
1394                 postfix->sampler_descriptor = transfer.gpu;
1395         } else {
1396                 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1397                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1398                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1399                                                                                 transfer_size);
1400                 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1401
1402                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1403                         desc[i] = ctx->samplers[stage][i]->midgard_hw;
1404
1405                 postfix->sampler_descriptor = transfer.gpu;
1406         }
1407 }
1408
1409 void
1410 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1411                                struct mali_vertex_tiler_postfix *vertex_postfix)
1412 {
1413         struct panfrost_context *ctx = batch->ctx;
1414
1415         if (!ctx->vertex)
1416                 return;
1417
1418         struct panfrost_vertex_state *so = ctx->vertex;
1419
1420         panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1421         vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1422                                                                sizeof(*so->hw) *
1423                                                                PAN_MAX_ATTRIBUTE);
1424 }
1425
1426 void
1427 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1428                           struct mali_vertex_tiler_postfix *vertex_postfix)
1429 {
1430         struct panfrost_context *ctx = batch->ctx;
1431         struct panfrost_vertex_state *so = ctx->vertex;
1432
1433         /* Staged mali_attr, and index into them. i =/= k, depending on the
1434          * vertex buffer mask and instancing. Twice as much room is allocated,
1435          * for a worst case of NPOT_DIVIDEs which take up extra slot */
1436         union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1437         unsigned k = 0;
1438
1439         for (unsigned i = 0; i < so->num_elements; ++i) {
1440                 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1441                  * means duplicating some vertex buffers (who cares? aside from
1442                  * maybe some caching implications but I somehow doubt that
1443                  * matters) */
1444
1445                 struct pipe_vertex_element *elem = &so->pipe[i];
1446                 unsigned vbi = elem->vertex_buffer_index;
1447
1448                 /* The exception to 1:1 mapping is that we can have multiple
1449                  * entries (NPOT divisors), so we fixup anyways */
1450
1451                 so->hw[i].index = k;
1452
1453                 if (!(ctx->vb_mask & (1 << vbi)))
1454                         continue;
1455
1456                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1457                 struct panfrost_resource *rsrc;
1458
1459                 rsrc = pan_resource(buf->buffer.resource);
1460                 if (!rsrc)
1461                         continue;
1462
1463                 /* Align to 64 bytes by masking off the lower bits. This
1464                  * will be adjusted back when we fixup the src_offset in
1465                  * mali_attr_meta */
1466
1467                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1468                 mali_ptr addr = raw_addr & ~63;
1469                 unsigned chopped_addr = raw_addr - addr;
1470
1471                 /* Add a dependency of the batch on the vertex buffer */
1472                 panfrost_batch_add_bo(batch, rsrc->bo,
1473                                       PAN_BO_ACCESS_SHARED |
1474                                       PAN_BO_ACCESS_READ |
1475                                       PAN_BO_ACCESS_VERTEX_TILER);
1476
1477                 /* Set common fields */
1478                 attrs[k].elements = addr;
1479                 attrs[k].stride = buf->stride;
1480
1481                 /* Since we advanced the base pointer, we shrink the buffer
1482                  * size */
1483                 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1484
1485                 /* We need to add the extra size we masked off (for
1486                  * correctness) so the data doesn't get clamped away */
1487                 attrs[k].size += chopped_addr;
1488
1489                 /* For non-instancing make sure we initialize */
1490                 attrs[k].shift = attrs[k].extra_flags = 0;
1491
1492                 /* Instancing uses a dramatically different code path than
1493                  * linear, so dispatch for the actual emission now that the
1494                  * common code is finished */
1495
1496                 unsigned divisor = elem->instance_divisor;
1497
1498                 if (divisor && ctx->instance_count == 1) {
1499                         /* Silly corner case where there's a divisor(=1) but
1500                          * there's no legitimate instancing. So we want *every*
1501                          * attribute to be the same. So set stride to zero so
1502                          * we don't go anywhere. */
1503
1504                         attrs[k].size = attrs[k].stride + chopped_addr;
1505                         attrs[k].stride = 0;
1506                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1507                 } else if (ctx->instance_count <= 1) {
1508                         /* Normal, non-instanced attributes */
1509                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1510                 } else {
1511                         unsigned instance_shift = vertex_postfix->instance_shift;
1512                         unsigned instance_odd = vertex_postfix->instance_odd;
1513
1514                         k += panfrost_vertex_instanced(ctx->padded_count,
1515                                                        instance_shift,
1516                                                        instance_odd,
1517                                                        divisor, &attrs[k]);
1518                 }
1519         }
1520
1521         /* Add special gl_VertexID/gl_InstanceID buffers */
1522
1523         panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1524         so->hw[PAN_VERTEX_ID].index = k++;
1525         panfrost_instance_id(ctx->padded_count, &attrs[k]);
1526         so->hw[PAN_INSTANCE_ID].index = k++;
1527
1528         /* Upload whatever we emitted and go */
1529
1530         vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1531                                                            k * sizeof(*attrs));
1532 }
1533
1534 static mali_ptr
1535 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1536                        unsigned stride, unsigned count)
1537 {
1538         /* Fill out the descriptor */
1539         slot->stride = stride;
1540         slot->size = stride * count;
1541         slot->shift = slot->extra_flags = 0;
1542
1543         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1544                                                                         slot->size);
1545
1546         slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1547
1548         return transfer.gpu;
1549 }
1550
1551 static void
1552 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1553                         unsigned stride, unsigned offset, unsigned count,
1554                         struct pipe_stream_output_target *target)
1555 {
1556         /* Fill out the descriptor */
1557         slot->stride = stride * 4;
1558         slot->shift = slot->extra_flags = 0;
1559
1560         unsigned max_size = target->buffer_size;
1561         unsigned expected_size = slot->stride * count;
1562
1563         slot->size = MIN2(max_size, expected_size);
1564
1565         /* Grab the BO and bind it to the batch */
1566         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1567
1568         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1569          * the perspective of the TILER and FRAGMENT.
1570          */
1571         panfrost_batch_add_bo(batch, bo,
1572                               PAN_BO_ACCESS_SHARED |
1573                               PAN_BO_ACCESS_RW |
1574                               PAN_BO_ACCESS_VERTEX_TILER |
1575                               PAN_BO_ACCESS_FRAGMENT);
1576
1577         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1578         slot->elements = addr;
1579 }
1580
1581 /* Given a shader and buffer indices, link varying metadata together */
1582
1583 static bool
1584 is_special_varying(gl_varying_slot loc)
1585 {
1586         switch (loc) {
1587         case VARYING_SLOT_POS:
1588         case VARYING_SLOT_PSIZ:
1589         case VARYING_SLOT_PNTC:
1590         case VARYING_SLOT_FACE:
1591                 return true;
1592         default:
1593                 return false;
1594         }
1595 }
1596
1597 static void
1598 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1599                            signed general, signed gl_Position,
1600                            signed gl_PointSize, signed gl_PointCoord,
1601                            signed gl_FrontFacing)
1602 {
1603         struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1604
1605         for (unsigned i = 0; i < ss->varying_count; ++i) {
1606                 gl_varying_slot location = ss->varyings_loc[i];
1607                 int index = -1;
1608
1609                 switch (location) {
1610                 case VARYING_SLOT_POS:
1611                         index = gl_Position;
1612                         break;
1613                 case VARYING_SLOT_PSIZ:
1614                         index = gl_PointSize;
1615                         break;
1616                 case VARYING_SLOT_PNTC:
1617                         index = gl_PointCoord;
1618                         break;
1619                 case VARYING_SLOT_FACE:
1620                         index = gl_FrontFacing;
1621                         break;
1622                 default:
1623                         index = general;
1624                         break;
1625                 }
1626
1627                 assert(index >= 0);
1628                 out[i].index = index;
1629         }
1630 }
1631
1632 static bool
1633 has_point_coord(unsigned mask, gl_varying_slot loc)
1634 {
1635         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1636                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1637         else if (loc == VARYING_SLOT_PNTC)
1638                 return (mask & (1 << 8));
1639         else
1640                 return false;
1641 }
1642
1643 /* Helpers for manipulating stream out information so we can pack varyings
1644  * accordingly. Compute the src_offset for a given captured varying */
1645
1646 static struct pipe_stream_output *
1647 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1648 {
1649         for (unsigned i = 0; i < info->num_outputs; ++i) {
1650                 if (info->output[i].register_index == loc)
1651                         return &info->output[i];
1652         }
1653
1654         unreachable("Varying not captured");
1655 }
1656
1657 /* TODO: Integers */
1658 static enum mali_format
1659 pan_xfb_format(unsigned nr_components)
1660 {
1661         switch (nr_components) {
1662                 case 1: return MALI_R32F;
1663                 case 2: return MALI_RG32F;
1664                 case 3: return MALI_RGB32F;
1665                 case 4: return MALI_RGBA32F;
1666                 default: unreachable("Invalid format");
1667         }
1668 }
1669
1670 void
1671 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1672                                  unsigned vertex_count,
1673                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1674                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1675                                  union midgard_primitive_size *primitive_size)
1676 {
1677         /* Load the shaders */
1678         struct panfrost_context *ctx = batch->ctx;
1679         struct panfrost_shader_state *vs, *fs;
1680         unsigned int num_gen_varyings = 0;
1681         size_t vs_size, fs_size;
1682
1683         /* Allocate the varying descriptor */
1684
1685         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1686         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1687         vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1688         fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1689
1690         struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1691                                                                      vs_size +
1692                                                                      fs_size);
1693
1694         struct pipe_stream_output_info *so = &vs->stream_output;
1695
1696         /* Check if this varying is linked by us. This is the case for
1697          * general-purpose, non-captured varyings. If it is, link it. If it's
1698          * not, use the provided stream out information to determine the
1699          * offset, since it was already linked for us. */
1700
1701         for (unsigned i = 0; i < vs->varying_count; i++) {
1702                 gl_varying_slot loc = vs->varyings_loc[i];
1703
1704                 bool special = is_special_varying(loc);
1705                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1706
1707                 if (captured) {
1708                         struct pipe_stream_output *o = pan_get_so(so, loc);
1709
1710                         unsigned dst_offset = o->dst_offset * 4; /* dwords */
1711                         vs->varyings[i].src_offset = dst_offset;
1712                 } else if (!special) {
1713                         vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1714                 }
1715         }
1716
1717         /* Conversely, we need to set src_offset for the captured varyings.
1718          * Here, the layout is defined by the stream out info, not us */
1719
1720         /* Link up with fragment varyings */
1721         bool reads_point_coord = fs->reads_point_coord;
1722
1723         for (unsigned i = 0; i < fs->varying_count; i++) {
1724                 gl_varying_slot loc = fs->varyings_loc[i];
1725                 unsigned src_offset;
1726                 signed vs_idx = -1;
1727
1728                 /* Link up */
1729                 for (unsigned j = 0; j < vs->varying_count; ++j) {
1730                         if (vs->varyings_loc[j] == loc) {
1731                                 vs_idx = j;
1732                                 break;
1733                         }
1734                 }
1735
1736                 /* Either assign or reuse */
1737                 if (vs_idx >= 0)
1738                         src_offset = vs->varyings[vs_idx].src_offset;
1739                 else
1740                         src_offset = 16 * (num_gen_varyings++);
1741
1742                 fs->varyings[i].src_offset = src_offset;
1743
1744                 if (has_point_coord(fs->point_sprite_mask, loc))
1745                         reads_point_coord = true;
1746         }
1747
1748         memcpy(trans.cpu, vs->varyings, vs_size);
1749         memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1750
1751         union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1752
1753         /* Figure out how many streamout buffers could be bound */
1754         unsigned so_count = ctx->streamout.num_targets;
1755         for (unsigned i = 0; i < vs->varying_count; i++) {
1756                 gl_varying_slot loc = vs->varyings_loc[i];
1757
1758                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1759                 if (!captured) continue;
1760
1761                 struct pipe_stream_output *o = pan_get_so(so, loc);
1762                 so_count = MAX2(so_count, o->output_buffer + 1);
1763         }
1764
1765         signed idx = so_count;
1766         signed general = idx++;
1767         signed gl_Position = idx++;
1768         signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1769         signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1770         signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1771         signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1772
1773         /* Emit the stream out buffers */
1774
1775         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1776                                                            ctx->vertex_count);
1777
1778         for (unsigned i = 0; i < so_count; ++i) {
1779                 if (i < ctx->streamout.num_targets) {
1780                         panfrost_emit_streamout(batch, &varyings[i],
1781                                                 so->stride[i],
1782                                                 ctx->streamout.offsets[i],
1783                                                 out_count,
1784                                                 ctx->streamout.targets[i]);
1785                 } else {
1786                         /* Emit a dummy buffer */
1787                         panfrost_emit_varyings(batch, &varyings[i],
1788                                                so->stride[i] * 4,
1789                                                out_count);
1790
1791                         /* Clear the attribute type */
1792                         varyings[i].elements &= ~0xF;
1793                 }
1794         }
1795
1796         panfrost_emit_varyings(batch, &varyings[general],
1797                                num_gen_varyings * 16,
1798                                vertex_count);
1799
1800         mali_ptr varyings_p;
1801
1802         /* fp32 vec4 gl_Position */
1803         varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1804                                             sizeof(float) * 4, vertex_count);
1805         tiler_postfix->position_varying = varyings_p;
1806
1807
1808         if (panfrost_writes_point_size(ctx)) {
1809                 varyings_p = panfrost_emit_varyings(batch,
1810                                                     &varyings[gl_PointSize],
1811                                                     2, vertex_count);
1812                 primitive_size->pointer = varyings_p;
1813         }
1814
1815         if (reads_point_coord)
1816                 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1817
1818         if (fs->reads_face)
1819                 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1820
1821         if (fs->reads_frag_coord)
1822                 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1823
1824         struct panfrost_device *device = pan_device(ctx->base.screen);
1825         assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord));
1826
1827         /* Let's go ahead and link varying meta to the buffer in question, now
1828          * that that information is available. VARYING_SLOT_POS is mapped to
1829          * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1830          * */
1831
1832         panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1833                                    gl_PointSize, gl_PointCoord,
1834                                    gl_FrontFacing);
1835
1836         panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1837                                    gl_FragCoord, gl_PointSize,
1838                                    gl_PointCoord, gl_FrontFacing);
1839
1840         /* Replace streamout */
1841
1842         struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1843         struct mali_attr_meta *ofs = ovs + vs->varying_count;
1844
1845         for (unsigned i = 0; i < vs->varying_count; i++) {
1846                 gl_varying_slot loc = vs->varyings_loc[i];
1847
1848                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1849                 if (!captured)
1850                         continue;
1851
1852                 struct pipe_stream_output *o = pan_get_so(so, loc);
1853                 ovs[i].index = o->output_buffer;
1854
1855                 /* Set the type appropriately. TODO: Integer varyings XXX */
1856                 assert(o->stream == 0);
1857                 ovs[i].format = pan_xfb_format(o->num_components);
1858
1859                 if (device->quirks & HAS_SWIZZLES)
1860                         ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1861                 else
1862                         ovs[i].swizzle = panfrost_bifrost_swizzle(o->num_components);
1863
1864                 /* Link to the fragment */
1865                 signed fs_idx = -1;
1866
1867                 /* Link up */
1868                 for (unsigned j = 0; j < fs->varying_count; ++j) {
1869                         if (fs->varyings_loc[j] == loc) {
1870                                 fs_idx = j;
1871                                 break;
1872                         }
1873                 }
1874
1875                 if (fs_idx >= 0) {
1876                         ofs[fs_idx].index = ovs[i].index;
1877                         ofs[fs_idx].format = ovs[i].format;
1878                         ofs[fs_idx].swizzle = ovs[i].swizzle;
1879                 }
1880         }
1881
1882         /* Replace point sprite */
1883         for (unsigned i = 0; i < fs->varying_count; i++) {
1884                 /* If we have a point sprite replacement, handle that here. We
1885                  * have to translate location first.  TODO: Flip y in shader.
1886                  * We're already keying ... just time crunch .. */
1887
1888                 if (has_point_coord(fs->point_sprite_mask,
1889                                     fs->varyings_loc[i])) {
1890                         ofs[i].index = gl_PointCoord;
1891
1892                         /* Swizzle out the z/w to 0/1 */
1893                         ofs[i].format = MALI_RG16F;
1894                         ofs[i].swizzle = panfrost_get_default_swizzle(2);
1895                 }
1896         }
1897
1898         /* Fix up unaligned addresses */
1899         for (unsigned i = 0; i < so_count; ++i) {
1900                 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1901                         continue;
1902
1903                 unsigned align = (varyings[i].elements & 63);
1904
1905                 /* While we're at it, the SO buffers are linear */
1906
1907                 if (!align) {
1908                         varyings[i].elements |= MALI_ATTR_LINEAR;
1909                         continue;
1910                 }
1911
1912                 /* We need to adjust alignment */
1913                 varyings[i].elements &= ~63;
1914                 varyings[i].elements |= MALI_ATTR_LINEAR;
1915                 varyings[i].size += align;
1916
1917                 for (unsigned v = 0; v < vs->varying_count; ++v) {
1918                         if (ovs[v].index != i)
1919                                 continue;
1920
1921                         ovs[v].src_offset = vs->varyings[v].src_offset + align;
1922                 }
1923
1924                 for (unsigned f = 0; f < fs->varying_count; ++f) {
1925                         if (ofs[f].index != i)
1926                                 continue;
1927
1928                         ofs[f].src_offset = fs->varyings[f].src_offset + align;
1929                 }
1930         }
1931
1932         varyings_p = panfrost_upload_transient(batch, varyings,
1933                                                idx * sizeof(*varyings));
1934         vertex_postfix->varyings = varyings_p;
1935         tiler_postfix->varyings = varyings_p;
1936
1937         vertex_postfix->varying_meta = trans.gpu;
1938         tiler_postfix->varying_meta = trans.gpu + vs_size;
1939 }
1940
1941 void
1942 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1943                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1944                                 struct mali_vertex_tiler_postfix *vertex_postfix,
1945                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1946                                 struct mali_vertex_tiler_postfix *tiler_postfix,
1947                                 union midgard_primitive_size *primitive_size)
1948 {
1949         struct panfrost_context *ctx = batch->ctx;
1950         struct panfrost_device *device = pan_device(ctx->base.screen);
1951         bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
1952         struct bifrost_payload_vertex bifrost_vertex = {0,};
1953         struct bifrost_payload_tiler bifrost_tiler = {0,};
1954         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1955         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1956         void *vp, *tp;
1957         size_t vp_size, tp_size;
1958
1959         if (device->quirks & IS_BIFROST) {
1960                 bifrost_vertex.prefix = *vertex_prefix;
1961                 bifrost_vertex.postfix = *vertex_postfix;
1962                 vp = &bifrost_vertex;
1963                 vp_size = sizeof(bifrost_vertex);
1964
1965                 bifrost_tiler.prefix = *tiler_prefix;
1966                 bifrost_tiler.tiler.primitive_size = *primitive_size;
1967                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1968                 bifrost_tiler.postfix = *tiler_postfix;
1969                 tp = &bifrost_tiler;
1970                 tp_size = sizeof(bifrost_tiler);
1971         } else {
1972                 midgard_vertex.prefix = *vertex_prefix;
1973                 midgard_vertex.postfix = *vertex_postfix;
1974                 vp = &midgard_vertex;
1975                 vp_size = sizeof(midgard_vertex);
1976
1977                 midgard_tiler.prefix = *tiler_prefix;
1978                 midgard_tiler.postfix = *tiler_postfix;
1979                 midgard_tiler.primitive_size = *primitive_size;
1980                 tp = &midgard_tiler;
1981                 tp_size = sizeof(midgard_tiler);
1982         }
1983
1984         if (wallpapering) {
1985                 /* Inject in reverse order, with "predicted" job indices.
1986                  * THIS IS A HACK XXX */
1987                 panfrost_new_job(batch, JOB_TYPE_TILER, false,
1988                                  batch->job_index + 2, tp, tp_size, true);
1989                 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1990                                  vp, vp_size, true);
1991                 return;
1992         }
1993
1994         /* If rasterizer discard is enable, only submit the vertex */
1995
1996         bool rasterizer_discard = ctx->rasterizer &&
1997                                   ctx->rasterizer->base.rasterizer_discard;
1998
1999         unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2000                                            vp, vp_size, false);
2001
2002         if (rasterizer_discard)
2003                 return;
2004
2005         panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2006                          false);
2007 }
2008
2009 /* TODO: stop hardcoding this */
2010 mali_ptr
2011 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2012 {
2013         uint16_t locations[] = {
2014             128, 128,
2015             0, 256,
2016             0, 256,
2017             0, 256,
2018             0, 256,
2019             0, 256,
2020             0, 256,
2021             0, 256,
2022             0, 256,
2023             0, 256,
2024             0, 256,
2025             0, 256,
2026             0, 256,
2027             0, 256,
2028             0, 256,
2029             0, 256,
2030             0, 256,
2031             0, 256,
2032             0, 256,
2033             0, 256,
2034             0, 256,
2035             0, 256,
2036             0, 256,
2037             0, 256,
2038             0, 256,
2039             0, 256,
2040             0, 256,
2041             0, 256,
2042             0, 256,
2043             0, 256,
2044             0, 256,
2045             0, 256,
2046             128, 128,
2047             0, 0,
2048             0, 0,
2049             0, 0,
2050             0, 0,
2051             0, 0,
2052             0, 0,
2053             0, 0,
2054             0, 0,
2055             0, 0,
2056             0, 0,
2057             0, 0,
2058             0, 0,
2059             0, 0,
2060             0, 0,
2061             0, 0,
2062         };
2063
2064         return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2065 }