src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_allocate.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         unsigned shift = panfrost_get_stack_shift(batch->stack_size);
  62         struct mali_shared_memory shared = {
  63                 .stack_shift = shift,
  64                 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
  65                 .shared_workgroup_count = ~0,
  66         };
  67         postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
  68 }
  69
  70 static void
  71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  72                                struct mali_vertex_tiler_postfix *postfix)
  73 {
  74         struct panfrost_device *dev = pan_device(ctx->base.screen);
  75         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  76
  77         /* If we haven't, reserve space for the framebuffer */
  78
  79         if (!batch->framebuffer.gpu) {
  80                 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
  81                         sizeof(struct mali_single_framebuffer) :
  82                         sizeof(struct mali_framebuffer);
  83
  84                 batch->framebuffer = panfrost_allocate_transient(batch, size);
  85
  86                 /* Tag the pointer */
  87                 if (!(dev->quirks & MIDGARD_SFBD))
  88                         batch->framebuffer.gpu |= MALI_MFBD;
  89         }
  90
  91         postfix->shared_memory = batch->framebuffer.gpu;
  92 }
  93
  94 static void
  95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
  96                               struct mali_vertex_tiler_prefix *prefix,
  97                               struct mali_vertex_tiler_postfix *postfix)
  98 {
  99         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 100
 101         postfix->gl_enables |= 0x7;
 102         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
 103                 rasterizer && rasterizer->base.front_ccw);
 104         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
 105                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
 106         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
 107                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
 108         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
 109                 rasterizer && rasterizer->base.flatshade_first);
 110 }
 111
 112 void
 113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 114                                   struct mali_vertex_tiler_prefix *prefix,
 115                                   union midgard_primitive_size *primitive_size)
 116 {
 117         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 118
 119         if (!panfrost_writes_point_size(ctx)) {
 120                 bool points = prefix->draw_mode == MALI_POINTS;
 121                 float val = 0.0f;
 122
 123                 if (rasterizer)
 124                         val = points ?
 125                               rasterizer->base.point_size :
 126                               rasterizer->base.line_width;
 127
 128                 primitive_size->constant = val;
 129         }
 130 }
 131
 132 static void
 133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 134                                    struct mali_vertex_tiler_postfix *postfix)
 135 {
 136         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 137         if (ctx->occlusion_query)
 138                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 139         else
 140                 postfix->occlusion_counter = 0;
 141 }
 142
 143 void
 144 panfrost_vt_init(struct panfrost_context *ctx,
 145                  enum pipe_shader_type stage,
 146                  struct mali_vertex_tiler_prefix *prefix,
 147                  struct mali_vertex_tiler_postfix *postfix)
 148 {
 149         struct panfrost_device *device = pan_device(ctx->base.screen);
 150
 151         if (!ctx->shader[stage])
 152                 return;
 153
 154         memset(prefix, 0, sizeof(*prefix));
 155         memset(postfix, 0, sizeof(*postfix));
 156
 157         if (device->quirks & IS_BIFROST) {
 158                 postfix->gl_enables = 0x2;
 159                 panfrost_vt_emit_shared_memory(ctx, postfix);
 160         } else {
 161                 postfix->gl_enables = 0x6;
 162                 panfrost_vt_attach_framebuffer(ctx, postfix);
 163         }
 164
 165         if (stage == PIPE_SHADER_FRAGMENT) {
 166                 panfrost_vt_update_occlusion_query(ctx, postfix);
 167                 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
 168         }
 169 }
 170
 171 static unsigned
 172 panfrost_translate_index_size(unsigned size)
 173 {
 174         switch (size) {
 175         case 1:
 176                 return MALI_DRAW_INDEXED_UINT8;
 177
 178         case 2:
 179                 return MALI_DRAW_INDEXED_UINT16;
 180
 181         case 4:
 182                 return MALI_DRAW_INDEXED_UINT32;
 183
 184         default:
 185                 unreachable("Invalid index size");
 186         }
 187 }
 188
 189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 190  * good for the duration of the draw (transient), could last longer. Also get
 191  * the bounds on the index buffer for the range accessed by the draw. We do
 192  * these operations together because there are natural optimizations which
 193  * require them to be together. */
 194
 195 static mali_ptr
 196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 197                                   const struct pipe_draw_info *info,
 198                                   unsigned *min_index, unsigned *max_index)
 199 {
 200         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 201         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 202         off_t offset = info->start * info->index_size;
 203         bool needs_indices = true;
 204         mali_ptr out = 0;
 205
 206         if (info->max_index != ~0u) {
 207                 *min_index = info->min_index;
 208                 *max_index = info->max_index;
 209                 needs_indices = false;
 210         }
 211
 212         if (!info->has_user_indices) {
 213                 /* Only resources can be directly mapped */
 214                 panfrost_batch_add_bo(batch, rsrc->bo,
 215                                       PAN_BO_ACCESS_SHARED |
 216                                       PAN_BO_ACCESS_READ |
 217                                       PAN_BO_ACCESS_VERTEX_TILER);
 218                 out = rsrc->bo->gpu + offset;
 219
 220                 /* Check the cache */
 221                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 222                                                            info->start,
 223                                                            info->count,
 224                                                            min_index,
 225                                                            max_index);
 226         } else {
 227                 /* Otherwise, we need to upload to transient memory */
 228                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 229                 out = panfrost_upload_transient(batch, ibuf8 + offset,
 230                                                 info->count *
 231                                                 info->index_size);
 232         }
 233
 234         if (needs_indices) {
 235                 /* Fallback */
 236                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 237
 238                 if (!info->has_user_indices)
 239                         panfrost_minmax_cache_add(rsrc->index_cache,
 240                                                   info->start, info->count,
 241                                                   *min_index, *max_index);
 242         }
 243
 244         return out;
 245 }
 246
 247 void
 248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 249                           const struct pipe_draw_info *info,
 250                           enum mali_draw_mode draw_mode,
 251                           struct mali_vertex_tiler_postfix *vertex_postfix,
 252                           struct mali_vertex_tiler_prefix *tiler_prefix,
 253                           struct mali_vertex_tiler_postfix *tiler_postfix,
 254                           unsigned *vertex_count,
 255                           unsigned *padded_count)
 256 {
 257         tiler_prefix->draw_mode = draw_mode;
 258
 259         unsigned draw_flags = 0;
 260
 261         if (panfrost_writes_point_size(ctx))
 262                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 263
 264         if (info->primitive_restart)
 265                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 266
 267         /* These doesn't make much sense */
 268
 269         draw_flags |= 0x3000;
 270
 271         if (info->index_size) {
 272                 unsigned min_index = 0, max_index = 0;
 273
 274                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 275                                                                        info,
 276                                                                        &min_index,
 277                                                                        &max_index);
 278
 279                 /* Use the corresponding values */
 280                 *vertex_count = max_index - min_index + 1;
 281                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 282                 tiler_prefix->offset_bias_correction = -min_index;
 283                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 284                 draw_flags |= panfrost_translate_index_size(info->index_size);
 285         } else {
 286                 tiler_prefix->indices = 0;
 287                 *vertex_count = ctx->vertex_count;
 288                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 289                 tiler_prefix->offset_bias_correction = 0;
 290                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 291         }
 292
 293         tiler_prefix->unknown_draw = draw_flags;
 294
 295         /* Encode the padded vertex count */
 296
 297         if (info->instance_count > 1) {
 298                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 299
 300                 unsigned shift = __builtin_ctz(ctx->padded_count);
 301                 unsigned k = ctx->padded_count >> (shift + 1);
 302
 303                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 304                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 305         } else {
 306                 *padded_count = *vertex_count;
 307
 308                 /* Reset instancing state */
 309                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 310                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 311         }
 312 }
 313
 314 static void
 315 panfrost_shader_meta_init(struct panfrost_context *ctx,
 316                           enum pipe_shader_type st,
 317                           struct mali_shader_meta *meta)
 318 {
 319         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 320         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 321
 322         memset(meta, 0, sizeof(*meta));
 323         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 324         meta->attribute_count = ss->attribute_count;
 325         meta->varying_count = ss->varying_count;
 326         meta->texture_count = ctx->sampler_view_count[st];
 327         meta->sampler_count = ctx->sampler_count[st];
 328
 329         if (dev->quirks & IS_BIFROST) {
 330                 if (st == PIPE_SHADER_VERTEX)
 331                         meta->bifrost1.unk1 = 0x800000;
 332                 else {
 333                         /* First clause ATEST |= 0x4000000.
 334                          * Less than 32 regs |= 0x200 */
 335                         meta->bifrost1.unk1 = 0x958020;
 336                 }
 337
 338                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 339                 if (st == PIPE_SHADER_VERTEX)
 340                         meta->bifrost2.preload_regs = 0xC0;
 341                 else
 342                         meta->bifrost2.preload_regs = 0x1;
 343                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 344                                                     ss->uniform_cutoff);
 345         } else {
 346                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 347                                                     ss->uniform_cutoff);
 348                 meta->midgard1.work_count = ss->work_reg_count;
 349                 meta->midgard1.flags_hi = 0x8; /* XXX */
 350                 meta->midgard1.flags_lo = 0x220;
 351                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 352         }
 353 }
 354
 355 static unsigned
 356 panfrost_translate_compare_func(enum pipe_compare_func in)
 357 {
 358         switch (in) {
 359         case PIPE_FUNC_NEVER:
 360                 return MALI_FUNC_NEVER;
 361
 362         case PIPE_FUNC_LESS:
 363                 return MALI_FUNC_LESS;
 364
 365         case PIPE_FUNC_EQUAL:
 366                 return MALI_FUNC_EQUAL;
 367
 368         case PIPE_FUNC_LEQUAL:
 369                 return MALI_FUNC_LEQUAL;
 370
 371         case PIPE_FUNC_GREATER:
 372                 return MALI_FUNC_GREATER;
 373
 374         case PIPE_FUNC_NOTEQUAL:
 375                 return MALI_FUNC_NOTEQUAL;
 376
 377         case PIPE_FUNC_GEQUAL:
 378                 return MALI_FUNC_GEQUAL;
 379
 380         case PIPE_FUNC_ALWAYS:
 381                 return MALI_FUNC_ALWAYS;
 382
 383         default:
 384                 unreachable("Invalid func");
 385         }
 386 }
 387
 388 static unsigned
 389 panfrost_translate_stencil_op(enum pipe_stencil_op in)
 390 {
 391         switch (in) {
 392         case PIPE_STENCIL_OP_KEEP:
 393                 return MALI_STENCIL_KEEP;
 394
 395         case PIPE_STENCIL_OP_ZERO:
 396                 return MALI_STENCIL_ZERO;
 397
 398         case PIPE_STENCIL_OP_REPLACE:
 399                return MALI_STENCIL_REPLACE;
 400
 401         case PIPE_STENCIL_OP_INCR:
 402                 return MALI_STENCIL_INCR;
 403
 404         case PIPE_STENCIL_OP_DECR:
 405                 return MALI_STENCIL_DECR;
 406
 407         case PIPE_STENCIL_OP_INCR_WRAP:
 408                 return MALI_STENCIL_INCR_WRAP;
 409
 410         case PIPE_STENCIL_OP_DECR_WRAP:
 411                 return MALI_STENCIL_DECR_WRAP;
 412
 413         case PIPE_STENCIL_OP_INVERT:
 414                 return MALI_STENCIL_INVERT;
 415
 416         default:
 417                 unreachable("Invalid stencil op");
 418         }
 419 }
 420
 421 static unsigned
 422 translate_tex_wrap(enum pipe_tex_wrap w)
 423 {
 424         switch (w) {
 425         case PIPE_TEX_WRAP_REPEAT:
 426                 return MALI_WRAP_REPEAT;
 427
 428         case PIPE_TEX_WRAP_CLAMP:
 429                 return MALI_WRAP_CLAMP;
 430
 431         case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 432                 return MALI_WRAP_CLAMP_TO_EDGE;
 433
 434         case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 435                 return MALI_WRAP_CLAMP_TO_BORDER;
 436
 437         case PIPE_TEX_WRAP_MIRROR_REPEAT:
 438                 return MALI_WRAP_MIRRORED_REPEAT;
 439
 440         case PIPE_TEX_WRAP_MIRROR_CLAMP:
 441                 return MALI_WRAP_MIRRORED_CLAMP;
 442
 443         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 444                 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
 445
 446         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 447                 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
 448
 449         default:
 450                 unreachable("Invalid wrap");
 451         }
 452 }
 453
 454 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 455                                 struct mali_sampler_descriptor *hw)
 456 {
 457         unsigned func = panfrost_translate_compare_func(cso->compare_func);
 458         bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 459         bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 460         bool mip_linear  = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
 461         unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
 462         unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
 463         unsigned mip_filter = mip_linear  ?
 464                               (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
 465         unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
 466
 467         *hw = (struct mali_sampler_descriptor) {
 468                 .filter_mode = min_filter | mag_filter | mip_filter |
 469                                normalized,
 470                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 471                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 472                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 473                 .compare_func = panfrost_flip_compare_func(func),
 474                 .border_color = {
 475                         cso->border_color.f[0],
 476                         cso->border_color.f[1],
 477                         cso->border_color.f[2],
 478                         cso->border_color.f[3]
 479                 },
 480                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 481                 .max_lod = FIXED_16(cso->max_lod, false),
 482                 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
 483                 .seamless_cube_map = cso->seamless_cube_map,
 484         };
 485
 486         /* If necessary, we disable mipmapping in the sampler descriptor by
 487          * clamping the LOD as tight as possible (from 0 to epsilon,
 488          * essentially -- remember these are fixed point numbers, so
 489          * epsilon=1/256) */
 490
 491         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 492                 hw->max_lod = hw->min_lod + 1;
 493 }
 494
 495 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 496                                         struct bifrost_sampler_descriptor *hw)
 497 {
 498         *hw = (struct bifrost_sampler_descriptor) {
 499                 .unk1 = 0x1,
 500                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 501                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 502                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 503                 .unk8 = 0x8,
 504                 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
 505                 .norm_coords = cso->normalized_coords,
 506                 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
 507                 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
 508                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 509                 .max_lod = FIXED_16(cso->max_lod, false),
 510         };
 511
 512         /* If necessary, we disable mipmapping in the sampler descriptor by
 513          * clamping the LOD as tight as possible (from 0 to epsilon,
 514          * essentially -- remember these are fixed point numbers, so
 515          * epsilon=1/256) */
 516
 517         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 518                 hw->max_lod = hw->min_lod + 1;
 519 }
 520
 521 static void
 522 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
 523                             struct mali_stencil_test *out)
 524 {
 525         out->ref = 0; /* Gallium gets it from elsewhere */
 526
 527         out->mask = in->valuemask;
 528         out->func = panfrost_translate_compare_func(in->func);
 529         out->sfail = panfrost_translate_stencil_op(in->fail_op);
 530         out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
 531         out->dppass = panfrost_translate_stencil_op(in->zpass_op);
 532 }
 533
 534 static void
 535 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 536                                      struct mali_shader_meta *fragmeta)
 537 {
 538         if (!ctx->rasterizer) {
 539                 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
 540                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
 541                 fragmeta->depth_units = 0.0f;
 542                 fragmeta->depth_factor = 0.0f;
 543                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
 544                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
 545                 return;
 546         }
 547
 548         bool msaa = ctx->rasterizer->base.multisample;
 549
 550         /* TODO: Sample size */
 551         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 552         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 553         fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
 554         fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
 555
 556         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 557
 558         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
 559                 ctx->rasterizer->base.offset_tri);
 560         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
 561                 ctx->rasterizer->base.offset_tri);
 562 }
 563
 564 static void
 565 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 566                               struct mali_shader_meta *fragmeta)
 567 {
 568         const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 569         int zfunc = PIPE_FUNC_ALWAYS;
 570
 571         if (!zsa) {
 572                 struct pipe_stencil_state default_stencil = {
 573                         .enabled = 0,
 574                         .func = PIPE_FUNC_ALWAYS,
 575                         .fail_op = MALI_STENCIL_KEEP,
 576                         .zfail_op = MALI_STENCIL_KEEP,
 577                         .zpass_op = MALI_STENCIL_KEEP,
 578                         .writemask = 0xFF,
 579                         .valuemask = 0xFF
 580                 };
 581
 582                 panfrost_make_stencil_state(&default_stencil,
 583                                             &fragmeta->stencil_front);
 584                 fragmeta->stencil_mask_front = default_stencil.writemask;
 585                 fragmeta->stencil_back = fragmeta->stencil_front;
 586                 fragmeta->stencil_mask_back = default_stencil.writemask;
 587                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
 588                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
 589         } else {
 590                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 591                         zsa->stencil[0].enabled);
 592                 panfrost_make_stencil_state(&zsa->stencil[0],
 593                                             &fragmeta->stencil_front);
 594                 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
 595                 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
 596
 597                 /* If back-stencil is not enabled, use the front values */
 598
 599                 if (zsa->stencil[1].enabled) {
 600                         panfrost_make_stencil_state(&zsa->stencil[1],
 601                                                     &fragmeta->stencil_back);
 602                         fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
 603                         fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
 604                 } else {
 605                         fragmeta->stencil_back = fragmeta->stencil_front;
 606                         fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
 607                         fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
 608                 }
 609
 610                 if (zsa->depth.enabled)
 611                         zfunc = zsa->depth.func;
 612
 613                 /* Depth state (TODO: Refactor) */
 614
 615                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 616                         zsa->depth.writemask);
 617         }
 618
 619         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 620         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
 621 }
 622
 623 static void
 624 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 625                                 struct mali_shader_meta *fragmeta,
 626                                 void *rts)
 627 {
 628         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 629
 630         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 631                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 632                 !ctx->blend->base.dither);
 633
 634         /* Get blending setup */
 635         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 636
 637         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 638         unsigned shader_offset = 0;
 639         struct panfrost_bo *shader_bo = NULL;
 640
 641         for (unsigned c = 0; c < rt_count; ++c)
 642                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
 643                                                           &shader_offset);
 644
 645          /* If there is a blend shader, work registers are shared. XXX: opt */
 646
 647         for (unsigned c = 0; c < rt_count; ++c) {
 648                 if (blend[c].is_shader)
 649                         fragmeta->midgard1.work_count = 16;
 650         }
 651
 652         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 653          * copied to the blend_meta appended (by convention), but this is the
 654          * field actually read by the hardware. (Or maybe both are read...?).
 655          * Specify the last RTi with a blend shader. */
 656
 657         fragmeta->blend.shader = 0;
 658
 659         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 660                 if (!blend[rt].is_shader)
 661                         continue;
 662
 663                 fragmeta->blend.shader = blend[rt].shader.gpu |
 664                                          blend[rt].shader.first_tag;
 665                 break;
 666         }
 667
 668         if (dev->quirks & MIDGARD_SFBD) {
 669                 /* When only a single render target platform is used, the blend
 670                  * information is inside the shader meta itself. We additionally
 671                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 672                  * we're able to read back the destination buffer) */
 673
 674                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 675                         blend[0].is_shader);
 676
 677                 if (!blend[0].is_shader) {
 678                         fragmeta->blend.equation = *blend[0].equation.equation;
 679                         fragmeta->blend.constant = blend[0].equation.constant;
 680                 }
 681
 682                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 683                         !blend[0].no_blending);
 684                 return;
 685         }
 686
 687         /* Additional blend descriptor tacked on for jobs using MFBD */
 688
 689         for (unsigned i = 0; i < rt_count; ++i) {
 690                 if (dev->quirks & IS_BIFROST) {
 691                         struct bifrost_blend_rt *brts = rts;
 692                         struct panfrost_shader_state *fs;
 693                         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 694
 695                         brts[i].flags = 0x200;
 696                         if (blend[i].is_shader) {
 697                                 /* The blend shader's address needs to be at
 698                                  * the same top 32 bit as the fragment shader.
 699                                  * TODO: Ensure that's always the case.
 700                                  */
 701                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 702                                        (fs->bo->gpu & (0xffffffffull << 32)));
 703                                 brts[i].shader = blend[i].shader.gpu;
 704                                 brts[i].unk2 = 0x0;
 705                         } else {
 706                                 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
 707                                 const struct util_format_description *format_desc;
 708                                 format_desc = util_format_description(format);
 709
 710                                 brts[i].equation = *blend[i].equation.equation;
 711
 712                                 /* TODO: this is a bit more complicated */
 713                                 brts[i].constant = blend[i].equation.constant;
 714
 715                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 716                                 brts[i].unk2 = 0x19;
 717
 718                                 brts[i].shader_type = fs->blend_types[i];
 719                         }
 720                 } else {
 721                         struct midgard_blend_rt *mrts = rts;
 722
 723                         mrts[i].flags = 0x200;
 724
 725                         bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 726                                        (ctx->pipe_framebuffer.cbufs[i]) &&
 727                                        util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 728
 729                         SET_BIT(mrts[i].flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 730                         SET_BIT(mrts[i].flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 731                         SET_BIT(mrts[i].flags, MALI_BLEND_SRGB, is_srgb);
 732                         SET_BIT(mrts[i].flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 733
 734                         if (blend[i].is_shader) {
 735                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 736                         } else {
 737                                 mrts[i].blend.equation = *blend[i].equation.equation;
 738                                 mrts[i].blend.constant = blend[i].equation.constant;
 739                         }
 740                 }
 741         }
 742 }
 743
 744 static void
 745 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 746                                struct mali_shader_meta *fragmeta,
 747                                void *rts)
 748 {
 749         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 750         struct panfrost_shader_state *fs;
 751
 752         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 753
 754         fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
 755         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
 756         fragmeta->unknown2_4 = 0x4e0;
 757
 758         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 759          * is required (independent of 32-bit/64-bit descriptors), or why it's
 760          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 761          * these earlier chips (perhaps this is a chicken bit of some kind).
 762          * More investigation is needed. */
 763
 764         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 765
 766         /* Depending on whether it's legal to in the given shader, we try to
 767          * enable early-z testing (or forward-pixel kill?) */
 768
 769         SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 770                 !fs->can_discard && !fs->writes_depth);
 771
 772         /* Add the writes Z/S flags if needed. */
 773         SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 774         SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 775
 776         /* Any time texturing is used, derivatives are implicitly calculated,
 777          * so we need to enable helper invocations */
 778
 779         SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 780                 fs->helper_invocations);
 781
 782         /* CAN_DISCARD should be set if the fragment shader possibly contains a
 783          * 'discard' instruction. It is likely this is related to optimizations
 784          * related to forward-pixel kill, as per "Mali Performance 3: Is
 785          * EGL_BUFFER_PRESERVED a good thing?" by Peter Harris */
 786
 787         SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD, fs->can_discard);
 788         SET_BIT(fragmeta->midgard1.flags_lo, 0x400, fs->can_discard);
 789
 790         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 791         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 792         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 793 }
 794
 795 void
 796 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 797                           enum pipe_shader_type st,
 798                           struct mali_vertex_tiler_postfix *postfix)
 799 {
 800         struct panfrost_context *ctx = batch->ctx;
 801         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 802
 803         if (!ss) {
 804                 postfix->shader = 0;
 805                 return;
 806         }
 807
 808         struct mali_shader_meta meta;
 809
 810         panfrost_shader_meta_init(ctx, st, &meta);
 811
 812         /* Add the shader BO to the batch. */
 813         panfrost_batch_add_bo(batch, ss->bo,
 814                               PAN_BO_ACCESS_PRIVATE |
 815                               PAN_BO_ACCESS_READ |
 816                               panfrost_bo_access_for_stage(st));
 817
 818         mali_ptr shader_ptr;
 819
 820         if (st == PIPE_SHADER_FRAGMENT) {
 821                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 822                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 823                 size_t desc_size = sizeof(meta);
 824                 void *rts = NULL;
 825                 struct panfrost_transfer xfer;
 826                 unsigned rt_size;
 827
 828                 if (dev->quirks & MIDGARD_SFBD)
 829                         rt_size = 0;
 830                 else if (dev->quirks & IS_BIFROST)
 831                         rt_size = sizeof(struct bifrost_blend_rt);
 832                 else
 833                         rt_size = sizeof(struct midgard_blend_rt);
 834
 835                 desc_size += rt_size * rt_count;
 836
 837                 if (rt_size)
 838                         rts = rzalloc_size(ctx, rt_size * rt_count);
 839
 840                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 841
 842                 xfer = panfrost_allocate_transient(batch, desc_size);
 843
 844                 memcpy(xfer.cpu, &meta, sizeof(meta));
 845                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 846
 847                 if (rt_size)
 848                         ralloc_free(rts);
 849
 850                 shader_ptr = xfer.gpu;
 851         } else {
 852                 shader_ptr = panfrost_upload_transient(batch, &meta,
 853                                                        sizeof(meta));
 854         }
 855
 856         postfix->shader = shader_ptr;
 857 }
 858
 859 static void
 860 panfrost_mali_viewport_init(struct panfrost_context *ctx,
 861                             struct mali_viewport *mvp)
 862 {
 863         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 864
 865         /* Clip bounds are encoded as floats. The viewport itself is encoded as
 866          * (somewhat) asymmetric ints. */
 867
 868         const struct pipe_scissor_state *ss = &ctx->scissor;
 869
 870         memset(mvp, 0, sizeof(*mvp));
 871
 872         /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
 873          * each direction. Clipping to the viewport in theory should work, but
 874          * in practice causes issues when we're not explicitly trying to
 875          * scissor */
 876
 877         *mvp = (struct mali_viewport) {
 878                 .clip_minx = -INFINITY,
 879                 .clip_miny = -INFINITY,
 880                 .clip_maxx = INFINITY,
 881                 .clip_maxy = INFINITY,
 882         };
 883
 884         /* Always scissor to the viewport by default. */
 885         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 886         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 887
 888         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 889         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 890
 891         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 892         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 893
 894         /* Apply the scissor test */
 895
 896         unsigned minx, miny, maxx, maxy;
 897
 898         if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
 899                 minx = MAX2(ss->minx, vp_minx);
 900                 miny = MAX2(ss->miny, vp_miny);
 901                 maxx = MIN2(ss->maxx, vp_maxx);
 902                 maxy = MIN2(ss->maxy, vp_maxy);
 903         } else {
 904                 minx = vp_minx;
 905                 miny = vp_miny;
 906                 maxx = vp_maxx;
 907                 maxy = vp_maxy;
 908         }
 909
 910         /* Hardware needs the min/max to be strictly ordered, so flip if we
 911          * need to. The viewport transformation in the vertex shader will
 912          * handle the negatives if we don't */
 913
 914         if (miny > maxy) {
 915                 unsigned temp = miny;
 916                 miny = maxy;
 917                 maxy = temp;
 918         }
 919
 920         if (minx > maxx) {
 921                 unsigned temp = minx;
 922                 minx = maxx;
 923                 maxx = temp;
 924         }
 925
 926         if (minz > maxz) {
 927                 float temp = minz;
 928                 minz = maxz;
 929                 maxz = temp;
 930         }
 931
 932         /* Clamp to the framebuffer size as a last check */
 933
 934         minx = MIN2(ctx->pipe_framebuffer.width, minx);
 935         maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
 936
 937         miny = MIN2(ctx->pipe_framebuffer.height, miny);
 938         maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
 939
 940         /* Upload */
 941
 942         mvp->viewport0[0] = minx;
 943         mvp->viewport1[0] = MALI_POSITIVE(maxx);
 944
 945         mvp->viewport0[1] = miny;
 946         mvp->viewport1[1] = MALI_POSITIVE(maxy);
 947
 948         mvp->clip_minz = minz;
 949         mvp->clip_maxz = maxz;
 950 }
 951
 952 void
 953 panfrost_emit_viewport(struct panfrost_batch *batch,
 954                        struct mali_vertex_tiler_postfix *tiler_postfix)
 955 {
 956         struct panfrost_context *ctx = batch->ctx;
 957         struct mali_viewport mvp;
 958
 959         panfrost_mali_viewport_init(batch->ctx,  &mvp);
 960
 961         /* Update the job, unless we're doing wallpapering (whose lack of
 962          * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
 963          * just... be faster :) */
 964
 965         if (!ctx->wallpaper_batch)
 966                 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
 967                                              mvp.viewport0[1],
 968                                              mvp.viewport1[0] + 1,
 969                                              mvp.viewport1[1] + 1);
 970
 971         tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
 972                                                             sizeof(mvp));
 973 }
 974
 975 static mali_ptr
 976 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 977                                  enum pipe_shader_type st,
 978                                  struct panfrost_constant_buffer *buf,
 979                                  unsigned index)
 980 {
 981         struct pipe_constant_buffer *cb = &buf->cb[index];
 982         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 983
 984         if (rsrc) {
 985                 panfrost_batch_add_bo(batch, rsrc->bo,
 986                                       PAN_BO_ACCESS_SHARED |
 987                                       PAN_BO_ACCESS_READ |
 988                                       panfrost_bo_access_for_stage(st));
 989
 990                 /* Alignment gauranteed by
 991                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 992                 return rsrc->bo->gpu + cb->buffer_offset;
 993         } else if (cb->user_buffer) {
 994                 return panfrost_upload_transient(batch,
 995                                                  cb->user_buffer +
 996                                                  cb->buffer_offset,
 997                                                  cb->buffer_size);
 998         } else {
 999                 unreachable("No constant buffer");
1000         }
1001 }
1002
1003 struct sysval_uniform {
1004         union {
1005                 float f[4];
1006                 int32_t i[4];
1007                 uint32_t u[4];
1008                 uint64_t du[2];
1009         };
1010 };
1011
1012 static void
1013 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1014                                       struct sysval_uniform *uniform)
1015 {
1016         struct panfrost_context *ctx = batch->ctx;
1017         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1018
1019         uniform->f[0] = vp->scale[0];
1020         uniform->f[1] = vp->scale[1];
1021         uniform->f[2] = vp->scale[2];
1022 }
1023
1024 static void
1025 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1026                                        struct sysval_uniform *uniform)
1027 {
1028         struct panfrost_context *ctx = batch->ctx;
1029         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1030
1031         uniform->f[0] = vp->translate[0];
1032         uniform->f[1] = vp->translate[1];
1033         uniform->f[2] = vp->translate[2];
1034 }
1035
1036 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1037                                        enum pipe_shader_type st,
1038                                        unsigned int sysvalid,
1039                                        struct sysval_uniform *uniform)
1040 {
1041         struct panfrost_context *ctx = batch->ctx;
1042         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1043         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1044         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1045         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1046
1047         assert(dim);
1048         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1049
1050         if (dim > 1)
1051                 uniform->i[1] = u_minify(tex->texture->height0,
1052                                          tex->u.tex.first_level);
1053
1054         if (dim > 2)
1055                 uniform->i[2] = u_minify(tex->texture->depth0,
1056                                          tex->u.tex.first_level);
1057
1058         if (is_array)
1059                 uniform->i[dim] = tex->texture->array_size;
1060 }
1061
1062 static void
1063 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1064                             enum pipe_shader_type st,
1065                             unsigned ssbo_id,
1066                             struct sysval_uniform *uniform)
1067 {
1068         struct panfrost_context *ctx = batch->ctx;
1069
1070         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1071         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1072
1073         /* Compute address */
1074         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1075
1076         panfrost_batch_add_bo(batch, bo,
1077                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1078                               panfrost_bo_access_for_stage(st));
1079
1080         /* Upload address and size as sysval */
1081         uniform->du[0] = bo->gpu + sb.buffer_offset;
1082         uniform->u[2] = sb.buffer_size;
1083 }
1084
1085 static void
1086 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1087                                enum pipe_shader_type st,
1088                                unsigned samp_idx,
1089                                struct sysval_uniform *uniform)
1090 {
1091         struct panfrost_context *ctx = batch->ctx;
1092         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1093
1094         uniform->f[0] = sampl->min_lod;
1095         uniform->f[1] = sampl->max_lod;
1096         uniform->f[2] = sampl->lod_bias;
1097
1098         /* Even without any errata, Midgard represents "no mipmapping" as
1099          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1100          * panfrost_create_sampler_state which also explains our choice of
1101          * epsilon value (again to keep behaviour consistent) */
1102
1103         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1104                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1105 }
1106
1107 static void
1108 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1109                                        struct sysval_uniform *uniform)
1110 {
1111         struct panfrost_context *ctx = batch->ctx;
1112
1113         uniform->u[0] = ctx->compute_grid->grid[0];
1114         uniform->u[1] = ctx->compute_grid->grid[1];
1115         uniform->u[2] = ctx->compute_grid->grid[2];
1116 }
1117
1118 static void
1119 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1120                         struct panfrost_shader_state *ss,
1121                         enum pipe_shader_type st)
1122 {
1123         struct sysval_uniform *uniforms = (void *)buf;
1124
1125         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1126                 int sysval = ss->sysval[i];
1127
1128                 switch (PAN_SYSVAL_TYPE(sysval)) {
1129                 case PAN_SYSVAL_VIEWPORT_SCALE:
1130                         panfrost_upload_viewport_scale_sysval(batch,
1131                                                               &uniforms[i]);
1132                         break;
1133                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1134                         panfrost_upload_viewport_offset_sysval(batch,
1135                                                                &uniforms[i]);
1136                         break;
1137                 case PAN_SYSVAL_TEXTURE_SIZE:
1138                         panfrost_upload_txs_sysval(batch, st,
1139                                                    PAN_SYSVAL_ID(sysval),
1140                                                    &uniforms[i]);
1141                         break;
1142                 case PAN_SYSVAL_SSBO:
1143                         panfrost_upload_ssbo_sysval(batch, st,
1144                                                     PAN_SYSVAL_ID(sysval),
1145                                                     &uniforms[i]);
1146                         break;
1147                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1148                         panfrost_upload_num_work_groups_sysval(batch,
1149                                                                &uniforms[i]);
1150                         break;
1151                 case PAN_SYSVAL_SAMPLER:
1152                         panfrost_upload_sampler_sysval(batch, st,
1153                                                        PAN_SYSVAL_ID(sysval),
1154                                                        &uniforms[i]);
1155                         break;
1156                 default:
1157                         assert(0);
1158                 }
1159         }
1160 }
1161
1162 static const void *
1163 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1164                                  unsigned index)
1165 {
1166         struct pipe_constant_buffer *cb = &buf->cb[index];
1167         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1168
1169         if (rsrc)
1170                 return rsrc->bo->cpu;
1171         else if (cb->user_buffer)
1172                 return cb->user_buffer;
1173         else
1174                 unreachable("No constant buffer");
1175 }
1176
1177 void
1178 panfrost_emit_const_buf(struct panfrost_batch *batch,
1179                         enum pipe_shader_type stage,
1180                         struct mali_vertex_tiler_postfix *postfix)
1181 {
1182         struct panfrost_context *ctx = batch->ctx;
1183         struct panfrost_shader_variants *all = ctx->shader[stage];
1184
1185         if (!all)
1186                 return;
1187
1188         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1189
1190         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1191
1192         /* Uniforms are implicitly UBO #0 */
1193         bool has_uniforms = buf->enabled_mask & (1 << 0);
1194
1195         /* Allocate room for the sysval and the uniforms */
1196         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1197         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1198         size_t size = sys_size + uniform_size;
1199         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1200                                                                         size);
1201
1202         /* Upload sysvals requested by the shader */
1203         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1204
1205         /* Upload uniforms */
1206         if (has_uniforms && uniform_size) {
1207                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1208                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1209         }
1210
1211         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1212          * uploaded */
1213
1214         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1215         assert(ubo_count >= 1);
1216
1217         size_t sz = sizeof(uint64_t) * ubo_count;
1218         uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1219         int uniform_count = ss->uniform_count;
1220
1221         /* Upload uniforms as a UBO */
1222         ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1223
1224         /* The rest are honest-to-goodness UBOs */
1225
1226         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1227                 size_t usz = buf->cb[ubo].buffer_size;
1228                 bool enabled = buf->enabled_mask & (1 << ubo);
1229                 bool empty = usz == 0;
1230
1231                 if (!enabled || empty) {
1232                         /* Stub out disabled UBOs to catch accesses */
1233                         ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1234                         continue;
1235                 }
1236
1237                 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1238                                                                 buf, ubo);
1239
1240                 unsigned bytes_per_field = 16;
1241                 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1242                 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1243         }
1244
1245         mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1246         postfix->uniforms = transfer.gpu;
1247         postfix->uniform_buffers = ubufs;
1248
1249         buf->dirty_mask = 0;
1250 }
1251
1252 void
1253 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1254                             const struct pipe_grid_info *info,
1255                             struct midgard_payload_vertex_tiler *vtp)
1256 {
1257         struct panfrost_context *ctx = batch->ctx;
1258         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1259         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1260         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1261                                                            128));
1262         unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1263                                info->grid[2] * 4;
1264         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1265                                                                   shared_size,
1266                                                                   1);
1267
1268         struct mali_shared_memory shared = {
1269                 .shared_memory = bo->gpu,
1270                 .shared_workgroup_count =
1271                         util_logbase2_ceil(info->grid[0]) +
1272                         util_logbase2_ceil(info->grid[1]) +
1273                         util_logbase2_ceil(info->grid[2]),
1274                 .shared_unk1 = 0x2,
1275                 .shared_shift = util_logbase2(single_size) - 1
1276         };
1277
1278         vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1279                                                                sizeof(shared));
1280 }
1281
1282 static mali_ptr
1283 panfrost_get_tex_desc(struct panfrost_batch *batch,
1284                       enum pipe_shader_type st,
1285                       struct panfrost_sampler_view *view)
1286 {
1287         if (!view)
1288                 return (mali_ptr) 0;
1289
1290         struct pipe_sampler_view *pview = &view->base;
1291         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1292
1293         /* Add the BO to the job so it's retained until the job is done. */
1294
1295         panfrost_batch_add_bo(batch, rsrc->bo,
1296                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1297                               panfrost_bo_access_for_stage(st));
1298
1299         panfrost_batch_add_bo(batch, view->midgard_bo,
1300                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1301                               panfrost_bo_access_for_stage(st));
1302
1303         return view->midgard_bo->gpu;
1304 }
1305
1306 void
1307 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1308                                   enum pipe_shader_type stage,
1309                                   struct mali_vertex_tiler_postfix *postfix)
1310 {
1311         struct panfrost_context *ctx = batch->ctx;
1312         struct panfrost_device *device = pan_device(ctx->base.screen);
1313
1314         if (!ctx->sampler_view_count[stage])
1315                 return;
1316
1317         if (device->quirks & IS_BIFROST) {
1318                 struct bifrost_texture_descriptor *descriptors;
1319
1320                 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1321                                      ctx->sampler_view_count[stage]);
1322
1323                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1324                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1325                         struct pipe_sampler_view *pview = &view->base;
1326                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1327
1328                         /* Add the BOs to the job so they are retained until the job is done. */
1329
1330                         panfrost_batch_add_bo(batch, rsrc->bo,
1331                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1332                                               panfrost_bo_access_for_stage(stage));
1333
1334                         panfrost_batch_add_bo(batch, view->bifrost_bo,
1335                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1336                                               panfrost_bo_access_for_stage(stage));
1337
1338                         memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1339                 }
1340
1341                 postfix->textures = panfrost_upload_transient(batch,
1342                                                               descriptors,
1343                                                               sizeof(struct bifrost_texture_descriptor) *
1344                                                                       ctx->sampler_view_count[stage]);
1345
1346                 free(descriptors);
1347         } else {
1348                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1349
1350                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1351                         trampolines[i] = panfrost_get_tex_desc(batch, stage,
1352                                                                ctx->sampler_views[stage][i]);
1353
1354                 postfix->textures = panfrost_upload_transient(batch,
1355                                                               trampolines,
1356                                                               sizeof(uint64_t) *
1357                                                               ctx->sampler_view_count[stage]);
1358         }
1359 }
1360
1361 void
1362 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1363                                   enum pipe_shader_type stage,
1364                                   struct mali_vertex_tiler_postfix *postfix)
1365 {
1366         struct panfrost_context *ctx = batch->ctx;
1367         struct panfrost_device *device = pan_device(ctx->base.screen);
1368
1369         if (!ctx->sampler_count[stage])
1370                 return;
1371
1372         if (device->quirks & IS_BIFROST) {
1373                 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1374                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1375                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1376                                                                                 transfer_size);
1377                 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1378
1379                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1380                         desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1381
1382                 postfix->sampler_descriptor = transfer.gpu;
1383         } else {
1384                 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1385                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1386                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1387                                                                                 transfer_size);
1388                 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1389
1390                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1391                         desc[i] = ctx->samplers[stage][i]->midgard_hw;
1392
1393                 postfix->sampler_descriptor = transfer.gpu;
1394         }
1395 }
1396
1397 void
1398 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1399                                struct mali_vertex_tiler_postfix *vertex_postfix)
1400 {
1401         struct panfrost_context *ctx = batch->ctx;
1402
1403         if (!ctx->vertex)
1404                 return;
1405
1406         struct panfrost_vertex_state *so = ctx->vertex;
1407
1408         panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1409         vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1410                                                                sizeof(*so->hw) *
1411                                                                PAN_MAX_ATTRIBUTE);
1412 }
1413
1414 void
1415 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1416                           struct mali_vertex_tiler_postfix *vertex_postfix)
1417 {
1418         struct panfrost_context *ctx = batch->ctx;
1419         struct panfrost_vertex_state *so = ctx->vertex;
1420
1421         /* Staged mali_attr, and index into them. i =/= k, depending on the
1422          * vertex buffer mask and instancing. Twice as much room is allocated,
1423          * for a worst case of NPOT_DIVIDEs which take up extra slot */
1424         union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1425         unsigned k = 0;
1426
1427         for (unsigned i = 0; i < so->num_elements; ++i) {
1428                 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1429                  * means duplicating some vertex buffers (who cares? aside from
1430                  * maybe some caching implications but I somehow doubt that
1431                  * matters) */
1432
1433                 struct pipe_vertex_element *elem = &so->pipe[i];
1434                 unsigned vbi = elem->vertex_buffer_index;
1435
1436                 /* The exception to 1:1 mapping is that we can have multiple
1437                  * entries (NPOT divisors), so we fixup anyways */
1438
1439                 so->hw[i].index = k;
1440
1441                 if (!(ctx->vb_mask & (1 << vbi)))
1442                         continue;
1443
1444                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1445                 struct panfrost_resource *rsrc;
1446
1447                 rsrc = pan_resource(buf->buffer.resource);
1448                 if (!rsrc)
1449                         continue;
1450
1451                 /* Align to 64 bytes by masking off the lower bits. This
1452                  * will be adjusted back when we fixup the src_offset in
1453                  * mali_attr_meta */
1454
1455                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1456                 mali_ptr addr = raw_addr & ~63;
1457                 unsigned chopped_addr = raw_addr - addr;
1458
1459                 /* Add a dependency of the batch on the vertex buffer */
1460                 panfrost_batch_add_bo(batch, rsrc->bo,
1461                                       PAN_BO_ACCESS_SHARED |
1462                                       PAN_BO_ACCESS_READ |
1463                                       PAN_BO_ACCESS_VERTEX_TILER);
1464
1465                 /* Set common fields */
1466                 attrs[k].elements = addr;
1467                 attrs[k].stride = buf->stride;
1468
1469                 /* Since we advanced the base pointer, we shrink the buffer
1470                  * size */
1471                 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1472
1473                 /* We need to add the extra size we masked off (for
1474                  * correctness) so the data doesn't get clamped away */
1475                 attrs[k].size += chopped_addr;
1476
1477                 /* For non-instancing make sure we initialize */
1478                 attrs[k].shift = attrs[k].extra_flags = 0;
1479
1480                 /* Instancing uses a dramatically different code path than
1481                  * linear, so dispatch for the actual emission now that the
1482                  * common code is finished */
1483
1484                 unsigned divisor = elem->instance_divisor;
1485
1486                 if (divisor && ctx->instance_count == 1) {
1487                         /* Silly corner case where there's a divisor(=1) but
1488                          * there's no legitimate instancing. So we want *every*
1489                          * attribute to be the same. So set stride to zero so
1490                          * we don't go anywhere. */
1491
1492                         attrs[k].size = attrs[k].stride + chopped_addr;
1493                         attrs[k].stride = 0;
1494                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1495                 } else if (ctx->instance_count <= 1) {
1496                         /* Normal, non-instanced attributes */
1497                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1498                 } else {
1499                         unsigned instance_shift = vertex_postfix->instance_shift;
1500                         unsigned instance_odd = vertex_postfix->instance_odd;
1501
1502                         k += panfrost_vertex_instanced(ctx->padded_count,
1503                                                        instance_shift,
1504                                                        instance_odd,
1505                                                        divisor, &attrs[k]);
1506                 }
1507         }
1508
1509         /* Add special gl_VertexID/gl_InstanceID buffers */
1510
1511         panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1512         so->hw[PAN_VERTEX_ID].index = k++;
1513         panfrost_instance_id(ctx->padded_count, &attrs[k]);
1514         so->hw[PAN_INSTANCE_ID].index = k++;
1515
1516         /* Upload whatever we emitted and go */
1517
1518         vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1519                                                            k * sizeof(*attrs));
1520 }
1521
1522 static mali_ptr
1523 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1524                        unsigned stride, unsigned count)
1525 {
1526         /* Fill out the descriptor */
1527         slot->stride = stride;
1528         slot->size = stride * count;
1529         slot->shift = slot->extra_flags = 0;
1530
1531         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1532                                                                         slot->size);
1533
1534         slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1535
1536         return transfer.gpu;
1537 }
1538
1539 static void
1540 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1541                         unsigned stride, unsigned offset, unsigned count,
1542                         struct pipe_stream_output_target *target)
1543 {
1544         /* Fill out the descriptor */
1545         slot->stride = stride * 4;
1546         slot->shift = slot->extra_flags = 0;
1547
1548         unsigned max_size = target->buffer_size;
1549         unsigned expected_size = slot->stride * count;
1550
1551         slot->size = MIN2(max_size, expected_size);
1552
1553         /* Grab the BO and bind it to the batch */
1554         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1555
1556         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1557          * the perspective of the TILER and FRAGMENT.
1558          */
1559         panfrost_batch_add_bo(batch, bo,
1560                               PAN_BO_ACCESS_SHARED |
1561                               PAN_BO_ACCESS_RW |
1562                               PAN_BO_ACCESS_VERTEX_TILER |
1563                               PAN_BO_ACCESS_FRAGMENT);
1564
1565         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1566         slot->elements = addr;
1567 }
1568
1569 /* Given a shader and buffer indices, link varying metadata together */
1570
1571 static bool
1572 is_special_varying(gl_varying_slot loc)
1573 {
1574         switch (loc) {
1575         case VARYING_SLOT_POS:
1576         case VARYING_SLOT_PSIZ:
1577         case VARYING_SLOT_PNTC:
1578         case VARYING_SLOT_FACE:
1579                 return true;
1580         default:
1581                 return false;
1582         }
1583 }
1584
1585 static void
1586 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1587                            signed general, signed gl_Position,
1588                            signed gl_PointSize, signed gl_PointCoord,
1589                            signed gl_FrontFacing)
1590 {
1591         struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1592
1593         for (unsigned i = 0; i < ss->varying_count; ++i) {
1594                 gl_varying_slot location = ss->varyings_loc[i];
1595                 int index = -1;
1596
1597                 switch (location) {
1598                 case VARYING_SLOT_POS:
1599                         index = gl_Position;
1600                         break;
1601                 case VARYING_SLOT_PSIZ:
1602                         index = gl_PointSize;
1603                         break;
1604                 case VARYING_SLOT_PNTC:
1605                         index = gl_PointCoord;
1606                         break;
1607                 case VARYING_SLOT_FACE:
1608                         index = gl_FrontFacing;
1609                         break;
1610                 default:
1611                         index = general;
1612                         break;
1613                 }
1614
1615                 assert(index >= 0);
1616                 out[i].index = index;
1617         }
1618 }
1619
1620 static bool
1621 has_point_coord(unsigned mask, gl_varying_slot loc)
1622 {
1623         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1624                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1625         else if (loc == VARYING_SLOT_PNTC)
1626                 return (mask & (1 << 8));
1627         else
1628                 return false;
1629 }
1630
1631 /* Helpers for manipulating stream out information so we can pack varyings
1632  * accordingly. Compute the src_offset for a given captured varying */
1633
1634 static struct pipe_stream_output *
1635 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1636 {
1637         for (unsigned i = 0; i < info->num_outputs; ++i) {
1638                 if (info->output[i].register_index == loc)
1639                         return &info->output[i];
1640         }
1641
1642         unreachable("Varying not captured");
1643 }
1644
1645 /* TODO: Integers */
1646 static enum mali_format
1647 pan_xfb_format(unsigned nr_components)
1648 {
1649         switch (nr_components) {
1650                 case 1: return MALI_R32F;
1651                 case 2: return MALI_RG32F;
1652                 case 3: return MALI_RGB32F;
1653                 case 4: return MALI_RGBA32F;
1654                 default: unreachable("Invalid format");
1655         }
1656 }
1657
1658 void
1659 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1660                                  unsigned vertex_count,
1661                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1662                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1663                                  union midgard_primitive_size *primitive_size)
1664 {
1665         /* Load the shaders */
1666         struct panfrost_context *ctx = batch->ctx;
1667         struct panfrost_shader_state *vs, *fs;
1668         unsigned int num_gen_varyings = 0;
1669         size_t vs_size, fs_size;
1670
1671         /* Allocate the varying descriptor */
1672
1673         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1674         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1675         vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1676         fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1677
1678         struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1679                                                                      vs_size +
1680                                                                      fs_size);
1681
1682         struct pipe_stream_output_info *so = &vs->stream_output;
1683
1684         /* Check if this varying is linked by us. This is the case for
1685          * general-purpose, non-captured varyings. If it is, link it. If it's
1686          * not, use the provided stream out information to determine the
1687          * offset, since it was already linked for us. */
1688
1689         for (unsigned i = 0; i < vs->varying_count; i++) {
1690                 gl_varying_slot loc = vs->varyings_loc[i];
1691
1692                 bool special = is_special_varying(loc);
1693                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1694
1695                 if (captured) {
1696                         struct pipe_stream_output *o = pan_get_so(so, loc);
1697
1698                         unsigned dst_offset = o->dst_offset * 4; /* dwords */
1699                         vs->varyings[i].src_offset = dst_offset;
1700                 } else if (!special) {
1701                         vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1702                 }
1703         }
1704
1705         /* Conversely, we need to set src_offset for the captured varyings.
1706          * Here, the layout is defined by the stream out info, not us */
1707
1708         /* Link up with fragment varyings */
1709         bool reads_point_coord = fs->reads_point_coord;
1710
1711         for (unsigned i = 0; i < fs->varying_count; i++) {
1712                 gl_varying_slot loc = fs->varyings_loc[i];
1713                 unsigned src_offset;
1714                 signed vs_idx = -1;
1715
1716                 /* Link up */
1717                 for (unsigned j = 0; j < vs->varying_count; ++j) {
1718                         if (vs->varyings_loc[j] == loc) {
1719                                 vs_idx = j;
1720                                 break;
1721                         }
1722                 }
1723
1724                 /* Either assign or reuse */
1725                 if (vs_idx >= 0)
1726                         src_offset = vs->varyings[vs_idx].src_offset;
1727                 else
1728                         src_offset = 16 * (num_gen_varyings++);
1729
1730                 fs->varyings[i].src_offset = src_offset;
1731
1732                 if (has_point_coord(fs->point_sprite_mask, loc))
1733                         reads_point_coord = true;
1734         }
1735
1736         memcpy(trans.cpu, vs->varyings, vs_size);
1737         memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1738
1739         union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1740
1741         /* Figure out how many streamout buffers could be bound */
1742         unsigned so_count = ctx->streamout.num_targets;
1743         for (unsigned i = 0; i < vs->varying_count; i++) {
1744                 gl_varying_slot loc = vs->varyings_loc[i];
1745
1746                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1747                 if (!captured) continue;
1748
1749                 struct pipe_stream_output *o = pan_get_so(so, loc);
1750                 so_count = MAX2(so_count, o->output_buffer + 1);
1751         }
1752
1753         signed idx = so_count;
1754         signed general = idx++;
1755         signed gl_Position = idx++;
1756         signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1757         signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1758         signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1759         signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1760
1761         /* Emit the stream out buffers */
1762
1763         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1764                                                            ctx->vertex_count);
1765
1766         for (unsigned i = 0; i < so_count; ++i) {
1767                 if (i < ctx->streamout.num_targets) {
1768                         panfrost_emit_streamout(batch, &varyings[i],
1769                                                 so->stride[i],
1770                                                 ctx->streamout.offsets[i],
1771                                                 out_count,
1772                                                 ctx->streamout.targets[i]);
1773                 } else {
1774                         /* Emit a dummy buffer */
1775                         panfrost_emit_varyings(batch, &varyings[i],
1776                                                so->stride[i] * 4,
1777                                                out_count);
1778
1779                         /* Clear the attribute type */
1780                         varyings[i].elements &= ~0xF;
1781                 }
1782         }
1783
1784         panfrost_emit_varyings(batch, &varyings[general],
1785                                num_gen_varyings * 16,
1786                                vertex_count);
1787
1788         mali_ptr varyings_p;
1789
1790         /* fp32 vec4 gl_Position */
1791         varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1792                                             sizeof(float) * 4, vertex_count);
1793         tiler_postfix->position_varying = varyings_p;
1794
1795
1796         if (panfrost_writes_point_size(ctx)) {
1797                 varyings_p = panfrost_emit_varyings(batch,
1798                                                     &varyings[gl_PointSize],
1799                                                     2, vertex_count);
1800                 primitive_size->pointer = varyings_p;
1801         }
1802
1803         if (reads_point_coord)
1804                 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1805
1806         if (fs->reads_face)
1807                 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1808
1809         if (fs->reads_frag_coord)
1810                 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1811
1812         struct panfrost_device *device = pan_device(ctx->base.screen);
1813         assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord || fs->reads_face || fs->reads_frag_coord));
1814
1815         /* Let's go ahead and link varying meta to the buffer in question, now
1816          * that that information is available. VARYING_SLOT_POS is mapped to
1817          * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1818          * */
1819
1820         panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1821                                    gl_PointSize, gl_PointCoord,
1822                                    gl_FrontFacing);
1823
1824         panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1825                                    gl_FragCoord, gl_PointSize,
1826                                    gl_PointCoord, gl_FrontFacing);
1827
1828         /* Replace streamout */
1829
1830         struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1831         struct mali_attr_meta *ofs = ovs + vs->varying_count;
1832
1833         for (unsigned i = 0; i < vs->varying_count; i++) {
1834                 gl_varying_slot loc = vs->varyings_loc[i];
1835
1836                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1837                 if (!captured)
1838                         continue;
1839
1840                 struct pipe_stream_output *o = pan_get_so(so, loc);
1841                 ovs[i].index = o->output_buffer;
1842
1843                 /* Set the type appropriately. TODO: Integer varyings XXX */
1844                 assert(o->stream == 0);
1845                 ovs[i].format = pan_xfb_format(o->num_components);
1846
1847                 if (device->quirks & HAS_SWIZZLES)
1848                         ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1849                 else
1850                         ovs[i].swizzle = panfrost_bifrost_swizzle(o->num_components);
1851
1852                 /* Link to the fragment */
1853                 signed fs_idx = -1;
1854
1855                 /* Link up */
1856                 for (unsigned j = 0; j < fs->varying_count; ++j) {
1857                         if (fs->varyings_loc[j] == loc) {
1858                                 fs_idx = j;
1859                                 break;
1860                         }
1861                 }
1862
1863                 if (fs_idx >= 0) {
1864                         ofs[fs_idx].index = ovs[i].index;
1865                         ofs[fs_idx].format = ovs[i].format;
1866                         ofs[fs_idx].swizzle = ovs[i].swizzle;
1867                 }
1868         }
1869
1870         /* Replace point sprite */
1871         for (unsigned i = 0; i < fs->varying_count; i++) {
1872                 /* If we have a point sprite replacement, handle that here. We
1873                  * have to translate location first.  TODO: Flip y in shader.
1874                  * We're already keying ... just time crunch .. */
1875
1876                 if (has_point_coord(fs->point_sprite_mask,
1877                                     fs->varyings_loc[i])) {
1878                         ofs[i].index = gl_PointCoord;
1879
1880                         /* Swizzle out the z/w to 0/1 */
1881                         ofs[i].format = MALI_RG16F;
1882                         ofs[i].swizzle = panfrost_get_default_swizzle(2);
1883                 }
1884         }
1885
1886         /* Fix up unaligned addresses */
1887         for (unsigned i = 0; i < so_count; ++i) {
1888                 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1889                         continue;
1890
1891                 unsigned align = (varyings[i].elements & 63);
1892
1893                 /* While we're at it, the SO buffers are linear */
1894
1895                 if (!align) {
1896                         varyings[i].elements |= MALI_ATTR_LINEAR;
1897                         continue;
1898                 }
1899
1900                 /* We need to adjust alignment */
1901                 varyings[i].elements &= ~63;
1902                 varyings[i].elements |= MALI_ATTR_LINEAR;
1903                 varyings[i].size += align;
1904
1905                 for (unsigned v = 0; v < vs->varying_count; ++v) {
1906                         if (ovs[v].index != i)
1907                                 continue;
1908
1909                         ovs[v].src_offset = vs->varyings[v].src_offset + align;
1910                 }
1911
1912                 for (unsigned f = 0; f < fs->varying_count; ++f) {
1913                         if (ofs[f].index != i)
1914                                 continue;
1915
1916                         ofs[f].src_offset = fs->varyings[f].src_offset + align;
1917                 }
1918         }
1919
1920         varyings_p = panfrost_upload_transient(batch, varyings,
1921                                                idx * sizeof(*varyings));
1922         vertex_postfix->varyings = varyings_p;
1923         tiler_postfix->varyings = varyings_p;
1924
1925         vertex_postfix->varying_meta = trans.gpu;
1926         tiler_postfix->varying_meta = trans.gpu + vs_size;
1927 }
1928
1929 void
1930 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1931                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1932                                 struct mali_vertex_tiler_postfix *vertex_postfix,
1933                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1934                                 struct mali_vertex_tiler_postfix *tiler_postfix,
1935                                 union midgard_primitive_size *primitive_size)
1936 {
1937         struct panfrost_context *ctx = batch->ctx;
1938         struct panfrost_device *device = pan_device(ctx->base.screen);
1939         bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
1940         struct bifrost_payload_vertex bifrost_vertex = {0,};
1941         struct bifrost_payload_tiler bifrost_tiler = {0,};
1942         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1943         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1944         void *vp, *tp;
1945         size_t vp_size, tp_size;
1946
1947         if (device->quirks & IS_BIFROST) {
1948                 bifrost_vertex.prefix = *vertex_prefix;
1949                 bifrost_vertex.postfix = *vertex_postfix;
1950                 vp = &bifrost_vertex;
1951                 vp_size = sizeof(bifrost_vertex);
1952
1953                 bifrost_tiler.prefix = *tiler_prefix;
1954                 bifrost_tiler.tiler.primitive_size = *primitive_size;
1955                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1956                 bifrost_tiler.postfix = *tiler_postfix;
1957                 tp = &bifrost_tiler;
1958                 tp_size = sizeof(bifrost_tiler);
1959         } else {
1960                 midgard_vertex.prefix = *vertex_prefix;
1961                 midgard_vertex.postfix = *vertex_postfix;
1962                 vp = &midgard_vertex;
1963                 vp_size = sizeof(midgard_vertex);
1964
1965                 midgard_tiler.prefix = *tiler_prefix;
1966                 midgard_tiler.postfix = *tiler_postfix;
1967                 midgard_tiler.primitive_size = *primitive_size;
1968                 tp = &midgard_tiler;
1969                 tp_size = sizeof(midgard_tiler);
1970         }
1971
1972         if (wallpapering) {
1973                 /* Inject in reverse order, with "predicted" job indices.
1974                  * THIS IS A HACK XXX */
1975                 panfrost_new_job(batch, JOB_TYPE_TILER, false,
1976                                  batch->job_index + 2, tp, tp_size, true);
1977                 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1978                                  vp, vp_size, true);
1979                 return;
1980         }
1981
1982         /* If rasterizer discard is enable, only submit the vertex */
1983
1984         bool rasterizer_discard = ctx->rasterizer &&
1985                                   ctx->rasterizer->base.rasterizer_discard;
1986
1987         unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1988                                            vp, vp_size, false);
1989
1990         if (rasterizer_discard)
1991                 return;
1992
1993         panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
1994                          false);
1995 }
1996
1997 /* TODO: stop hardcoding this */
1998 mali_ptr
1999 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2000 {
2001         uint16_t locations[] = {
2002             128, 128,
2003             0, 256,
2004             0, 256,
2005             0, 256,
2006             0, 256,
2007             0, 256,
2008             0, 256,
2009             0, 256,
2010             0, 256,
2011             0, 256,
2012             0, 256,
2013             0, 256,
2014             0, 256,
2015             0, 256,
2016             0, 256,
2017             0, 256,
2018             0, 256,
2019             0, 256,
2020             0, 256,
2021             0, 256,
2022             0, 256,
2023             0, 256,
2024             0, 256,
2025             0, 256,
2026             0, 256,
2027             0, 256,
2028             0, 256,
2029             0, 256,
2030             0, 256,
2031             0, 256,
2032             0, 256,
2033             0, 256,
2034             128, 128,
2035             0, 0,
2036             0, 0,
2037             0, 0,
2038             0, 0,
2039             0, 0,
2040             0, 0,
2041             0, 0,
2042             0, 0,
2043             0, 0,
2044             0, 0,
2045             0, 0,
2046             0, 0,
2047             0, 0,
2048             0, 0,
2049             0, 0,
2050         };
2051
2052         return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2053 }