src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_allocate.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         unsigned shift = panfrost_get_stack_shift(batch->stack_size);
  62         struct mali_shared_memory shared = {
  63                 .stack_shift = shift,
  64                 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
  65                 .shared_workgroup_count = ~0,
  66         };
  67         postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
  68 }
  69
  70 static void
  71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  72                                struct mali_vertex_tiler_postfix *postfix)
  73 {
  74         struct panfrost_device *dev = pan_device(ctx->base.screen);
  75         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  76
  77         /* If we haven't, reserve space for the framebuffer */
  78
  79         if (!batch->framebuffer.gpu) {
  80                 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
  81                         sizeof(struct mali_single_framebuffer) :
  82                         sizeof(struct mali_framebuffer);
  83
  84                 batch->framebuffer = panfrost_allocate_transient(batch, size);
  85
  86                 /* Tag the pointer */
  87                 if (!(dev->quirks & MIDGARD_SFBD))
  88                         batch->framebuffer.gpu |= MALI_MFBD;
  89         }
  90
  91         postfix->shared_memory = batch->framebuffer.gpu;
  92 }
  93
  94 static void
  95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
  96                               struct mali_vertex_tiler_prefix *prefix,
  97                               struct mali_vertex_tiler_postfix *postfix)
  98 {
  99         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 100
 101         postfix->gl_enables |= 0x7;
 102         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
 103                 rasterizer && rasterizer->base.front_ccw);
 104         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
 105                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
 106         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
 107                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
 108         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
 109                 rasterizer && rasterizer->base.flatshade_first);
 110 }
 111
 112 void
 113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 114                                   struct mali_vertex_tiler_prefix *prefix,
 115                                   union midgard_primitive_size *primitive_size)
 116 {
 117         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 118
 119         if (!panfrost_writes_point_size(ctx)) {
 120                 bool points = prefix->draw_mode == MALI_POINTS;
 121                 float val = 0.0f;
 122
 123                 if (rasterizer)
 124                         val = points ?
 125                               rasterizer->base.point_size :
 126                               rasterizer->base.line_width;
 127
 128                 primitive_size->constant = val;
 129         }
 130 }
 131
 132 static void
 133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 134                                    struct mali_vertex_tiler_postfix *postfix)
 135 {
 136         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 137         if (ctx->occlusion_query)
 138                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 139         else
 140                 postfix->occlusion_counter = 0;
 141 }
 142
 143 void
 144 panfrost_vt_init(struct panfrost_context *ctx,
 145                  enum pipe_shader_type stage,
 146                  struct mali_vertex_tiler_prefix *prefix,
 147                  struct mali_vertex_tiler_postfix *postfix)
 148 {
 149         struct panfrost_device *device = pan_device(ctx->base.screen);
 150
 151         if (!ctx->shader[stage])
 152                 return;
 153
 154         memset(prefix, 0, sizeof(*prefix));
 155         memset(postfix, 0, sizeof(*postfix));
 156
 157         if (device->quirks & IS_BIFROST) {
 158                 postfix->gl_enables = 0x2;
 159                 panfrost_vt_emit_shared_memory(ctx, postfix);
 160         } else {
 161                 postfix->gl_enables = 0x6;
 162                 panfrost_vt_attach_framebuffer(ctx, postfix);
 163         }
 164
 165         if (stage == PIPE_SHADER_FRAGMENT) {
 166                 panfrost_vt_update_occlusion_query(ctx, postfix);
 167                 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
 168         }
 169 }
 170
 171 static unsigned
 172 panfrost_translate_index_size(unsigned size)
 173 {
 174         switch (size) {
 175         case 1:
 176                 return MALI_DRAW_INDEXED_UINT8;
 177
 178         case 2:
 179                 return MALI_DRAW_INDEXED_UINT16;
 180
 181         case 4:
 182                 return MALI_DRAW_INDEXED_UINT32;
 183
 184         default:
 185                 unreachable("Invalid index size");
 186         }
 187 }
 188
 189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 190  * good for the duration of the draw (transient), could last longer. Also get
 191  * the bounds on the index buffer for the range accessed by the draw. We do
 192  * these operations together because there are natural optimizations which
 193  * require them to be together. */
 194
 195 static mali_ptr
 196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 197                                   const struct pipe_draw_info *info,
 198                                   unsigned *min_index, unsigned *max_index)
 199 {
 200         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 201         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 202         off_t offset = info->start * info->index_size;
 203         bool needs_indices = true;
 204         mali_ptr out = 0;
 205
 206         if (info->max_index != ~0u) {
 207                 *min_index = info->min_index;
 208                 *max_index = info->max_index;
 209                 needs_indices = false;
 210         }
 211
 212         if (!info->has_user_indices) {
 213                 /* Only resources can be directly mapped */
 214                 panfrost_batch_add_bo(batch, rsrc->bo,
 215                                       PAN_BO_ACCESS_SHARED |
 216                                       PAN_BO_ACCESS_READ |
 217                                       PAN_BO_ACCESS_VERTEX_TILER);
 218                 out = rsrc->bo->gpu + offset;
 219
 220                 /* Check the cache */
 221                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 222                                                            info->start,
 223                                                            info->count,
 224                                                            min_index,
 225                                                            max_index);
 226         } else {
 227                 /* Otherwise, we need to upload to transient memory */
 228                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 229                 out = panfrost_upload_transient(batch, ibuf8 + offset,
 230                                                 info->count *
 231                                                 info->index_size);
 232         }
 233
 234         if (needs_indices) {
 235                 /* Fallback */
 236                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 237
 238                 if (!info->has_user_indices)
 239                         panfrost_minmax_cache_add(rsrc->index_cache,
 240                                                   info->start, info->count,
 241                                                   *min_index, *max_index);
 242         }
 243
 244         return out;
 245 }
 246
 247 void
 248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 249                           const struct pipe_draw_info *info,
 250                           enum mali_draw_mode draw_mode,
 251                           struct mali_vertex_tiler_postfix *vertex_postfix,
 252                           struct mali_vertex_tiler_prefix *tiler_prefix,
 253                           struct mali_vertex_tiler_postfix *tiler_postfix,
 254                           unsigned *vertex_count,
 255                           unsigned *padded_count)
 256 {
 257         tiler_prefix->draw_mode = draw_mode;
 258
 259         unsigned draw_flags = 0;
 260
 261         if (panfrost_writes_point_size(ctx))
 262                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 263
 264         if (info->primitive_restart)
 265                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 266
 267         /* These doesn't make much sense */
 268
 269         draw_flags |= 0x3000;
 270
 271         if (info->index_size) {
 272                 unsigned min_index = 0, max_index = 0;
 273
 274                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 275                                                                        info,
 276                                                                        &min_index,
 277                                                                        &max_index);
 278
 279                 /* Use the corresponding values */
 280                 *vertex_count = max_index - min_index + 1;
 281                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 282                 tiler_prefix->offset_bias_correction = -min_index;
 283                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 284                 draw_flags |= panfrost_translate_index_size(info->index_size);
 285         } else {
 286                 tiler_prefix->indices = 0;
 287                 *vertex_count = ctx->vertex_count;
 288                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 289                 tiler_prefix->offset_bias_correction = 0;
 290                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 291         }
 292
 293         tiler_prefix->unknown_draw = draw_flags;
 294
 295         /* Encode the padded vertex count */
 296
 297         if (info->instance_count > 1) {
 298                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 299
 300                 unsigned shift = __builtin_ctz(ctx->padded_count);
 301                 unsigned k = ctx->padded_count >> (shift + 1);
 302
 303                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 304                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 305         } else {
 306                 *padded_count = *vertex_count;
 307
 308                 /* Reset instancing state */
 309                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 310                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 311         }
 312 }
 313
 314 static void
 315 panfrost_shader_meta_init(struct panfrost_context *ctx,
 316                           enum pipe_shader_type st,
 317                           struct mali_shader_meta *meta)
 318 {
 319         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 320         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 321
 322         memset(meta, 0, sizeof(*meta));
 323         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 324         meta->attribute_count = ss->attribute_count;
 325         meta->varying_count = ss->varying_count;
 326         meta->texture_count = ctx->sampler_view_count[st];
 327         meta->sampler_count = ctx->sampler_count[st];
 328
 329         if (dev->quirks & IS_BIFROST) {
 330                 if (st == PIPE_SHADER_VERTEX)
 331                         meta->bifrost1.unk1 = 0x800000;
 332                 else {
 333                         /* First clause ATEST |= 0x4000000.
 334                          * Less than 32 regs |= 0x200 */
 335                         meta->bifrost1.unk1 = 0x950020;
 336                 }
 337
 338                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 339                 if (st == PIPE_SHADER_VERTEX)
 340                         meta->bifrost2.preload_regs = 0xC0;
 341                 else {
 342                         meta->bifrost2.preload_regs = 0x1;
 343                         SET_BIT(meta->bifrost2.preload_regs, 0x10, ss->reads_frag_coord);
 344                 }
 345
 346                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 347                                                     ss->uniform_cutoff);
 348         } else {
 349                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 350                                                     ss->uniform_cutoff);
 351                 meta->midgard1.work_count = ss->work_reg_count;
 352
 353                 /* TODO: This is not conformant on ES3 */
 354                 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
 355
 356                 meta->midgard1.flags_lo = 0x20;
 357                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 358
 359                 SET_BIT(meta->midgard1.flags_hi, MALI_WRITES_GLOBAL, ss->writes_global);
 360         }
 361 }
 362
 363 static unsigned
 364 panfrost_translate_compare_func(enum pipe_compare_func in)
 365 {
 366         switch (in) {
 367         case PIPE_FUNC_NEVER:
 368                 return MALI_FUNC_NEVER;
 369
 370         case PIPE_FUNC_LESS:
 371                 return MALI_FUNC_LESS;
 372
 373         case PIPE_FUNC_EQUAL:
 374                 return MALI_FUNC_EQUAL;
 375
 376         case PIPE_FUNC_LEQUAL:
 377                 return MALI_FUNC_LEQUAL;
 378
 379         case PIPE_FUNC_GREATER:
 380                 return MALI_FUNC_GREATER;
 381
 382         case PIPE_FUNC_NOTEQUAL:
 383                 return MALI_FUNC_NOTEQUAL;
 384
 385         case PIPE_FUNC_GEQUAL:
 386                 return MALI_FUNC_GEQUAL;
 387
 388         case PIPE_FUNC_ALWAYS:
 389                 return MALI_FUNC_ALWAYS;
 390
 391         default:
 392                 unreachable("Invalid func");
 393         }
 394 }
 395
 396 static unsigned
 397 panfrost_translate_stencil_op(enum pipe_stencil_op in)
 398 {
 399         switch (in) {
 400         case PIPE_STENCIL_OP_KEEP:
 401                 return MALI_STENCIL_KEEP;
 402
 403         case PIPE_STENCIL_OP_ZERO:
 404                 return MALI_STENCIL_ZERO;
 405
 406         case PIPE_STENCIL_OP_REPLACE:
 407                return MALI_STENCIL_REPLACE;
 408
 409         case PIPE_STENCIL_OP_INCR:
 410                 return MALI_STENCIL_INCR;
 411
 412         case PIPE_STENCIL_OP_DECR:
 413                 return MALI_STENCIL_DECR;
 414
 415         case PIPE_STENCIL_OP_INCR_WRAP:
 416                 return MALI_STENCIL_INCR_WRAP;
 417
 418         case PIPE_STENCIL_OP_DECR_WRAP:
 419                 return MALI_STENCIL_DECR_WRAP;
 420
 421         case PIPE_STENCIL_OP_INVERT:
 422                 return MALI_STENCIL_INVERT;
 423
 424         default:
 425                 unreachable("Invalid stencil op");
 426         }
 427 }
 428
 429 static unsigned
 430 translate_tex_wrap(enum pipe_tex_wrap w)
 431 {
 432         switch (w) {
 433         case PIPE_TEX_WRAP_REPEAT:
 434                 return MALI_WRAP_REPEAT;
 435
 436         case PIPE_TEX_WRAP_CLAMP:
 437                 return MALI_WRAP_CLAMP;
 438
 439         case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 440                 return MALI_WRAP_CLAMP_TO_EDGE;
 441
 442         case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 443                 return MALI_WRAP_CLAMP_TO_BORDER;
 444
 445         case PIPE_TEX_WRAP_MIRROR_REPEAT:
 446                 return MALI_WRAP_MIRRORED_REPEAT;
 447
 448         case PIPE_TEX_WRAP_MIRROR_CLAMP:
 449                 return MALI_WRAP_MIRRORED_CLAMP;
 450
 451         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 452                 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
 453
 454         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 455                 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
 456
 457         default:
 458                 unreachable("Invalid wrap");
 459         }
 460 }
 461
 462 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 463                                 struct mali_sampler_descriptor *hw)
 464 {
 465         unsigned func = panfrost_translate_compare_func(cso->compare_func);
 466         bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 467         bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 468         bool mip_linear  = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
 469         unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
 470         unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
 471         unsigned mip_filter = mip_linear  ?
 472                               (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
 473         unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
 474
 475         *hw = (struct mali_sampler_descriptor) {
 476                 .filter_mode = min_filter | mag_filter | mip_filter |
 477                                normalized,
 478                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 479                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 480                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 481                 .compare_func = panfrost_flip_compare_func(func),
 482                 .border_color = {
 483                         cso->border_color.f[0],
 484                         cso->border_color.f[1],
 485                         cso->border_color.f[2],
 486                         cso->border_color.f[3]
 487                 },
 488                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 489                 .max_lod = FIXED_16(cso->max_lod, false),
 490                 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
 491                 .seamless_cube_map = cso->seamless_cube_map,
 492         };
 493
 494         /* If necessary, we disable mipmapping in the sampler descriptor by
 495          * clamping the LOD as tight as possible (from 0 to epsilon,
 496          * essentially -- remember these are fixed point numbers, so
 497          * epsilon=1/256) */
 498
 499         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 500                 hw->max_lod = hw->min_lod + 1;
 501 }
 502
 503 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 504                                         struct bifrost_sampler_descriptor *hw)
 505 {
 506         *hw = (struct bifrost_sampler_descriptor) {
 507                 .unk1 = 0x1,
 508                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 509                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 510                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 511                 .unk8 = 0x8,
 512                 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
 513                 .norm_coords = cso->normalized_coords,
 514                 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
 515                 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
 516                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 517                 .max_lod = FIXED_16(cso->max_lod, false),
 518         };
 519
 520         /* If necessary, we disable mipmapping in the sampler descriptor by
 521          * clamping the LOD as tight as possible (from 0 to epsilon,
 522          * essentially -- remember these are fixed point numbers, so
 523          * epsilon=1/256) */
 524
 525         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 526                 hw->max_lod = hw->min_lod + 1;
 527 }
 528
 529 static void
 530 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
 531                             struct mali_stencil_test *out)
 532 {
 533         out->ref = 0; /* Gallium gets it from elsewhere */
 534
 535         out->mask = in->valuemask;
 536         out->func = panfrost_translate_compare_func(in->func);
 537         out->sfail = panfrost_translate_stencil_op(in->fail_op);
 538         out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
 539         out->dppass = panfrost_translate_stencil_op(in->zpass_op);
 540 }
 541
 542 static void
 543 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 544                                      struct mali_shader_meta *fragmeta)
 545 {
 546         if (!ctx->rasterizer) {
 547                 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
 548                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
 549                 fragmeta->depth_units = 0.0f;
 550                 fragmeta->depth_factor = 0.0f;
 551                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
 552                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
 553                 return;
 554         }
 555
 556         bool msaa = ctx->rasterizer->base.multisample;
 557
 558         /* TODO: Sample size */
 559         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 560         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 561         fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
 562         fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
 563
 564         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 565
 566         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
 567                 ctx->rasterizer->base.offset_tri);
 568         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
 569                 ctx->rasterizer->base.offset_tri);
 570 }
 571
 572 static void
 573 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 574                               struct mali_shader_meta *fragmeta)
 575 {
 576         const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 577         int zfunc = PIPE_FUNC_ALWAYS;
 578
 579         if (!zsa) {
 580                 struct pipe_stencil_state default_stencil = {
 581                         .enabled = 0,
 582                         .func = PIPE_FUNC_ALWAYS,
 583                         .fail_op = MALI_STENCIL_KEEP,
 584                         .zfail_op = MALI_STENCIL_KEEP,
 585                         .zpass_op = MALI_STENCIL_KEEP,
 586                         .writemask = 0xFF,
 587                         .valuemask = 0xFF
 588                 };
 589
 590                 panfrost_make_stencil_state(&default_stencil,
 591                                             &fragmeta->stencil_front);
 592                 fragmeta->stencil_mask_front = default_stencil.writemask;
 593                 fragmeta->stencil_back = fragmeta->stencil_front;
 594                 fragmeta->stencil_mask_back = default_stencil.writemask;
 595                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
 596                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
 597         } else {
 598                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 599                         zsa->stencil[0].enabled);
 600                 panfrost_make_stencil_state(&zsa->stencil[0],
 601                                             &fragmeta->stencil_front);
 602                 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
 603                 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
 604
 605                 /* If back-stencil is not enabled, use the front values */
 606
 607                 if (zsa->stencil[1].enabled) {
 608                         panfrost_make_stencil_state(&zsa->stencil[1],
 609                                                     &fragmeta->stencil_back);
 610                         fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
 611                         fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
 612                 } else {
 613                         fragmeta->stencil_back = fragmeta->stencil_front;
 614                         fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
 615                         fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
 616                 }
 617
 618                 if (zsa->depth.enabled)
 619                         zfunc = zsa->depth.func;
 620
 621                 /* Depth state (TODO: Refactor) */
 622
 623                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 624                         zsa->depth.writemask);
 625         }
 626
 627         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 628         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
 629 }
 630
 631 static bool
 632 panfrost_fs_required(
 633                 struct panfrost_shader_state *fs,
 634                 struct panfrost_blend_final *blend,
 635                 unsigned rt_count)
 636 {
 637         /* If we generally have side effects */
 638         if (fs->fs_sidefx)
 639                 return true;
 640
 641         /* If colour is written we need to execute */
 642         for (unsigned i = 0; i < rt_count; ++i) {
 643                 if (!blend[i].no_colour)
 644                         return true;
 645         }
 646
 647         /* If depth is written and not implied we need to execute.
 648          * TODO: Predicate on Z/S writes being enabled */
 649         return (fs->writes_depth || fs->writes_stencil);
 650 }
 651
 652 static void
 653 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 654                                 struct mali_shader_meta *fragmeta,
 655                                 void *rts)
 656 {
 657         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 658         struct panfrost_shader_state *fs;
 659         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 660
 661         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 662                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 663                 !ctx->blend->base.dither);
 664
 665         /* Get blending setup */
 666         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 667
 668         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 669         unsigned shader_offset = 0;
 670         struct panfrost_bo *shader_bo = NULL;
 671
 672         for (unsigned c = 0; c < rt_count; ++c)
 673                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
 674                                                           &shader_offset);
 675
 676         /* Disable shader execution if we can */
 677         if (dev->quirks & MIDGARD_SHADERLESS
 678                         && !panfrost_fs_required(fs, blend, rt_count)) {
 679                 fragmeta->shader = 0;
 680                 fragmeta->attribute_count = 0;
 681                 fragmeta->varying_count = 0;
 682                 fragmeta->texture_count = 0;
 683                 fragmeta->sampler_count = 0;
 684
 685                 /* This feature is not known to work on Bifrost */
 686                 fragmeta->midgard1.work_count = 1;
 687                 fragmeta->midgard1.uniform_count = 0;
 688                 fragmeta->midgard1.uniform_buffer_count = 0;
 689         }
 690
 691          /* If there is a blend shader, work registers are shared. We impose 8
 692           * work registers as a limit for blend shaders. Should be lower XXX */
 693
 694         if (!(dev->quirks & IS_BIFROST)) {
 695                 for (unsigned c = 0; c < rt_count; ++c) {
 696                         if (blend[c].is_shader) {
 697                                 fragmeta->midgard1.work_count =
 698                                         MAX2(fragmeta->midgard1.work_count, 8);
 699                         }
 700                 }
 701         }
 702
 703         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 704          * copied to the blend_meta appended (by convention), but this is the
 705          * field actually read by the hardware. (Or maybe both are read...?).
 706          * Specify the last RTi with a blend shader. */
 707
 708         fragmeta->blend.shader = 0;
 709
 710         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 711                 if (!blend[rt].is_shader)
 712                         continue;
 713
 714                 fragmeta->blend.shader = blend[rt].shader.gpu |
 715                                          blend[rt].shader.first_tag;
 716                 break;
 717         }
 718
 719         if (dev->quirks & MIDGARD_SFBD) {
 720                 /* When only a single render target platform is used, the blend
 721                  * information is inside the shader meta itself. We additionally
 722                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 723                  * we're able to read back the destination buffer) */
 724
 725                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 726                         blend[0].is_shader);
 727
 728                 if (!blend[0].is_shader) {
 729                         fragmeta->blend.equation = *blend[0].equation.equation;
 730                         fragmeta->blend.constant = blend[0].equation.constant;
 731                 }
 732
 733                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 734                         !blend[0].no_blending || fs->can_discard);
 735                 return;
 736         }
 737
 738         if (dev->quirks & IS_BIFROST) {
 739                 bool no_blend = true;
 740
 741                 for (unsigned i = 0; i < rt_count; ++i)
 742                         no_blend &= (blend[i].no_blending | blend[i].no_colour);
 743
 744                 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
 745                         !fs->can_discard && !fs->writes_depth && no_blend);
 746         }
 747
 748         /* Additional blend descriptor tacked on for jobs using MFBD */
 749
 750         for (unsigned i = 0; i < rt_count; ++i) {
 751                 unsigned flags = 0;
 752
 753                 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
 754                         flags = 0x200;
 755
 756                         bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 757                                        (ctx->pipe_framebuffer.cbufs[i]) &&
 758                                        util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 759
 760                         SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 761                         SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 762                         SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
 763                         SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 764                 }
 765
 766                 if (dev->quirks & IS_BIFROST) {
 767                         struct bifrost_blend_rt *brts = rts;
 768
 769                         brts[i].flags = flags;
 770
 771                         if (blend[i].is_shader) {
 772                                 /* The blend shader's address needs to be at
 773                                  * the same top 32 bit as the fragment shader.
 774                                  * TODO: Ensure that's always the case.
 775                                  */
 776                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 777                                        (fs->bo->gpu & (0xffffffffull << 32)));
 778                                 brts[i].shader = blend[i].shader.gpu;
 779                                 brts[i].unk2 = 0x0;
 780                         } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
 781                                 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
 782                                 const struct util_format_description *format_desc;
 783                                 format_desc = util_format_description(format);
 784
 785                                 brts[i].equation = *blend[i].equation.equation;
 786
 787                                 /* TODO: this is a bit more complicated */
 788                                 brts[i].constant = blend[i].equation.constant;
 789
 790                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 791
 792                                 /* 0x19 disables blending and forces REPLACE
 793                                  * mode (equivalent to rgb_mode = alpha_mode =
 794                                  * x122, colour mask = 0xF). 0x1a allows
 795                                  * blending. */
 796                                 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
 797
 798                                 brts[i].shader_type = fs->blend_types[i];
 799                         } else {
 800                                 /* Dummy attachment for depth-only */
 801                                 brts[i].unk2 = 0x3;
 802                                 brts[i].shader_type = fs->blend_types[i];
 803                         }
 804                 } else {
 805                         struct midgard_blend_rt *mrts = rts;
 806                         mrts[i].flags = flags;
 807
 808                         if (blend[i].is_shader) {
 809                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 810                         } else {
 811                                 mrts[i].blend.equation = *blend[i].equation.equation;
 812                                 mrts[i].blend.constant = blend[i].equation.constant;
 813                         }
 814                 }
 815         }
 816 }
 817
 818 static void
 819 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 820                                struct mali_shader_meta *fragmeta,
 821                                void *rts)
 822 {
 823         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 824         struct panfrost_shader_state *fs;
 825
 826         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 827
 828         fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
 829         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
 830         fragmeta->unknown2_4 = 0x4e0;
 831
 832         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 833          * is required (independent of 32-bit/64-bit descriptors), or why it's
 834          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 835          * these earlier chips (perhaps this is a chicken bit of some kind).
 836          * More investigation is needed. */
 837
 838         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 839
 840         if (dev->quirks & IS_BIFROST) {
 841                 /* TODO */
 842         } else {
 843                 /* Depending on whether it's legal to in the given shader, we try to
 844                  * enable early-z testing. TODO: respect e-z force */
 845
 846                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 847                         !fs->can_discard && !fs->writes_depth && !fs->writes_global);
 848
 849                 /* Add the writes Z/S flags if needed. */
 850                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 851                 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 852
 853                 /* Any time texturing is used, derivatives are implicitly calculated,
 854                  * so we need to enable helper invocations */
 855
 856                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 857                         fs->helper_invocations);
 858
 859                 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 860
 861                 bool depth_enabled = fs->writes_depth ||
 862                    (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
 863
 864                 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, !depth_enabled && fs->can_discard);
 865                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
 866         }
 867
 868         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 869         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 870         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 871 }
 872
 873 void
 874 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 875                           enum pipe_shader_type st,
 876                           struct mali_vertex_tiler_postfix *postfix)
 877 {
 878         struct panfrost_context *ctx = batch->ctx;
 879         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 880
 881         if (!ss) {
 882                 postfix->shader = 0;
 883                 return;
 884         }
 885
 886         struct mali_shader_meta meta;
 887
 888         panfrost_shader_meta_init(ctx, st, &meta);
 889
 890         /* Add the shader BO to the batch. */
 891         panfrost_batch_add_bo(batch, ss->bo,
 892                               PAN_BO_ACCESS_PRIVATE |
 893                               PAN_BO_ACCESS_READ |
 894                               panfrost_bo_access_for_stage(st));
 895
 896         mali_ptr shader_ptr;
 897
 898         if (st == PIPE_SHADER_FRAGMENT) {
 899                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 900                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 901                 size_t desc_size = sizeof(meta);
 902                 void *rts = NULL;
 903                 struct panfrost_transfer xfer;
 904                 unsigned rt_size;
 905
 906                 if (dev->quirks & MIDGARD_SFBD)
 907                         rt_size = 0;
 908                 else if (dev->quirks & IS_BIFROST)
 909                         rt_size = sizeof(struct bifrost_blend_rt);
 910                 else
 911                         rt_size = sizeof(struct midgard_blend_rt);
 912
 913                 desc_size += rt_size * rt_count;
 914
 915                 if (rt_size)
 916                         rts = rzalloc_size(ctx, rt_size * rt_count);
 917
 918                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 919
 920                 xfer = panfrost_allocate_transient(batch, desc_size);
 921
 922                 memcpy(xfer.cpu, &meta, sizeof(meta));
 923                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 924
 925                 if (rt_size)
 926                         ralloc_free(rts);
 927
 928                 shader_ptr = xfer.gpu;
 929         } else {
 930                 shader_ptr = panfrost_upload_transient(batch, &meta,
 931                                                        sizeof(meta));
 932         }
 933
 934         postfix->shader = shader_ptr;
 935 }
 936
 937 static void
 938 panfrost_mali_viewport_init(struct panfrost_context *ctx,
 939                             struct mali_viewport *mvp)
 940 {
 941         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 942
 943         /* Clip bounds are encoded as floats. The viewport itself is encoded as
 944          * (somewhat) asymmetric ints. */
 945
 946         const struct pipe_scissor_state *ss = &ctx->scissor;
 947
 948         memset(mvp, 0, sizeof(*mvp));
 949
 950         /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
 951          * each direction. Clipping to the viewport in theory should work, but
 952          * in practice causes issues when we're not explicitly trying to
 953          * scissor */
 954
 955         *mvp = (struct mali_viewport) {
 956                 .clip_minx = -INFINITY,
 957                 .clip_miny = -INFINITY,
 958                 .clip_maxx = INFINITY,
 959                 .clip_maxy = INFINITY,
 960         };
 961
 962         /* Always scissor to the viewport by default. */
 963         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 964         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 965
 966         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 967         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 968
 969         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 970         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 971
 972         /* Apply the scissor test */
 973
 974         unsigned minx, miny, maxx, maxy;
 975
 976         if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
 977                 minx = MAX2(ss->minx, vp_minx);
 978                 miny = MAX2(ss->miny, vp_miny);
 979                 maxx = MIN2(ss->maxx, vp_maxx);
 980                 maxy = MIN2(ss->maxy, vp_maxy);
 981         } else {
 982                 minx = vp_minx;
 983                 miny = vp_miny;
 984                 maxx = vp_maxx;
 985                 maxy = vp_maxy;
 986         }
 987
 988         /* Hardware needs the min/max to be strictly ordered, so flip if we
 989          * need to. The viewport transformation in the vertex shader will
 990          * handle the negatives if we don't */
 991
 992         if (miny > maxy) {
 993                 unsigned temp = miny;
 994                 miny = maxy;
 995                 maxy = temp;
 996         }
 997
 998         if (minx > maxx) {
 999                 unsigned temp = minx;
1000                 minx = maxx;
1001                 maxx = temp;
1002         }
1003
1004         if (minz > maxz) {
1005                 float temp = minz;
1006                 minz = maxz;
1007                 maxz = temp;
1008         }
1009
1010         /* Clamp to the framebuffer size as a last check */
1011
1012         minx = MIN2(ctx->pipe_framebuffer.width, minx);
1013         maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
1014
1015         miny = MIN2(ctx->pipe_framebuffer.height, miny);
1016         maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
1017
1018         /* Upload */
1019
1020         mvp->viewport0[0] = minx;
1021         mvp->viewport1[0] = MALI_POSITIVE(maxx);
1022
1023         mvp->viewport0[1] = miny;
1024         mvp->viewport1[1] = MALI_POSITIVE(maxy);
1025
1026         mvp->clip_minz = minz;
1027         mvp->clip_maxz = maxz;
1028 }
1029
1030 void
1031 panfrost_emit_viewport(struct panfrost_batch *batch,
1032                        struct mali_vertex_tiler_postfix *tiler_postfix)
1033 {
1034         struct panfrost_context *ctx = batch->ctx;
1035         struct mali_viewport mvp;
1036
1037         panfrost_mali_viewport_init(batch->ctx,  &mvp);
1038
1039         /* Update the job, unless we're doing wallpapering (whose lack of
1040          * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
1041          * just... be faster :) */
1042
1043         if (!ctx->wallpaper_batch)
1044                 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
1045                                              mvp.viewport0[1],
1046                                              mvp.viewport1[0] + 1,
1047                                              mvp.viewport1[1] + 1);
1048
1049         tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
1050                                                             sizeof(mvp));
1051 }
1052
1053 static mali_ptr
1054 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1055                                  enum pipe_shader_type st,
1056                                  struct panfrost_constant_buffer *buf,
1057                                  unsigned index)
1058 {
1059         struct pipe_constant_buffer *cb = &buf->cb[index];
1060         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1061
1062         if (rsrc) {
1063                 panfrost_batch_add_bo(batch, rsrc->bo,
1064                                       PAN_BO_ACCESS_SHARED |
1065                                       PAN_BO_ACCESS_READ |
1066                                       panfrost_bo_access_for_stage(st));
1067
1068                 /* Alignment gauranteed by
1069                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1070                 return rsrc->bo->gpu + cb->buffer_offset;
1071         } else if (cb->user_buffer) {
1072                 return panfrost_upload_transient(batch,
1073                                                  cb->user_buffer +
1074                                                  cb->buffer_offset,
1075                                                  cb->buffer_size);
1076         } else {
1077                 unreachable("No constant buffer");
1078         }
1079 }
1080
1081 struct sysval_uniform {
1082         union {
1083                 float f[4];
1084                 int32_t i[4];
1085                 uint32_t u[4];
1086                 uint64_t du[2];
1087         };
1088 };
1089
1090 static void
1091 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1092                                       struct sysval_uniform *uniform)
1093 {
1094         struct panfrost_context *ctx = batch->ctx;
1095         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1096
1097         uniform->f[0] = vp->scale[0];
1098         uniform->f[1] = vp->scale[1];
1099         uniform->f[2] = vp->scale[2];
1100 }
1101
1102 static void
1103 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1104                                        struct sysval_uniform *uniform)
1105 {
1106         struct panfrost_context *ctx = batch->ctx;
1107         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1108
1109         uniform->f[0] = vp->translate[0];
1110         uniform->f[1] = vp->translate[1];
1111         uniform->f[2] = vp->translate[2];
1112 }
1113
1114 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1115                                        enum pipe_shader_type st,
1116                                        unsigned int sysvalid,
1117                                        struct sysval_uniform *uniform)
1118 {
1119         struct panfrost_context *ctx = batch->ctx;
1120         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1121         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1122         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1123         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1124
1125         assert(dim);
1126         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1127
1128         if (dim > 1)
1129                 uniform->i[1] = u_minify(tex->texture->height0,
1130                                          tex->u.tex.first_level);
1131
1132         if (dim > 2)
1133                 uniform->i[2] = u_minify(tex->texture->depth0,
1134                                          tex->u.tex.first_level);
1135
1136         if (is_array)
1137                 uniform->i[dim] = tex->texture->array_size;
1138 }
1139
1140 static void
1141 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1142                             enum pipe_shader_type st,
1143                             unsigned ssbo_id,
1144                             struct sysval_uniform *uniform)
1145 {
1146         struct panfrost_context *ctx = batch->ctx;
1147
1148         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1149         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1150
1151         /* Compute address */
1152         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1153
1154         panfrost_batch_add_bo(batch, bo,
1155                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1156                               panfrost_bo_access_for_stage(st));
1157
1158         /* Upload address and size as sysval */
1159         uniform->du[0] = bo->gpu + sb.buffer_offset;
1160         uniform->u[2] = sb.buffer_size;
1161 }
1162
1163 static void
1164 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1165                                enum pipe_shader_type st,
1166                                unsigned samp_idx,
1167                                struct sysval_uniform *uniform)
1168 {
1169         struct panfrost_context *ctx = batch->ctx;
1170         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1171
1172         uniform->f[0] = sampl->min_lod;
1173         uniform->f[1] = sampl->max_lod;
1174         uniform->f[2] = sampl->lod_bias;
1175
1176         /* Even without any errata, Midgard represents "no mipmapping" as
1177          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1178          * panfrost_create_sampler_state which also explains our choice of
1179          * epsilon value (again to keep behaviour consistent) */
1180
1181         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1182                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1183 }
1184
1185 static void
1186 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1187                                        struct sysval_uniform *uniform)
1188 {
1189         struct panfrost_context *ctx = batch->ctx;
1190
1191         uniform->u[0] = ctx->compute_grid->grid[0];
1192         uniform->u[1] = ctx->compute_grid->grid[1];
1193         uniform->u[2] = ctx->compute_grid->grid[2];
1194 }
1195
1196 static void
1197 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1198                         struct panfrost_shader_state *ss,
1199                         enum pipe_shader_type st)
1200 {
1201         struct sysval_uniform *uniforms = (void *)buf;
1202
1203         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1204                 int sysval = ss->sysval[i];
1205
1206                 switch (PAN_SYSVAL_TYPE(sysval)) {
1207                 case PAN_SYSVAL_VIEWPORT_SCALE:
1208                         panfrost_upload_viewport_scale_sysval(batch,
1209                                                               &uniforms[i]);
1210                         break;
1211                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1212                         panfrost_upload_viewport_offset_sysval(batch,
1213                                                                &uniforms[i]);
1214                         break;
1215                 case PAN_SYSVAL_TEXTURE_SIZE:
1216                         panfrost_upload_txs_sysval(batch, st,
1217                                                    PAN_SYSVAL_ID(sysval),
1218                                                    &uniforms[i]);
1219                         break;
1220                 case PAN_SYSVAL_SSBO:
1221                         panfrost_upload_ssbo_sysval(batch, st,
1222                                                     PAN_SYSVAL_ID(sysval),
1223                                                     &uniforms[i]);
1224                         break;
1225                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1226                         panfrost_upload_num_work_groups_sysval(batch,
1227                                                                &uniforms[i]);
1228                         break;
1229                 case PAN_SYSVAL_SAMPLER:
1230                         panfrost_upload_sampler_sysval(batch, st,
1231                                                        PAN_SYSVAL_ID(sysval),
1232                                                        &uniforms[i]);
1233                         break;
1234                 default:
1235                         assert(0);
1236                 }
1237         }
1238 }
1239
1240 static const void *
1241 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1242                                  unsigned index)
1243 {
1244         struct pipe_constant_buffer *cb = &buf->cb[index];
1245         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1246
1247         if (rsrc)
1248                 return rsrc->bo->cpu;
1249         else if (cb->user_buffer)
1250                 return cb->user_buffer;
1251         else
1252                 unreachable("No constant buffer");
1253 }
1254
1255 void
1256 panfrost_emit_const_buf(struct panfrost_batch *batch,
1257                         enum pipe_shader_type stage,
1258                         struct mali_vertex_tiler_postfix *postfix)
1259 {
1260         struct panfrost_context *ctx = batch->ctx;
1261         struct panfrost_shader_variants *all = ctx->shader[stage];
1262
1263         if (!all)
1264                 return;
1265
1266         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1267
1268         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1269
1270         /* Uniforms are implicitly UBO #0 */
1271         bool has_uniforms = buf->enabled_mask & (1 << 0);
1272
1273         /* Allocate room for the sysval and the uniforms */
1274         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1275         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1276         size_t size = sys_size + uniform_size;
1277         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1278                                                                         size);
1279
1280         /* Upload sysvals requested by the shader */
1281         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1282
1283         /* Upload uniforms */
1284         if (has_uniforms && uniform_size) {
1285                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1286                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1287         }
1288
1289         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1290          * uploaded */
1291
1292         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1293         assert(ubo_count >= 1);
1294
1295         size_t sz = sizeof(uint64_t) * ubo_count;
1296         uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1297         int uniform_count = ss->uniform_count;
1298
1299         /* Upload uniforms as a UBO */
1300         ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1301
1302         /* The rest are honest-to-goodness UBOs */
1303
1304         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1305                 size_t usz = buf->cb[ubo].buffer_size;
1306                 bool enabled = buf->enabled_mask & (1 << ubo);
1307                 bool empty = usz == 0;
1308
1309                 if (!enabled || empty) {
1310                         /* Stub out disabled UBOs to catch accesses */
1311                         ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1312                         continue;
1313                 }
1314
1315                 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1316                                                                 buf, ubo);
1317
1318                 unsigned bytes_per_field = 16;
1319                 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1320                 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1321         }
1322
1323         mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1324         postfix->uniforms = transfer.gpu;
1325         postfix->uniform_buffers = ubufs;
1326
1327         buf->dirty_mask = 0;
1328 }
1329
1330 void
1331 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1332                             const struct pipe_grid_info *info,
1333                             struct midgard_payload_vertex_tiler *vtp)
1334 {
1335         struct panfrost_context *ctx = batch->ctx;
1336         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1337         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1338         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1339                                                            128));
1340         unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1341                                info->grid[2] * 4;
1342         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1343                                                                   shared_size,
1344                                                                   1);
1345
1346         struct mali_shared_memory shared = {
1347                 .shared_memory = bo->gpu,
1348                 .shared_workgroup_count =
1349                         util_logbase2_ceil(info->grid[0]) +
1350                         util_logbase2_ceil(info->grid[1]) +
1351                         util_logbase2_ceil(info->grid[2]),
1352                 .shared_unk1 = 0x2,
1353                 .shared_shift = util_logbase2(single_size) - 1
1354         };
1355
1356         vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1357                                                                sizeof(shared));
1358 }
1359
1360 static mali_ptr
1361 panfrost_get_tex_desc(struct panfrost_batch *batch,
1362                       enum pipe_shader_type st,
1363                       struct panfrost_sampler_view *view)
1364 {
1365         if (!view)
1366                 return (mali_ptr) 0;
1367
1368         struct pipe_sampler_view *pview = &view->base;
1369         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1370
1371         /* Add the BO to the job so it's retained until the job is done. */
1372
1373         panfrost_batch_add_bo(batch, rsrc->bo,
1374                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1375                               panfrost_bo_access_for_stage(st));
1376
1377         panfrost_batch_add_bo(batch, view->midgard_bo,
1378                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1379                               panfrost_bo_access_for_stage(st));
1380
1381         return view->midgard_bo->gpu;
1382 }
1383
1384 void
1385 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1386                                   enum pipe_shader_type stage,
1387                                   struct mali_vertex_tiler_postfix *postfix)
1388 {
1389         struct panfrost_context *ctx = batch->ctx;
1390         struct panfrost_device *device = pan_device(ctx->base.screen);
1391
1392         if (!ctx->sampler_view_count[stage])
1393                 return;
1394
1395         if (device->quirks & IS_BIFROST) {
1396                 struct bifrost_texture_descriptor *descriptors;
1397
1398                 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1399                                      ctx->sampler_view_count[stage]);
1400
1401                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1402                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1403                         struct pipe_sampler_view *pview = &view->base;
1404                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1405
1406                         /* Add the BOs to the job so they are retained until the job is done. */
1407
1408                         panfrost_batch_add_bo(batch, rsrc->bo,
1409                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1410                                               panfrost_bo_access_for_stage(stage));
1411
1412                         panfrost_batch_add_bo(batch, view->bifrost_bo,
1413                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1414                                               panfrost_bo_access_for_stage(stage));
1415
1416                         memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1417                 }
1418
1419                 postfix->textures = panfrost_upload_transient(batch,
1420                                                               descriptors,
1421                                                               sizeof(struct bifrost_texture_descriptor) *
1422                                                                       ctx->sampler_view_count[stage]);
1423
1424                 free(descriptors);
1425         } else {
1426                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1427
1428                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1429                         trampolines[i] = panfrost_get_tex_desc(batch, stage,
1430                                                                ctx->sampler_views[stage][i]);
1431
1432                 postfix->textures = panfrost_upload_transient(batch,
1433                                                               trampolines,
1434                                                               sizeof(uint64_t) *
1435                                                               ctx->sampler_view_count[stage]);
1436         }
1437 }
1438
1439 void
1440 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1441                                   enum pipe_shader_type stage,
1442                                   struct mali_vertex_tiler_postfix *postfix)
1443 {
1444         struct panfrost_context *ctx = batch->ctx;
1445         struct panfrost_device *device = pan_device(ctx->base.screen);
1446
1447         if (!ctx->sampler_count[stage])
1448                 return;
1449
1450         if (device->quirks & IS_BIFROST) {
1451                 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1452                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1453                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1454                                                                                 transfer_size);
1455                 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1456
1457                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1458                         desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1459
1460                 postfix->sampler_descriptor = transfer.gpu;
1461         } else {
1462                 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1463                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1464                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1465                                                                                 transfer_size);
1466                 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1467
1468                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1469                         desc[i] = ctx->samplers[stage][i]->midgard_hw;
1470
1471                 postfix->sampler_descriptor = transfer.gpu;
1472         }
1473 }
1474
1475 void
1476 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1477                                struct mali_vertex_tiler_postfix *vertex_postfix)
1478 {
1479         struct panfrost_context *ctx = batch->ctx;
1480
1481         if (!ctx->vertex)
1482                 return;
1483
1484         struct panfrost_vertex_state *so = ctx->vertex;
1485
1486         panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1487         vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1488                                                                sizeof(*so->hw) *
1489                                                                PAN_MAX_ATTRIBUTE);
1490 }
1491
1492 void
1493 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1494                           struct mali_vertex_tiler_postfix *vertex_postfix)
1495 {
1496         struct panfrost_context *ctx = batch->ctx;
1497         struct panfrost_vertex_state *so = ctx->vertex;
1498
1499         /* Staged mali_attr, and index into them. i =/= k, depending on the
1500          * vertex buffer mask and instancing. Twice as much room is allocated,
1501          * for a worst case of NPOT_DIVIDEs which take up extra slot */
1502         union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1503         unsigned k = 0;
1504
1505         for (unsigned i = 0; i < so->num_elements; ++i) {
1506                 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1507                  * means duplicating some vertex buffers (who cares? aside from
1508                  * maybe some caching implications but I somehow doubt that
1509                  * matters) */
1510
1511                 struct pipe_vertex_element *elem = &so->pipe[i];
1512                 unsigned vbi = elem->vertex_buffer_index;
1513
1514                 /* The exception to 1:1 mapping is that we can have multiple
1515                  * entries (NPOT divisors), so we fixup anyways */
1516
1517                 so->hw[i].index = k;
1518
1519                 if (!(ctx->vb_mask & (1 << vbi)))
1520                         continue;
1521
1522                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1523                 struct panfrost_resource *rsrc;
1524
1525                 rsrc = pan_resource(buf->buffer.resource);
1526                 if (!rsrc)
1527                         continue;
1528
1529                 /* Align to 64 bytes by masking off the lower bits. This
1530                  * will be adjusted back when we fixup the src_offset in
1531                  * mali_attr_meta */
1532
1533                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1534                 mali_ptr addr = raw_addr & ~63;
1535                 unsigned chopped_addr = raw_addr - addr;
1536
1537                 /* Add a dependency of the batch on the vertex buffer */
1538                 panfrost_batch_add_bo(batch, rsrc->bo,
1539                                       PAN_BO_ACCESS_SHARED |
1540                                       PAN_BO_ACCESS_READ |
1541                                       PAN_BO_ACCESS_VERTEX_TILER);
1542
1543                 /* Set common fields */
1544                 attrs[k].elements = addr;
1545                 attrs[k].stride = buf->stride;
1546
1547                 /* Since we advanced the base pointer, we shrink the buffer
1548                  * size */
1549                 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1550
1551                 /* We need to add the extra size we masked off (for
1552                  * correctness) so the data doesn't get clamped away */
1553                 attrs[k].size += chopped_addr;
1554
1555                 /* For non-instancing make sure we initialize */
1556                 attrs[k].shift = attrs[k].extra_flags = 0;
1557
1558                 /* Instancing uses a dramatically different code path than
1559                  * linear, so dispatch for the actual emission now that the
1560                  * common code is finished */
1561
1562                 unsigned divisor = elem->instance_divisor;
1563
1564                 if (divisor && ctx->instance_count == 1) {
1565                         /* Silly corner case where there's a divisor(=1) but
1566                          * there's no legitimate instancing. So we want *every*
1567                          * attribute to be the same. So set stride to zero so
1568                          * we don't go anywhere. */
1569
1570                         attrs[k].size = attrs[k].stride + chopped_addr;
1571                         attrs[k].stride = 0;
1572                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1573                 } else if (ctx->instance_count <= 1) {
1574                         /* Normal, non-instanced attributes */
1575                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1576                 } else {
1577                         unsigned instance_shift = vertex_postfix->instance_shift;
1578                         unsigned instance_odd = vertex_postfix->instance_odd;
1579
1580                         k += panfrost_vertex_instanced(ctx->padded_count,
1581                                                        instance_shift,
1582                                                        instance_odd,
1583                                                        divisor, &attrs[k]);
1584                 }
1585         }
1586
1587         /* Add special gl_VertexID/gl_InstanceID buffers */
1588
1589         panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1590         so->hw[PAN_VERTEX_ID].index = k++;
1591         panfrost_instance_id(ctx->padded_count, &attrs[k]);
1592         so->hw[PAN_INSTANCE_ID].index = k++;
1593
1594         /* Upload whatever we emitted and go */
1595
1596         vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1597                                                            k * sizeof(*attrs));
1598 }
1599
1600 static mali_ptr
1601 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1602                        unsigned stride, unsigned count)
1603 {
1604         /* Fill out the descriptor */
1605         slot->stride = stride;
1606         slot->size = stride * count;
1607         slot->shift = slot->extra_flags = 0;
1608
1609         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1610                                                                         slot->size);
1611
1612         slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1613
1614         return transfer.gpu;
1615 }
1616
1617 static void
1618 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1619                         unsigned stride, unsigned offset, unsigned count,
1620                         struct pipe_stream_output_target *target)
1621 {
1622         /* Fill out the descriptor */
1623         slot->stride = stride * 4;
1624         slot->shift = slot->extra_flags = 0;
1625
1626         unsigned max_size = target->buffer_size;
1627         unsigned expected_size = slot->stride * count;
1628
1629         slot->size = MIN2(max_size, expected_size);
1630
1631         /* Grab the BO and bind it to the batch */
1632         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1633
1634         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1635          * the perspective of the TILER and FRAGMENT.
1636          */
1637         panfrost_batch_add_bo(batch, bo,
1638                               PAN_BO_ACCESS_SHARED |
1639                               PAN_BO_ACCESS_RW |
1640                               PAN_BO_ACCESS_VERTEX_TILER |
1641                               PAN_BO_ACCESS_FRAGMENT);
1642
1643         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1644         slot->elements = addr;
1645 }
1646
1647 /* Given a shader and buffer indices, link varying metadata together */
1648
1649 static bool
1650 is_special_varying(gl_varying_slot loc)
1651 {
1652         switch (loc) {
1653         case VARYING_SLOT_POS:
1654         case VARYING_SLOT_PSIZ:
1655         case VARYING_SLOT_PNTC:
1656         case VARYING_SLOT_FACE:
1657                 return true;
1658         default:
1659                 return false;
1660         }
1661 }
1662
1663 static void
1664 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1665                            signed general, signed gl_Position,
1666                            signed gl_PointSize, signed gl_PointCoord,
1667                            signed gl_FrontFacing)
1668 {
1669         struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1670
1671         for (unsigned i = 0; i < ss->varying_count; ++i) {
1672                 gl_varying_slot location = ss->varyings_loc[i];
1673                 int index = -1;
1674
1675                 switch (location) {
1676                 case VARYING_SLOT_POS:
1677                         index = gl_Position;
1678                         break;
1679                 case VARYING_SLOT_PSIZ:
1680                         index = gl_PointSize;
1681                         break;
1682                 case VARYING_SLOT_PNTC:
1683                         index = gl_PointCoord;
1684                         break;
1685                 case VARYING_SLOT_FACE:
1686                         index = gl_FrontFacing;
1687                         break;
1688                 default:
1689                         index = general;
1690                         break;
1691                 }
1692
1693                 assert(index >= 0);
1694                 out[i].index = index;
1695         }
1696 }
1697
1698 static bool
1699 has_point_coord(unsigned mask, gl_varying_slot loc)
1700 {
1701         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1702                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1703         else if (loc == VARYING_SLOT_PNTC)
1704                 return (mask & (1 << 8));
1705         else
1706                 return false;
1707 }
1708
1709 /* Helpers for manipulating stream out information so we can pack varyings
1710  * accordingly. Compute the src_offset for a given captured varying */
1711
1712 static struct pipe_stream_output *
1713 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1714 {
1715         for (unsigned i = 0; i < info->num_outputs; ++i) {
1716                 if (info->output[i].register_index == loc)
1717                         return &info->output[i];
1718         }
1719
1720         unreachable("Varying not captured");
1721 }
1722
1723 void
1724 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1725                                  unsigned vertex_count,
1726                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1727                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1728                                  union midgard_primitive_size *primitive_size)
1729 {
1730         /* Load the shaders */
1731         struct panfrost_context *ctx = batch->ctx;
1732         struct panfrost_device *device = pan_device(ctx->base.screen);
1733         struct panfrost_shader_state *vs, *fs;
1734         unsigned int num_gen_varyings = 0;
1735         size_t vs_size, fs_size;
1736
1737         /* Allocate the varying descriptor */
1738
1739         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1740         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1741         vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1742         fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1743
1744         struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1745                                                                      vs_size +
1746                                                                      fs_size);
1747
1748         struct pipe_stream_output_info *so = &vs->stream_output;
1749
1750         /* Check if this varying is linked by us. This is the case for
1751          * general-purpose, non-captured varyings. If it is, link it. If it's
1752          * not, use the provided stream out information to determine the
1753          * offset, since it was already linked for us. */
1754
1755         for (unsigned i = 0; i < vs->varying_count; i++) {
1756                 gl_varying_slot loc = vs->varyings_loc[i];
1757
1758                 bool special = is_special_varying(loc);
1759                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1760
1761                 if (captured) {
1762                         struct pipe_stream_output *o = pan_get_so(so, loc);
1763
1764                         unsigned dst_offset = o->dst_offset * 4; /* dwords */
1765                         vs->varyings[i].src_offset = dst_offset;
1766                 } else if (!special) {
1767                         vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1768                 }
1769         }
1770
1771         /* Conversely, we need to set src_offset for the captured varyings.
1772          * Here, the layout is defined by the stream out info, not us */
1773
1774         /* Link up with fragment varyings */
1775         bool reads_point_coord = fs->reads_point_coord;
1776
1777         for (unsigned i = 0; i < fs->varying_count; i++) {
1778                 gl_varying_slot loc = fs->varyings_loc[i];
1779                 unsigned src_offset;
1780                 signed vs_idx = -1;
1781
1782                 /* Link up */
1783                 for (unsigned j = 0; j < vs->varying_count; ++j) {
1784                         if (vs->varyings_loc[j] == loc) {
1785                                 vs_idx = j;
1786                                 break;
1787                         }
1788                 }
1789
1790                 /* Either assign or reuse */
1791                 if (vs_idx >= 0)
1792                         src_offset = vs->varyings[vs_idx].src_offset;
1793                 else
1794                         src_offset = 16 * (num_gen_varyings++);
1795
1796                 fs->varyings[i].src_offset = src_offset;
1797
1798                 if (has_point_coord(fs->point_sprite_mask, loc))
1799                         reads_point_coord = true;
1800         }
1801
1802         memcpy(trans.cpu, vs->varyings, vs_size);
1803         memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1804
1805         union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1806
1807         /* Figure out how many streamout buffers could be bound */
1808         unsigned so_count = ctx->streamout.num_targets;
1809         for (unsigned i = 0; i < vs->varying_count; i++) {
1810                 gl_varying_slot loc = vs->varyings_loc[i];
1811
1812                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1813                 if (!captured) continue;
1814
1815                 struct pipe_stream_output *o = pan_get_so(so, loc);
1816                 so_count = MAX2(so_count, o->output_buffer + 1);
1817         }
1818
1819         signed idx = so_count;
1820         signed general = idx++;
1821         signed gl_Position = idx++;
1822         signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1823         signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1824         signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1825         signed gl_FragCoord = (fs->reads_frag_coord &&
1826                         !(device->quirks & IS_BIFROST))
1827                         ? (idx++) : -1;
1828
1829         /* Emit the stream out buffers */
1830
1831         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1832                                                            ctx->vertex_count);
1833
1834         for (unsigned i = 0; i < so_count; ++i) {
1835                 if (i < ctx->streamout.num_targets) {
1836                         panfrost_emit_streamout(batch, &varyings[i],
1837                                                 so->stride[i],
1838                                                 ctx->streamout.offsets[i],
1839                                                 out_count,
1840                                                 ctx->streamout.targets[i]);
1841                 } else {
1842                         /* Emit a dummy buffer */
1843                         panfrost_emit_varyings(batch, &varyings[i],
1844                                                so->stride[i] * 4,
1845                                                out_count);
1846
1847                         /* Clear the attribute type */
1848                         varyings[i].elements &= ~0xF;
1849                 }
1850         }
1851
1852         panfrost_emit_varyings(batch, &varyings[general],
1853                                num_gen_varyings * 16,
1854                                vertex_count);
1855
1856         mali_ptr varyings_p;
1857
1858         /* fp32 vec4 gl_Position */
1859         varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1860                                             sizeof(float) * 4, vertex_count);
1861         tiler_postfix->position_varying = varyings_p;
1862
1863
1864         if (panfrost_writes_point_size(ctx)) {
1865                 varyings_p = panfrost_emit_varyings(batch,
1866                                                     &varyings[gl_PointSize],
1867                                                     2, vertex_count);
1868                 primitive_size->pointer = varyings_p;
1869         }
1870
1871         if (gl_PointCoord >= 0)
1872                 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1873
1874         if (gl_FrontFacing >= 0)
1875                 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1876
1877         if (gl_FragCoord >= 0)
1878                 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1879
1880         assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord));
1881
1882         /* Let's go ahead and link varying meta to the buffer in question, now
1883          * that that information is available. VARYING_SLOT_POS is mapped to
1884          * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1885          * */
1886
1887         panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1888                                    gl_PointSize, gl_PointCoord,
1889                                    gl_FrontFacing);
1890
1891         panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1892                                    gl_FragCoord, gl_PointSize,
1893                                    gl_PointCoord, gl_FrontFacing);
1894
1895         /* Replace streamout */
1896
1897         struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1898         struct mali_attr_meta *ofs = ovs + vs->varying_count;
1899
1900         for (unsigned i = 0; i < vs->varying_count; i++) {
1901                 gl_varying_slot loc = vs->varyings_loc[i];
1902
1903                 /* If we write gl_PointSize from the vertex shader but don't
1904                  * consume it, no memory will be allocated for it, so if we
1905                  * attempted to write anyway we would dereference a NULL
1906                  * pointer on the GPU. Midgard seems fine with this; Bifrost
1907                  * faults. */
1908
1909                 if (loc == VARYING_SLOT_PSIZ && !panfrost_writes_point_size(ctx))
1910                         ovs[i].format = MALI_VARYING_DISCARD;
1911
1912                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1913                 if (!captured)
1914                         continue;
1915
1916                 struct pipe_stream_output *o = pan_get_so(so, loc);
1917                 ovs[i].index = o->output_buffer;
1918
1919                 assert(o->stream == 0);
1920                 ovs[i].format = (vs->varyings[i].format & ~MALI_NR_CHANNELS(4))
1921                         | MALI_NR_CHANNELS(o->num_components);
1922
1923                 if (device->quirks & HAS_SWIZZLES)
1924                         ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1925                 else
1926                         ovs[i].swizzle = panfrost_bifrost_swizzle(o->num_components);
1927
1928                 /* Link to the fragment */
1929                 signed fs_idx = -1;
1930
1931                 /* Link up */
1932                 for (unsigned j = 0; j < fs->varying_count; ++j) {
1933                         if (fs->varyings_loc[j] == loc) {
1934                                 fs_idx = j;
1935                                 break;
1936                         }
1937                 }
1938
1939                 if (fs_idx >= 0) {
1940                         ofs[fs_idx].index = ovs[i].index;
1941                         ofs[fs_idx].format = ovs[i].format;
1942                         ofs[fs_idx].swizzle = ovs[i].swizzle;
1943                 }
1944         }
1945
1946         /* Replace point sprite */
1947         for (unsigned i = 0; i < fs->varying_count; i++) {
1948                 /* If we have a point sprite replacement, handle that here. We
1949                  * have to translate location first.  TODO: Flip y in shader.
1950                  * We're already keying ... just time crunch .. */
1951
1952                 if (has_point_coord(fs->point_sprite_mask,
1953                                     fs->varyings_loc[i])) {
1954                         ofs[i].index = gl_PointCoord;
1955
1956                         /* Swizzle out the z/w to 0/1 */
1957                         ofs[i].format = MALI_RG16F;
1958                         ofs[i].swizzle = panfrost_get_default_swizzle(2);
1959                 }
1960         }
1961
1962         /* Fix up unaligned addresses */
1963         for (unsigned i = 0; i < so_count; ++i) {
1964                 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1965                         continue;
1966
1967                 unsigned align = (varyings[i].elements & 63);
1968
1969                 /* While we're at it, the SO buffers are linear */
1970
1971                 if (!align) {
1972                         varyings[i].elements |= MALI_ATTR_LINEAR;
1973                         continue;
1974                 }
1975
1976                 /* We need to adjust alignment */
1977                 varyings[i].elements &= ~63;
1978                 varyings[i].elements |= MALI_ATTR_LINEAR;
1979                 varyings[i].size += align;
1980
1981                 for (unsigned v = 0; v < vs->varying_count; ++v) {
1982                         if (ovs[v].index != i)
1983                                 continue;
1984
1985                         ovs[v].src_offset = vs->varyings[v].src_offset + align;
1986                 }
1987
1988                 for (unsigned f = 0; f < fs->varying_count; ++f) {
1989                         if (ofs[f].index != i)
1990                                 continue;
1991
1992                         ofs[f].src_offset = fs->varyings[f].src_offset + align;
1993                 }
1994         }
1995
1996         varyings_p = panfrost_upload_transient(batch, varyings,
1997                                                idx * sizeof(*varyings));
1998         vertex_postfix->varyings = varyings_p;
1999         tiler_postfix->varyings = varyings_p;
2000
2001         vertex_postfix->varying_meta = trans.gpu;
2002         tiler_postfix->varying_meta = trans.gpu + vs_size;
2003 }
2004
2005 void
2006 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2007                                 struct mali_vertex_tiler_prefix *vertex_prefix,
2008                                 struct mali_vertex_tiler_postfix *vertex_postfix,
2009                                 struct mali_vertex_tiler_prefix *tiler_prefix,
2010                                 struct mali_vertex_tiler_postfix *tiler_postfix,
2011                                 union midgard_primitive_size *primitive_size)
2012 {
2013         struct panfrost_context *ctx = batch->ctx;
2014         struct panfrost_device *device = pan_device(ctx->base.screen);
2015         bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
2016         struct bifrost_payload_vertex bifrost_vertex = {0,};
2017         struct bifrost_payload_tiler bifrost_tiler = {0,};
2018         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2019         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2020         void *vp, *tp;
2021         size_t vp_size, tp_size;
2022
2023         if (device->quirks & IS_BIFROST) {
2024                 bifrost_vertex.prefix = *vertex_prefix;
2025                 bifrost_vertex.postfix = *vertex_postfix;
2026                 vp = &bifrost_vertex;
2027                 vp_size = sizeof(bifrost_vertex);
2028
2029                 bifrost_tiler.prefix = *tiler_prefix;
2030                 bifrost_tiler.tiler.primitive_size = *primitive_size;
2031                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2032                 bifrost_tiler.postfix = *tiler_postfix;
2033                 tp = &bifrost_tiler;
2034                 tp_size = sizeof(bifrost_tiler);
2035         } else {
2036                 midgard_vertex.prefix = *vertex_prefix;
2037                 midgard_vertex.postfix = *vertex_postfix;
2038                 vp = &midgard_vertex;
2039                 vp_size = sizeof(midgard_vertex);
2040
2041                 midgard_tiler.prefix = *tiler_prefix;
2042                 midgard_tiler.postfix = *tiler_postfix;
2043                 midgard_tiler.primitive_size = *primitive_size;
2044                 tp = &midgard_tiler;
2045                 tp_size = sizeof(midgard_tiler);
2046         }
2047
2048         if (wallpapering) {
2049                 /* Inject in reverse order, with "predicted" job indices.
2050                  * THIS IS A HACK XXX */
2051                 panfrost_new_job(batch, JOB_TYPE_TILER, false,
2052                                  batch->job_index + 2, tp, tp_size, true);
2053                 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2054                                  vp, vp_size, true);
2055                 return;
2056         }
2057
2058         /* If rasterizer discard is enable, only submit the vertex */
2059
2060         bool rasterizer_discard = ctx->rasterizer &&
2061                                   ctx->rasterizer->base.rasterizer_discard;
2062
2063         unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2064                                            vp, vp_size, false);
2065
2066         if (rasterizer_discard)
2067                 return;
2068
2069         panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2070                          false);
2071 }
2072
2073 /* TODO: stop hardcoding this */
2074 mali_ptr
2075 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2076 {
2077         uint16_t locations[] = {
2078             128, 128,
2079             0, 256,
2080             0, 256,
2081             0, 256,
2082             0, 256,
2083             0, 256,
2084             0, 256,
2085             0, 256,
2086             0, 256,
2087             0, 256,
2088             0, 256,
2089             0, 256,
2090             0, 256,
2091             0, 256,
2092             0, 256,
2093             0, 256,
2094             0, 256,
2095             0, 256,
2096             0, 256,
2097             0, 256,
2098             0, 256,
2099             0, 256,
2100             0, 256,
2101             0, 256,
2102             0, 256,
2103             0, 256,
2104             0, 256,
2105             0, 256,
2106             0, 256,
2107             0, 256,
2108             0, 256,
2109             0, 256,
2110             128, 128,
2111             0, 0,
2112             0, 0,
2113             0, 0,
2114             0, 0,
2115             0, 0,
2116             0, 0,
2117             0, 0,
2118             0, 0,
2119             0, 0,
2120             0, 0,
2121             0, 0,
2122             0, 0,
2123             0, 0,
2124             0, 0,
2125             0, 0,
2126         };
2127
2128         return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2129 }