panfrost: Fix transform feedback types
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_allocate.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 unsigned shift = panfrost_get_stack_shift(batch->stack_size);
62 struct mali_shared_memory shared = {
63 .stack_shift = shift,
64 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
65 .shared_workgroup_count = ~0,
66 };
67 postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
68 }
69
70 static void
71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
72 struct mali_vertex_tiler_postfix *postfix)
73 {
74 struct panfrost_device *dev = pan_device(ctx->base.screen);
75 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
76
77 /* If we haven't, reserve space for the framebuffer */
78
79 if (!batch->framebuffer.gpu) {
80 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
81 sizeof(struct mali_single_framebuffer) :
82 sizeof(struct mali_framebuffer);
83
84 batch->framebuffer = panfrost_allocate_transient(batch, size);
85
86 /* Tag the pointer */
87 if (!(dev->quirks & MIDGARD_SFBD))
88 batch->framebuffer.gpu |= MALI_MFBD;
89 }
90
91 postfix->shared_memory = batch->framebuffer.gpu;
92 }
93
94 static void
95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
96 struct mali_vertex_tiler_prefix *prefix,
97 struct mali_vertex_tiler_postfix *postfix)
98 {
99 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
100
101 postfix->gl_enables |= 0x7;
102 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
103 rasterizer && rasterizer->base.front_ccw);
104 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
105 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
106 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
107 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
108 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
109 rasterizer && rasterizer->base.flatshade_first);
110 }
111
112 void
113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
114 struct mali_vertex_tiler_prefix *prefix,
115 union midgard_primitive_size *primitive_size)
116 {
117 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
118
119 if (!panfrost_writes_point_size(ctx)) {
120 bool points = prefix->draw_mode == MALI_POINTS;
121 float val = 0.0f;
122
123 if (rasterizer)
124 val = points ?
125 rasterizer->base.point_size :
126 rasterizer->base.line_width;
127
128 primitive_size->constant = val;
129 }
130 }
131
132 static void
133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
134 struct mali_vertex_tiler_postfix *postfix)
135 {
136 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
137 if (ctx->occlusion_query)
138 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
139 else
140 postfix->occlusion_counter = 0;
141 }
142
143 void
144 panfrost_vt_init(struct panfrost_context *ctx,
145 enum pipe_shader_type stage,
146 struct mali_vertex_tiler_prefix *prefix,
147 struct mali_vertex_tiler_postfix *postfix)
148 {
149 struct panfrost_device *device = pan_device(ctx->base.screen);
150
151 if (!ctx->shader[stage])
152 return;
153
154 memset(prefix, 0, sizeof(*prefix));
155 memset(postfix, 0, sizeof(*postfix));
156
157 if (device->quirks & IS_BIFROST) {
158 postfix->gl_enables = 0x2;
159 panfrost_vt_emit_shared_memory(ctx, postfix);
160 } else {
161 postfix->gl_enables = 0x6;
162 panfrost_vt_attach_framebuffer(ctx, postfix);
163 }
164
165 if (stage == PIPE_SHADER_FRAGMENT) {
166 panfrost_vt_update_occlusion_query(ctx, postfix);
167 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
168 }
169 }
170
171 static unsigned
172 panfrost_translate_index_size(unsigned size)
173 {
174 switch (size) {
175 case 1:
176 return MALI_DRAW_INDEXED_UINT8;
177
178 case 2:
179 return MALI_DRAW_INDEXED_UINT16;
180
181 case 4:
182 return MALI_DRAW_INDEXED_UINT32;
183
184 default:
185 unreachable("Invalid index size");
186 }
187 }
188
189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
190 * good for the duration of the draw (transient), could last longer. Also get
191 * the bounds on the index buffer for the range accessed by the draw. We do
192 * these operations together because there are natural optimizations which
193 * require them to be together. */
194
195 static mali_ptr
196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
197 const struct pipe_draw_info *info,
198 unsigned *min_index, unsigned *max_index)
199 {
200 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
201 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
202 off_t offset = info->start * info->index_size;
203 bool needs_indices = true;
204 mali_ptr out = 0;
205
206 if (info->max_index != ~0u) {
207 *min_index = info->min_index;
208 *max_index = info->max_index;
209 needs_indices = false;
210 }
211
212 if (!info->has_user_indices) {
213 /* Only resources can be directly mapped */
214 panfrost_batch_add_bo(batch, rsrc->bo,
215 PAN_BO_ACCESS_SHARED |
216 PAN_BO_ACCESS_READ |
217 PAN_BO_ACCESS_VERTEX_TILER);
218 out = rsrc->bo->gpu + offset;
219
220 /* Check the cache */
221 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
222 info->start,
223 info->count,
224 min_index,
225 max_index);
226 } else {
227 /* Otherwise, we need to upload to transient memory */
228 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
229 out = panfrost_upload_transient(batch, ibuf8 + offset,
230 info->count *
231 info->index_size);
232 }
233
234 if (needs_indices) {
235 /* Fallback */
236 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
237
238 if (!info->has_user_indices)
239 panfrost_minmax_cache_add(rsrc->index_cache,
240 info->start, info->count,
241 *min_index, *max_index);
242 }
243
244 return out;
245 }
246
247 void
248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
249 const struct pipe_draw_info *info,
250 enum mali_draw_mode draw_mode,
251 struct mali_vertex_tiler_postfix *vertex_postfix,
252 struct mali_vertex_tiler_prefix *tiler_prefix,
253 struct mali_vertex_tiler_postfix *tiler_postfix,
254 unsigned *vertex_count,
255 unsigned *padded_count)
256 {
257 tiler_prefix->draw_mode = draw_mode;
258
259 unsigned draw_flags = 0;
260
261 if (panfrost_writes_point_size(ctx))
262 draw_flags |= MALI_DRAW_VARYING_SIZE;
263
264 if (info->primitive_restart)
265 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
266
267 /* These doesn't make much sense */
268
269 draw_flags |= 0x3000;
270
271 if (info->index_size) {
272 unsigned min_index = 0, max_index = 0;
273
274 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
275 info,
276 &min_index,
277 &max_index);
278
279 /* Use the corresponding values */
280 *vertex_count = max_index - min_index + 1;
281 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
282 tiler_prefix->offset_bias_correction = -min_index;
283 tiler_prefix->index_count = MALI_POSITIVE(info->count);
284 draw_flags |= panfrost_translate_index_size(info->index_size);
285 } else {
286 tiler_prefix->indices = 0;
287 *vertex_count = ctx->vertex_count;
288 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
289 tiler_prefix->offset_bias_correction = 0;
290 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
291 }
292
293 tiler_prefix->unknown_draw = draw_flags;
294
295 /* Encode the padded vertex count */
296
297 if (info->instance_count > 1) {
298 *padded_count = panfrost_padded_vertex_count(*vertex_count);
299
300 unsigned shift = __builtin_ctz(ctx->padded_count);
301 unsigned k = ctx->padded_count >> (shift + 1);
302
303 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
304 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
305 } else {
306 *padded_count = *vertex_count;
307
308 /* Reset instancing state */
309 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
310 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
311 }
312 }
313
314 static void
315 panfrost_shader_meta_init(struct panfrost_context *ctx,
316 enum pipe_shader_type st,
317 struct mali_shader_meta *meta)
318 {
319 const struct panfrost_device *dev = pan_device(ctx->base.screen);
320 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
321
322 memset(meta, 0, sizeof(*meta));
323 meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
324 meta->attribute_count = ss->attribute_count;
325 meta->varying_count = ss->varying_count;
326 meta->texture_count = ctx->sampler_view_count[st];
327 meta->sampler_count = ctx->sampler_count[st];
328
329 if (dev->quirks & IS_BIFROST) {
330 if (st == PIPE_SHADER_VERTEX)
331 meta->bifrost1.unk1 = 0x800000;
332 else {
333 /* First clause ATEST |= 0x4000000.
334 * Less than 32 regs |= 0x200 */
335 meta->bifrost1.unk1 = 0x958020;
336 }
337
338 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
339 if (st == PIPE_SHADER_VERTEX)
340 meta->bifrost2.preload_regs = 0xC0;
341 else
342 meta->bifrost2.preload_regs = 0x1;
343 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
344 ss->uniform_cutoff);
345 } else {
346 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
347 ss->uniform_cutoff);
348 meta->midgard1.work_count = ss->work_reg_count;
349 meta->midgard1.flags_hi = 0x8; /* XXX */
350 meta->midgard1.flags_lo = 0x220;
351 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
352 }
353 }
354
355 static unsigned
356 panfrost_translate_compare_func(enum pipe_compare_func in)
357 {
358 switch (in) {
359 case PIPE_FUNC_NEVER:
360 return MALI_FUNC_NEVER;
361
362 case PIPE_FUNC_LESS:
363 return MALI_FUNC_LESS;
364
365 case PIPE_FUNC_EQUAL:
366 return MALI_FUNC_EQUAL;
367
368 case PIPE_FUNC_LEQUAL:
369 return MALI_FUNC_LEQUAL;
370
371 case PIPE_FUNC_GREATER:
372 return MALI_FUNC_GREATER;
373
374 case PIPE_FUNC_NOTEQUAL:
375 return MALI_FUNC_NOTEQUAL;
376
377 case PIPE_FUNC_GEQUAL:
378 return MALI_FUNC_GEQUAL;
379
380 case PIPE_FUNC_ALWAYS:
381 return MALI_FUNC_ALWAYS;
382
383 default:
384 unreachable("Invalid func");
385 }
386 }
387
388 static unsigned
389 panfrost_translate_stencil_op(enum pipe_stencil_op in)
390 {
391 switch (in) {
392 case PIPE_STENCIL_OP_KEEP:
393 return MALI_STENCIL_KEEP;
394
395 case PIPE_STENCIL_OP_ZERO:
396 return MALI_STENCIL_ZERO;
397
398 case PIPE_STENCIL_OP_REPLACE:
399 return MALI_STENCIL_REPLACE;
400
401 case PIPE_STENCIL_OP_INCR:
402 return MALI_STENCIL_INCR;
403
404 case PIPE_STENCIL_OP_DECR:
405 return MALI_STENCIL_DECR;
406
407 case PIPE_STENCIL_OP_INCR_WRAP:
408 return MALI_STENCIL_INCR_WRAP;
409
410 case PIPE_STENCIL_OP_DECR_WRAP:
411 return MALI_STENCIL_DECR_WRAP;
412
413 case PIPE_STENCIL_OP_INVERT:
414 return MALI_STENCIL_INVERT;
415
416 default:
417 unreachable("Invalid stencil op");
418 }
419 }
420
421 static unsigned
422 translate_tex_wrap(enum pipe_tex_wrap w)
423 {
424 switch (w) {
425 case PIPE_TEX_WRAP_REPEAT:
426 return MALI_WRAP_REPEAT;
427
428 case PIPE_TEX_WRAP_CLAMP:
429 return MALI_WRAP_CLAMP;
430
431 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
432 return MALI_WRAP_CLAMP_TO_EDGE;
433
434 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
435 return MALI_WRAP_CLAMP_TO_BORDER;
436
437 case PIPE_TEX_WRAP_MIRROR_REPEAT:
438 return MALI_WRAP_MIRRORED_REPEAT;
439
440 case PIPE_TEX_WRAP_MIRROR_CLAMP:
441 return MALI_WRAP_MIRRORED_CLAMP;
442
443 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
444 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
445
446 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
447 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
448
449 default:
450 unreachable("Invalid wrap");
451 }
452 }
453
454 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
455 struct mali_sampler_descriptor *hw)
456 {
457 unsigned func = panfrost_translate_compare_func(cso->compare_func);
458 bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
459 bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
460 bool mip_linear = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
461 unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
462 unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
463 unsigned mip_filter = mip_linear ?
464 (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
465 unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
466
467 *hw = (struct mali_sampler_descriptor) {
468 .filter_mode = min_filter | mag_filter | mip_filter |
469 normalized,
470 .wrap_s = translate_tex_wrap(cso->wrap_s),
471 .wrap_t = translate_tex_wrap(cso->wrap_t),
472 .wrap_r = translate_tex_wrap(cso->wrap_r),
473 .compare_func = panfrost_flip_compare_func(func),
474 .border_color = {
475 cso->border_color.f[0],
476 cso->border_color.f[1],
477 cso->border_color.f[2],
478 cso->border_color.f[3]
479 },
480 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
481 .max_lod = FIXED_16(cso->max_lod, false),
482 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
483 .seamless_cube_map = cso->seamless_cube_map,
484 };
485
486 /* If necessary, we disable mipmapping in the sampler descriptor by
487 * clamping the LOD as tight as possible (from 0 to epsilon,
488 * essentially -- remember these are fixed point numbers, so
489 * epsilon=1/256) */
490
491 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
492 hw->max_lod = hw->min_lod + 1;
493 }
494
495 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
496 struct bifrost_sampler_descriptor *hw)
497 {
498 *hw = (struct bifrost_sampler_descriptor) {
499 .unk1 = 0x1,
500 .wrap_s = translate_tex_wrap(cso->wrap_s),
501 .wrap_t = translate_tex_wrap(cso->wrap_t),
502 .wrap_r = translate_tex_wrap(cso->wrap_r),
503 .unk8 = 0x8,
504 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
505 .norm_coords = cso->normalized_coords,
506 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
507 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
508 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
509 .max_lod = FIXED_16(cso->max_lod, false),
510 };
511
512 /* If necessary, we disable mipmapping in the sampler descriptor by
513 * clamping the LOD as tight as possible (from 0 to epsilon,
514 * essentially -- remember these are fixed point numbers, so
515 * epsilon=1/256) */
516
517 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
518 hw->max_lod = hw->min_lod + 1;
519 }
520
521 static void
522 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
523 struct mali_stencil_test *out)
524 {
525 out->ref = 0; /* Gallium gets it from elsewhere */
526
527 out->mask = in->valuemask;
528 out->func = panfrost_translate_compare_func(in->func);
529 out->sfail = panfrost_translate_stencil_op(in->fail_op);
530 out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
531 out->dppass = panfrost_translate_stencil_op(in->zpass_op);
532 }
533
534 static void
535 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
536 struct mali_shader_meta *fragmeta)
537 {
538 if (!ctx->rasterizer) {
539 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
540 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
541 fragmeta->depth_units = 0.0f;
542 fragmeta->depth_factor = 0.0f;
543 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
544 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
545 return;
546 }
547
548 bool msaa = ctx->rasterizer->base.multisample;
549
550 /* TODO: Sample size */
551 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
552 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
553 fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
554 fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
555
556 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
557
558 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
559 ctx->rasterizer->base.offset_tri);
560 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
561 ctx->rasterizer->base.offset_tri);
562 }
563
564 static void
565 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
566 struct mali_shader_meta *fragmeta)
567 {
568 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
569 int zfunc = PIPE_FUNC_ALWAYS;
570
571 if (!zsa) {
572 struct pipe_stencil_state default_stencil = {
573 .enabled = 0,
574 .func = PIPE_FUNC_ALWAYS,
575 .fail_op = MALI_STENCIL_KEEP,
576 .zfail_op = MALI_STENCIL_KEEP,
577 .zpass_op = MALI_STENCIL_KEEP,
578 .writemask = 0xFF,
579 .valuemask = 0xFF
580 };
581
582 panfrost_make_stencil_state(&default_stencil,
583 &fragmeta->stencil_front);
584 fragmeta->stencil_mask_front = default_stencil.writemask;
585 fragmeta->stencil_back = fragmeta->stencil_front;
586 fragmeta->stencil_mask_back = default_stencil.writemask;
587 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
588 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
589 } else {
590 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
591 zsa->stencil[0].enabled);
592 panfrost_make_stencil_state(&zsa->stencil[0],
593 &fragmeta->stencil_front);
594 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
595 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
596
597 /* If back-stencil is not enabled, use the front values */
598
599 if (zsa->stencil[1].enabled) {
600 panfrost_make_stencil_state(&zsa->stencil[1],
601 &fragmeta->stencil_back);
602 fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
603 fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
604 } else {
605 fragmeta->stencil_back = fragmeta->stencil_front;
606 fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
607 fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
608 }
609
610 if (zsa->depth.enabled)
611 zfunc = zsa->depth.func;
612
613 /* Depth state (TODO: Refactor) */
614
615 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
616 zsa->depth.writemask);
617 }
618
619 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
620 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
621 }
622
623 static bool
624 panfrost_fs_required(
625 struct panfrost_shader_state *fs,
626 struct panfrost_blend_final *blend,
627 unsigned rt_count)
628 {
629 /* If we generally have side effects */
630 if (fs->fs_sidefx)
631 return true;
632
633 /* If colour is written we need to execute */
634 for (unsigned i = 0; i < rt_count; ++i) {
635 if (!blend[i].no_colour)
636 return true;
637 }
638
639 /* If depth is written and not implied we need to execute.
640 * TODO: Predicate on Z/S writes being enabled */
641 return (fs->writes_depth || fs->writes_stencil);
642 }
643
644 static void
645 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
646 struct mali_shader_meta *fragmeta,
647 void *rts)
648 {
649 const struct panfrost_device *dev = pan_device(ctx->base.screen);
650 struct panfrost_shader_state *fs;
651 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
652
653 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
654 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
655 !ctx->blend->base.dither);
656
657 /* Get blending setup */
658 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
659
660 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
661 unsigned shader_offset = 0;
662 struct panfrost_bo *shader_bo = NULL;
663
664 for (unsigned c = 0; c < rt_count; ++c)
665 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
666 &shader_offset);
667
668 /* Disable shader execution if we can */
669 if (dev->quirks & MIDGARD_SHADERLESS
670 && !panfrost_fs_required(fs, blend, rt_count)) {
671 fragmeta->shader = 0;
672 fragmeta->attribute_count = 0;
673 fragmeta->varying_count = 0;
674 fragmeta->texture_count = 0;
675 fragmeta->sampler_count = 0;
676
677 /* This feature is not known to work on Bifrost */
678 fragmeta->midgard1.work_count = 1;
679 fragmeta->midgard1.uniform_count = 0;
680 fragmeta->midgard1.uniform_buffer_count = 0;
681 }
682
683 /* If there is a blend shader, work registers are shared. We impose 8
684 * work registers as a limit for blend shaders. Should be lower XXX */
685
686 if (!(dev->quirks & IS_BIFROST)) {
687 for (unsigned c = 0; c < rt_count; ++c) {
688 if (blend[c].is_shader) {
689 fragmeta->midgard1.work_count =
690 MAX2(fragmeta->midgard1.work_count, 8);
691 }
692 }
693 }
694
695 /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
696 * copied to the blend_meta appended (by convention), but this is the
697 * field actually read by the hardware. (Or maybe both are read...?).
698 * Specify the last RTi with a blend shader. */
699
700 fragmeta->blend.shader = 0;
701
702 for (signed rt = (rt_count - 1); rt >= 0; --rt) {
703 if (!blend[rt].is_shader)
704 continue;
705
706 fragmeta->blend.shader = blend[rt].shader.gpu |
707 blend[rt].shader.first_tag;
708 break;
709 }
710
711 if (dev->quirks & MIDGARD_SFBD) {
712 /* When only a single render target platform is used, the blend
713 * information is inside the shader meta itself. We additionally
714 * need to signal CAN_DISCARD for nontrivial blend modes (so
715 * we're able to read back the destination buffer) */
716
717 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
718 blend[0].is_shader);
719
720 if (!blend[0].is_shader) {
721 fragmeta->blend.equation = *blend[0].equation.equation;
722 fragmeta->blend.constant = blend[0].equation.constant;
723 }
724
725 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
726 !blend[0].no_blending || fs->can_discard);
727 return;
728 }
729
730 /* Additional blend descriptor tacked on for jobs using MFBD */
731
732 for (unsigned i = 0; i < rt_count; ++i) {
733 if (dev->quirks & IS_BIFROST) {
734 struct bifrost_blend_rt *brts = rts;
735
736 brts[i].flags = 0x200;
737 if (blend[i].is_shader) {
738 /* The blend shader's address needs to be at
739 * the same top 32 bit as the fragment shader.
740 * TODO: Ensure that's always the case.
741 */
742 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
743 (fs->bo->gpu & (0xffffffffull << 32)));
744 brts[i].shader = blend[i].shader.gpu;
745 brts[i].unk2 = 0x0;
746 } else {
747 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
748 const struct util_format_description *format_desc;
749 format_desc = util_format_description(format);
750
751 brts[i].equation = *blend[i].equation.equation;
752
753 /* TODO: this is a bit more complicated */
754 brts[i].constant = blend[i].equation.constant;
755
756 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
757 brts[i].unk2 = 0x19;
758
759 brts[i].shader_type = fs->blend_types[i];
760 }
761 } else {
762 struct midgard_blend_rt *mrts = rts;
763
764 if (!blend[i].no_colour) {
765 mrts[i].flags = 0x200;
766
767 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
768 (ctx->pipe_framebuffer.cbufs[i]) &&
769 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
770
771 SET_BIT(mrts[i].flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
772 SET_BIT(mrts[i].flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
773 SET_BIT(mrts[i].flags, MALI_BLEND_SRGB, is_srgb);
774 SET_BIT(mrts[i].flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
775 }
776
777 if (blend[i].is_shader) {
778 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
779 } else {
780 mrts[i].blend.equation = *blend[i].equation.equation;
781 mrts[i].blend.constant = blend[i].equation.constant;
782 }
783 }
784 }
785 }
786
787 static void
788 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
789 struct mali_shader_meta *fragmeta,
790 void *rts)
791 {
792 const struct panfrost_device *dev = pan_device(ctx->base.screen);
793 struct panfrost_shader_state *fs;
794
795 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
796
797 fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
798 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
799 fragmeta->unknown2_4 = 0x4e0;
800
801 /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
802 * is required (independent of 32-bit/64-bit descriptors), or why it's
803 * not used on later GPU revisions. Otherwise, all shader jobs fault on
804 * these earlier chips (perhaps this is a chicken bit of some kind).
805 * More investigation is needed. */
806
807 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
808
809 if (dev->quirks & IS_BIFROST) {
810 /* TODO */
811 } else {
812 /* Depending on whether it's legal to in the given shader, we try to
813 * enable early-z testing (or forward-pixel kill?) */
814
815 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
816 !fs->can_discard && !fs->writes_depth);
817
818 /* Add the writes Z/S flags if needed. */
819 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
820 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
821
822 /* Any time texturing is used, derivatives are implicitly calculated,
823 * so we need to enable helper invocations */
824
825 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
826 fs->helper_invocations);
827
828 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
829
830 bool depth_enabled = fs->writes_depth ||
831 (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
832
833 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, !depth_enabled && fs->can_discard);
834 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
835 }
836
837 panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
838 panfrost_frag_meta_zsa_update(ctx, fragmeta);
839 panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
840 }
841
842 void
843 panfrost_emit_shader_meta(struct panfrost_batch *batch,
844 enum pipe_shader_type st,
845 struct mali_vertex_tiler_postfix *postfix)
846 {
847 struct panfrost_context *ctx = batch->ctx;
848 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
849
850 if (!ss) {
851 postfix->shader = 0;
852 return;
853 }
854
855 struct mali_shader_meta meta;
856
857 panfrost_shader_meta_init(ctx, st, &meta);
858
859 /* Add the shader BO to the batch. */
860 panfrost_batch_add_bo(batch, ss->bo,
861 PAN_BO_ACCESS_PRIVATE |
862 PAN_BO_ACCESS_READ |
863 panfrost_bo_access_for_stage(st));
864
865 mali_ptr shader_ptr;
866
867 if (st == PIPE_SHADER_FRAGMENT) {
868 struct panfrost_device *dev = pan_device(ctx->base.screen);
869 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
870 size_t desc_size = sizeof(meta);
871 void *rts = NULL;
872 struct panfrost_transfer xfer;
873 unsigned rt_size;
874
875 if (dev->quirks & MIDGARD_SFBD)
876 rt_size = 0;
877 else if (dev->quirks & IS_BIFROST)
878 rt_size = sizeof(struct bifrost_blend_rt);
879 else
880 rt_size = sizeof(struct midgard_blend_rt);
881
882 desc_size += rt_size * rt_count;
883
884 if (rt_size)
885 rts = rzalloc_size(ctx, rt_size * rt_count);
886
887 panfrost_frag_shader_meta_init(ctx, &meta, rts);
888
889 xfer = panfrost_allocate_transient(batch, desc_size);
890
891 memcpy(xfer.cpu, &meta, sizeof(meta));
892 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
893
894 if (rt_size)
895 ralloc_free(rts);
896
897 shader_ptr = xfer.gpu;
898 } else {
899 shader_ptr = panfrost_upload_transient(batch, &meta,
900 sizeof(meta));
901 }
902
903 postfix->shader = shader_ptr;
904 }
905
906 static void
907 panfrost_mali_viewport_init(struct panfrost_context *ctx,
908 struct mali_viewport *mvp)
909 {
910 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
911
912 /* Clip bounds are encoded as floats. The viewport itself is encoded as
913 * (somewhat) asymmetric ints. */
914
915 const struct pipe_scissor_state *ss = &ctx->scissor;
916
917 memset(mvp, 0, sizeof(*mvp));
918
919 /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
920 * each direction. Clipping to the viewport in theory should work, but
921 * in practice causes issues when we're not explicitly trying to
922 * scissor */
923
924 *mvp = (struct mali_viewport) {
925 .clip_minx = -INFINITY,
926 .clip_miny = -INFINITY,
927 .clip_maxx = INFINITY,
928 .clip_maxy = INFINITY,
929 };
930
931 /* Always scissor to the viewport by default. */
932 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
933 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
934
935 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
936 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
937
938 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
939 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
940
941 /* Apply the scissor test */
942
943 unsigned minx, miny, maxx, maxy;
944
945 if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
946 minx = MAX2(ss->minx, vp_minx);
947 miny = MAX2(ss->miny, vp_miny);
948 maxx = MIN2(ss->maxx, vp_maxx);
949 maxy = MIN2(ss->maxy, vp_maxy);
950 } else {
951 minx = vp_minx;
952 miny = vp_miny;
953 maxx = vp_maxx;
954 maxy = vp_maxy;
955 }
956
957 /* Hardware needs the min/max to be strictly ordered, so flip if we
958 * need to. The viewport transformation in the vertex shader will
959 * handle the negatives if we don't */
960
961 if (miny > maxy) {
962 unsigned temp = miny;
963 miny = maxy;
964 maxy = temp;
965 }
966
967 if (minx > maxx) {
968 unsigned temp = minx;
969 minx = maxx;
970 maxx = temp;
971 }
972
973 if (minz > maxz) {
974 float temp = minz;
975 minz = maxz;
976 maxz = temp;
977 }
978
979 /* Clamp to the framebuffer size as a last check */
980
981 minx = MIN2(ctx->pipe_framebuffer.width, minx);
982 maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
983
984 miny = MIN2(ctx->pipe_framebuffer.height, miny);
985 maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
986
987 /* Upload */
988
989 mvp->viewport0[0] = minx;
990 mvp->viewport1[0] = MALI_POSITIVE(maxx);
991
992 mvp->viewport0[1] = miny;
993 mvp->viewport1[1] = MALI_POSITIVE(maxy);
994
995 mvp->clip_minz = minz;
996 mvp->clip_maxz = maxz;
997 }
998
999 void
1000 panfrost_emit_viewport(struct panfrost_batch *batch,
1001 struct mali_vertex_tiler_postfix *tiler_postfix)
1002 {
1003 struct panfrost_context *ctx = batch->ctx;
1004 struct mali_viewport mvp;
1005
1006 panfrost_mali_viewport_init(batch->ctx, &mvp);
1007
1008 /* Update the job, unless we're doing wallpapering (whose lack of
1009 * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
1010 * just... be faster :) */
1011
1012 if (!ctx->wallpaper_batch)
1013 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
1014 mvp.viewport0[1],
1015 mvp.viewport1[0] + 1,
1016 mvp.viewport1[1] + 1);
1017
1018 tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
1019 sizeof(mvp));
1020 }
1021
1022 static mali_ptr
1023 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1024 enum pipe_shader_type st,
1025 struct panfrost_constant_buffer *buf,
1026 unsigned index)
1027 {
1028 struct pipe_constant_buffer *cb = &buf->cb[index];
1029 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1030
1031 if (rsrc) {
1032 panfrost_batch_add_bo(batch, rsrc->bo,
1033 PAN_BO_ACCESS_SHARED |
1034 PAN_BO_ACCESS_READ |
1035 panfrost_bo_access_for_stage(st));
1036
1037 /* Alignment gauranteed by
1038 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1039 return rsrc->bo->gpu + cb->buffer_offset;
1040 } else if (cb->user_buffer) {
1041 return panfrost_upload_transient(batch,
1042 cb->user_buffer +
1043 cb->buffer_offset,
1044 cb->buffer_size);
1045 } else {
1046 unreachable("No constant buffer");
1047 }
1048 }
1049
1050 struct sysval_uniform {
1051 union {
1052 float f[4];
1053 int32_t i[4];
1054 uint32_t u[4];
1055 uint64_t du[2];
1056 };
1057 };
1058
1059 static void
1060 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1061 struct sysval_uniform *uniform)
1062 {
1063 struct panfrost_context *ctx = batch->ctx;
1064 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1065
1066 uniform->f[0] = vp->scale[0];
1067 uniform->f[1] = vp->scale[1];
1068 uniform->f[2] = vp->scale[2];
1069 }
1070
1071 static void
1072 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1073 struct sysval_uniform *uniform)
1074 {
1075 struct panfrost_context *ctx = batch->ctx;
1076 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1077
1078 uniform->f[0] = vp->translate[0];
1079 uniform->f[1] = vp->translate[1];
1080 uniform->f[2] = vp->translate[2];
1081 }
1082
1083 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1084 enum pipe_shader_type st,
1085 unsigned int sysvalid,
1086 struct sysval_uniform *uniform)
1087 {
1088 struct panfrost_context *ctx = batch->ctx;
1089 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1090 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1091 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1092 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1093
1094 assert(dim);
1095 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1096
1097 if (dim > 1)
1098 uniform->i[1] = u_minify(tex->texture->height0,
1099 tex->u.tex.first_level);
1100
1101 if (dim > 2)
1102 uniform->i[2] = u_minify(tex->texture->depth0,
1103 tex->u.tex.first_level);
1104
1105 if (is_array)
1106 uniform->i[dim] = tex->texture->array_size;
1107 }
1108
1109 static void
1110 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1111 enum pipe_shader_type st,
1112 unsigned ssbo_id,
1113 struct sysval_uniform *uniform)
1114 {
1115 struct panfrost_context *ctx = batch->ctx;
1116
1117 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1118 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1119
1120 /* Compute address */
1121 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1122
1123 panfrost_batch_add_bo(batch, bo,
1124 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1125 panfrost_bo_access_for_stage(st));
1126
1127 /* Upload address and size as sysval */
1128 uniform->du[0] = bo->gpu + sb.buffer_offset;
1129 uniform->u[2] = sb.buffer_size;
1130 }
1131
1132 static void
1133 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1134 enum pipe_shader_type st,
1135 unsigned samp_idx,
1136 struct sysval_uniform *uniform)
1137 {
1138 struct panfrost_context *ctx = batch->ctx;
1139 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1140
1141 uniform->f[0] = sampl->min_lod;
1142 uniform->f[1] = sampl->max_lod;
1143 uniform->f[2] = sampl->lod_bias;
1144
1145 /* Even without any errata, Midgard represents "no mipmapping" as
1146 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1147 * panfrost_create_sampler_state which also explains our choice of
1148 * epsilon value (again to keep behaviour consistent) */
1149
1150 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1151 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1152 }
1153
1154 static void
1155 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1156 struct sysval_uniform *uniform)
1157 {
1158 struct panfrost_context *ctx = batch->ctx;
1159
1160 uniform->u[0] = ctx->compute_grid->grid[0];
1161 uniform->u[1] = ctx->compute_grid->grid[1];
1162 uniform->u[2] = ctx->compute_grid->grid[2];
1163 }
1164
1165 static void
1166 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1167 struct panfrost_shader_state *ss,
1168 enum pipe_shader_type st)
1169 {
1170 struct sysval_uniform *uniforms = (void *)buf;
1171
1172 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1173 int sysval = ss->sysval[i];
1174
1175 switch (PAN_SYSVAL_TYPE(sysval)) {
1176 case PAN_SYSVAL_VIEWPORT_SCALE:
1177 panfrost_upload_viewport_scale_sysval(batch,
1178 &uniforms[i]);
1179 break;
1180 case PAN_SYSVAL_VIEWPORT_OFFSET:
1181 panfrost_upload_viewport_offset_sysval(batch,
1182 &uniforms[i]);
1183 break;
1184 case PAN_SYSVAL_TEXTURE_SIZE:
1185 panfrost_upload_txs_sysval(batch, st,
1186 PAN_SYSVAL_ID(sysval),
1187 &uniforms[i]);
1188 break;
1189 case PAN_SYSVAL_SSBO:
1190 panfrost_upload_ssbo_sysval(batch, st,
1191 PAN_SYSVAL_ID(sysval),
1192 &uniforms[i]);
1193 break;
1194 case PAN_SYSVAL_NUM_WORK_GROUPS:
1195 panfrost_upload_num_work_groups_sysval(batch,
1196 &uniforms[i]);
1197 break;
1198 case PAN_SYSVAL_SAMPLER:
1199 panfrost_upload_sampler_sysval(batch, st,
1200 PAN_SYSVAL_ID(sysval),
1201 &uniforms[i]);
1202 break;
1203 default:
1204 assert(0);
1205 }
1206 }
1207 }
1208
1209 static const void *
1210 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1211 unsigned index)
1212 {
1213 struct pipe_constant_buffer *cb = &buf->cb[index];
1214 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1215
1216 if (rsrc)
1217 return rsrc->bo->cpu;
1218 else if (cb->user_buffer)
1219 return cb->user_buffer;
1220 else
1221 unreachable("No constant buffer");
1222 }
1223
1224 void
1225 panfrost_emit_const_buf(struct panfrost_batch *batch,
1226 enum pipe_shader_type stage,
1227 struct mali_vertex_tiler_postfix *postfix)
1228 {
1229 struct panfrost_context *ctx = batch->ctx;
1230 struct panfrost_shader_variants *all = ctx->shader[stage];
1231
1232 if (!all)
1233 return;
1234
1235 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1236
1237 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1238
1239 /* Uniforms are implicitly UBO #0 */
1240 bool has_uniforms = buf->enabled_mask & (1 << 0);
1241
1242 /* Allocate room for the sysval and the uniforms */
1243 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1244 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1245 size_t size = sys_size + uniform_size;
1246 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1247 size);
1248
1249 /* Upload sysvals requested by the shader */
1250 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1251
1252 /* Upload uniforms */
1253 if (has_uniforms && uniform_size) {
1254 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1255 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1256 }
1257
1258 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1259 * uploaded */
1260
1261 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1262 assert(ubo_count >= 1);
1263
1264 size_t sz = sizeof(uint64_t) * ubo_count;
1265 uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1266 int uniform_count = ss->uniform_count;
1267
1268 /* Upload uniforms as a UBO */
1269 ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1270
1271 /* The rest are honest-to-goodness UBOs */
1272
1273 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1274 size_t usz = buf->cb[ubo].buffer_size;
1275 bool enabled = buf->enabled_mask & (1 << ubo);
1276 bool empty = usz == 0;
1277
1278 if (!enabled || empty) {
1279 /* Stub out disabled UBOs to catch accesses */
1280 ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1281 continue;
1282 }
1283
1284 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1285 buf, ubo);
1286
1287 unsigned bytes_per_field = 16;
1288 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1289 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1290 }
1291
1292 mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1293 postfix->uniforms = transfer.gpu;
1294 postfix->uniform_buffers = ubufs;
1295
1296 buf->dirty_mask = 0;
1297 }
1298
1299 void
1300 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1301 const struct pipe_grid_info *info,
1302 struct midgard_payload_vertex_tiler *vtp)
1303 {
1304 struct panfrost_context *ctx = batch->ctx;
1305 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1306 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1307 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1308 128));
1309 unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1310 info->grid[2] * 4;
1311 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1312 shared_size,
1313 1);
1314
1315 struct mali_shared_memory shared = {
1316 .shared_memory = bo->gpu,
1317 .shared_workgroup_count =
1318 util_logbase2_ceil(info->grid[0]) +
1319 util_logbase2_ceil(info->grid[1]) +
1320 util_logbase2_ceil(info->grid[2]),
1321 .shared_unk1 = 0x2,
1322 .shared_shift = util_logbase2(single_size) - 1
1323 };
1324
1325 vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1326 sizeof(shared));
1327 }
1328
1329 static mali_ptr
1330 panfrost_get_tex_desc(struct panfrost_batch *batch,
1331 enum pipe_shader_type st,
1332 struct panfrost_sampler_view *view)
1333 {
1334 if (!view)
1335 return (mali_ptr) 0;
1336
1337 struct pipe_sampler_view *pview = &view->base;
1338 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1339
1340 /* Add the BO to the job so it's retained until the job is done. */
1341
1342 panfrost_batch_add_bo(batch, rsrc->bo,
1343 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1344 panfrost_bo_access_for_stage(st));
1345
1346 panfrost_batch_add_bo(batch, view->midgard_bo,
1347 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1348 panfrost_bo_access_for_stage(st));
1349
1350 return view->midgard_bo->gpu;
1351 }
1352
1353 void
1354 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1355 enum pipe_shader_type stage,
1356 struct mali_vertex_tiler_postfix *postfix)
1357 {
1358 struct panfrost_context *ctx = batch->ctx;
1359 struct panfrost_device *device = pan_device(ctx->base.screen);
1360
1361 if (!ctx->sampler_view_count[stage])
1362 return;
1363
1364 if (device->quirks & IS_BIFROST) {
1365 struct bifrost_texture_descriptor *descriptors;
1366
1367 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1368 ctx->sampler_view_count[stage]);
1369
1370 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1371 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1372 struct pipe_sampler_view *pview = &view->base;
1373 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1374
1375 /* Add the BOs to the job so they are retained until the job is done. */
1376
1377 panfrost_batch_add_bo(batch, rsrc->bo,
1378 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1379 panfrost_bo_access_for_stage(stage));
1380
1381 panfrost_batch_add_bo(batch, view->bifrost_bo,
1382 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1383 panfrost_bo_access_for_stage(stage));
1384
1385 memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1386 }
1387
1388 postfix->textures = panfrost_upload_transient(batch,
1389 descriptors,
1390 sizeof(struct bifrost_texture_descriptor) *
1391 ctx->sampler_view_count[stage]);
1392
1393 free(descriptors);
1394 } else {
1395 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1396
1397 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1398 trampolines[i] = panfrost_get_tex_desc(batch, stage,
1399 ctx->sampler_views[stage][i]);
1400
1401 postfix->textures = panfrost_upload_transient(batch,
1402 trampolines,
1403 sizeof(uint64_t) *
1404 ctx->sampler_view_count[stage]);
1405 }
1406 }
1407
1408 void
1409 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1410 enum pipe_shader_type stage,
1411 struct mali_vertex_tiler_postfix *postfix)
1412 {
1413 struct panfrost_context *ctx = batch->ctx;
1414 struct panfrost_device *device = pan_device(ctx->base.screen);
1415
1416 if (!ctx->sampler_count[stage])
1417 return;
1418
1419 if (device->quirks & IS_BIFROST) {
1420 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1421 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1422 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1423 transfer_size);
1424 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1425
1426 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1427 desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1428
1429 postfix->sampler_descriptor = transfer.gpu;
1430 } else {
1431 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1432 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1433 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1434 transfer_size);
1435 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1436
1437 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1438 desc[i] = ctx->samplers[stage][i]->midgard_hw;
1439
1440 postfix->sampler_descriptor = transfer.gpu;
1441 }
1442 }
1443
1444 void
1445 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1446 struct mali_vertex_tiler_postfix *vertex_postfix)
1447 {
1448 struct panfrost_context *ctx = batch->ctx;
1449
1450 if (!ctx->vertex)
1451 return;
1452
1453 struct panfrost_vertex_state *so = ctx->vertex;
1454
1455 panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1456 vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1457 sizeof(*so->hw) *
1458 PAN_MAX_ATTRIBUTE);
1459 }
1460
1461 void
1462 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1463 struct mali_vertex_tiler_postfix *vertex_postfix)
1464 {
1465 struct panfrost_context *ctx = batch->ctx;
1466 struct panfrost_vertex_state *so = ctx->vertex;
1467
1468 /* Staged mali_attr, and index into them. i =/= k, depending on the
1469 * vertex buffer mask and instancing. Twice as much room is allocated,
1470 * for a worst case of NPOT_DIVIDEs which take up extra slot */
1471 union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1472 unsigned k = 0;
1473
1474 for (unsigned i = 0; i < so->num_elements; ++i) {
1475 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1476 * means duplicating some vertex buffers (who cares? aside from
1477 * maybe some caching implications but I somehow doubt that
1478 * matters) */
1479
1480 struct pipe_vertex_element *elem = &so->pipe[i];
1481 unsigned vbi = elem->vertex_buffer_index;
1482
1483 /* The exception to 1:1 mapping is that we can have multiple
1484 * entries (NPOT divisors), so we fixup anyways */
1485
1486 so->hw[i].index = k;
1487
1488 if (!(ctx->vb_mask & (1 << vbi)))
1489 continue;
1490
1491 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1492 struct panfrost_resource *rsrc;
1493
1494 rsrc = pan_resource(buf->buffer.resource);
1495 if (!rsrc)
1496 continue;
1497
1498 /* Align to 64 bytes by masking off the lower bits. This
1499 * will be adjusted back when we fixup the src_offset in
1500 * mali_attr_meta */
1501
1502 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1503 mali_ptr addr = raw_addr & ~63;
1504 unsigned chopped_addr = raw_addr - addr;
1505
1506 /* Add a dependency of the batch on the vertex buffer */
1507 panfrost_batch_add_bo(batch, rsrc->bo,
1508 PAN_BO_ACCESS_SHARED |
1509 PAN_BO_ACCESS_READ |
1510 PAN_BO_ACCESS_VERTEX_TILER);
1511
1512 /* Set common fields */
1513 attrs[k].elements = addr;
1514 attrs[k].stride = buf->stride;
1515
1516 /* Since we advanced the base pointer, we shrink the buffer
1517 * size */
1518 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1519
1520 /* We need to add the extra size we masked off (for
1521 * correctness) so the data doesn't get clamped away */
1522 attrs[k].size += chopped_addr;
1523
1524 /* For non-instancing make sure we initialize */
1525 attrs[k].shift = attrs[k].extra_flags = 0;
1526
1527 /* Instancing uses a dramatically different code path than
1528 * linear, so dispatch for the actual emission now that the
1529 * common code is finished */
1530
1531 unsigned divisor = elem->instance_divisor;
1532
1533 if (divisor && ctx->instance_count == 1) {
1534 /* Silly corner case where there's a divisor(=1) but
1535 * there's no legitimate instancing. So we want *every*
1536 * attribute to be the same. So set stride to zero so
1537 * we don't go anywhere. */
1538
1539 attrs[k].size = attrs[k].stride + chopped_addr;
1540 attrs[k].stride = 0;
1541 attrs[k++].elements |= MALI_ATTR_LINEAR;
1542 } else if (ctx->instance_count <= 1) {
1543 /* Normal, non-instanced attributes */
1544 attrs[k++].elements |= MALI_ATTR_LINEAR;
1545 } else {
1546 unsigned instance_shift = vertex_postfix->instance_shift;
1547 unsigned instance_odd = vertex_postfix->instance_odd;
1548
1549 k += panfrost_vertex_instanced(ctx->padded_count,
1550 instance_shift,
1551 instance_odd,
1552 divisor, &attrs[k]);
1553 }
1554 }
1555
1556 /* Add special gl_VertexID/gl_InstanceID buffers */
1557
1558 panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1559 so->hw[PAN_VERTEX_ID].index = k++;
1560 panfrost_instance_id(ctx->padded_count, &attrs[k]);
1561 so->hw[PAN_INSTANCE_ID].index = k++;
1562
1563 /* Upload whatever we emitted and go */
1564
1565 vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1566 k * sizeof(*attrs));
1567 }
1568
1569 static mali_ptr
1570 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1571 unsigned stride, unsigned count)
1572 {
1573 /* Fill out the descriptor */
1574 slot->stride = stride;
1575 slot->size = stride * count;
1576 slot->shift = slot->extra_flags = 0;
1577
1578 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1579 slot->size);
1580
1581 slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1582
1583 return transfer.gpu;
1584 }
1585
1586 static void
1587 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1588 unsigned stride, unsigned offset, unsigned count,
1589 struct pipe_stream_output_target *target)
1590 {
1591 /* Fill out the descriptor */
1592 slot->stride = stride * 4;
1593 slot->shift = slot->extra_flags = 0;
1594
1595 unsigned max_size = target->buffer_size;
1596 unsigned expected_size = slot->stride * count;
1597
1598 slot->size = MIN2(max_size, expected_size);
1599
1600 /* Grab the BO and bind it to the batch */
1601 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1602
1603 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1604 * the perspective of the TILER and FRAGMENT.
1605 */
1606 panfrost_batch_add_bo(batch, bo,
1607 PAN_BO_ACCESS_SHARED |
1608 PAN_BO_ACCESS_RW |
1609 PAN_BO_ACCESS_VERTEX_TILER |
1610 PAN_BO_ACCESS_FRAGMENT);
1611
1612 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1613 slot->elements = addr;
1614 }
1615
1616 /* Given a shader and buffer indices, link varying metadata together */
1617
1618 static bool
1619 is_special_varying(gl_varying_slot loc)
1620 {
1621 switch (loc) {
1622 case VARYING_SLOT_POS:
1623 case VARYING_SLOT_PSIZ:
1624 case VARYING_SLOT_PNTC:
1625 case VARYING_SLOT_FACE:
1626 return true;
1627 default:
1628 return false;
1629 }
1630 }
1631
1632 static void
1633 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1634 signed general, signed gl_Position,
1635 signed gl_PointSize, signed gl_PointCoord,
1636 signed gl_FrontFacing)
1637 {
1638 struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1639
1640 for (unsigned i = 0; i < ss->varying_count; ++i) {
1641 gl_varying_slot location = ss->varyings_loc[i];
1642 int index = -1;
1643
1644 switch (location) {
1645 case VARYING_SLOT_POS:
1646 index = gl_Position;
1647 break;
1648 case VARYING_SLOT_PSIZ:
1649 index = gl_PointSize;
1650 break;
1651 case VARYING_SLOT_PNTC:
1652 index = gl_PointCoord;
1653 break;
1654 case VARYING_SLOT_FACE:
1655 index = gl_FrontFacing;
1656 break;
1657 default:
1658 index = general;
1659 break;
1660 }
1661
1662 assert(index >= 0);
1663 out[i].index = index;
1664 }
1665 }
1666
1667 static bool
1668 has_point_coord(unsigned mask, gl_varying_slot loc)
1669 {
1670 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1671 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1672 else if (loc == VARYING_SLOT_PNTC)
1673 return (mask & (1 << 8));
1674 else
1675 return false;
1676 }
1677
1678 /* Helpers for manipulating stream out information so we can pack varyings
1679 * accordingly. Compute the src_offset for a given captured varying */
1680
1681 static struct pipe_stream_output *
1682 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1683 {
1684 for (unsigned i = 0; i < info->num_outputs; ++i) {
1685 if (info->output[i].register_index == loc)
1686 return &info->output[i];
1687 }
1688
1689 unreachable("Varying not captured");
1690 }
1691
1692 void
1693 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1694 unsigned vertex_count,
1695 struct mali_vertex_tiler_postfix *vertex_postfix,
1696 struct mali_vertex_tiler_postfix *tiler_postfix,
1697 union midgard_primitive_size *primitive_size)
1698 {
1699 /* Load the shaders */
1700 struct panfrost_context *ctx = batch->ctx;
1701 struct panfrost_shader_state *vs, *fs;
1702 unsigned int num_gen_varyings = 0;
1703 size_t vs_size, fs_size;
1704
1705 /* Allocate the varying descriptor */
1706
1707 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1708 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1709 vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1710 fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1711
1712 struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1713 vs_size +
1714 fs_size);
1715
1716 struct pipe_stream_output_info *so = &vs->stream_output;
1717
1718 /* Check if this varying is linked by us. This is the case for
1719 * general-purpose, non-captured varyings. If it is, link it. If it's
1720 * not, use the provided stream out information to determine the
1721 * offset, since it was already linked for us. */
1722
1723 for (unsigned i = 0; i < vs->varying_count; i++) {
1724 gl_varying_slot loc = vs->varyings_loc[i];
1725
1726 bool special = is_special_varying(loc);
1727 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1728
1729 if (captured) {
1730 struct pipe_stream_output *o = pan_get_so(so, loc);
1731
1732 unsigned dst_offset = o->dst_offset * 4; /* dwords */
1733 vs->varyings[i].src_offset = dst_offset;
1734 } else if (!special) {
1735 vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1736 }
1737 }
1738
1739 /* Conversely, we need to set src_offset for the captured varyings.
1740 * Here, the layout is defined by the stream out info, not us */
1741
1742 /* Link up with fragment varyings */
1743 bool reads_point_coord = fs->reads_point_coord;
1744
1745 for (unsigned i = 0; i < fs->varying_count; i++) {
1746 gl_varying_slot loc = fs->varyings_loc[i];
1747 unsigned src_offset;
1748 signed vs_idx = -1;
1749
1750 /* Link up */
1751 for (unsigned j = 0; j < vs->varying_count; ++j) {
1752 if (vs->varyings_loc[j] == loc) {
1753 vs_idx = j;
1754 break;
1755 }
1756 }
1757
1758 /* Either assign or reuse */
1759 if (vs_idx >= 0)
1760 src_offset = vs->varyings[vs_idx].src_offset;
1761 else
1762 src_offset = 16 * (num_gen_varyings++);
1763
1764 fs->varyings[i].src_offset = src_offset;
1765
1766 if (has_point_coord(fs->point_sprite_mask, loc))
1767 reads_point_coord = true;
1768 }
1769
1770 memcpy(trans.cpu, vs->varyings, vs_size);
1771 memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1772
1773 union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1774
1775 /* Figure out how many streamout buffers could be bound */
1776 unsigned so_count = ctx->streamout.num_targets;
1777 for (unsigned i = 0; i < vs->varying_count; i++) {
1778 gl_varying_slot loc = vs->varyings_loc[i];
1779
1780 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1781 if (!captured) continue;
1782
1783 struct pipe_stream_output *o = pan_get_so(so, loc);
1784 so_count = MAX2(so_count, o->output_buffer + 1);
1785 }
1786
1787 signed idx = so_count;
1788 signed general = idx++;
1789 signed gl_Position = idx++;
1790 signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1791 signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1792 signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1793 signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1794
1795 /* Emit the stream out buffers */
1796
1797 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1798 ctx->vertex_count);
1799
1800 for (unsigned i = 0; i < so_count; ++i) {
1801 if (i < ctx->streamout.num_targets) {
1802 panfrost_emit_streamout(batch, &varyings[i],
1803 so->stride[i],
1804 ctx->streamout.offsets[i],
1805 out_count,
1806 ctx->streamout.targets[i]);
1807 } else {
1808 /* Emit a dummy buffer */
1809 panfrost_emit_varyings(batch, &varyings[i],
1810 so->stride[i] * 4,
1811 out_count);
1812
1813 /* Clear the attribute type */
1814 varyings[i].elements &= ~0xF;
1815 }
1816 }
1817
1818 panfrost_emit_varyings(batch, &varyings[general],
1819 num_gen_varyings * 16,
1820 vertex_count);
1821
1822 mali_ptr varyings_p;
1823
1824 /* fp32 vec4 gl_Position */
1825 varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1826 sizeof(float) * 4, vertex_count);
1827 tiler_postfix->position_varying = varyings_p;
1828
1829
1830 if (panfrost_writes_point_size(ctx)) {
1831 varyings_p = panfrost_emit_varyings(batch,
1832 &varyings[gl_PointSize],
1833 2, vertex_count);
1834 primitive_size->pointer = varyings_p;
1835 }
1836
1837 if (reads_point_coord)
1838 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1839
1840 if (fs->reads_face)
1841 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1842
1843 if (fs->reads_frag_coord)
1844 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1845
1846 struct panfrost_device *device = pan_device(ctx->base.screen);
1847 assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord));
1848
1849 /* Let's go ahead and link varying meta to the buffer in question, now
1850 * that that information is available. VARYING_SLOT_POS is mapped to
1851 * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1852 * */
1853
1854 panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1855 gl_PointSize, gl_PointCoord,
1856 gl_FrontFacing);
1857
1858 panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1859 gl_FragCoord, gl_PointSize,
1860 gl_PointCoord, gl_FrontFacing);
1861
1862 /* Replace streamout */
1863
1864 struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1865 struct mali_attr_meta *ofs = ovs + vs->varying_count;
1866
1867 for (unsigned i = 0; i < vs->varying_count; i++) {
1868 gl_varying_slot loc = vs->varyings_loc[i];
1869
1870 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1871 if (!captured)
1872 continue;
1873
1874 struct pipe_stream_output *o = pan_get_so(so, loc);
1875 ovs[i].index = o->output_buffer;
1876
1877 assert(o->stream == 0);
1878 ovs[i].format = (vs->varyings[i].format & ~MALI_NR_CHANNELS(4))
1879 | MALI_NR_CHANNELS(o->num_components);
1880
1881 if (device->quirks & HAS_SWIZZLES)
1882 ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1883 else
1884 ovs[i].swizzle = panfrost_bifrost_swizzle(o->num_components);
1885
1886 /* Link to the fragment */
1887 signed fs_idx = -1;
1888
1889 /* Link up */
1890 for (unsigned j = 0; j < fs->varying_count; ++j) {
1891 if (fs->varyings_loc[j] == loc) {
1892 fs_idx = j;
1893 break;
1894 }
1895 }
1896
1897 if (fs_idx >= 0) {
1898 ofs[fs_idx].index = ovs[i].index;
1899 ofs[fs_idx].format = ovs[i].format;
1900 ofs[fs_idx].swizzle = ovs[i].swizzle;
1901 }
1902 }
1903
1904 /* Replace point sprite */
1905 for (unsigned i = 0; i < fs->varying_count; i++) {
1906 /* If we have a point sprite replacement, handle that here. We
1907 * have to translate location first. TODO: Flip y in shader.
1908 * We're already keying ... just time crunch .. */
1909
1910 if (has_point_coord(fs->point_sprite_mask,
1911 fs->varyings_loc[i])) {
1912 ofs[i].index = gl_PointCoord;
1913
1914 /* Swizzle out the z/w to 0/1 */
1915 ofs[i].format = MALI_RG16F;
1916 ofs[i].swizzle = panfrost_get_default_swizzle(2);
1917 }
1918 }
1919
1920 /* Fix up unaligned addresses */
1921 for (unsigned i = 0; i < so_count; ++i) {
1922 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1923 continue;
1924
1925 unsigned align = (varyings[i].elements & 63);
1926
1927 /* While we're at it, the SO buffers are linear */
1928
1929 if (!align) {
1930 varyings[i].elements |= MALI_ATTR_LINEAR;
1931 continue;
1932 }
1933
1934 /* We need to adjust alignment */
1935 varyings[i].elements &= ~63;
1936 varyings[i].elements |= MALI_ATTR_LINEAR;
1937 varyings[i].size += align;
1938
1939 for (unsigned v = 0; v < vs->varying_count; ++v) {
1940 if (ovs[v].index != i)
1941 continue;
1942
1943 ovs[v].src_offset = vs->varyings[v].src_offset + align;
1944 }
1945
1946 for (unsigned f = 0; f < fs->varying_count; ++f) {
1947 if (ofs[f].index != i)
1948 continue;
1949
1950 ofs[f].src_offset = fs->varyings[f].src_offset + align;
1951 }
1952 }
1953
1954 varyings_p = panfrost_upload_transient(batch, varyings,
1955 idx * sizeof(*varyings));
1956 vertex_postfix->varyings = varyings_p;
1957 tiler_postfix->varyings = varyings_p;
1958
1959 vertex_postfix->varying_meta = trans.gpu;
1960 tiler_postfix->varying_meta = trans.gpu + vs_size;
1961 }
1962
1963 void
1964 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1965 struct mali_vertex_tiler_prefix *vertex_prefix,
1966 struct mali_vertex_tiler_postfix *vertex_postfix,
1967 struct mali_vertex_tiler_prefix *tiler_prefix,
1968 struct mali_vertex_tiler_postfix *tiler_postfix,
1969 union midgard_primitive_size *primitive_size)
1970 {
1971 struct panfrost_context *ctx = batch->ctx;
1972 struct panfrost_device *device = pan_device(ctx->base.screen);
1973 bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
1974 struct bifrost_payload_vertex bifrost_vertex = {0,};
1975 struct bifrost_payload_tiler bifrost_tiler = {0,};
1976 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1977 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1978 void *vp, *tp;
1979 size_t vp_size, tp_size;
1980
1981 if (device->quirks & IS_BIFROST) {
1982 bifrost_vertex.prefix = *vertex_prefix;
1983 bifrost_vertex.postfix = *vertex_postfix;
1984 vp = &bifrost_vertex;
1985 vp_size = sizeof(bifrost_vertex);
1986
1987 bifrost_tiler.prefix = *tiler_prefix;
1988 bifrost_tiler.tiler.primitive_size = *primitive_size;
1989 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1990 bifrost_tiler.postfix = *tiler_postfix;
1991 tp = &bifrost_tiler;
1992 tp_size = sizeof(bifrost_tiler);
1993 } else {
1994 midgard_vertex.prefix = *vertex_prefix;
1995 midgard_vertex.postfix = *vertex_postfix;
1996 vp = &midgard_vertex;
1997 vp_size = sizeof(midgard_vertex);
1998
1999 midgard_tiler.prefix = *tiler_prefix;
2000 midgard_tiler.postfix = *tiler_postfix;
2001 midgard_tiler.primitive_size = *primitive_size;
2002 tp = &midgard_tiler;
2003 tp_size = sizeof(midgard_tiler);
2004 }
2005
2006 if (wallpapering) {
2007 /* Inject in reverse order, with "predicted" job indices.
2008 * THIS IS A HACK XXX */
2009 panfrost_new_job(batch, JOB_TYPE_TILER, false,
2010 batch->job_index + 2, tp, tp_size, true);
2011 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2012 vp, vp_size, true);
2013 return;
2014 }
2015
2016 /* If rasterizer discard is enable, only submit the vertex */
2017
2018 bool rasterizer_discard = ctx->rasterizer &&
2019 ctx->rasterizer->base.rasterizer_discard;
2020
2021 unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2022 vp, vp_size, false);
2023
2024 if (rasterizer_discard)
2025 return;
2026
2027 panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2028 false);
2029 }
2030
2031 /* TODO: stop hardcoding this */
2032 mali_ptr
2033 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2034 {
2035 uint16_t locations[] = {
2036 128, 128,
2037 0, 256,
2038 0, 256,
2039 0, 256,
2040 0, 256,
2041 0, 256,
2042 0, 256,
2043 0, 256,
2044 0, 256,
2045 0, 256,
2046 0, 256,
2047 0, 256,
2048 0, 256,
2049 0, 256,
2050 0, 256,
2051 0, 256,
2052 0, 256,
2053 0, 256,
2054 0, 256,
2055 0, 256,
2056 0, 256,
2057 0, 256,
2058 0, 256,
2059 0, 256,
2060 0, 256,
2061 0, 256,
2062 0, 256,
2063 0, 256,
2064 0, 256,
2065 0, 256,
2066 0, 256,
2067 0, 256,
2068 128, 128,
2069 0, 0,
2070 0, 0,
2071 0, 0,
2072 0, 0,
2073 0, 0,
2074 0, 0,
2075 0, 0,
2076 0, 0,
2077 0, 0,
2078 0, 0,
2079 0, 0,
2080 0, 0,
2081 0, 0,
2082 0, 0,
2083 0, 0,
2084 };
2085
2086 return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2087 }