panfrost: Fix gl_PointSize out of GL_POINTS
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_allocate.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 unsigned shift = panfrost_get_stack_shift(batch->stack_size);
62 struct mali_shared_memory shared = {
63 .stack_shift = shift,
64 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
65 .shared_workgroup_count = ~0,
66 };
67 postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
68 }
69
70 static void
71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
72 struct mali_vertex_tiler_postfix *postfix)
73 {
74 struct panfrost_device *dev = pan_device(ctx->base.screen);
75 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
76
77 /* If we haven't, reserve space for the framebuffer */
78
79 if (!batch->framebuffer.gpu) {
80 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
81 sizeof(struct mali_single_framebuffer) :
82 sizeof(struct mali_framebuffer);
83
84 batch->framebuffer = panfrost_allocate_transient(batch, size);
85
86 /* Tag the pointer */
87 if (!(dev->quirks & MIDGARD_SFBD))
88 batch->framebuffer.gpu |= MALI_MFBD;
89 }
90
91 postfix->shared_memory = batch->framebuffer.gpu;
92 }
93
94 static void
95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
96 struct mali_vertex_tiler_prefix *prefix,
97 struct mali_vertex_tiler_postfix *postfix)
98 {
99 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
100
101 postfix->gl_enables |= 0x7;
102 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
103 rasterizer && rasterizer->base.front_ccw);
104 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
105 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
106 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
107 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
108 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
109 rasterizer && rasterizer->base.flatshade_first);
110 }
111
112 void
113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
114 struct mali_vertex_tiler_prefix *prefix,
115 union midgard_primitive_size *primitive_size)
116 {
117 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
118
119 if (!panfrost_writes_point_size(ctx)) {
120 bool points = prefix->draw_mode == MALI_POINTS;
121 float val = 0.0f;
122
123 if (rasterizer)
124 val = points ?
125 rasterizer->base.point_size :
126 rasterizer->base.line_width;
127
128 primitive_size->constant = val;
129 }
130 }
131
132 static void
133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
134 struct mali_vertex_tiler_postfix *postfix)
135 {
136 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
137 if (ctx->occlusion_query)
138 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
139 else
140 postfix->occlusion_counter = 0;
141 }
142
143 void
144 panfrost_vt_init(struct panfrost_context *ctx,
145 enum pipe_shader_type stage,
146 struct mali_vertex_tiler_prefix *prefix,
147 struct mali_vertex_tiler_postfix *postfix)
148 {
149 struct panfrost_device *device = pan_device(ctx->base.screen);
150
151 if (!ctx->shader[stage])
152 return;
153
154 memset(prefix, 0, sizeof(*prefix));
155 memset(postfix, 0, sizeof(*postfix));
156
157 if (device->quirks & IS_BIFROST) {
158 postfix->gl_enables = 0x2;
159 panfrost_vt_emit_shared_memory(ctx, postfix);
160 } else {
161 postfix->gl_enables = 0x6;
162 panfrost_vt_attach_framebuffer(ctx, postfix);
163 }
164
165 if (stage == PIPE_SHADER_FRAGMENT) {
166 panfrost_vt_update_occlusion_query(ctx, postfix);
167 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
168 }
169 }
170
171 static unsigned
172 panfrost_translate_index_size(unsigned size)
173 {
174 switch (size) {
175 case 1:
176 return MALI_DRAW_INDEXED_UINT8;
177
178 case 2:
179 return MALI_DRAW_INDEXED_UINT16;
180
181 case 4:
182 return MALI_DRAW_INDEXED_UINT32;
183
184 default:
185 unreachable("Invalid index size");
186 }
187 }
188
189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
190 * good for the duration of the draw (transient), could last longer. Also get
191 * the bounds on the index buffer for the range accessed by the draw. We do
192 * these operations together because there are natural optimizations which
193 * require them to be together. */
194
195 static mali_ptr
196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
197 const struct pipe_draw_info *info,
198 unsigned *min_index, unsigned *max_index)
199 {
200 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
201 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
202 off_t offset = info->start * info->index_size;
203 bool needs_indices = true;
204 mali_ptr out = 0;
205
206 if (info->max_index != ~0u) {
207 *min_index = info->min_index;
208 *max_index = info->max_index;
209 needs_indices = false;
210 }
211
212 if (!info->has_user_indices) {
213 /* Only resources can be directly mapped */
214 panfrost_batch_add_bo(batch, rsrc->bo,
215 PAN_BO_ACCESS_SHARED |
216 PAN_BO_ACCESS_READ |
217 PAN_BO_ACCESS_VERTEX_TILER);
218 out = rsrc->bo->gpu + offset;
219
220 /* Check the cache */
221 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
222 info->start,
223 info->count,
224 min_index,
225 max_index);
226 } else {
227 /* Otherwise, we need to upload to transient memory */
228 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
229 out = panfrost_upload_transient(batch, ibuf8 + offset,
230 info->count *
231 info->index_size);
232 }
233
234 if (needs_indices) {
235 /* Fallback */
236 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
237
238 if (!info->has_user_indices)
239 panfrost_minmax_cache_add(rsrc->index_cache,
240 info->start, info->count,
241 *min_index, *max_index);
242 }
243
244 return out;
245 }
246
247 void
248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
249 const struct pipe_draw_info *info,
250 enum mali_draw_mode draw_mode,
251 struct mali_vertex_tiler_postfix *vertex_postfix,
252 struct mali_vertex_tiler_prefix *tiler_prefix,
253 struct mali_vertex_tiler_postfix *tiler_postfix,
254 unsigned *vertex_count,
255 unsigned *padded_count)
256 {
257 tiler_prefix->draw_mode = draw_mode;
258
259 unsigned draw_flags = 0;
260
261 if (panfrost_writes_point_size(ctx))
262 draw_flags |= MALI_DRAW_VARYING_SIZE;
263
264 if (info->primitive_restart)
265 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
266
267 /* These doesn't make much sense */
268
269 draw_flags |= 0x3000;
270
271 if (info->index_size) {
272 unsigned min_index = 0, max_index = 0;
273
274 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
275 info,
276 &min_index,
277 &max_index);
278
279 /* Use the corresponding values */
280 *vertex_count = max_index - min_index + 1;
281 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
282 tiler_prefix->offset_bias_correction = -min_index;
283 tiler_prefix->index_count = MALI_POSITIVE(info->count);
284 draw_flags |= panfrost_translate_index_size(info->index_size);
285 } else {
286 tiler_prefix->indices = 0;
287 *vertex_count = ctx->vertex_count;
288 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
289 tiler_prefix->offset_bias_correction = 0;
290 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
291 }
292
293 tiler_prefix->unknown_draw = draw_flags;
294
295 /* Encode the padded vertex count */
296
297 if (info->instance_count > 1) {
298 *padded_count = panfrost_padded_vertex_count(*vertex_count);
299
300 unsigned shift = __builtin_ctz(ctx->padded_count);
301 unsigned k = ctx->padded_count >> (shift + 1);
302
303 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
304 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
305 } else {
306 *padded_count = *vertex_count;
307
308 /* Reset instancing state */
309 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
310 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
311 }
312 }
313
314 static void
315 panfrost_shader_meta_init(struct panfrost_context *ctx,
316 enum pipe_shader_type st,
317 struct mali_shader_meta *meta)
318 {
319 const struct panfrost_device *dev = pan_device(ctx->base.screen);
320 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
321
322 memset(meta, 0, sizeof(*meta));
323 meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
324 meta->attribute_count = ss->attribute_count;
325 meta->varying_count = ss->varying_count;
326 meta->texture_count = ctx->sampler_view_count[st];
327 meta->sampler_count = ctx->sampler_count[st];
328
329 if (dev->quirks & IS_BIFROST) {
330 if (st == PIPE_SHADER_VERTEX)
331 meta->bifrost1.unk1 = 0x800000;
332 else {
333 /* First clause ATEST |= 0x4000000.
334 * Less than 32 regs |= 0x200 */
335 meta->bifrost1.unk1 = 0x950020;
336 }
337
338 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
339 if (st == PIPE_SHADER_VERTEX)
340 meta->bifrost2.preload_regs = 0xC0;
341 else {
342 meta->bifrost2.preload_regs = 0x1;
343 SET_BIT(meta->bifrost2.preload_regs, 0x10, ss->reads_frag_coord);
344 }
345
346 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
347 ss->uniform_cutoff);
348 } else {
349 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
350 ss->uniform_cutoff);
351 meta->midgard1.work_count = ss->work_reg_count;
352
353 /* TODO: This is not conformant on ES3 */
354 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
355
356 meta->midgard1.flags_lo = 0x20;
357 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
358
359 SET_BIT(meta->midgard1.flags_hi, MALI_WRITES_GLOBAL, ss->writes_global);
360 }
361 }
362
363 static unsigned
364 panfrost_translate_compare_func(enum pipe_compare_func in)
365 {
366 switch (in) {
367 case PIPE_FUNC_NEVER:
368 return MALI_FUNC_NEVER;
369
370 case PIPE_FUNC_LESS:
371 return MALI_FUNC_LESS;
372
373 case PIPE_FUNC_EQUAL:
374 return MALI_FUNC_EQUAL;
375
376 case PIPE_FUNC_LEQUAL:
377 return MALI_FUNC_LEQUAL;
378
379 case PIPE_FUNC_GREATER:
380 return MALI_FUNC_GREATER;
381
382 case PIPE_FUNC_NOTEQUAL:
383 return MALI_FUNC_NOTEQUAL;
384
385 case PIPE_FUNC_GEQUAL:
386 return MALI_FUNC_GEQUAL;
387
388 case PIPE_FUNC_ALWAYS:
389 return MALI_FUNC_ALWAYS;
390
391 default:
392 unreachable("Invalid func");
393 }
394 }
395
396 static unsigned
397 panfrost_translate_stencil_op(enum pipe_stencil_op in)
398 {
399 switch (in) {
400 case PIPE_STENCIL_OP_KEEP:
401 return MALI_STENCIL_KEEP;
402
403 case PIPE_STENCIL_OP_ZERO:
404 return MALI_STENCIL_ZERO;
405
406 case PIPE_STENCIL_OP_REPLACE:
407 return MALI_STENCIL_REPLACE;
408
409 case PIPE_STENCIL_OP_INCR:
410 return MALI_STENCIL_INCR;
411
412 case PIPE_STENCIL_OP_DECR:
413 return MALI_STENCIL_DECR;
414
415 case PIPE_STENCIL_OP_INCR_WRAP:
416 return MALI_STENCIL_INCR_WRAP;
417
418 case PIPE_STENCIL_OP_DECR_WRAP:
419 return MALI_STENCIL_DECR_WRAP;
420
421 case PIPE_STENCIL_OP_INVERT:
422 return MALI_STENCIL_INVERT;
423
424 default:
425 unreachable("Invalid stencil op");
426 }
427 }
428
429 static unsigned
430 translate_tex_wrap(enum pipe_tex_wrap w)
431 {
432 switch (w) {
433 case PIPE_TEX_WRAP_REPEAT:
434 return MALI_WRAP_REPEAT;
435
436 case PIPE_TEX_WRAP_CLAMP:
437 return MALI_WRAP_CLAMP;
438
439 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
440 return MALI_WRAP_CLAMP_TO_EDGE;
441
442 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
443 return MALI_WRAP_CLAMP_TO_BORDER;
444
445 case PIPE_TEX_WRAP_MIRROR_REPEAT:
446 return MALI_WRAP_MIRRORED_REPEAT;
447
448 case PIPE_TEX_WRAP_MIRROR_CLAMP:
449 return MALI_WRAP_MIRRORED_CLAMP;
450
451 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
452 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
453
454 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
455 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
456
457 default:
458 unreachable("Invalid wrap");
459 }
460 }
461
462 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
463 struct mali_sampler_descriptor *hw)
464 {
465 unsigned func = panfrost_translate_compare_func(cso->compare_func);
466 bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
467 bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
468 bool mip_linear = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
469 unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
470 unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
471 unsigned mip_filter = mip_linear ?
472 (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
473 unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
474
475 *hw = (struct mali_sampler_descriptor) {
476 .filter_mode = min_filter | mag_filter | mip_filter |
477 normalized,
478 .wrap_s = translate_tex_wrap(cso->wrap_s),
479 .wrap_t = translate_tex_wrap(cso->wrap_t),
480 .wrap_r = translate_tex_wrap(cso->wrap_r),
481 .compare_func = panfrost_flip_compare_func(func),
482 .border_color = {
483 cso->border_color.f[0],
484 cso->border_color.f[1],
485 cso->border_color.f[2],
486 cso->border_color.f[3]
487 },
488 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
489 .max_lod = FIXED_16(cso->max_lod, false),
490 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
491 .seamless_cube_map = cso->seamless_cube_map,
492 };
493
494 /* If necessary, we disable mipmapping in the sampler descriptor by
495 * clamping the LOD as tight as possible (from 0 to epsilon,
496 * essentially -- remember these are fixed point numbers, so
497 * epsilon=1/256) */
498
499 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
500 hw->max_lod = hw->min_lod + 1;
501 }
502
503 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
504 struct bifrost_sampler_descriptor *hw)
505 {
506 *hw = (struct bifrost_sampler_descriptor) {
507 .unk1 = 0x1,
508 .wrap_s = translate_tex_wrap(cso->wrap_s),
509 .wrap_t = translate_tex_wrap(cso->wrap_t),
510 .wrap_r = translate_tex_wrap(cso->wrap_r),
511 .unk8 = 0x8,
512 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
513 .norm_coords = cso->normalized_coords,
514 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
515 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
516 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
517 .max_lod = FIXED_16(cso->max_lod, false),
518 };
519
520 /* If necessary, we disable mipmapping in the sampler descriptor by
521 * clamping the LOD as tight as possible (from 0 to epsilon,
522 * essentially -- remember these are fixed point numbers, so
523 * epsilon=1/256) */
524
525 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
526 hw->max_lod = hw->min_lod + 1;
527 }
528
529 static void
530 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
531 struct mali_stencil_test *out)
532 {
533 out->ref = 0; /* Gallium gets it from elsewhere */
534
535 out->mask = in->valuemask;
536 out->func = panfrost_translate_compare_func(in->func);
537 out->sfail = panfrost_translate_stencil_op(in->fail_op);
538 out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
539 out->dppass = panfrost_translate_stencil_op(in->zpass_op);
540 }
541
542 static void
543 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
544 struct mali_shader_meta *fragmeta)
545 {
546 if (!ctx->rasterizer) {
547 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
548 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
549 fragmeta->depth_units = 0.0f;
550 fragmeta->depth_factor = 0.0f;
551 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
552 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
553 return;
554 }
555
556 bool msaa = ctx->rasterizer->base.multisample;
557
558 /* TODO: Sample size */
559 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
560 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
561 fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
562 fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
563
564 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
565
566 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
567 ctx->rasterizer->base.offset_tri);
568 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
569 ctx->rasterizer->base.offset_tri);
570 }
571
572 static void
573 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
574 struct mali_shader_meta *fragmeta)
575 {
576 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
577 int zfunc = PIPE_FUNC_ALWAYS;
578
579 if (!zsa) {
580 struct pipe_stencil_state default_stencil = {
581 .enabled = 0,
582 .func = PIPE_FUNC_ALWAYS,
583 .fail_op = MALI_STENCIL_KEEP,
584 .zfail_op = MALI_STENCIL_KEEP,
585 .zpass_op = MALI_STENCIL_KEEP,
586 .writemask = 0xFF,
587 .valuemask = 0xFF
588 };
589
590 panfrost_make_stencil_state(&default_stencil,
591 &fragmeta->stencil_front);
592 fragmeta->stencil_mask_front = default_stencil.writemask;
593 fragmeta->stencil_back = fragmeta->stencil_front;
594 fragmeta->stencil_mask_back = default_stencil.writemask;
595 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
596 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
597 } else {
598 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
599 zsa->stencil[0].enabled);
600 panfrost_make_stencil_state(&zsa->stencil[0],
601 &fragmeta->stencil_front);
602 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
603 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
604
605 /* If back-stencil is not enabled, use the front values */
606
607 if (zsa->stencil[1].enabled) {
608 panfrost_make_stencil_state(&zsa->stencil[1],
609 &fragmeta->stencil_back);
610 fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
611 fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
612 } else {
613 fragmeta->stencil_back = fragmeta->stencil_front;
614 fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
615 fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
616 }
617
618 if (zsa->depth.enabled)
619 zfunc = zsa->depth.func;
620
621 /* Depth state (TODO: Refactor) */
622
623 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
624 zsa->depth.writemask);
625 }
626
627 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
628 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
629 }
630
631 static bool
632 panfrost_fs_required(
633 struct panfrost_shader_state *fs,
634 struct panfrost_blend_final *blend,
635 unsigned rt_count)
636 {
637 /* If we generally have side effects */
638 if (fs->fs_sidefx)
639 return true;
640
641 /* If colour is written we need to execute */
642 for (unsigned i = 0; i < rt_count; ++i) {
643 if (!blend[i].no_colour)
644 return true;
645 }
646
647 /* If depth is written and not implied we need to execute.
648 * TODO: Predicate on Z/S writes being enabled */
649 return (fs->writes_depth || fs->writes_stencil);
650 }
651
652 static void
653 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
654 struct mali_shader_meta *fragmeta,
655 void *rts)
656 {
657 const struct panfrost_device *dev = pan_device(ctx->base.screen);
658 struct panfrost_shader_state *fs;
659 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
660
661 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
662 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
663 !ctx->blend->base.dither);
664
665 /* Get blending setup */
666 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
667
668 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
669 unsigned shader_offset = 0;
670 struct panfrost_bo *shader_bo = NULL;
671
672 for (unsigned c = 0; c < rt_count; ++c)
673 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
674 &shader_offset);
675
676 /* Disable shader execution if we can */
677 if (dev->quirks & MIDGARD_SHADERLESS
678 && !panfrost_fs_required(fs, blend, rt_count)) {
679 fragmeta->shader = 0;
680 fragmeta->attribute_count = 0;
681 fragmeta->varying_count = 0;
682 fragmeta->texture_count = 0;
683 fragmeta->sampler_count = 0;
684
685 /* This feature is not known to work on Bifrost */
686 fragmeta->midgard1.work_count = 1;
687 fragmeta->midgard1.uniform_count = 0;
688 fragmeta->midgard1.uniform_buffer_count = 0;
689 }
690
691 /* If there is a blend shader, work registers are shared. We impose 8
692 * work registers as a limit for blend shaders. Should be lower XXX */
693
694 if (!(dev->quirks & IS_BIFROST)) {
695 for (unsigned c = 0; c < rt_count; ++c) {
696 if (blend[c].is_shader) {
697 fragmeta->midgard1.work_count =
698 MAX2(fragmeta->midgard1.work_count, 8);
699 }
700 }
701 }
702
703 /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
704 * copied to the blend_meta appended (by convention), but this is the
705 * field actually read by the hardware. (Or maybe both are read...?).
706 * Specify the last RTi with a blend shader. */
707
708 fragmeta->blend.shader = 0;
709
710 for (signed rt = (rt_count - 1); rt >= 0; --rt) {
711 if (!blend[rt].is_shader)
712 continue;
713
714 fragmeta->blend.shader = blend[rt].shader.gpu |
715 blend[rt].shader.first_tag;
716 break;
717 }
718
719 if (dev->quirks & MIDGARD_SFBD) {
720 /* When only a single render target platform is used, the blend
721 * information is inside the shader meta itself. We additionally
722 * need to signal CAN_DISCARD for nontrivial blend modes (so
723 * we're able to read back the destination buffer) */
724
725 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
726 blend[0].is_shader);
727
728 if (!blend[0].is_shader) {
729 fragmeta->blend.equation = *blend[0].equation.equation;
730 fragmeta->blend.constant = blend[0].equation.constant;
731 }
732
733 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
734 !blend[0].no_blending || fs->can_discard);
735 return;
736 }
737
738 if (dev->quirks & IS_BIFROST) {
739 bool no_blend = true;
740
741 for (unsigned i = 0; i < rt_count; ++i)
742 no_blend &= (blend[i].no_blending | blend[i].no_colour);
743
744 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
745 !fs->can_discard && !fs->writes_depth && no_blend);
746 }
747
748 /* Additional blend descriptor tacked on for jobs using MFBD */
749
750 for (unsigned i = 0; i < rt_count; ++i) {
751 unsigned flags = 0;
752
753 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
754 flags = 0x200;
755
756 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
757 (ctx->pipe_framebuffer.cbufs[i]) &&
758 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
759
760 SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
761 SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
762 SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
763 SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
764 }
765
766 if (dev->quirks & IS_BIFROST) {
767 struct bifrost_blend_rt *brts = rts;
768
769 brts[i].flags = flags;
770
771 if (blend[i].is_shader) {
772 /* The blend shader's address needs to be at
773 * the same top 32 bit as the fragment shader.
774 * TODO: Ensure that's always the case.
775 */
776 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
777 (fs->bo->gpu & (0xffffffffull << 32)));
778 brts[i].shader = blend[i].shader.gpu;
779 brts[i].unk2 = 0x0;
780 } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
781 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
782 const struct util_format_description *format_desc;
783 format_desc = util_format_description(format);
784
785 brts[i].equation = *blend[i].equation.equation;
786
787 /* TODO: this is a bit more complicated */
788 brts[i].constant = blend[i].equation.constant;
789
790 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
791
792 /* 0x19 disables blending and forces REPLACE
793 * mode (equivalent to rgb_mode = alpha_mode =
794 * x122, colour mask = 0xF). 0x1a allows
795 * blending. */
796 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
797
798 brts[i].shader_type = fs->blend_types[i];
799 } else {
800 /* Dummy attachment for depth-only */
801 brts[i].unk2 = 0x3;
802 brts[i].shader_type = fs->blend_types[i];
803 }
804 } else {
805 struct midgard_blend_rt *mrts = rts;
806 mrts[i].flags = flags;
807
808 if (blend[i].is_shader) {
809 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
810 } else {
811 mrts[i].blend.equation = *blend[i].equation.equation;
812 mrts[i].blend.constant = blend[i].equation.constant;
813 }
814 }
815 }
816 }
817
818 static void
819 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
820 struct mali_shader_meta *fragmeta,
821 void *rts)
822 {
823 const struct panfrost_device *dev = pan_device(ctx->base.screen);
824 struct panfrost_shader_state *fs;
825
826 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
827
828 fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
829 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
830 fragmeta->unknown2_4 = 0x4e0;
831
832 /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
833 * is required (independent of 32-bit/64-bit descriptors), or why it's
834 * not used on later GPU revisions. Otherwise, all shader jobs fault on
835 * these earlier chips (perhaps this is a chicken bit of some kind).
836 * More investigation is needed. */
837
838 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
839
840 if (dev->quirks & IS_BIFROST) {
841 /* TODO */
842 } else {
843 /* Depending on whether it's legal to in the given shader, we try to
844 * enable early-z testing. TODO: respect e-z force */
845
846 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
847 !fs->can_discard && !fs->writes_depth && !fs->writes_global);
848
849 /* Add the writes Z/S flags if needed. */
850 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
851 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
852
853 /* Any time texturing is used, derivatives are implicitly calculated,
854 * so we need to enable helper invocations */
855
856 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
857 fs->helper_invocations);
858
859 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
860
861 bool depth_enabled = fs->writes_depth ||
862 (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
863
864 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, !depth_enabled && fs->can_discard);
865 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
866 }
867
868 panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
869 panfrost_frag_meta_zsa_update(ctx, fragmeta);
870 panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
871 }
872
873 void
874 panfrost_emit_shader_meta(struct panfrost_batch *batch,
875 enum pipe_shader_type st,
876 struct mali_vertex_tiler_postfix *postfix)
877 {
878 struct panfrost_context *ctx = batch->ctx;
879 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
880
881 if (!ss) {
882 postfix->shader = 0;
883 return;
884 }
885
886 struct mali_shader_meta meta;
887
888 panfrost_shader_meta_init(ctx, st, &meta);
889
890 /* Add the shader BO to the batch. */
891 panfrost_batch_add_bo(batch, ss->bo,
892 PAN_BO_ACCESS_PRIVATE |
893 PAN_BO_ACCESS_READ |
894 panfrost_bo_access_for_stage(st));
895
896 mali_ptr shader_ptr;
897
898 if (st == PIPE_SHADER_FRAGMENT) {
899 struct panfrost_device *dev = pan_device(ctx->base.screen);
900 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
901 size_t desc_size = sizeof(meta);
902 void *rts = NULL;
903 struct panfrost_transfer xfer;
904 unsigned rt_size;
905
906 if (dev->quirks & MIDGARD_SFBD)
907 rt_size = 0;
908 else if (dev->quirks & IS_BIFROST)
909 rt_size = sizeof(struct bifrost_blend_rt);
910 else
911 rt_size = sizeof(struct midgard_blend_rt);
912
913 desc_size += rt_size * rt_count;
914
915 if (rt_size)
916 rts = rzalloc_size(ctx, rt_size * rt_count);
917
918 panfrost_frag_shader_meta_init(ctx, &meta, rts);
919
920 xfer = panfrost_allocate_transient(batch, desc_size);
921
922 memcpy(xfer.cpu, &meta, sizeof(meta));
923 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
924
925 if (rt_size)
926 ralloc_free(rts);
927
928 shader_ptr = xfer.gpu;
929 } else {
930 shader_ptr = panfrost_upload_transient(batch, &meta,
931 sizeof(meta));
932 }
933
934 postfix->shader = shader_ptr;
935 }
936
937 static void
938 panfrost_mali_viewport_init(struct panfrost_context *ctx,
939 struct mali_viewport *mvp)
940 {
941 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
942
943 /* Clip bounds are encoded as floats. The viewport itself is encoded as
944 * (somewhat) asymmetric ints. */
945
946 const struct pipe_scissor_state *ss = &ctx->scissor;
947
948 memset(mvp, 0, sizeof(*mvp));
949
950 /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
951 * each direction. Clipping to the viewport in theory should work, but
952 * in practice causes issues when we're not explicitly trying to
953 * scissor */
954
955 *mvp = (struct mali_viewport) {
956 .clip_minx = -INFINITY,
957 .clip_miny = -INFINITY,
958 .clip_maxx = INFINITY,
959 .clip_maxy = INFINITY,
960 };
961
962 /* Always scissor to the viewport by default. */
963 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
964 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
965
966 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
967 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
968
969 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
970 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
971
972 /* Apply the scissor test */
973
974 unsigned minx, miny, maxx, maxy;
975
976 if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
977 minx = MAX2(ss->minx, vp_minx);
978 miny = MAX2(ss->miny, vp_miny);
979 maxx = MIN2(ss->maxx, vp_maxx);
980 maxy = MIN2(ss->maxy, vp_maxy);
981 } else {
982 minx = vp_minx;
983 miny = vp_miny;
984 maxx = vp_maxx;
985 maxy = vp_maxy;
986 }
987
988 /* Hardware needs the min/max to be strictly ordered, so flip if we
989 * need to. The viewport transformation in the vertex shader will
990 * handle the negatives if we don't */
991
992 if (miny > maxy) {
993 unsigned temp = miny;
994 miny = maxy;
995 maxy = temp;
996 }
997
998 if (minx > maxx) {
999 unsigned temp = minx;
1000 minx = maxx;
1001 maxx = temp;
1002 }
1003
1004 if (minz > maxz) {
1005 float temp = minz;
1006 minz = maxz;
1007 maxz = temp;
1008 }
1009
1010 /* Clamp to the framebuffer size as a last check */
1011
1012 minx = MIN2(ctx->pipe_framebuffer.width, minx);
1013 maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
1014
1015 miny = MIN2(ctx->pipe_framebuffer.height, miny);
1016 maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
1017
1018 /* Upload */
1019
1020 mvp->viewport0[0] = minx;
1021 mvp->viewport1[0] = MALI_POSITIVE(maxx);
1022
1023 mvp->viewport0[1] = miny;
1024 mvp->viewport1[1] = MALI_POSITIVE(maxy);
1025
1026 mvp->clip_minz = minz;
1027 mvp->clip_maxz = maxz;
1028 }
1029
1030 void
1031 panfrost_emit_viewport(struct panfrost_batch *batch,
1032 struct mali_vertex_tiler_postfix *tiler_postfix)
1033 {
1034 struct panfrost_context *ctx = batch->ctx;
1035 struct mali_viewport mvp;
1036
1037 panfrost_mali_viewport_init(batch->ctx, &mvp);
1038
1039 /* Update the job, unless we're doing wallpapering (whose lack of
1040 * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
1041 * just... be faster :) */
1042
1043 if (!ctx->wallpaper_batch)
1044 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
1045 mvp.viewport0[1],
1046 mvp.viewport1[0] + 1,
1047 mvp.viewport1[1] + 1);
1048
1049 tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
1050 sizeof(mvp));
1051 }
1052
1053 static mali_ptr
1054 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1055 enum pipe_shader_type st,
1056 struct panfrost_constant_buffer *buf,
1057 unsigned index)
1058 {
1059 struct pipe_constant_buffer *cb = &buf->cb[index];
1060 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1061
1062 if (rsrc) {
1063 panfrost_batch_add_bo(batch, rsrc->bo,
1064 PAN_BO_ACCESS_SHARED |
1065 PAN_BO_ACCESS_READ |
1066 panfrost_bo_access_for_stage(st));
1067
1068 /* Alignment gauranteed by
1069 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1070 return rsrc->bo->gpu + cb->buffer_offset;
1071 } else if (cb->user_buffer) {
1072 return panfrost_upload_transient(batch,
1073 cb->user_buffer +
1074 cb->buffer_offset,
1075 cb->buffer_size);
1076 } else {
1077 unreachable("No constant buffer");
1078 }
1079 }
1080
1081 struct sysval_uniform {
1082 union {
1083 float f[4];
1084 int32_t i[4];
1085 uint32_t u[4];
1086 uint64_t du[2];
1087 };
1088 };
1089
1090 static void
1091 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1092 struct sysval_uniform *uniform)
1093 {
1094 struct panfrost_context *ctx = batch->ctx;
1095 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1096
1097 uniform->f[0] = vp->scale[0];
1098 uniform->f[1] = vp->scale[1];
1099 uniform->f[2] = vp->scale[2];
1100 }
1101
1102 static void
1103 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1104 struct sysval_uniform *uniform)
1105 {
1106 struct panfrost_context *ctx = batch->ctx;
1107 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1108
1109 uniform->f[0] = vp->translate[0];
1110 uniform->f[1] = vp->translate[1];
1111 uniform->f[2] = vp->translate[2];
1112 }
1113
1114 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1115 enum pipe_shader_type st,
1116 unsigned int sysvalid,
1117 struct sysval_uniform *uniform)
1118 {
1119 struct panfrost_context *ctx = batch->ctx;
1120 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1121 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1122 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1123 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1124
1125 assert(dim);
1126 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1127
1128 if (dim > 1)
1129 uniform->i[1] = u_minify(tex->texture->height0,
1130 tex->u.tex.first_level);
1131
1132 if (dim > 2)
1133 uniform->i[2] = u_minify(tex->texture->depth0,
1134 tex->u.tex.first_level);
1135
1136 if (is_array)
1137 uniform->i[dim] = tex->texture->array_size;
1138 }
1139
1140 static void
1141 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1142 enum pipe_shader_type st,
1143 unsigned ssbo_id,
1144 struct sysval_uniform *uniform)
1145 {
1146 struct panfrost_context *ctx = batch->ctx;
1147
1148 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1149 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1150
1151 /* Compute address */
1152 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1153
1154 panfrost_batch_add_bo(batch, bo,
1155 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1156 panfrost_bo_access_for_stage(st));
1157
1158 /* Upload address and size as sysval */
1159 uniform->du[0] = bo->gpu + sb.buffer_offset;
1160 uniform->u[2] = sb.buffer_size;
1161 }
1162
1163 static void
1164 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1165 enum pipe_shader_type st,
1166 unsigned samp_idx,
1167 struct sysval_uniform *uniform)
1168 {
1169 struct panfrost_context *ctx = batch->ctx;
1170 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1171
1172 uniform->f[0] = sampl->min_lod;
1173 uniform->f[1] = sampl->max_lod;
1174 uniform->f[2] = sampl->lod_bias;
1175
1176 /* Even without any errata, Midgard represents "no mipmapping" as
1177 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1178 * panfrost_create_sampler_state which also explains our choice of
1179 * epsilon value (again to keep behaviour consistent) */
1180
1181 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1182 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1183 }
1184
1185 static void
1186 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1187 struct sysval_uniform *uniform)
1188 {
1189 struct panfrost_context *ctx = batch->ctx;
1190
1191 uniform->u[0] = ctx->compute_grid->grid[0];
1192 uniform->u[1] = ctx->compute_grid->grid[1];
1193 uniform->u[2] = ctx->compute_grid->grid[2];
1194 }
1195
1196 static void
1197 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1198 struct panfrost_shader_state *ss,
1199 enum pipe_shader_type st)
1200 {
1201 struct sysval_uniform *uniforms = (void *)buf;
1202
1203 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1204 int sysval = ss->sysval[i];
1205
1206 switch (PAN_SYSVAL_TYPE(sysval)) {
1207 case PAN_SYSVAL_VIEWPORT_SCALE:
1208 panfrost_upload_viewport_scale_sysval(batch,
1209 &uniforms[i]);
1210 break;
1211 case PAN_SYSVAL_VIEWPORT_OFFSET:
1212 panfrost_upload_viewport_offset_sysval(batch,
1213 &uniforms[i]);
1214 break;
1215 case PAN_SYSVAL_TEXTURE_SIZE:
1216 panfrost_upload_txs_sysval(batch, st,
1217 PAN_SYSVAL_ID(sysval),
1218 &uniforms[i]);
1219 break;
1220 case PAN_SYSVAL_SSBO:
1221 panfrost_upload_ssbo_sysval(batch, st,
1222 PAN_SYSVAL_ID(sysval),
1223 &uniforms[i]);
1224 break;
1225 case PAN_SYSVAL_NUM_WORK_GROUPS:
1226 panfrost_upload_num_work_groups_sysval(batch,
1227 &uniforms[i]);
1228 break;
1229 case PAN_SYSVAL_SAMPLER:
1230 panfrost_upload_sampler_sysval(batch, st,
1231 PAN_SYSVAL_ID(sysval),
1232 &uniforms[i]);
1233 break;
1234 default:
1235 assert(0);
1236 }
1237 }
1238 }
1239
1240 static const void *
1241 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1242 unsigned index)
1243 {
1244 struct pipe_constant_buffer *cb = &buf->cb[index];
1245 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1246
1247 if (rsrc)
1248 return rsrc->bo->cpu;
1249 else if (cb->user_buffer)
1250 return cb->user_buffer;
1251 else
1252 unreachable("No constant buffer");
1253 }
1254
1255 void
1256 panfrost_emit_const_buf(struct panfrost_batch *batch,
1257 enum pipe_shader_type stage,
1258 struct mali_vertex_tiler_postfix *postfix)
1259 {
1260 struct panfrost_context *ctx = batch->ctx;
1261 struct panfrost_shader_variants *all = ctx->shader[stage];
1262
1263 if (!all)
1264 return;
1265
1266 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1267
1268 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1269
1270 /* Uniforms are implicitly UBO #0 */
1271 bool has_uniforms = buf->enabled_mask & (1 << 0);
1272
1273 /* Allocate room for the sysval and the uniforms */
1274 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1275 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1276 size_t size = sys_size + uniform_size;
1277 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1278 size);
1279
1280 /* Upload sysvals requested by the shader */
1281 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1282
1283 /* Upload uniforms */
1284 if (has_uniforms && uniform_size) {
1285 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1286 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1287 }
1288
1289 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1290 * uploaded */
1291
1292 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1293 assert(ubo_count >= 1);
1294
1295 size_t sz = sizeof(uint64_t) * ubo_count;
1296 uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1297 int uniform_count = ss->uniform_count;
1298
1299 /* Upload uniforms as a UBO */
1300 ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1301
1302 /* The rest are honest-to-goodness UBOs */
1303
1304 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1305 size_t usz = buf->cb[ubo].buffer_size;
1306 bool enabled = buf->enabled_mask & (1 << ubo);
1307 bool empty = usz == 0;
1308
1309 if (!enabled || empty) {
1310 /* Stub out disabled UBOs to catch accesses */
1311 ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1312 continue;
1313 }
1314
1315 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1316 buf, ubo);
1317
1318 unsigned bytes_per_field = 16;
1319 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1320 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1321 }
1322
1323 mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1324 postfix->uniforms = transfer.gpu;
1325 postfix->uniform_buffers = ubufs;
1326
1327 buf->dirty_mask = 0;
1328 }
1329
1330 void
1331 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1332 const struct pipe_grid_info *info,
1333 struct midgard_payload_vertex_tiler *vtp)
1334 {
1335 struct panfrost_context *ctx = batch->ctx;
1336 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1337 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1338 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1339 128));
1340 unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1341 info->grid[2] * 4;
1342 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1343 shared_size,
1344 1);
1345
1346 struct mali_shared_memory shared = {
1347 .shared_memory = bo->gpu,
1348 .shared_workgroup_count =
1349 util_logbase2_ceil(info->grid[0]) +
1350 util_logbase2_ceil(info->grid[1]) +
1351 util_logbase2_ceil(info->grid[2]),
1352 .shared_unk1 = 0x2,
1353 .shared_shift = util_logbase2(single_size) - 1
1354 };
1355
1356 vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1357 sizeof(shared));
1358 }
1359
1360 static mali_ptr
1361 panfrost_get_tex_desc(struct panfrost_batch *batch,
1362 enum pipe_shader_type st,
1363 struct panfrost_sampler_view *view)
1364 {
1365 if (!view)
1366 return (mali_ptr) 0;
1367
1368 struct pipe_sampler_view *pview = &view->base;
1369 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1370
1371 /* Add the BO to the job so it's retained until the job is done. */
1372
1373 panfrost_batch_add_bo(batch, rsrc->bo,
1374 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1375 panfrost_bo_access_for_stage(st));
1376
1377 panfrost_batch_add_bo(batch, view->midgard_bo,
1378 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1379 panfrost_bo_access_for_stage(st));
1380
1381 return view->midgard_bo->gpu;
1382 }
1383
1384 void
1385 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1386 enum pipe_shader_type stage,
1387 struct mali_vertex_tiler_postfix *postfix)
1388 {
1389 struct panfrost_context *ctx = batch->ctx;
1390 struct panfrost_device *device = pan_device(ctx->base.screen);
1391
1392 if (!ctx->sampler_view_count[stage])
1393 return;
1394
1395 if (device->quirks & IS_BIFROST) {
1396 struct bifrost_texture_descriptor *descriptors;
1397
1398 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1399 ctx->sampler_view_count[stage]);
1400
1401 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1402 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1403 struct pipe_sampler_view *pview = &view->base;
1404 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1405
1406 /* Add the BOs to the job so they are retained until the job is done. */
1407
1408 panfrost_batch_add_bo(batch, rsrc->bo,
1409 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1410 panfrost_bo_access_for_stage(stage));
1411
1412 panfrost_batch_add_bo(batch, view->bifrost_bo,
1413 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1414 panfrost_bo_access_for_stage(stage));
1415
1416 memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1417 }
1418
1419 postfix->textures = panfrost_upload_transient(batch,
1420 descriptors,
1421 sizeof(struct bifrost_texture_descriptor) *
1422 ctx->sampler_view_count[stage]);
1423
1424 free(descriptors);
1425 } else {
1426 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1427
1428 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1429 trampolines[i] = panfrost_get_tex_desc(batch, stage,
1430 ctx->sampler_views[stage][i]);
1431
1432 postfix->textures = panfrost_upload_transient(batch,
1433 trampolines,
1434 sizeof(uint64_t) *
1435 ctx->sampler_view_count[stage]);
1436 }
1437 }
1438
1439 void
1440 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1441 enum pipe_shader_type stage,
1442 struct mali_vertex_tiler_postfix *postfix)
1443 {
1444 struct panfrost_context *ctx = batch->ctx;
1445 struct panfrost_device *device = pan_device(ctx->base.screen);
1446
1447 if (!ctx->sampler_count[stage])
1448 return;
1449
1450 if (device->quirks & IS_BIFROST) {
1451 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1452 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1453 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1454 transfer_size);
1455 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1456
1457 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1458 desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1459
1460 postfix->sampler_descriptor = transfer.gpu;
1461 } else {
1462 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1463 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1464 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1465 transfer_size);
1466 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1467
1468 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1469 desc[i] = ctx->samplers[stage][i]->midgard_hw;
1470
1471 postfix->sampler_descriptor = transfer.gpu;
1472 }
1473 }
1474
1475 void
1476 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1477 struct mali_vertex_tiler_postfix *vertex_postfix)
1478 {
1479 struct panfrost_context *ctx = batch->ctx;
1480
1481 if (!ctx->vertex)
1482 return;
1483
1484 struct panfrost_vertex_state *so = ctx->vertex;
1485
1486 panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1487 vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1488 sizeof(*so->hw) *
1489 PAN_MAX_ATTRIBUTE);
1490 }
1491
1492 void
1493 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1494 struct mali_vertex_tiler_postfix *vertex_postfix)
1495 {
1496 struct panfrost_context *ctx = batch->ctx;
1497 struct panfrost_vertex_state *so = ctx->vertex;
1498
1499 /* Staged mali_attr, and index into them. i =/= k, depending on the
1500 * vertex buffer mask and instancing. Twice as much room is allocated,
1501 * for a worst case of NPOT_DIVIDEs which take up extra slot */
1502 union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1503 unsigned k = 0;
1504
1505 for (unsigned i = 0; i < so->num_elements; ++i) {
1506 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1507 * means duplicating some vertex buffers (who cares? aside from
1508 * maybe some caching implications but I somehow doubt that
1509 * matters) */
1510
1511 struct pipe_vertex_element *elem = &so->pipe[i];
1512 unsigned vbi = elem->vertex_buffer_index;
1513
1514 /* The exception to 1:1 mapping is that we can have multiple
1515 * entries (NPOT divisors), so we fixup anyways */
1516
1517 so->hw[i].index = k;
1518
1519 if (!(ctx->vb_mask & (1 << vbi)))
1520 continue;
1521
1522 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1523 struct panfrost_resource *rsrc;
1524
1525 rsrc = pan_resource(buf->buffer.resource);
1526 if (!rsrc)
1527 continue;
1528
1529 /* Align to 64 bytes by masking off the lower bits. This
1530 * will be adjusted back when we fixup the src_offset in
1531 * mali_attr_meta */
1532
1533 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1534 mali_ptr addr = raw_addr & ~63;
1535 unsigned chopped_addr = raw_addr - addr;
1536
1537 /* Add a dependency of the batch on the vertex buffer */
1538 panfrost_batch_add_bo(batch, rsrc->bo,
1539 PAN_BO_ACCESS_SHARED |
1540 PAN_BO_ACCESS_READ |
1541 PAN_BO_ACCESS_VERTEX_TILER);
1542
1543 /* Set common fields */
1544 attrs[k].elements = addr;
1545 attrs[k].stride = buf->stride;
1546
1547 /* Since we advanced the base pointer, we shrink the buffer
1548 * size */
1549 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1550
1551 /* We need to add the extra size we masked off (for
1552 * correctness) so the data doesn't get clamped away */
1553 attrs[k].size += chopped_addr;
1554
1555 /* For non-instancing make sure we initialize */
1556 attrs[k].shift = attrs[k].extra_flags = 0;
1557
1558 /* Instancing uses a dramatically different code path than
1559 * linear, so dispatch for the actual emission now that the
1560 * common code is finished */
1561
1562 unsigned divisor = elem->instance_divisor;
1563
1564 if (divisor && ctx->instance_count == 1) {
1565 /* Silly corner case where there's a divisor(=1) but
1566 * there's no legitimate instancing. So we want *every*
1567 * attribute to be the same. So set stride to zero so
1568 * we don't go anywhere. */
1569
1570 attrs[k].size = attrs[k].stride + chopped_addr;
1571 attrs[k].stride = 0;
1572 attrs[k++].elements |= MALI_ATTR_LINEAR;
1573 } else if (ctx->instance_count <= 1) {
1574 /* Normal, non-instanced attributes */
1575 attrs[k++].elements |= MALI_ATTR_LINEAR;
1576 } else {
1577 unsigned instance_shift = vertex_postfix->instance_shift;
1578 unsigned instance_odd = vertex_postfix->instance_odd;
1579
1580 k += panfrost_vertex_instanced(ctx->padded_count,
1581 instance_shift,
1582 instance_odd,
1583 divisor, &attrs[k]);
1584 }
1585 }
1586
1587 /* Add special gl_VertexID/gl_InstanceID buffers */
1588
1589 panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1590 so->hw[PAN_VERTEX_ID].index = k++;
1591 panfrost_instance_id(ctx->padded_count, &attrs[k]);
1592 so->hw[PAN_INSTANCE_ID].index = k++;
1593
1594 /* Upload whatever we emitted and go */
1595
1596 vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1597 k * sizeof(*attrs));
1598 }
1599
1600 static mali_ptr
1601 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1602 unsigned stride, unsigned count)
1603 {
1604 /* Fill out the descriptor */
1605 slot->stride = stride;
1606 slot->size = stride * count;
1607 slot->shift = slot->extra_flags = 0;
1608
1609 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1610 slot->size);
1611
1612 slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1613
1614 return transfer.gpu;
1615 }
1616
1617 static void
1618 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1619 unsigned stride, unsigned offset, unsigned count,
1620 struct pipe_stream_output_target *target)
1621 {
1622 /* Fill out the descriptor */
1623 slot->stride = stride * 4;
1624 slot->shift = slot->extra_flags = 0;
1625
1626 unsigned max_size = target->buffer_size;
1627 unsigned expected_size = slot->stride * count;
1628
1629 slot->size = MIN2(max_size, expected_size);
1630
1631 /* Grab the BO and bind it to the batch */
1632 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1633
1634 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1635 * the perspective of the TILER and FRAGMENT.
1636 */
1637 panfrost_batch_add_bo(batch, bo,
1638 PAN_BO_ACCESS_SHARED |
1639 PAN_BO_ACCESS_RW |
1640 PAN_BO_ACCESS_VERTEX_TILER |
1641 PAN_BO_ACCESS_FRAGMENT);
1642
1643 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1644 slot->elements = addr;
1645 }
1646
1647 /* Given a shader and buffer indices, link varying metadata together */
1648
1649 static bool
1650 is_special_varying(gl_varying_slot loc)
1651 {
1652 switch (loc) {
1653 case VARYING_SLOT_POS:
1654 case VARYING_SLOT_PSIZ:
1655 case VARYING_SLOT_PNTC:
1656 case VARYING_SLOT_FACE:
1657 return true;
1658 default:
1659 return false;
1660 }
1661 }
1662
1663 static void
1664 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1665 signed general, signed gl_Position,
1666 signed gl_PointSize, signed gl_PointCoord,
1667 signed gl_FrontFacing)
1668 {
1669 struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1670
1671 for (unsigned i = 0; i < ss->varying_count; ++i) {
1672 gl_varying_slot location = ss->varyings_loc[i];
1673 int index = -1;
1674
1675 switch (location) {
1676 case VARYING_SLOT_POS:
1677 index = gl_Position;
1678 break;
1679 case VARYING_SLOT_PSIZ:
1680 index = gl_PointSize;
1681 break;
1682 case VARYING_SLOT_PNTC:
1683 index = gl_PointCoord;
1684 break;
1685 case VARYING_SLOT_FACE:
1686 index = gl_FrontFacing;
1687 break;
1688 default:
1689 index = general;
1690 break;
1691 }
1692
1693 assert(index >= 0);
1694 out[i].index = index;
1695 }
1696 }
1697
1698 static bool
1699 has_point_coord(unsigned mask, gl_varying_slot loc)
1700 {
1701 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1702 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1703 else if (loc == VARYING_SLOT_PNTC)
1704 return (mask & (1 << 8));
1705 else
1706 return false;
1707 }
1708
1709 /* Helpers for manipulating stream out information so we can pack varyings
1710 * accordingly. Compute the src_offset for a given captured varying */
1711
1712 static struct pipe_stream_output *
1713 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1714 {
1715 for (unsigned i = 0; i < info->num_outputs; ++i) {
1716 if (info->output[i].register_index == loc)
1717 return &info->output[i];
1718 }
1719
1720 unreachable("Varying not captured");
1721 }
1722
1723 void
1724 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1725 unsigned vertex_count,
1726 struct mali_vertex_tiler_postfix *vertex_postfix,
1727 struct mali_vertex_tiler_postfix *tiler_postfix,
1728 union midgard_primitive_size *primitive_size)
1729 {
1730 /* Load the shaders */
1731 struct panfrost_context *ctx = batch->ctx;
1732 struct panfrost_device *device = pan_device(ctx->base.screen);
1733 struct panfrost_shader_state *vs, *fs;
1734 unsigned int num_gen_varyings = 0;
1735 size_t vs_size, fs_size;
1736
1737 /* Allocate the varying descriptor */
1738
1739 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1740 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1741 vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1742 fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1743
1744 struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1745 vs_size +
1746 fs_size);
1747
1748 struct pipe_stream_output_info *so = &vs->stream_output;
1749
1750 /* Check if this varying is linked by us. This is the case for
1751 * general-purpose, non-captured varyings. If it is, link it. If it's
1752 * not, use the provided stream out information to determine the
1753 * offset, since it was already linked for us. */
1754
1755 for (unsigned i = 0; i < vs->varying_count; i++) {
1756 gl_varying_slot loc = vs->varyings_loc[i];
1757
1758 bool special = is_special_varying(loc);
1759 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1760
1761 if (captured) {
1762 struct pipe_stream_output *o = pan_get_so(so, loc);
1763
1764 unsigned dst_offset = o->dst_offset * 4; /* dwords */
1765 vs->varyings[i].src_offset = dst_offset;
1766 } else if (!special) {
1767 vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1768 }
1769 }
1770
1771 /* Conversely, we need to set src_offset for the captured varyings.
1772 * Here, the layout is defined by the stream out info, not us */
1773
1774 /* Link up with fragment varyings */
1775 bool reads_point_coord = fs->reads_point_coord;
1776
1777 for (unsigned i = 0; i < fs->varying_count; i++) {
1778 gl_varying_slot loc = fs->varyings_loc[i];
1779 unsigned src_offset;
1780 signed vs_idx = -1;
1781
1782 /* Link up */
1783 for (unsigned j = 0; j < vs->varying_count; ++j) {
1784 if (vs->varyings_loc[j] == loc) {
1785 vs_idx = j;
1786 break;
1787 }
1788 }
1789
1790 /* Either assign or reuse */
1791 if (vs_idx >= 0)
1792 src_offset = vs->varyings[vs_idx].src_offset;
1793 else
1794 src_offset = 16 * (num_gen_varyings++);
1795
1796 fs->varyings[i].src_offset = src_offset;
1797
1798 if (has_point_coord(fs->point_sprite_mask, loc))
1799 reads_point_coord = true;
1800 }
1801
1802 memcpy(trans.cpu, vs->varyings, vs_size);
1803 memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1804
1805 union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1806
1807 /* Figure out how many streamout buffers could be bound */
1808 unsigned so_count = ctx->streamout.num_targets;
1809 for (unsigned i = 0; i < vs->varying_count; i++) {
1810 gl_varying_slot loc = vs->varyings_loc[i];
1811
1812 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1813 if (!captured) continue;
1814
1815 struct pipe_stream_output *o = pan_get_so(so, loc);
1816 so_count = MAX2(so_count, o->output_buffer + 1);
1817 }
1818
1819 signed idx = so_count;
1820 signed general = idx++;
1821 signed gl_Position = idx++;
1822 signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1823 signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1824 signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1825 signed gl_FragCoord = (fs->reads_frag_coord &&
1826 !(device->quirks & IS_BIFROST))
1827 ? (idx++) : -1;
1828
1829 /* Emit the stream out buffers */
1830
1831 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1832 ctx->vertex_count);
1833
1834 for (unsigned i = 0; i < so_count; ++i) {
1835 if (i < ctx->streamout.num_targets) {
1836 panfrost_emit_streamout(batch, &varyings[i],
1837 so->stride[i],
1838 ctx->streamout.offsets[i],
1839 out_count,
1840 ctx->streamout.targets[i]);
1841 } else {
1842 /* Emit a dummy buffer */
1843 panfrost_emit_varyings(batch, &varyings[i],
1844 so->stride[i] * 4,
1845 out_count);
1846
1847 /* Clear the attribute type */
1848 varyings[i].elements &= ~0xF;
1849 }
1850 }
1851
1852 panfrost_emit_varyings(batch, &varyings[general],
1853 num_gen_varyings * 16,
1854 vertex_count);
1855
1856 mali_ptr varyings_p;
1857
1858 /* fp32 vec4 gl_Position */
1859 varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1860 sizeof(float) * 4, vertex_count);
1861 tiler_postfix->position_varying = varyings_p;
1862
1863
1864 if (panfrost_writes_point_size(ctx)) {
1865 varyings_p = panfrost_emit_varyings(batch,
1866 &varyings[gl_PointSize],
1867 2, vertex_count);
1868 primitive_size->pointer = varyings_p;
1869 }
1870
1871 if (gl_PointCoord >= 0)
1872 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1873
1874 if (gl_FrontFacing >= 0)
1875 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1876
1877 if (gl_FragCoord >= 0)
1878 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1879
1880 assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord));
1881
1882 /* Let's go ahead and link varying meta to the buffer in question, now
1883 * that that information is available. VARYING_SLOT_POS is mapped to
1884 * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1885 * */
1886
1887 panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1888 gl_PointSize, gl_PointCoord,
1889 gl_FrontFacing);
1890
1891 panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1892 gl_FragCoord, gl_PointSize,
1893 gl_PointCoord, gl_FrontFacing);
1894
1895 /* Replace streamout */
1896
1897 struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1898 struct mali_attr_meta *ofs = ovs + vs->varying_count;
1899
1900 for (unsigned i = 0; i < vs->varying_count; i++) {
1901 gl_varying_slot loc = vs->varyings_loc[i];
1902
1903 /* If we write gl_PointSize from the vertex shader but don't
1904 * consume it, no memory will be allocated for it, so if we
1905 * attempted to write anyway we would dereference a NULL
1906 * pointer on the GPU. Midgard seems fine with this; Bifrost
1907 * faults. */
1908
1909 if (loc == VARYING_SLOT_PSIZ && !panfrost_writes_point_size(ctx))
1910 ovs[i].format = MALI_VARYING_DISCARD;
1911
1912 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1913 if (!captured)
1914 continue;
1915
1916 struct pipe_stream_output *o = pan_get_so(so, loc);
1917 ovs[i].index = o->output_buffer;
1918
1919 assert(o->stream == 0);
1920 ovs[i].format = (vs->varyings[i].format & ~MALI_NR_CHANNELS(4))
1921 | MALI_NR_CHANNELS(o->num_components);
1922
1923 if (device->quirks & HAS_SWIZZLES)
1924 ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1925 else
1926 ovs[i].swizzle = panfrost_bifrost_swizzle(o->num_components);
1927
1928 /* Link to the fragment */
1929 signed fs_idx = -1;
1930
1931 /* Link up */
1932 for (unsigned j = 0; j < fs->varying_count; ++j) {
1933 if (fs->varyings_loc[j] == loc) {
1934 fs_idx = j;
1935 break;
1936 }
1937 }
1938
1939 if (fs_idx >= 0) {
1940 ofs[fs_idx].index = ovs[i].index;
1941 ofs[fs_idx].format = ovs[i].format;
1942 ofs[fs_idx].swizzle = ovs[i].swizzle;
1943 }
1944 }
1945
1946 /* Replace point sprite */
1947 for (unsigned i = 0; i < fs->varying_count; i++) {
1948 /* If we have a point sprite replacement, handle that here. We
1949 * have to translate location first. TODO: Flip y in shader.
1950 * We're already keying ... just time crunch .. */
1951
1952 if (has_point_coord(fs->point_sprite_mask,
1953 fs->varyings_loc[i])) {
1954 ofs[i].index = gl_PointCoord;
1955
1956 /* Swizzle out the z/w to 0/1 */
1957 ofs[i].format = MALI_RG16F;
1958 ofs[i].swizzle = panfrost_get_default_swizzle(2);
1959 }
1960 }
1961
1962 /* Fix up unaligned addresses */
1963 for (unsigned i = 0; i < so_count; ++i) {
1964 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1965 continue;
1966
1967 unsigned align = (varyings[i].elements & 63);
1968
1969 /* While we're at it, the SO buffers are linear */
1970
1971 if (!align) {
1972 varyings[i].elements |= MALI_ATTR_LINEAR;
1973 continue;
1974 }
1975
1976 /* We need to adjust alignment */
1977 varyings[i].elements &= ~63;
1978 varyings[i].elements |= MALI_ATTR_LINEAR;
1979 varyings[i].size += align;
1980
1981 for (unsigned v = 0; v < vs->varying_count; ++v) {
1982 if (ovs[v].index != i)
1983 continue;
1984
1985 ovs[v].src_offset = vs->varyings[v].src_offset + align;
1986 }
1987
1988 for (unsigned f = 0; f < fs->varying_count; ++f) {
1989 if (ofs[f].index != i)
1990 continue;
1991
1992 ofs[f].src_offset = fs->varyings[f].src_offset + align;
1993 }
1994 }
1995
1996 varyings_p = panfrost_upload_transient(batch, varyings,
1997 idx * sizeof(*varyings));
1998 vertex_postfix->varyings = varyings_p;
1999 tiler_postfix->varyings = varyings_p;
2000
2001 vertex_postfix->varying_meta = trans.gpu;
2002 tiler_postfix->varying_meta = trans.gpu + vs_size;
2003 }
2004
2005 void
2006 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2007 struct mali_vertex_tiler_prefix *vertex_prefix,
2008 struct mali_vertex_tiler_postfix *vertex_postfix,
2009 struct mali_vertex_tiler_prefix *tiler_prefix,
2010 struct mali_vertex_tiler_postfix *tiler_postfix,
2011 union midgard_primitive_size *primitive_size)
2012 {
2013 struct panfrost_context *ctx = batch->ctx;
2014 struct panfrost_device *device = pan_device(ctx->base.screen);
2015 bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
2016 struct bifrost_payload_vertex bifrost_vertex = {0,};
2017 struct bifrost_payload_tiler bifrost_tiler = {0,};
2018 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2019 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2020 void *vp, *tp;
2021 size_t vp_size, tp_size;
2022
2023 if (device->quirks & IS_BIFROST) {
2024 bifrost_vertex.prefix = *vertex_prefix;
2025 bifrost_vertex.postfix = *vertex_postfix;
2026 vp = &bifrost_vertex;
2027 vp_size = sizeof(bifrost_vertex);
2028
2029 bifrost_tiler.prefix = *tiler_prefix;
2030 bifrost_tiler.tiler.primitive_size = *primitive_size;
2031 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2032 bifrost_tiler.postfix = *tiler_postfix;
2033 tp = &bifrost_tiler;
2034 tp_size = sizeof(bifrost_tiler);
2035 } else {
2036 midgard_vertex.prefix = *vertex_prefix;
2037 midgard_vertex.postfix = *vertex_postfix;
2038 vp = &midgard_vertex;
2039 vp_size = sizeof(midgard_vertex);
2040
2041 midgard_tiler.prefix = *tiler_prefix;
2042 midgard_tiler.postfix = *tiler_postfix;
2043 midgard_tiler.primitive_size = *primitive_size;
2044 tp = &midgard_tiler;
2045 tp_size = sizeof(midgard_tiler);
2046 }
2047
2048 if (wallpapering) {
2049 /* Inject in reverse order, with "predicted" job indices.
2050 * THIS IS A HACK XXX */
2051 panfrost_new_job(batch, JOB_TYPE_TILER, false,
2052 batch->job_index + 2, tp, tp_size, true);
2053 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2054 vp, vp_size, true);
2055 return;
2056 }
2057
2058 /* If rasterizer discard is enable, only submit the vertex */
2059
2060 bool rasterizer_discard = ctx->rasterizer &&
2061 ctx->rasterizer->base.rasterizer_discard;
2062
2063 unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2064 vp, vp_size, false);
2065
2066 if (rasterizer_discard)
2067 return;
2068
2069 panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2070 false);
2071 }
2072
2073 /* TODO: stop hardcoding this */
2074 mali_ptr
2075 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2076 {
2077 uint16_t locations[] = {
2078 128, 128,
2079 0, 256,
2080 0, 256,
2081 0, 256,
2082 0, 256,
2083 0, 256,
2084 0, 256,
2085 0, 256,
2086 0, 256,
2087 0, 256,
2088 0, 256,
2089 0, 256,
2090 0, 256,
2091 0, 256,
2092 0, 256,
2093 0, 256,
2094 0, 256,
2095 0, 256,
2096 0, 256,
2097 0, 256,
2098 0, 256,
2099 0, 256,
2100 0, 256,
2101 0, 256,
2102 0, 256,
2103 0, 256,
2104 0, 256,
2105 0, 256,
2106 0, 256,
2107 0, 256,
2108 0, 256,
2109 0, 256,
2110 128, 128,
2111 0, 0,
2112 0, 0,
2113 0, 0,
2114 0, 0,
2115 0, 0,
2116 0, 0,
2117 0, 0,
2118 0, 0,
2119 0, 0,
2120 0, 0,
2121 0, 0,
2122 0, 0,
2123 0, 0,
2124 0, 0,
2125 0, 0,
2126 };
2127
2128 return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2129 }