617efa46b7531ccdc0acd11e3b40fea75a146dc9
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_pool.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static mali_ptr
55 panfrost_vt_emit_shared_memory(struct panfrost_batch *batch)
56 {
57 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
58
59 struct mali_shared_memory shared = {
60 .shared_workgroup_count = ~0,
61 };
62
63 if (batch->stack_size) {
64 struct panfrost_bo *stack =
65 panfrost_batch_get_scratchpad(batch, batch->stack_size,
66 dev->thread_tls_alloc,
67 dev->core_count);
68
69 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
70 shared.scratchpad = stack->gpu;
71 }
72
73 return panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
74 }
75
76 void
77 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
78 struct mali_vertex_tiler_prefix *prefix,
79 union midgard_primitive_size *primitive_size)
80 {
81 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
82
83 if (!panfrost_writes_point_size(ctx)) {
84 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
85 rasterizer->base.point_size :
86 rasterizer->base.line_width;
87
88 primitive_size->constant = val;
89 }
90 }
91
92 void
93 panfrost_vt_init(struct panfrost_context *ctx,
94 enum pipe_shader_type stage,
95 struct mali_vertex_tiler_prefix *prefix,
96 struct mali_vertex_tiler_postfix *postfix)
97 {
98 struct panfrost_device *device = pan_device(ctx->base.screen);
99 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
100
101 if (!ctx->shader[stage])
102 return;
103
104 memset(prefix, 0, sizeof(*prefix));
105 memset(postfix, 0, sizeof(*postfix));
106
107 if (device->quirks & IS_BIFROST) {
108 postfix->gl_enables = 0x2;
109 postfix->shared_memory = panfrost_vt_emit_shared_memory(batch);
110 } else {
111 postfix->gl_enables = 0x6;
112 postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
113 }
114
115 if (stage == PIPE_SHADER_FRAGMENT) {
116 if (ctx->occlusion_query) {
117 postfix->gl_enables |= MALI_OCCLUSION_QUERY;
118 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
119 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
120 PAN_BO_ACCESS_SHARED |
121 PAN_BO_ACCESS_RW |
122 PAN_BO_ACCESS_FRAGMENT);
123 }
124
125 postfix->gl_enables |= 0x7;
126 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
127 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
128 rast->front_ccw);
129 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
130 (rast->cull_face & PIPE_FACE_FRONT));
131 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
132 (rast->cull_face & PIPE_FACE_BACK));
133 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
134 rast->flatshade_first);
135 }
136 }
137
138 static unsigned
139 panfrost_translate_index_size(unsigned size)
140 {
141 switch (size) {
142 case 1:
143 return MALI_DRAW_INDEXED_UINT8;
144
145 case 2:
146 return MALI_DRAW_INDEXED_UINT16;
147
148 case 4:
149 return MALI_DRAW_INDEXED_UINT32;
150
151 default:
152 unreachable("Invalid index size");
153 }
154 }
155
156 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
157 * good for the duration of the draw (transient), could last longer. Also get
158 * the bounds on the index buffer for the range accessed by the draw. We do
159 * these operations together because there are natural optimizations which
160 * require them to be together. */
161
162 static mali_ptr
163 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
164 const struct pipe_draw_info *info,
165 unsigned *min_index, unsigned *max_index)
166 {
167 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
168 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
169 off_t offset = info->start * info->index_size;
170 bool needs_indices = true;
171 mali_ptr out = 0;
172
173 if (info->max_index != ~0u) {
174 *min_index = info->min_index;
175 *max_index = info->max_index;
176 needs_indices = false;
177 }
178
179 if (!info->has_user_indices) {
180 /* Only resources can be directly mapped */
181 panfrost_batch_add_bo(batch, rsrc->bo,
182 PAN_BO_ACCESS_SHARED |
183 PAN_BO_ACCESS_READ |
184 PAN_BO_ACCESS_VERTEX_TILER);
185 out = rsrc->bo->gpu + offset;
186
187 /* Check the cache */
188 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
189 info->start,
190 info->count,
191 min_index,
192 max_index);
193 } else {
194 /* Otherwise, we need to upload to transient memory */
195 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
196 struct panfrost_transfer T =
197 panfrost_pool_alloc_aligned(&batch->pool,
198 info->count * info->index_size,
199 info->index_size);
200
201 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
202 out = T.gpu;
203 }
204
205 if (needs_indices) {
206 /* Fallback */
207 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
208
209 if (!info->has_user_indices)
210 panfrost_minmax_cache_add(rsrc->index_cache,
211 info->start, info->count,
212 *min_index, *max_index);
213 }
214
215 return out;
216 }
217
218 void
219 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
220 const struct pipe_draw_info *info,
221 enum mali_draw_mode draw_mode,
222 struct mali_vertex_tiler_postfix *vertex_postfix,
223 struct mali_vertex_tiler_prefix *tiler_prefix,
224 struct mali_vertex_tiler_postfix *tiler_postfix,
225 unsigned *vertex_count,
226 unsigned *padded_count)
227 {
228 tiler_prefix->draw_mode = draw_mode;
229
230 unsigned draw_flags = 0;
231
232 if (panfrost_writes_point_size(ctx))
233 draw_flags |= MALI_DRAW_VARYING_SIZE;
234
235 if (info->primitive_restart)
236 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
237
238 /* These doesn't make much sense */
239
240 draw_flags |= 0x3000;
241
242 if (info->index_size) {
243 unsigned min_index = 0, max_index = 0;
244
245 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
246 info,
247 &min_index,
248 &max_index);
249
250 /* Use the corresponding values */
251 *vertex_count = max_index - min_index + 1;
252 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
253 tiler_prefix->offset_bias_correction = -min_index;
254 tiler_prefix->index_count = MALI_POSITIVE(info->count);
255 draw_flags |= panfrost_translate_index_size(info->index_size);
256 } else {
257 tiler_prefix->indices = 0;
258 *vertex_count = ctx->vertex_count;
259 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
260 tiler_prefix->offset_bias_correction = 0;
261 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
262 }
263
264 tiler_prefix->unknown_draw = draw_flags;
265
266 /* Encode the padded vertex count */
267
268 if (info->instance_count > 1) {
269 *padded_count = panfrost_padded_vertex_count(*vertex_count);
270
271 unsigned shift = __builtin_ctz(ctx->padded_count);
272 unsigned k = ctx->padded_count >> (shift + 1);
273
274 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
275 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
276 } else {
277 *padded_count = *vertex_count;
278
279 /* Reset instancing state */
280 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
281 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
282 }
283 }
284
285 static unsigned
286 translate_tex_wrap(enum pipe_tex_wrap w)
287 {
288 switch (w) {
289 case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
290 case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
291 case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
292 case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
293 case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
294 case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
295 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
296 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
297 default: unreachable("Invalid wrap");
298 }
299 }
300
301 /* The hardware compares in the wrong order order, so we have to flip before
302 * encoding. Yes, really. */
303
304 static enum mali_func
305 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
306 {
307 if (!cso->compare_mode)
308 return MALI_FUNC_NEVER;
309
310 enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
311 return panfrost_flip_compare_func(f);
312 }
313
314 static enum mali_mipmap_mode
315 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
316 {
317 switch (f) {
318 case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
319 case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
320 case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
321 default: unreachable("Invalid");
322 }
323 }
324
325 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
326 struct mali_midgard_sampler_packed *hw)
327 {
328 pan_pack(hw, MIDGARD_SAMPLER, cfg) {
329 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
330 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
331 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
332 MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
333 cfg.normalized_coordinates = cso->normalized_coords;
334
335 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
336
337 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
338
339 /* If necessary, we disable mipmapping in the sampler descriptor by
340 * clamping the LOD as tight as possible (from 0 to epsilon,
341 * essentially -- remember these are fixed point numbers, so
342 * epsilon=1/256) */
343
344 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
345 cfg.minimum_lod + 1 :
346 FIXED_16(cso->max_lod, false);
347
348 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
349 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
350 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
351
352 cfg.compare_function = panfrost_sampler_compare_func(cso);
353 cfg.seamless_cube_map = cso->seamless_cube_map;
354
355 cfg.border_color_r = cso->border_color.f[0];
356 cfg.border_color_g = cso->border_color.f[1];
357 cfg.border_color_b = cso->border_color.f[2];
358 cfg.border_color_a = cso->border_color.f[3];
359 }
360 }
361
362 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
363 struct mali_bifrost_sampler_packed *hw)
364 {
365 pan_pack(hw, BIFROST_SAMPLER, cfg) {
366 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
367 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
368 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
369 cfg.normalized_coordinates = cso->normalized_coords;
370
371 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
372 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
373 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
374
375 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
376 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
377 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
378
379 cfg.compare_function = panfrost_sampler_compare_func(cso);
380 cfg.seamless_cube_map = cso->seamless_cube_map;
381 }
382 }
383
384 static bool
385 panfrost_fs_required(
386 struct panfrost_shader_state *fs,
387 struct panfrost_blend_final *blend,
388 unsigned rt_count)
389 {
390 /* If we generally have side effects */
391 if (fs->fs_sidefx)
392 return true;
393
394 /* If colour is written we need to execute */
395 for (unsigned i = 0; i < rt_count; ++i) {
396 if (!blend[i].no_colour)
397 return true;
398 }
399
400 /* If depth is written and not implied we need to execute.
401 * TODO: Predicate on Z/S writes being enabled */
402 return (fs->writes_depth || fs->writes_stencil);
403 }
404
405 static void
406 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
407 struct panfrost_blend_final *blend)
408 {
409 const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
410 struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
411 unsigned rt_count = batch->key.nr_cbufs;
412
413 struct bifrost_blend_rt *brts = rts;
414
415 /* Disable blending for depth-only */
416
417 if (rt_count == 0) {
418 if (dev->quirks & IS_BIFROST) {
419 memset(brts, 0, sizeof(*brts));
420 brts[0].unk2 = 0x3;
421 } else {
422 pan_pack(rts, MIDGARD_BLEND_OPAQUE, cfg) {
423 cfg.equation = 0xf0122122; /* Replace */
424 }
425 }
426 }
427
428 for (unsigned i = 0; i < rt_count; ++i) {
429 struct mali_blend_flags_packed flags = {};
430
431 pan_pack(&flags, BLEND_FLAGS, cfg) {
432 if (blend[i].no_colour) {
433 cfg.enable = false;
434 break;
435 }
436
437 batch->draws |= (PIPE_CLEAR_COLOR0 << i);
438
439 cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
440 cfg.load_destination = blend[i].load_dest;
441 cfg.dither_disable = !batch->ctx->blend->base.dither;
442
443 if (!(dev->quirks & IS_BIFROST))
444 cfg.midgard_blend_shader = blend[i].is_shader;
445 }
446
447 if (dev->quirks & IS_BIFROST) {
448 memset(brts + i, 0, sizeof(brts[i]));
449 brts[i].flags = flags.opaque[0];
450
451 if (blend[i].is_shader) {
452 /* The blend shader's address needs to be at
453 * the same top 32 bit as the fragment shader.
454 * TODO: Ensure that's always the case.
455 */
456 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
457 (fs->bo->gpu & (0xffffffffull << 32)));
458 brts[i].shader = blend[i].shader.gpu;
459 brts[i].unk2 = 0x0;
460 } else {
461 enum pipe_format format = batch->key.cbufs[i]->format;
462 const struct util_format_description *format_desc;
463 format_desc = util_format_description(format);
464
465 brts[i].equation = blend[i].equation.equation;
466
467 /* TODO: this is a bit more complicated */
468 brts[i].constant = blend[i].equation.constant;
469
470 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
471
472 /* 0x19 disables blending and forces REPLACE
473 * mode (equivalent to rgb_mode = alpha_mode =
474 * x122, colour mask = 0xF). 0x1a allows
475 * blending. */
476 brts[i].unk2 = blend[i].opaque ? 0x19 : 0x1a;
477
478 brts[i].shader_type = fs->blend_types[i];
479 }
480 } else {
481 pan_pack(rts, MIDGARD_BLEND_OPAQUE, cfg) {
482 cfg.flags = flags;
483
484 if (blend[i].is_shader) {
485 cfg.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
486 } else {
487 cfg.equation = blend[i].equation.equation.opaque[0];
488 cfg.constant = blend[i].equation.constant;
489 }
490 }
491
492 rts += MALI_MIDGARD_BLEND_LENGTH;
493 }
494 }
495 }
496
497 static void
498 panfrost_emit_frag_shader(struct panfrost_context *ctx,
499 struct mali_state_packed *fragmeta,
500 struct panfrost_blend_final *blend)
501 {
502 const struct panfrost_device *dev = pan_device(ctx->base.screen);
503 struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
504 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
505 const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
506 unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
507 bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
508
509 /* Built up here */
510 struct mali_shader_packed shader = fs->shader;
511 struct mali_preload_packed preload = fs->preload;
512 uint32_t properties;
513 struct mali_multisample_misc_packed multisample_misc;
514 struct mali_stencil_mask_misc_packed stencil_mask_misc;
515 union midgard_blend sfbd_blend = { 0 };
516
517 if (!panfrost_fs_required(fs, blend, rt_count)) {
518 if (dev->quirks & IS_BIFROST) {
519 pan_pack(&shader, SHADER, cfg) {}
520
521 pan_pack(&properties, BIFROST_PROPERTIES, cfg) {
522 cfg.unknown = 0x950020; /* XXX */
523 cfg.early_z_enable = true;
524 }
525
526 preload.opaque[0] = 0;
527 } else {
528 pan_pack(&shader, SHADER, cfg) {
529 cfg.shader = 0x1;
530 }
531
532 pan_pack(&properties, MIDGARD_PROPERTIES, cfg) {
533 cfg.work_register_count = 1;
534 cfg.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
535 cfg.early_z_enable = true;
536 }
537 }
538 } else if (dev->quirks & IS_BIFROST) {
539 bool no_blend = true;
540
541 for (unsigned i = 0; i < rt_count; ++i)
542 no_blend &= (!blend[i].load_dest | blend[i].no_colour);
543
544 pan_pack(&properties, BIFROST_PROPERTIES, cfg) {
545 cfg.early_z_enable = !fs->can_discard && !fs->writes_depth && no_blend;
546 }
547
548 /* Combine with prepacked properties */
549 properties |= fs->properties.opaque[0];
550 } else {
551 /* Reasons to disable early-Z from a shader perspective */
552 bool late_z = fs->can_discard || fs->writes_global ||
553 fs->writes_depth || fs->writes_stencil;
554
555 /* If either depth or stencil is enabled, discard matters */
556 bool zs_enabled =
557 (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
558 zsa->base.stencil[0].enabled;
559
560 bool has_blend_shader = false;
561
562 for (unsigned c = 0; c < rt_count; ++c)
563 has_blend_shader |= blend[c].is_shader;
564
565 pan_pack(&properties, MIDGARD_PROPERTIES, cfg) {
566 /* TODO: Reduce this limit? */
567 if (has_blend_shader)
568 cfg.work_register_count = MAX2(fs->work_reg_count, 8);
569 else
570 cfg.work_register_count = fs->work_reg_count;
571
572 cfg.early_z_enable = !(late_z || alpha_to_coverage);
573 cfg.reads_tilebuffer = fs->outputs_read || (!zs_enabled && fs->can_discard);
574 cfg.reads_depth_stencil = zs_enabled && fs->can_discard;
575 }
576
577 properties |= fs->properties.opaque[0];
578 }
579
580 pan_pack(&multisample_misc, MULTISAMPLE_MISC, cfg) {
581 bool msaa = rast->multisample;
582 cfg.multisample_enable = msaa;
583 cfg.sample_mask = (msaa ? ctx->sample_mask : ~0) & 0xFFFF;
584
585 /* EXT_shader_framebuffer_fetch requires per-sample */
586 bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
587 cfg.evaluate_per_sample = msaa && per_sample;
588
589 if (dev->quirks & MIDGARD_SFBD) {
590 cfg.sfbd_load_destination = blend[0].load_dest;
591 cfg.sfbd_blend_shader = blend[0].is_shader;
592 }
593
594 cfg.depth_function = zsa->base.depth.enabled ?
595 panfrost_translate_compare_func(zsa->base.depth.func) :
596 MALI_FUNC_ALWAYS;
597
598 cfg.depth_write_mask = zsa->base.depth.writemask;
599 cfg.near_discard = rast->depth_clip_near;
600 cfg.far_discard = rast->depth_clip_far;
601 cfg.unknown_2 = true;
602 }
603
604 pan_pack(&stencil_mask_misc, STENCIL_MASK_MISC, cfg) {
605 cfg.stencil_mask_front = zsa->stencil_mask_front;
606 cfg.stencil_mask_back = zsa->stencil_mask_back;
607 cfg.stencil_enable = zsa->base.stencil[0].enabled;
608 cfg.alpha_to_coverage = alpha_to_coverage;
609
610 if (dev->quirks & MIDGARD_SFBD) {
611 cfg.sfbd_write_enable = !blend[0].no_colour;
612 cfg.sfbd_srgb = util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format);
613 cfg.sfbd_dither_disable = !ctx->blend->base.dither;
614 }
615
616 cfg.unknown_1 = 0x7;
617 cfg.depth_range_1 = cfg.depth_range_2 = rast->offset_tri;
618 cfg.single_sampled_lines = !rast->multisample;
619 }
620
621 if (dev->quirks & MIDGARD_SFBD) {
622 if (blend[0].is_shader) {
623 sfbd_blend.shader = blend[0].shader.gpu |
624 blend[0].shader.first_tag;
625 } else {
626 sfbd_blend.equation = blend[0].equation.equation;
627 sfbd_blend.constant = blend[0].equation.constant;
628 }
629 } else if (!(dev->quirks & IS_BIFROST)) {
630 /* Bug where MRT-capable hw apparently reads the last blend
631 * shader from here instead of the usual location? */
632
633 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
634 if (!blend[rt].is_shader)
635 continue;
636
637 sfbd_blend.shader = blend[rt].shader.gpu |
638 blend[rt].shader.first_tag;
639 break;
640 }
641 }
642
643 pan_pack(fragmeta, STATE_OPAQUE, cfg) {
644 cfg.shader = fs->shader;
645 cfg.properties = properties;
646 cfg.depth_units = rast->offset_units * 2.0f;
647 cfg.depth_factor = rast->offset_scale;
648 cfg.multisample_misc = multisample_misc;
649 cfg.stencil_mask_misc = stencil_mask_misc;
650
651 cfg.stencil_front = zsa->stencil_front;
652 cfg.stencil_back = zsa->stencil_back;
653
654 /* Bottom bits for stencil ref, exactly one word */
655 bool back_enab = zsa->base.stencil[1].enabled;
656 cfg.stencil_front.opaque[0] |= ctx->stencil_ref.ref_value[0];
657 cfg.stencil_back.opaque[0] |= ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
658
659 if (dev->quirks & IS_BIFROST)
660 cfg.preload = preload;
661 else
662 memcpy(&cfg.sfbd_blend, &sfbd_blend, sizeof(sfbd_blend));
663 }
664 }
665
666 mali_ptr
667 panfrost_emit_compute_shader_meta(struct panfrost_batch *batch, enum pipe_shader_type stage)
668 {
669 struct panfrost_shader_state *ss = panfrost_get_shader_state(batch->ctx, stage);
670
671 panfrost_batch_add_bo(batch, ss->bo,
672 PAN_BO_ACCESS_PRIVATE |
673 PAN_BO_ACCESS_READ |
674 PAN_BO_ACCESS_VERTEX_TILER);
675
676 panfrost_batch_add_bo(batch, pan_resource(ss->upload.rsrc)->bo,
677 PAN_BO_ACCESS_PRIVATE |
678 PAN_BO_ACCESS_READ |
679 PAN_BO_ACCESS_VERTEX_TILER);
680
681 return pan_resource(ss->upload.rsrc)->bo->gpu + ss->upload.offset;
682 }
683
684 mali_ptr
685 panfrost_emit_frag_shader_meta(struct panfrost_batch *batch)
686 {
687 struct panfrost_context *ctx = batch->ctx;
688 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
689
690 /* Add the shader BO to the batch. */
691 panfrost_batch_add_bo(batch, ss->bo,
692 PAN_BO_ACCESS_PRIVATE |
693 PAN_BO_ACCESS_READ |
694 PAN_BO_ACCESS_FRAGMENT);
695
696 struct panfrost_device *dev = pan_device(ctx->base.screen);
697 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
698 struct panfrost_transfer xfer;
699 unsigned rt_size;
700
701 if (dev->quirks & MIDGARD_SFBD)
702 rt_size = 0;
703 else if (dev->quirks & IS_BIFROST)
704 rt_size = sizeof(struct bifrost_blend_rt);
705 else
706 rt_size = sizeof(struct midgard_blend_rt);
707
708 unsigned desc_size = MALI_STATE_LENGTH + rt_size * rt_count;
709 xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, MALI_STATE_LENGTH);
710
711 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
712
713 for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
714 blend[c] = panfrost_get_blend_for_context(ctx, c);
715
716 panfrost_emit_frag_shader(ctx, (struct mali_state_packed *) xfer.cpu, blend);
717
718 if (!(dev->quirks & MIDGARD_SFBD))
719 panfrost_emit_blend(batch, xfer.cpu + MALI_STATE_LENGTH, blend);
720 else
721 batch->draws |= PIPE_CLEAR_COLOR0;
722
723 return xfer.gpu;
724 }
725
726 mali_ptr
727 panfrost_emit_viewport(struct panfrost_batch *batch)
728 {
729 struct panfrost_context *ctx = batch->ctx;
730 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
731 const struct pipe_scissor_state *ss = &ctx->scissor;
732 const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
733 const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
734
735 /* Derive min/max from translate/scale. Note since |x| >= 0 by
736 * definition, we have that -|x| <= |x| hence translate - |scale| <=
737 * translate + |scale|, so the ordering is correct here. */
738 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
739 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
740 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
741 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
742 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
743 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
744
745 /* Scissor to the intersection of viewport and to the scissor, clamped
746 * to the framebuffer */
747
748 unsigned minx = MIN2(fb->width, vp_minx);
749 unsigned maxx = MIN2(fb->width, vp_maxx);
750 unsigned miny = MIN2(fb->height, vp_miny);
751 unsigned maxy = MIN2(fb->height, vp_maxy);
752
753 if (ss && rast->scissor) {
754 minx = MAX2(ss->minx, minx);
755 miny = MAX2(ss->miny, miny);
756 maxx = MIN2(ss->maxx, maxx);
757 maxy = MIN2(ss->maxy, maxy);
758 }
759
760 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
761
762 pan_pack(T.cpu, VIEWPORT, cfg) {
763 cfg.scissor_minimum_x = minx;
764 cfg.scissor_minimum_y = miny;
765 cfg.scissor_maximum_x = maxx - 1;
766 cfg.scissor_maximum_y = maxy - 1;
767
768 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
769 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
770 }
771
772 panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
773 return T.gpu;
774 }
775
776 static mali_ptr
777 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
778 enum pipe_shader_type st,
779 struct panfrost_constant_buffer *buf,
780 unsigned index)
781 {
782 struct pipe_constant_buffer *cb = &buf->cb[index];
783 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
784
785 if (rsrc) {
786 panfrost_batch_add_bo(batch, rsrc->bo,
787 PAN_BO_ACCESS_SHARED |
788 PAN_BO_ACCESS_READ |
789 panfrost_bo_access_for_stage(st));
790
791 /* Alignment gauranteed by
792 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
793 return rsrc->bo->gpu + cb->buffer_offset;
794 } else if (cb->user_buffer) {
795 return panfrost_pool_upload_aligned(&batch->pool,
796 cb->user_buffer +
797 cb->buffer_offset,
798 cb->buffer_size, 16);
799 } else {
800 unreachable("No constant buffer");
801 }
802 }
803
804 struct sysval_uniform {
805 union {
806 float f[4];
807 int32_t i[4];
808 uint32_t u[4];
809 uint64_t du[2];
810 };
811 };
812
813 static void
814 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
815 struct sysval_uniform *uniform)
816 {
817 struct panfrost_context *ctx = batch->ctx;
818 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
819
820 uniform->f[0] = vp->scale[0];
821 uniform->f[1] = vp->scale[1];
822 uniform->f[2] = vp->scale[2];
823 }
824
825 static void
826 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
827 struct sysval_uniform *uniform)
828 {
829 struct panfrost_context *ctx = batch->ctx;
830 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
831
832 uniform->f[0] = vp->translate[0];
833 uniform->f[1] = vp->translate[1];
834 uniform->f[2] = vp->translate[2];
835 }
836
837 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
838 enum pipe_shader_type st,
839 unsigned int sysvalid,
840 struct sysval_uniform *uniform)
841 {
842 struct panfrost_context *ctx = batch->ctx;
843 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
844 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
845 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
846 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
847
848 assert(dim);
849 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
850
851 if (dim > 1)
852 uniform->i[1] = u_minify(tex->texture->height0,
853 tex->u.tex.first_level);
854
855 if (dim > 2)
856 uniform->i[2] = u_minify(tex->texture->depth0,
857 tex->u.tex.first_level);
858
859 if (is_array)
860 uniform->i[dim] = tex->texture->array_size;
861 }
862
863 static void
864 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
865 enum pipe_shader_type st,
866 unsigned ssbo_id,
867 struct sysval_uniform *uniform)
868 {
869 struct panfrost_context *ctx = batch->ctx;
870
871 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
872 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
873
874 /* Compute address */
875 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
876
877 panfrost_batch_add_bo(batch, bo,
878 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
879 panfrost_bo_access_for_stage(st));
880
881 /* Upload address and size as sysval */
882 uniform->du[0] = bo->gpu + sb.buffer_offset;
883 uniform->u[2] = sb.buffer_size;
884 }
885
886 static void
887 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
888 enum pipe_shader_type st,
889 unsigned samp_idx,
890 struct sysval_uniform *uniform)
891 {
892 struct panfrost_context *ctx = batch->ctx;
893 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
894
895 uniform->f[0] = sampl->min_lod;
896 uniform->f[1] = sampl->max_lod;
897 uniform->f[2] = sampl->lod_bias;
898
899 /* Even without any errata, Midgard represents "no mipmapping" as
900 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
901 * panfrost_create_sampler_state which also explains our choice of
902 * epsilon value (again to keep behaviour consistent) */
903
904 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
905 uniform->f[1] = uniform->f[0] + (1.0/256.0);
906 }
907
908 static void
909 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
910 struct sysval_uniform *uniform)
911 {
912 struct panfrost_context *ctx = batch->ctx;
913
914 uniform->u[0] = ctx->compute_grid->grid[0];
915 uniform->u[1] = ctx->compute_grid->grid[1];
916 uniform->u[2] = ctx->compute_grid->grid[2];
917 }
918
919 static void
920 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
921 struct panfrost_shader_state *ss,
922 enum pipe_shader_type st)
923 {
924 struct sysval_uniform *uniforms = (void *)buf;
925
926 for (unsigned i = 0; i < ss->sysval_count; ++i) {
927 int sysval = ss->sysval[i];
928
929 switch (PAN_SYSVAL_TYPE(sysval)) {
930 case PAN_SYSVAL_VIEWPORT_SCALE:
931 panfrost_upload_viewport_scale_sysval(batch,
932 &uniforms[i]);
933 break;
934 case PAN_SYSVAL_VIEWPORT_OFFSET:
935 panfrost_upload_viewport_offset_sysval(batch,
936 &uniforms[i]);
937 break;
938 case PAN_SYSVAL_TEXTURE_SIZE:
939 panfrost_upload_txs_sysval(batch, st,
940 PAN_SYSVAL_ID(sysval),
941 &uniforms[i]);
942 break;
943 case PAN_SYSVAL_SSBO:
944 panfrost_upload_ssbo_sysval(batch, st,
945 PAN_SYSVAL_ID(sysval),
946 &uniforms[i]);
947 break;
948 case PAN_SYSVAL_NUM_WORK_GROUPS:
949 panfrost_upload_num_work_groups_sysval(batch,
950 &uniforms[i]);
951 break;
952 case PAN_SYSVAL_SAMPLER:
953 panfrost_upload_sampler_sysval(batch, st,
954 PAN_SYSVAL_ID(sysval),
955 &uniforms[i]);
956 break;
957 default:
958 assert(0);
959 }
960 }
961 }
962
963 static const void *
964 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
965 unsigned index)
966 {
967 struct pipe_constant_buffer *cb = &buf->cb[index];
968 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
969
970 if (rsrc)
971 return rsrc->bo->cpu;
972 else if (cb->user_buffer)
973 return cb->user_buffer;
974 else
975 unreachable("No constant buffer");
976 }
977
978 void
979 panfrost_emit_const_buf(struct panfrost_batch *batch,
980 enum pipe_shader_type stage,
981 struct mali_vertex_tiler_postfix *postfix)
982 {
983 struct panfrost_context *ctx = batch->ctx;
984 struct panfrost_shader_variants *all = ctx->shader[stage];
985
986 if (!all)
987 return;
988
989 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
990
991 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
992
993 /* Uniforms are implicitly UBO #0 */
994 bool has_uniforms = buf->enabled_mask & (1 << 0);
995
996 /* Allocate room for the sysval and the uniforms */
997 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
998 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
999 size_t size = sys_size + uniform_size;
1000 struct panfrost_transfer transfer =
1001 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
1002
1003 /* Upload sysvals requested by the shader */
1004 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1005
1006 /* Upload uniforms */
1007 if (has_uniforms && uniform_size) {
1008 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1009 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1010 }
1011
1012 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1013 * uploaded, so it's always included. The count is the highest UBO
1014 * addressable -- gaps are included. */
1015
1016 unsigned ubo_count = 32 - __builtin_clz(buf->enabled_mask | 1);
1017
1018 size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1019 struct panfrost_transfer ubos =
1020 panfrost_pool_alloc_aligned(&batch->pool, sz,
1021 MALI_UNIFORM_BUFFER_LENGTH);
1022
1023 uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1024
1025 /* Upload uniforms as a UBO */
1026
1027 if (size) {
1028 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1029 cfg.entries = DIV_ROUND_UP(size, 16);
1030 cfg.pointer = transfer.gpu;
1031 }
1032 } else {
1033 *ubo_ptr = 0;
1034 }
1035
1036 /* The rest are honest-to-goodness UBOs */
1037
1038 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1039 size_t usz = buf->cb[ubo].buffer_size;
1040 bool enabled = buf->enabled_mask & (1 << ubo);
1041 bool empty = usz == 0;
1042
1043 if (!enabled || empty) {
1044 ubo_ptr[ubo] = 0;
1045 continue;
1046 }
1047
1048 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1049 cfg.entries = DIV_ROUND_UP(usz, 16);
1050 cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1051 stage, buf, ubo);
1052 }
1053 }
1054
1055 postfix->uniforms = transfer.gpu;
1056 postfix->uniform_buffers = ubos.gpu;
1057
1058 buf->dirty_mask = 0;
1059 }
1060
1061 mali_ptr
1062 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1063 const struct pipe_grid_info *info)
1064 {
1065 struct panfrost_context *ctx = batch->ctx;
1066 struct panfrost_device *dev = pan_device(ctx->base.screen);
1067 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1068 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1069 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1070 128));
1071
1072 unsigned log2_instances =
1073 util_logbase2_ceil(info->grid[0]) +
1074 util_logbase2_ceil(info->grid[1]) +
1075 util_logbase2_ceil(info->grid[2]);
1076
1077 unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1078 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1079 shared_size,
1080 1);
1081
1082 struct mali_shared_memory shared = {
1083 .shared_memory = bo->gpu,
1084 .shared_workgroup_count = log2_instances,
1085 .shared_shift = util_logbase2(single_size) + 1
1086 };
1087
1088 return panfrost_pool_upload_aligned(&batch->pool, &shared,
1089 sizeof(shared), 64);
1090 }
1091
1092 static mali_ptr
1093 panfrost_get_tex_desc(struct panfrost_batch *batch,
1094 enum pipe_shader_type st,
1095 struct panfrost_sampler_view *view)
1096 {
1097 if (!view)
1098 return (mali_ptr) 0;
1099
1100 struct pipe_sampler_view *pview = &view->base;
1101 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1102
1103 /* Add the BO to the job so it's retained until the job is done. */
1104
1105 panfrost_batch_add_bo(batch, rsrc->bo,
1106 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1107 panfrost_bo_access_for_stage(st));
1108
1109 panfrost_batch_add_bo(batch, view->bo,
1110 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1111 panfrost_bo_access_for_stage(st));
1112
1113 return view->bo->gpu;
1114 }
1115
1116 static void
1117 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1118 struct pipe_context *pctx)
1119 {
1120 struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1121 if (view->texture_bo != rsrc->bo->gpu ||
1122 view->modifier != rsrc->modifier) {
1123 panfrost_bo_unreference(view->bo);
1124 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1125 }
1126 }
1127
1128 mali_ptr
1129 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1130 enum pipe_shader_type stage)
1131 {
1132 struct panfrost_context *ctx = batch->ctx;
1133 struct panfrost_device *device = pan_device(ctx->base.screen);
1134
1135 if (!ctx->sampler_view_count[stage])
1136 return 0;
1137
1138 if (device->quirks & IS_BIFROST) {
1139 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1140 MALI_BIFROST_TEXTURE_LENGTH *
1141 ctx->sampler_view_count[stage],
1142 MALI_BIFROST_TEXTURE_LENGTH);
1143
1144 struct mali_bifrost_texture_packed *out =
1145 (struct mali_bifrost_texture_packed *) T.cpu;
1146
1147 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1148 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1149 struct pipe_sampler_view *pview = &view->base;
1150 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1151
1152 panfrost_update_sampler_view(view, &ctx->base);
1153 out[i] = view->bifrost_descriptor;
1154
1155 /* Add the BOs to the job so they are retained until the job is done. */
1156
1157 panfrost_batch_add_bo(batch, rsrc->bo,
1158 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1159 panfrost_bo_access_for_stage(stage));
1160
1161 panfrost_batch_add_bo(batch, view->bo,
1162 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1163 panfrost_bo_access_for_stage(stage));
1164 }
1165
1166 return T.gpu;
1167 } else {
1168 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1169
1170 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1171 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1172
1173 panfrost_update_sampler_view(view, &ctx->base);
1174
1175 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1176 }
1177
1178 return panfrost_pool_upload_aligned(&batch->pool, trampolines,
1179 sizeof(uint64_t) *
1180 ctx->sampler_view_count[stage],
1181 sizeof(uint64_t));
1182 }
1183 }
1184
1185 mali_ptr
1186 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1187 enum pipe_shader_type stage)
1188 {
1189 struct panfrost_context *ctx = batch->ctx;
1190
1191 if (!ctx->sampler_count[stage])
1192 return 0;
1193
1194 size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1195 assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1196
1197 size_t sz = desc_size * ctx->sampler_count[stage];
1198 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1199 struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1200
1201 for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1202 out[i] = ctx->samplers[stage][i]->hw;
1203
1204 return T.gpu;
1205 }
1206
1207 void
1208 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1209 struct mali_vertex_tiler_postfix *vertex_postfix)
1210 {
1211 struct panfrost_context *ctx = batch->ctx;
1212 struct panfrost_vertex_state *so = ctx->vertex;
1213 struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1214
1215 unsigned instance_shift = vertex_postfix->instance_shift;
1216 unsigned instance_odd = vertex_postfix->instance_odd;
1217
1218 /* Worst case: everything is NPOT, which is only possible if instancing
1219 * is enabled. Otherwise single record is gauranteed */
1220 bool could_npot = instance_shift || instance_odd;
1221
1222 struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1223 MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1224 (could_npot ? 2 : 1),
1225 MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1226
1227 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1228 MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1229 MALI_ATTRIBUTE_LENGTH);
1230
1231 struct mali_attribute_buffer_packed *bufs =
1232 (struct mali_attribute_buffer_packed *) S.cpu;
1233
1234 struct mali_attribute_packed *out =
1235 (struct mali_attribute_packed *) T.cpu;
1236
1237 unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1238 unsigned k = 0;
1239
1240 for (unsigned i = 0; i < so->num_elements; ++i) {
1241 /* We map buffers 1:1 with the attributes, which
1242 * means duplicating some vertex buffers (who cares? aside from
1243 * maybe some caching implications but I somehow doubt that
1244 * matters) */
1245
1246 struct pipe_vertex_element *elem = &so->pipe[i];
1247 unsigned vbi = elem->vertex_buffer_index;
1248 attrib_to_buffer[i] = k;
1249
1250 if (!(ctx->vb_mask & (1 << vbi)))
1251 continue;
1252
1253 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1254 struct panfrost_resource *rsrc;
1255
1256 rsrc = pan_resource(buf->buffer.resource);
1257 if (!rsrc)
1258 continue;
1259
1260 /* Add a dependency of the batch on the vertex buffer */
1261 panfrost_batch_add_bo(batch, rsrc->bo,
1262 PAN_BO_ACCESS_SHARED |
1263 PAN_BO_ACCESS_READ |
1264 PAN_BO_ACCESS_VERTEX_TILER);
1265
1266 /* Mask off lower bits, see offset fixup below */
1267 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1268 mali_ptr addr = raw_addr & ~63;
1269
1270 /* Since we advanced the base pointer, we shrink the buffer
1271 * size, but add the offset we subtracted */
1272 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1273 - buf->buffer_offset;
1274
1275 /* When there is a divisor, the hardware-level divisor is
1276 * the product of the instance divisor and the padded count */
1277 unsigned divisor = elem->instance_divisor;
1278 unsigned hw_divisor = ctx->padded_count * divisor;
1279 unsigned stride = buf->stride;
1280
1281 /* If there's a divisor(=1) but no instancing, we want every
1282 * attribute to be the same */
1283
1284 if (divisor && ctx->instance_count == 1)
1285 stride = 0;
1286
1287 if (!divisor || ctx->instance_count <= 1) {
1288 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1289 if (ctx->instance_count > 1)
1290 cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1291
1292 cfg.pointer = addr;
1293 cfg.stride = stride;
1294 cfg.size = size;
1295 cfg.divisor_r = instance_shift;
1296 cfg.divisor_p = instance_odd;
1297 }
1298 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1299 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1300 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1301 cfg.pointer = addr;
1302 cfg.stride = stride;
1303 cfg.size = size;
1304 cfg.divisor_r = __builtin_ctz(hw_divisor);
1305 }
1306
1307 } else {
1308 unsigned shift = 0, extra_flags = 0;
1309
1310 unsigned magic_divisor =
1311 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1312
1313 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1314 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1315 cfg.pointer = addr;
1316 cfg.stride = stride;
1317 cfg.size = size;
1318
1319 cfg.divisor_r = shift;
1320 cfg.divisor_e = extra_flags;
1321 }
1322
1323 pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1324 cfg.divisor_numerator = magic_divisor;
1325 cfg.divisor = divisor;
1326 }
1327
1328 ++k;
1329 }
1330
1331 ++k;
1332 }
1333
1334 /* Add special gl_VertexID/gl_InstanceID buffers */
1335
1336 if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1337 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1338
1339 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1340 cfg.buffer_index = k++;
1341 cfg.format = so->formats[PAN_VERTEX_ID];
1342 }
1343
1344 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1345
1346 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1347 cfg.buffer_index = k++;
1348 cfg.format = so->formats[PAN_INSTANCE_ID];
1349 }
1350 }
1351
1352 /* Attribute addresses require 64-byte alignment, so let:
1353 *
1354 * base' = base & ~63 = base - (base & 63)
1355 * offset' = offset + (base & 63)
1356 *
1357 * Since base' + offset' = base + offset, these are equivalent
1358 * addressing modes and now base is 64 aligned.
1359 */
1360
1361 unsigned start = vertex_postfix->offset_start;
1362
1363 for (unsigned i = 0; i < so->num_elements; ++i) {
1364 unsigned vbi = so->pipe[i].vertex_buffer_index;
1365 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1366
1367 /* Adjust by the masked off bits of the offset. Make sure we
1368 * read src_offset from so->hw (which is not GPU visible)
1369 * rather than target (which is) due to caching effects */
1370
1371 unsigned src_offset = so->pipe[i].src_offset;
1372
1373 /* BOs aligned to 4k so guaranteed aligned to 64 */
1374 src_offset += (buf->buffer_offset & 63);
1375
1376 /* Also, somewhat obscurely per-instance data needs to be
1377 * offset in response to a delayed start in an indexed draw */
1378
1379 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1380 src_offset -= buf->stride * start;
1381
1382 pan_pack(out + i, ATTRIBUTE, cfg) {
1383 cfg.buffer_index = attrib_to_buffer[i];
1384 cfg.format = so->formats[i];
1385 cfg.offset = src_offset;
1386 }
1387 }
1388
1389 vertex_postfix->attributes = S.gpu;
1390 vertex_postfix->attribute_meta = T.gpu;
1391 }
1392
1393 static mali_ptr
1394 panfrost_emit_varyings(struct panfrost_batch *batch,
1395 struct mali_attribute_buffer_packed *slot,
1396 unsigned stride, unsigned count)
1397 {
1398 unsigned size = stride * count;
1399 mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1400
1401 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1402 cfg.stride = stride;
1403 cfg.size = size;
1404 cfg.pointer = ptr;
1405 }
1406
1407 return ptr;
1408 }
1409
1410 static unsigned
1411 panfrost_streamout_offset(unsigned stride, unsigned offset,
1412 struct pipe_stream_output_target *target)
1413 {
1414 return (target->buffer_offset + (offset * stride * 4)) & 63;
1415 }
1416
1417 static void
1418 panfrost_emit_streamout(struct panfrost_batch *batch,
1419 struct mali_attribute_buffer_packed *slot,
1420 unsigned stride_words, unsigned offset, unsigned count,
1421 struct pipe_stream_output_target *target)
1422 {
1423 unsigned stride = stride_words * 4;
1424 unsigned max_size = target->buffer_size;
1425 unsigned expected_size = stride * count;
1426
1427 /* Grab the BO and bind it to the batch */
1428 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1429
1430 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1431 * the perspective of the TILER and FRAGMENT.
1432 */
1433 panfrost_batch_add_bo(batch, bo,
1434 PAN_BO_ACCESS_SHARED |
1435 PAN_BO_ACCESS_RW |
1436 PAN_BO_ACCESS_VERTEX_TILER |
1437 PAN_BO_ACCESS_FRAGMENT);
1438
1439 /* We will have an offset applied to get alignment */
1440 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1441
1442 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1443 cfg.pointer = (addr & ~63);
1444 cfg.stride = stride;
1445 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1446 }
1447 }
1448
1449 static bool
1450 has_point_coord(unsigned mask, gl_varying_slot loc)
1451 {
1452 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1453 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1454 else if (loc == VARYING_SLOT_PNTC)
1455 return (mask & (1 << 8));
1456 else
1457 return false;
1458 }
1459
1460 /* Helpers for manipulating stream out information so we can pack varyings
1461 * accordingly. Compute the src_offset for a given captured varying */
1462
1463 static struct pipe_stream_output *
1464 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1465 {
1466 for (unsigned i = 0; i < info->num_outputs; ++i) {
1467 if (info->output[i].register_index == loc)
1468 return &info->output[i];
1469 }
1470
1471 unreachable("Varying not captured");
1472 }
1473
1474 static unsigned
1475 pan_varying_size(enum mali_format fmt)
1476 {
1477 unsigned type = MALI_EXTRACT_TYPE(fmt);
1478 unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1479 unsigned bits = MALI_EXTRACT_BITS(fmt);
1480 unsigned bpc = 0;
1481
1482 if (bits == MALI_CHANNEL_FLOAT) {
1483 /* No doubles */
1484 bool fp16 = (type == MALI_FORMAT_SINT);
1485 assert(fp16 || (type == MALI_FORMAT_UNORM));
1486
1487 bpc = fp16 ? 2 : 4;
1488 } else {
1489 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1490
1491 /* See the enums */
1492 bits = 1 << bits;
1493 assert(bits >= 8);
1494 bpc = bits / 8;
1495 }
1496
1497 return bpc * chan;
1498 }
1499
1500 /* Indices for named (non-XFB) varyings that are present. These are packed
1501 * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1502 * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1503 * of a given special field given a shift S by:
1504 *
1505 * idx = popcount(P & ((1 << S) - 1))
1506 *
1507 * That is... look at all of the varyings that come earlier and count them, the
1508 * count is the new index since plus one. Likewise, the total number of special
1509 * buffers required is simply popcount(P)
1510 */
1511
1512 enum pan_special_varying {
1513 PAN_VARY_GENERAL = 0,
1514 PAN_VARY_POSITION = 1,
1515 PAN_VARY_PSIZ = 2,
1516 PAN_VARY_PNTCOORD = 3,
1517 PAN_VARY_FACE = 4,
1518 PAN_VARY_FRAGCOORD = 5,
1519
1520 /* Keep last */
1521 PAN_VARY_MAX,
1522 };
1523
1524 /* Given a varying, figure out which index it correpsonds to */
1525
1526 static inline unsigned
1527 pan_varying_index(unsigned present, enum pan_special_varying v)
1528 {
1529 unsigned mask = (1 << v) - 1;
1530 return util_bitcount(present & mask);
1531 }
1532
1533 /* Get the base offset for XFB buffers, which by convention come after
1534 * everything else. Wrapper function for semantic reasons; by construction this
1535 * is just popcount. */
1536
1537 static inline unsigned
1538 pan_xfb_base(unsigned present)
1539 {
1540 return util_bitcount(present);
1541 }
1542
1543 /* Computes the present mask for varyings so we can start emitting varying records */
1544
1545 static inline unsigned
1546 pan_varying_present(
1547 struct panfrost_shader_state *vs,
1548 struct panfrost_shader_state *fs,
1549 unsigned quirks)
1550 {
1551 /* At the moment we always emit general and position buffers. Not
1552 * strictly necessary but usually harmless */
1553
1554 unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1555
1556 /* Enable special buffers by the shader info */
1557
1558 if (vs->writes_point_size)
1559 present |= (1 << PAN_VARY_PSIZ);
1560
1561 if (fs->reads_point_coord)
1562 present |= (1 << PAN_VARY_PNTCOORD);
1563
1564 if (fs->reads_face)
1565 present |= (1 << PAN_VARY_FACE);
1566
1567 if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1568 present |= (1 << PAN_VARY_FRAGCOORD);
1569
1570 /* Also, if we have a point sprite, we need a point coord buffer */
1571
1572 for (unsigned i = 0; i < fs->varying_count; i++) {
1573 gl_varying_slot loc = fs->varyings_loc[i];
1574
1575 if (has_point_coord(fs->point_sprite_mask, loc))
1576 present |= (1 << PAN_VARY_PNTCOORD);
1577 }
1578
1579 return present;
1580 }
1581
1582 /* Emitters for varying records */
1583
1584 static void
1585 pan_emit_vary(struct mali_attribute_packed *out,
1586 unsigned present, enum pan_special_varying buf,
1587 unsigned quirks, enum mali_format format,
1588 unsigned offset)
1589 {
1590 unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1591 unsigned swizzle = quirks & HAS_SWIZZLES ?
1592 panfrost_get_default_swizzle(nr_channels) :
1593 panfrost_bifrost_swizzle(nr_channels);
1594
1595 pan_pack(out, ATTRIBUTE, cfg) {
1596 cfg.buffer_index = pan_varying_index(present, buf);
1597 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1598 cfg.format = (format << 12) | swizzle;
1599 cfg.offset = offset;
1600 }
1601 }
1602
1603 /* General varying that is unused */
1604
1605 static void
1606 pan_emit_vary_only(struct mali_attribute_packed *out,
1607 unsigned present, unsigned quirks)
1608 {
1609 pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1610 }
1611
1612 /* Special records */
1613
1614 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1615 [PAN_VARY_POSITION] = MALI_VARYING_POS,
1616 [PAN_VARY_PSIZ] = MALI_R16F,
1617 [PAN_VARY_PNTCOORD] = MALI_R16F,
1618 [PAN_VARY_FACE] = MALI_R32I,
1619 [PAN_VARY_FRAGCOORD] = MALI_RGBA32F
1620 };
1621
1622 static void
1623 pan_emit_vary_special(struct mali_attribute_packed *out,
1624 unsigned present, enum pan_special_varying buf,
1625 unsigned quirks)
1626 {
1627 assert(buf < PAN_VARY_MAX);
1628 pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1629 }
1630
1631 static enum mali_format
1632 pan_xfb_format(enum mali_format format, unsigned nr)
1633 {
1634 if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1635 return MALI_R32F | MALI_NR_CHANNELS(nr);
1636 else
1637 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1638 }
1639
1640 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1641 * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1642 * value. */
1643
1644 static void
1645 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1646 unsigned present,
1647 unsigned max_xfb,
1648 unsigned *streamout_offsets,
1649 unsigned quirks,
1650 enum mali_format format,
1651 struct pipe_stream_output o)
1652 {
1653 unsigned swizzle = quirks & HAS_SWIZZLES ?
1654 panfrost_get_default_swizzle(o.num_components) :
1655 panfrost_bifrost_swizzle(o.num_components);
1656
1657 pan_pack(out, ATTRIBUTE, cfg) {
1658 /* XFB buffers come after everything else */
1659 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1660 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1661
1662 /* Override number of channels and precision to highp */
1663 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1664
1665 /* Apply given offsets together */
1666 cfg.offset = (o.dst_offset * 4) /* dwords */
1667 + streamout_offsets[o.output_buffer];
1668 }
1669 }
1670
1671 /* Determine if we should capture a varying for XFB. This requires actually
1672 * having a buffer for it. If we don't capture it, we'll fallback to a general
1673 * varying path (linked or unlinked, possibly discarding the write) */
1674
1675 static bool
1676 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1677 unsigned loc, unsigned max_xfb)
1678 {
1679 if (!(xfb->so_mask & (1ll << loc)))
1680 return false;
1681
1682 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1683 return o->output_buffer < max_xfb;
1684 }
1685
1686 static void
1687 pan_emit_general_varying(struct mali_attribute_packed *out,
1688 struct panfrost_shader_state *other,
1689 struct panfrost_shader_state *xfb,
1690 gl_varying_slot loc,
1691 enum mali_format format,
1692 unsigned present,
1693 unsigned quirks,
1694 unsigned *gen_offsets,
1695 enum mali_format *gen_formats,
1696 unsigned *gen_stride,
1697 unsigned idx,
1698 bool should_alloc)
1699 {
1700 /* Check if we're linked */
1701 signed other_idx = -1;
1702
1703 for (unsigned j = 0; j < other->varying_count; ++j) {
1704 if (other->varyings_loc[j] == loc) {
1705 other_idx = j;
1706 break;
1707 }
1708 }
1709
1710 if (other_idx < 0) {
1711 pan_emit_vary_only(out, present, quirks);
1712 return;
1713 }
1714
1715 unsigned offset = gen_offsets[other_idx];
1716
1717 if (should_alloc) {
1718 /* We're linked, so allocate a space via a watermark allocation */
1719 enum mali_format alt = other->varyings[other_idx];
1720
1721 /* Do interpolation at minimum precision */
1722 unsigned size_main = pan_varying_size(format);
1723 unsigned size_alt = pan_varying_size(alt);
1724 unsigned size = MIN2(size_main, size_alt);
1725
1726 /* If a varying is marked for XFB but not actually captured, we
1727 * should match the format to the format that would otherwise
1728 * be used for XFB, since dEQP checks for invariance here. It's
1729 * unclear if this is required by the spec. */
1730
1731 if (xfb->so_mask & (1ull << loc)) {
1732 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1733 format = pan_xfb_format(format, o->num_components);
1734 size = pan_varying_size(format);
1735 } else if (size == size_alt) {
1736 format = alt;
1737 }
1738
1739 gen_offsets[idx] = *gen_stride;
1740 gen_formats[other_idx] = format;
1741 offset = *gen_stride;
1742 *gen_stride += size;
1743 }
1744
1745 pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1746 }
1747
1748 /* Higher-level wrapper around all of the above, classifying a varying into one
1749 * of the above types */
1750
1751 static void
1752 panfrost_emit_varying(
1753 struct mali_attribute_packed *out,
1754 struct panfrost_shader_state *stage,
1755 struct panfrost_shader_state *other,
1756 struct panfrost_shader_state *xfb,
1757 unsigned present,
1758 unsigned max_xfb,
1759 unsigned *streamout_offsets,
1760 unsigned quirks,
1761 unsigned *gen_offsets,
1762 enum mali_format *gen_formats,
1763 unsigned *gen_stride,
1764 unsigned idx,
1765 bool should_alloc,
1766 bool is_fragment)
1767 {
1768 gl_varying_slot loc = stage->varyings_loc[idx];
1769 enum mali_format format = stage->varyings[idx];
1770
1771 /* Override format to match linkage */
1772 if (!should_alloc && gen_formats[idx])
1773 format = gen_formats[idx];
1774
1775 if (has_point_coord(stage->point_sprite_mask, loc)) {
1776 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1777 } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1778 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1779 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1780 } else if (loc == VARYING_SLOT_POS) {
1781 if (is_fragment)
1782 pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1783 else
1784 pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1785 } else if (loc == VARYING_SLOT_PSIZ) {
1786 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1787 } else if (loc == VARYING_SLOT_PNTC) {
1788 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1789 } else if (loc == VARYING_SLOT_FACE) {
1790 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1791 } else {
1792 pan_emit_general_varying(out, other, xfb, loc, format, present,
1793 quirks, gen_offsets, gen_formats, gen_stride,
1794 idx, should_alloc);
1795 }
1796 }
1797
1798 static void
1799 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1800 unsigned present,
1801 enum pan_special_varying v,
1802 unsigned special)
1803 {
1804 if (present & (1 << v)) {
1805 unsigned idx = pan_varying_index(present, v);
1806
1807 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1808 cfg.special = special;
1809 cfg.type = 0;
1810 }
1811 }
1812 }
1813
1814 void
1815 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1816 unsigned vertex_count,
1817 struct mali_vertex_tiler_postfix *vertex_postfix,
1818 struct mali_vertex_tiler_postfix *tiler_postfix,
1819 union midgard_primitive_size *primitive_size)
1820 {
1821 /* Load the shaders */
1822 struct panfrost_context *ctx = batch->ctx;
1823 struct panfrost_device *dev = pan_device(ctx->base.screen);
1824 struct panfrost_shader_state *vs, *fs;
1825 size_t vs_size, fs_size;
1826
1827 /* Allocate the varying descriptor */
1828
1829 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1830 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1831 vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1832 fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1833
1834 struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1835 &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1836
1837 struct pipe_stream_output_info *so = &vs->stream_output;
1838 unsigned present = pan_varying_present(vs, fs, dev->quirks);
1839
1840 /* Check if this varying is linked by us. This is the case for
1841 * general-purpose, non-captured varyings. If it is, link it. If it's
1842 * not, use the provided stream out information to determine the
1843 * offset, since it was already linked for us. */
1844
1845 unsigned gen_offsets[32];
1846 enum mali_format gen_formats[32];
1847 memset(gen_offsets, 0, sizeof(gen_offsets));
1848 memset(gen_formats, 0, sizeof(gen_formats));
1849
1850 unsigned gen_stride = 0;
1851 assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1852 assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1853
1854 unsigned streamout_offsets[32];
1855
1856 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1857 streamout_offsets[i] = panfrost_streamout_offset(
1858 so->stride[i],
1859 ctx->streamout.offsets[i],
1860 ctx->streamout.targets[i]);
1861 }
1862
1863 struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1864 struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1865
1866 for (unsigned i = 0; i < vs->varying_count; i++) {
1867 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1868 ctx->streamout.num_targets, streamout_offsets,
1869 dev->quirks,
1870 gen_offsets, gen_formats, &gen_stride, i, true, false);
1871 }
1872
1873 for (unsigned i = 0; i < fs->varying_count; i++) {
1874 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1875 ctx->streamout.num_targets, streamout_offsets,
1876 dev->quirks,
1877 gen_offsets, gen_formats, &gen_stride, i, false, true);
1878 }
1879
1880 unsigned xfb_base = pan_xfb_base(present);
1881 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1882 MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
1883 MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1884 struct mali_attribute_buffer_packed *varyings =
1885 (struct mali_attribute_buffer_packed *) T.cpu;
1886
1887 /* Emit the stream out buffers */
1888
1889 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1890 ctx->vertex_count);
1891
1892 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1893 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
1894 so->stride[i],
1895 ctx->streamout.offsets[i],
1896 out_count,
1897 ctx->streamout.targets[i]);
1898 }
1899
1900 panfrost_emit_varyings(batch,
1901 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
1902 gen_stride, vertex_count);
1903
1904 /* fp32 vec4 gl_Position */
1905 tiler_postfix->position_varying = panfrost_emit_varyings(batch,
1906 &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
1907 sizeof(float) * 4, vertex_count);
1908
1909 if (present & (1 << PAN_VARY_PSIZ)) {
1910 primitive_size->pointer = panfrost_emit_varyings(batch,
1911 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
1912 2, vertex_count);
1913 }
1914
1915 pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
1916 pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
1917 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
1918
1919 vertex_postfix->varyings = T.gpu;
1920 tiler_postfix->varyings = T.gpu;
1921
1922 vertex_postfix->varying_meta = trans.gpu;
1923 tiler_postfix->varying_meta = trans.gpu + vs_size;
1924 }
1925
1926 void
1927 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1928 struct mali_vertex_tiler_prefix *vertex_prefix,
1929 struct mali_vertex_tiler_postfix *vertex_postfix,
1930 struct mali_vertex_tiler_prefix *tiler_prefix,
1931 struct mali_vertex_tiler_postfix *tiler_postfix,
1932 union midgard_primitive_size *primitive_size)
1933 {
1934 struct panfrost_context *ctx = batch->ctx;
1935 struct panfrost_device *device = pan_device(ctx->base.screen);
1936 bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
1937 struct bifrost_payload_vertex bifrost_vertex = {0,};
1938 struct bifrost_payload_tiler bifrost_tiler = {0,};
1939 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1940 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1941 void *vp, *tp;
1942 size_t vp_size, tp_size;
1943
1944 if (device->quirks & IS_BIFROST) {
1945 bifrost_vertex.prefix = *vertex_prefix;
1946 bifrost_vertex.postfix = *vertex_postfix;
1947 vp = &bifrost_vertex;
1948 vp_size = sizeof(bifrost_vertex);
1949
1950 bifrost_tiler.prefix = *tiler_prefix;
1951 bifrost_tiler.tiler.primitive_size = *primitive_size;
1952 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1953 bifrost_tiler.postfix = *tiler_postfix;
1954 tp = &bifrost_tiler;
1955 tp_size = sizeof(bifrost_tiler);
1956 } else {
1957 midgard_vertex.prefix = *vertex_prefix;
1958 midgard_vertex.postfix = *vertex_postfix;
1959 vp = &midgard_vertex;
1960 vp_size = sizeof(midgard_vertex);
1961
1962 midgard_tiler.prefix = *tiler_prefix;
1963 midgard_tiler.postfix = *tiler_postfix;
1964 midgard_tiler.primitive_size = *primitive_size;
1965 tp = &midgard_tiler;
1966 tp_size = sizeof(midgard_tiler);
1967 }
1968
1969 if (wallpapering) {
1970 /* Inject in reverse order, with "predicted" job indices.
1971 * THIS IS A HACK XXX */
1972 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
1973 batch->scoreboard.job_index + 2, tp, tp_size, true);
1974 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
1975 vp, vp_size, true);
1976 return;
1977 }
1978
1979 /* If rasterizer discard is enable, only submit the vertex */
1980
1981 unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
1982 vp, vp_size, false);
1983
1984 if (ctx->rasterizer->base.rasterizer_discard)
1985 return;
1986
1987 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
1988 false);
1989 }
1990
1991 /* TODO: stop hardcoding this */
1992 mali_ptr
1993 panfrost_emit_sample_locations(struct panfrost_batch *batch)
1994 {
1995 uint16_t locations[] = {
1996 128, 128,
1997 0, 256,
1998 0, 256,
1999 0, 256,
2000 0, 256,
2001 0, 256,
2002 0, 256,
2003 0, 256,
2004 0, 256,
2005 0, 256,
2006 0, 256,
2007 0, 256,
2008 0, 256,
2009 0, 256,
2010 0, 256,
2011 0, 256,
2012 0, 256,
2013 0, 256,
2014 0, 256,
2015 0, 256,
2016 0, 256,
2017 0, 256,
2018 0, 256,
2019 0, 256,
2020 0, 256,
2021 0, 256,
2022 0, 256,
2023 0, 256,
2024 0, 256,
2025 0, 256,
2026 0, 256,
2027 0, 256,
2028 128, 128,
2029 0, 0,
2030 0, 0,
2031 0, 0,
2032 0, 0,
2033 0, 0,
2034 0, 0,
2035 0, 0,
2036 0, 0,
2037 0, 0,
2038 0, 0,
2039 0, 0,
2040 0, 0,
2041 0, 0,
2042 0, 0,
2043 0, 0,
2044 };
2045
2046 return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
2047 }