2 * Copyright © 2014-2017 Broadcom
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 #include "util/u_blitter.h"
25 #include "util/u_prim.h"
26 #include "util/format/u_format.h"
27 #include "util/u_pack_color.h"
28 #include "util/u_prim_restart.h"
29 #include "util/u_upload_mgr.h"
30 #include "indices/u_primconvert.h"
32 #include "v3d_context.h"
33 #include "v3d_resource.h"
35 #include "broadcom/compiler/v3d_compiler.h"
36 #include "broadcom/common/v3d_macros.h"
37 #include "broadcom/cle/v3dx_pack.h"
40 * Does the initial bining command list setup for drawing to a given FBO.
43 v3d_start_draw(struct v3d_context
*v3d
)
45 struct v3d_job
*job
= v3d
->job
;
50 /* Get space to emit our BCL state, using a branch to jump to a new BO
53 v3d_cl_ensure_space_with_branch(&job
->bcl
, 256 /* XXX */);
55 job
->submit
.bcl_start
= job
->bcl
.bo
->offset
;
56 v3d_job_add_bo(job
, job
->bcl
.bo
);
58 uint32_t fb_layers
= util_framebuffer_get_num_layers(&v3d
->framebuffer
);
60 /* The PTB will request the tile alloc initial size per tile at start
63 uint32_t tile_alloc_size
=
64 MAX2(fb_layers
, 1) * job
->draw_tiles_x
* job
->draw_tiles_y
* 64;
66 /* The PTB allocates in aligned 4k chunks after the initial setup. */
67 tile_alloc_size
= align(tile_alloc_size
, 4096);
69 /* Include the first two chunk allocations that the PTB does so that
70 * we definitely clear the OOM condition before triggering one (the HW
71 * won't trigger OOM during the first allocations).
73 tile_alloc_size
+= 8192;
75 /* For performance, allocate some extra initial memory after the PTB's
76 * minimal allocations, so that we hopefully don't have to block the
77 * GPU on the kernel handling an OOM signal.
79 tile_alloc_size
+= 512 * 1024;
81 job
->tile_alloc
= v3d_bo_alloc(v3d
->screen
, tile_alloc_size
,
83 uint32_t tsda_per_tile_size
= v3d
->screen
->devinfo
.ver
>= 40 ? 256 : 64;
84 job
->tile_state
= v3d_bo_alloc(v3d
->screen
,
91 /* This must go before the binning mode configuration. It is
92 * required for layered framebuffers to work.
95 cl_emit(&job
->bcl
, NUMBER_OF_LAYERS
, config
) {
96 config
.number_of_layers
= fb_layers
;
101 #if V3D_VERSION >= 40
102 cl_emit(&job
->bcl
, TILE_BINNING_MODE_CFG
, config
) {
103 config
.width_in_pixels
= v3d
->framebuffer
.width
;
104 config
.height_in_pixels
= v3d
->framebuffer
.height
;
105 config
.number_of_render_targets
=
106 MAX2(v3d
->framebuffer
.nr_cbufs
, 1);
108 config
.multisample_mode_4x
= job
->msaa
;
110 config
.maximum_bpp_of_all_render_targets
= job
->internal_bpp
;
112 #else /* V3D_VERSION < 40 */
113 /* "Binning mode lists start with a Tile Binning Mode Configuration
116 * Part1 signals the end of binning config setup.
118 cl_emit(&job
->bcl
, TILE_BINNING_MODE_CFG_PART2
, config
) {
119 config
.tile_allocation_memory_address
=
120 cl_address(job
->tile_alloc
, 0);
121 config
.tile_allocation_memory_size
= job
->tile_alloc
->size
;
124 cl_emit(&job
->bcl
, TILE_BINNING_MODE_CFG_PART1
, config
) {
125 config
.tile_state_data_array_base_address
=
126 cl_address(job
->tile_state
, 0);
128 config
.width_in_tiles
= job
->draw_tiles_x
;
129 config
.height_in_tiles
= job
->draw_tiles_y
;
131 config
.number_of_render_targets
=
132 MAX2(v3d
->framebuffer
.nr_cbufs
, 1);
134 config
.multisample_mode_4x
= job
->msaa
;
136 config
.maximum_bpp_of_all_render_targets
= job
->internal_bpp
;
138 #endif /* V3D_VERSION < 40 */
140 /* There's definitely nothing in the VCD cache we want. */
141 cl_emit(&job
->bcl
, FLUSH_VCD_CACHE
, bin
);
143 /* Disable any leftover OQ state from another job. */
144 cl_emit(&job
->bcl
, OCCLUSION_QUERY_COUNTER
, counter
);
146 /* "Binning mode lists must have a Start Tile Binning item (6) after
147 * any prefix state data before the binning list proper starts."
149 cl_emit(&job
->bcl
, START_TILE_BINNING
, bin
);
151 job
->needs_flush
= true;
152 job
->draw_width
= v3d
->framebuffer
.width
;
153 job
->draw_height
= v3d
->framebuffer
.height
;
154 job
->num_layers
= fb_layers
;
158 v3d_predraw_check_stage_inputs(struct pipe_context
*pctx
,
159 enum pipe_shader_type s
)
161 struct v3d_context
*v3d
= v3d_context(pctx
);
163 /* Flush writes to textures we're sampling. */
164 for (int i
= 0; i
< v3d
->tex
[s
].num_textures
; i
++) {
165 struct pipe_sampler_view
*pview
= v3d
->tex
[s
].textures
[i
];
168 struct v3d_sampler_view
*view
= v3d_sampler_view(pview
);
170 if (view
->texture
!= view
->base
.texture
&&
171 view
->base
.format
!= PIPE_FORMAT_X32_S8X24_UINT
)
172 v3d_update_shadow_texture(pctx
, &view
->base
);
174 v3d_flush_jobs_writing_resource(v3d
, view
->texture
,
176 s
== PIPE_SHADER_COMPUTE
);
179 /* Flush writes to UBOs. */
180 foreach_bit(i
, v3d
->constbuf
[s
].enabled_mask
) {
181 struct pipe_constant_buffer
*cb
= &v3d
->constbuf
[s
].cb
[i
];
183 v3d_flush_jobs_writing_resource(v3d
, cb
->buffer
,
185 s
== PIPE_SHADER_COMPUTE
);
189 /* Flush reads/writes to our SSBOs */
190 foreach_bit(i
, v3d
->ssbo
[s
].enabled_mask
) {
191 struct pipe_shader_buffer
*sb
= &v3d
->ssbo
[s
].sb
[i
];
193 v3d_flush_jobs_reading_resource(v3d
, sb
->buffer
,
194 V3D_FLUSH_NOT_CURRENT_JOB
,
195 s
== PIPE_SHADER_COMPUTE
);
199 /* Flush reads/writes to our image views */
200 foreach_bit(i
, v3d
->shaderimg
[s
].enabled_mask
) {
201 struct v3d_image_view
*view
= &v3d
->shaderimg
[s
].si
[i
];
203 v3d_flush_jobs_reading_resource(v3d
, view
->base
.resource
,
204 V3D_FLUSH_NOT_CURRENT_JOB
,
205 s
== PIPE_SHADER_COMPUTE
);
208 /* Flush writes to our vertex buffers (i.e. from transform feedback) */
209 if (s
== PIPE_SHADER_VERTEX
) {
210 foreach_bit(i
, v3d
->vertexbuf
.enabled_mask
) {
211 struct pipe_vertex_buffer
*vb
= &v3d
->vertexbuf
.vb
[i
];
213 v3d_flush_jobs_writing_resource(v3d
, vb
->buffer
.resource
,
221 v3d_predraw_check_outputs(struct pipe_context
*pctx
)
223 struct v3d_context
*v3d
= v3d_context(pctx
);
225 /* Flush jobs reading from TF buffers that we are about to write. */
226 if (v3d_transform_feedback_enabled(v3d
)) {
227 struct v3d_streamout_stateobj
*so
= &v3d
->streamout
;
229 for (int i
= 0; i
< so
->num_targets
; i
++) {
233 const struct pipe_stream_output_target
*target
=
235 v3d_flush_jobs_reading_resource(v3d
, target
->buffer
,
243 * Checks if the state for the current draw reads a particular resource in
244 * in the given shader stage.
247 v3d_state_reads_resource(struct v3d_context
*v3d
,
248 struct pipe_resource
*prsc
,
249 enum pipe_shader_type s
)
251 struct v3d_resource
*rsc
= v3d_resource(prsc
);
254 if (s
== PIPE_SHADER_VERTEX
) {
255 foreach_bit(i
, v3d
->vertexbuf
.enabled_mask
) {
256 struct pipe_vertex_buffer
*vb
= &v3d
->vertexbuf
.vb
[i
];
257 if (!vb
->buffer
.resource
)
260 struct v3d_resource
*vb_rsc
=
261 v3d_resource(vb
->buffer
.resource
);
262 if (rsc
->bo
== vb_rsc
->bo
)
267 /* Constant buffers */
268 foreach_bit(i
, v3d
->constbuf
[s
].enabled_mask
) {
269 struct pipe_constant_buffer
*cb
= &v3d
->constbuf
[s
].cb
[i
];
273 struct v3d_resource
*cb_rsc
= v3d_resource(cb
->buffer
);
274 if (rsc
->bo
== cb_rsc
->bo
)
278 /* Shader storage buffers */
279 foreach_bit(i
, v3d
->ssbo
[s
].enabled_mask
) {
280 struct pipe_shader_buffer
*sb
= &v3d
->ssbo
[s
].sb
[i
];
284 struct v3d_resource
*sb_rsc
= v3d_resource(sb
->buffer
);
285 if (rsc
->bo
== sb_rsc
->bo
)
290 for (int i
= 0; i
< v3d
->tex
[s
].num_textures
; i
++) {
291 struct pipe_sampler_view
*pview
= v3d
->tex
[s
].textures
[i
];
295 struct v3d_sampler_view
*view
= v3d_sampler_view(pview
);
296 struct v3d_resource
*v_rsc
= v3d_resource(view
->texture
);
297 if (rsc
->bo
== v_rsc
->bo
)
305 v3d_emit_wait_for_tf(struct v3d_job
*job
)
307 /* XXX: we might be able to skip this in some cases, for now we
310 cl_emit(&job
->bcl
, FLUSH_TRANSFORM_FEEDBACK_DATA
, flush
);
312 cl_emit(&job
->bcl
, WAIT_FOR_TRANSFORM_FEEDBACK
, wait
) {
313 /* XXX: Wait for all outstanding writes... maybe we can do
314 * better in some cases.
316 wait
.block_count
= 255;
319 /* We have just flushed all our outstanding TF work in this job so make
320 * sure we don't emit TF flushes again for any of it again.
322 _mesa_set_clear(job
->tf_write_prscs
, NULL
);
326 v3d_emit_wait_for_tf_if_needed(struct v3d_context
*v3d
, struct v3d_job
*job
)
328 if (!job
->tf_enabled
)
331 set_foreach(job
->tf_write_prscs
, entry
) {
332 struct pipe_resource
*prsc
= (struct pipe_resource
*)entry
->key
;
333 for (int s
= 0; s
< PIPE_SHADER_COMPUTE
; s
++) {
334 /* Fragment shaders can only start executing after all
335 * binning (and thus TF) is complete.
337 * XXX: For VS/GS/TES, if the binning shader does not
338 * read the resource then we could also avoid emitting
341 if (s
== PIPE_SHADER_FRAGMENT
)
344 if (v3d_state_reads_resource(v3d
, prsc
, s
)) {
345 v3d_emit_wait_for_tf(job
);
362 #if V3D_VERSION >= 41
364 v3d_emit_gs_state_record(struct v3d_job
*job
,
365 struct v3d_compiled_shader
*gs_bin
,
366 struct v3d_cl_reloc gs_bin_uniforms
,
367 struct v3d_compiled_shader
*gs
,
368 struct v3d_cl_reloc gs_render_uniforms
)
370 cl_emit(&job
->indirect
, GEOMETRY_SHADER_STATE_RECORD
, shader
) {
371 shader
.geometry_bin_mode_shader_code_address
=
372 cl_address(v3d_resource(gs_bin
->resource
)->bo
,
374 shader
.geometry_bin_mode_shader_4_way_threadable
=
375 gs_bin
->prog_data
.gs
->base
.threads
== 4;
376 shader
.geometry_bin_mode_shader_start_in_final_thread_section
=
377 gs_bin
->prog_data
.gs
->base
.single_seg
;
378 shader
.geometry_bin_mode_shader_propagate_nans
= true;
379 shader
.geometry_bin_mode_shader_uniforms_address
=
382 shader
.geometry_render_mode_shader_code_address
=
383 cl_address(v3d_resource(gs
->resource
)->bo
, gs
->offset
);
384 shader
.geometry_render_mode_shader_4_way_threadable
=
385 gs
->prog_data
.gs
->base
.threads
== 4;
386 shader
.geometry_render_mode_shader_start_in_final_thread_section
=
387 gs
->prog_data
.gs
->base
.single_seg
;
388 shader
.geometry_render_mode_shader_propagate_nans
= true;
389 shader
.geometry_render_mode_shader_uniforms_address
=
395 v3d_gs_output_primitive(uint32_t prim_type
)
399 return GEOMETRY_SHADER_POINTS
;
401 return GEOMETRY_SHADER_LINE_STRIP
;
402 case GL_TRIANGLE_STRIP
:
403 return GEOMETRY_SHADER_TRI_STRIP
;
405 unreachable("Unsupported primitive type");
410 v3d_emit_tes_gs_common_params(struct v3d_job
*job
,
411 uint8_t gs_out_prim_type
,
412 uint8_t gs_num_invocations
)
414 /* This, and v3d_emit_tes_gs_shader_params below, fill in default
415 * values for tessellation fields even though we don't support
416 * tessellation yet because our packing functions (and the simulator)
417 * complain if we don't.
419 cl_emit(&job
->indirect
, TESSELLATION_GEOMETRY_COMMON_PARAMS
, shader
) {
420 shader
.tessellation_type
= TESSELLATION_TYPE_TRIANGLE
;
421 shader
.tessellation_point_mode
= false;
422 shader
.tessellation_edge_spacing
= TESSELLATION_EDGE_SPACING_EVEN
;
423 shader
.tessellation_clockwise
= true;
424 shader
.tessellation_invocations
= 1;
426 shader
.geometry_shader_output_format
=
427 v3d_gs_output_primitive(gs_out_prim_type
);
428 shader
.geometry_shader_instances
= gs_num_invocations
& 0x1F;
433 simd_width_to_gs_pack_mode(uint32_t width
)
437 return V3D_PACK_MODE_16_WAY
;
439 return V3D_PACK_MODE_8_WAY
;
441 return V3D_PACK_MODE_4_WAY
;
443 return V3D_PACK_MODE_1_WAY
;
445 unreachable("Invalid SIMD width");
450 v3d_emit_tes_gs_shader_params(struct v3d_job
*job
,
452 uint32_t gs_vpm_output_size
,
453 uint32_t gs_max_vpm_input_size_per_batch
)
455 cl_emit(&job
->indirect
, TESSELLATION_GEOMETRY_SHADER_PARAMS
, shader
) {
456 shader
.tcs_batch_flush_mode
= V3D_TCS_FLUSH_MODE_FULLY_PACKED
;
457 shader
.per_patch_data_column_depth
= 1;
458 shader
.tcs_output_segment_size_in_sectors
= 1;
459 shader
.tcs_output_segment_pack_mode
= V3D_PACK_MODE_16_WAY
;
460 shader
.tes_output_segment_size_in_sectors
= 1;
461 shader
.tes_output_segment_pack_mode
= V3D_PACK_MODE_16_WAY
;
462 shader
.gs_output_segment_size_in_sectors
= gs_vpm_output_size
;
463 shader
.gs_output_segment_pack_mode
=
464 simd_width_to_gs_pack_mode(gs_simd
);
465 shader
.tbg_max_patches_per_tcs_batch
= 1;
466 shader
.tbg_max_extra_vertex_segs_for_patches_after_first
= 0;
467 shader
.tbg_min_tcs_output_segments_required_in_play
= 1;
468 shader
.tbg_min_per_patch_data_segments_required_in_play
= 1;
469 shader
.tpg_max_patches_per_tes_batch
= 1;
470 shader
.tpg_max_vertex_segments_per_tes_batch
= 0;
471 shader
.tpg_max_tcs_output_segments_per_tes_batch
= 1;
472 shader
.tpg_min_tes_output_segments_required_in_play
= 1;
473 shader
.gbg_max_tes_output_vertex_segments_per_gs_batch
=
474 gs_max_vpm_input_size_per_batch
;
475 shader
.gbg_min_gs_output_segments_required_in_play
= 1;
479 static inline uint32_t
480 compute_vpm_size_in_sectors(const struct v3d_device_info
*devinfo
)
482 assert(devinfo
->vpm_size
> 0);
483 const uint32_t sector_size
= V3D_CHANNELS
* sizeof(uint32_t) * 8;
484 return devinfo
->vpm_size
/ sector_size
;
487 /* Computes various parameters affecting VPM memory configuration for programs
488 * involving geometry shaders to ensure the program fits in memory and honors
489 * requirements described in section "VPM usage" of the programming manual.
492 compute_vpm_config_gs(struct v3d_device_info
*devinfo
,
493 struct v3d_vs_prog_data
*vs
,
494 struct v3d_gs_prog_data
*gs
,
495 struct vpm_config
*vpm_cfg_out
)
497 const uint32_t A
= vs
->separate_segments
? 1 : 0;
498 const uint32_t Ad
= vs
->vpm_input_size
;
499 const uint32_t Vd
= vs
->vpm_output_size
;
501 const uint32_t vpm_size
= compute_vpm_size_in_sectors(devinfo
);
503 /* Try to fit program into our VPM memory budget by adjusting
504 * configurable parameters iteratively. We do this in two phases:
505 * the first phase tries to fit the program into the total available
506 * VPM memory. If we suceed at that, then the second phase attempts
507 * to fit the program into half of that budget so we can run bin and
508 * render programs in parallel.
510 struct vpm_config vpm_cfg
[2];
511 struct vpm_config
*final_vpm_cfg
= NULL
;
514 vpm_cfg
[phase
].As
= 1;
515 vpm_cfg
[phase
].Gs
= 1;
516 vpm_cfg
[phase
].Gd
= gs
->vpm_output_size
;
517 vpm_cfg
[phase
].gs_width
= gs
->simd_width
;
519 /* While there is a requirement that Vc >= [Vn / 16], this is
520 * always the case when tessellation is not present because in that
521 * case Vn can only be 6 at most (when input primitive is triangles
524 * We always choose Vc=2. We can't go lower than this due to GFXH-1744,
525 * and Broadcom has not found it worth it to increase it beyond this
526 * in general. Increasing Vc also increases VPM memory pressure which
527 * can turn up being detrimental for performance in some scenarios.
529 vpm_cfg
[phase
].Vc
= 2;
531 /* Gv is a constraint on the hardware to not exceed the
532 * specified number of vertex segments per GS batch. If adding a
533 * new primitive to a GS batch would result in a range of more
534 * than Gv vertex segments being referenced by the batch, then
535 * the hardware will flush the batch and start a new one. This
536 * means that we can choose any value we want, we just need to
537 * be aware that larger values improve GS batch utilization
538 * at the expense of more VPM memory pressure (which can affect
539 * other performance aspects, such as GS dispatch width).
540 * We start with the largest value, and will reduce it if we
541 * find that total memory pressure is too high.
543 vpm_cfg
[phase
].Gv
= 3;
545 /* When GS is present in absence of TES, then we need to satisfy
546 * that Ve >= Gv. We go with the smallest value of Ve to avoid
547 * increasing memory pressure.
549 vpm_cfg
[phase
].Ve
= vpm_cfg
[phase
].Gv
;
551 uint32_t vpm_sectors
=
552 A
* vpm_cfg
[phase
].As
* Ad
+
553 (vpm_cfg
[phase
].Vc
+ vpm_cfg
[phase
].Ve
) * Vd
+
554 vpm_cfg
[phase
].Gs
* vpm_cfg
[phase
].Gd
;
556 /* Ideally we want to use no more than half of the available
557 * memory so we can execute a bin and render program in parallel
558 * without stalls. If we achieved that then we are done.
560 if (vpm_sectors
<= vpm_size
/ 2) {
561 final_vpm_cfg
= &vpm_cfg
[phase
];
565 /* At the very least, we should not allocate more than the
566 * total available VPM memory. If we have a configuration that
567 * succeeds at this we save it and continue to see if we can
568 * meet the half-memory-use criteria too.
570 if (phase
== 0 && vpm_sectors
<= vpm_size
) {
571 vpm_cfg
[1] = vpm_cfg
[0];
575 /* Try lowering Gv */
576 if (vpm_cfg
[phase
].Gv
> 0) {
581 /* Try lowering GS dispatch width */
582 if (vpm_cfg
[phase
].gs_width
> 1) {
584 vpm_cfg
[phase
].gs_width
>>= 1;
586 align(vpm_cfg
[phase
].Gd
, 2) / 2;
587 } while (vpm_cfg
[phase
].gs_width
== 2);
589 /* Reset Gv to max after dropping dispatch width */
590 vpm_cfg
[phase
].Gv
= 3;
594 /* We ran out of options to reduce memory pressure. If we
595 * are at phase 1 we have at least a valid configuration, so we
599 final_vpm_cfg
= &vpm_cfg
[0];
603 if (!final_vpm_cfg
) {
604 /* FIXME: maybe return a boolean to indicate failure and use
605 * that to stop the submission for this draw call.
607 fprintf(stderr
, "Failed to allocate VPM memory.\n");
611 assert(final_vpm_cfg
);
612 assert(final_vpm_cfg
->Gd
<= 16);
613 assert(final_vpm_cfg
->Gv
< 4);
614 assert(final_vpm_cfg
->Ve
< 4);
615 assert(final_vpm_cfg
->Vc
>= 2 && final_vpm_cfg
->Vc
<= 4);
616 assert(final_vpm_cfg
->gs_width
== 1 ||
617 final_vpm_cfg
->gs_width
== 4 ||
618 final_vpm_cfg
->gs_width
== 8 ||
619 final_vpm_cfg
->gs_width
== 16);
621 *vpm_cfg_out
= *final_vpm_cfg
;
626 v3d_emit_gl_shader_state(struct v3d_context
*v3d
,
627 const struct pipe_draw_info
*info
)
629 struct v3d_job
*job
= v3d
->job
;
630 /* VC5_DIRTY_VTXSTATE */
631 struct v3d_vertex_stateobj
*vtx
= v3d
->vtx
;
632 /* VC5_DIRTY_VTXBUF */
633 struct v3d_vertexbuf_stateobj
*vertexbuf
= &v3d
->vertexbuf
;
635 /* Upload the uniforms to the indirect CL first */
636 struct v3d_cl_reloc fs_uniforms
=
637 v3d_write_uniforms(v3d
, job
, v3d
->prog
.fs
,
638 PIPE_SHADER_FRAGMENT
);
640 struct v3d_cl_reloc gs_uniforms
= { NULL
, 0 };
641 struct v3d_cl_reloc gs_bin_uniforms
= { NULL
, 0 };
643 gs_uniforms
= v3d_write_uniforms(v3d
, job
, v3d
->prog
.gs
,
644 PIPE_SHADER_GEOMETRY
);
646 if (v3d
->prog
.gs_bin
) {
647 gs_bin_uniforms
= v3d_write_uniforms(v3d
, job
, v3d
->prog
.gs_bin
,
648 PIPE_SHADER_GEOMETRY
);
651 struct v3d_cl_reloc vs_uniforms
=
652 v3d_write_uniforms(v3d
, job
, v3d
->prog
.vs
,
654 struct v3d_cl_reloc cs_uniforms
=
655 v3d_write_uniforms(v3d
, job
, v3d
->prog
.cs
,
658 /* Update the cache dirty flag based on the shader progs data */
659 job
->tmu_dirty_rcl
|= v3d
->prog
.cs
->prog_data
.vs
->base
.tmu_dirty_rcl
;
660 job
->tmu_dirty_rcl
|= v3d
->prog
.vs
->prog_data
.vs
->base
.tmu_dirty_rcl
;
661 if (v3d
->prog
.gs_bin
) {
662 job
->tmu_dirty_rcl
|=
663 v3d
->prog
.gs_bin
->prog_data
.gs
->base
.tmu_dirty_rcl
;
666 job
->tmu_dirty_rcl
|=
667 v3d
->prog
.gs
->prog_data
.gs
->base
.tmu_dirty_rcl
;
669 job
->tmu_dirty_rcl
|= v3d
->prog
.fs
->prog_data
.fs
->base
.tmu_dirty_rcl
;
671 /* See GFXH-930 workaround below */
672 uint32_t num_elements_to_emit
= MAX2(vtx
->num_elements
, 1);
674 uint32_t shader_state_record_length
=
675 cl_packet_length(GL_SHADER_STATE_RECORD
);
676 #if V3D_VERSION >= 41
678 shader_state_record_length
+=
679 cl_packet_length(GEOMETRY_SHADER_STATE_RECORD
) +
680 cl_packet_length(TESSELLATION_GEOMETRY_COMMON_PARAMS
) +
681 2 * cl_packet_length(TESSELLATION_GEOMETRY_SHADER_PARAMS
);
685 uint32_t shader_rec_offset
=
686 v3d_cl_ensure_space(&job
->indirect
,
687 shader_state_record_length
+
688 num_elements_to_emit
*
689 cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD
),
692 /* XXX perf: We should move most of the SHADER_STATE_RECORD setup to
693 * compile time, so that we mostly just have to OR the VS and FS
694 * records together at draw time.
697 struct vpm_config vpm_cfg_bin
, vpm_cfg
;
699 assert(v3d
->screen
->devinfo
.ver
>= 41 || !v3d
->prog
.gs
);
703 vpm_cfg_bin
.Vc
= v3d
->prog
.cs
->prog_data
.vs
->vcm_cache_size
;
707 vpm_cfg
.Vc
= v3d
->prog
.vs
->prog_data
.vs
->vcm_cache_size
;
710 #if V3D_VERSION >= 41
711 v3d_emit_gs_state_record(v3d
->job
,
712 v3d
->prog
.gs_bin
, gs_bin_uniforms
,
713 v3d
->prog
.gs
, gs_uniforms
);
715 struct v3d_gs_prog_data
*gs
= v3d
->prog
.gs
->prog_data
.gs
;
716 struct v3d_gs_prog_data
*gs_bin
= v3d
->prog
.gs_bin
->prog_data
.gs
;
718 v3d_emit_tes_gs_common_params(v3d
->job
,
720 gs
->num_invocations
);
722 /* Bin Tes/Gs params */
723 struct v3d_vs_prog_data
*vs_bin
= v3d
->prog
.cs
->prog_data
.vs
;
724 compute_vpm_config_gs(&v3d
->screen
->devinfo
,
725 vs_bin
, gs_bin
, &vpm_cfg_bin
);
727 v3d_emit_tes_gs_shader_params(v3d
->job
,
728 vpm_cfg_bin
.gs_width
,
732 /* Render Tes/Gs params */
733 struct v3d_vs_prog_data
*vs
= v3d
->prog
.vs
->prog_data
.vs
;
734 compute_vpm_config_gs(&v3d
->screen
->devinfo
,
737 v3d_emit_tes_gs_shader_params(v3d
->job
,
742 unreachable("No GS support pre-4.1");
746 cl_emit(&job
->indirect
, GL_SHADER_STATE_RECORD
, shader
) {
747 shader
.enable_clipping
= true;
748 /* VC5_DIRTY_PRIM_MODE | VC5_DIRTY_RASTERIZER */
749 shader
.point_size_in_shaded_vertex_data
=
750 (info
->mode
== PIPE_PRIM_POINTS
&&
751 v3d
->rasterizer
->base
.point_size_per_vertex
);
753 /* Must be set if the shader modifies Z, discards, or modifies
754 * the sample mask. For any of these cases, the fragment
755 * shader needs to write the Z value (even just discards).
757 shader
.fragment_shader_does_z_writes
=
758 v3d
->prog
.fs
->prog_data
.fs
->writes_z
;
759 /* Set if the EZ test must be disabled (due to shader side
760 * effects and the early_z flag not being present in the
763 shader
.turn_off_early_z_test
=
764 v3d
->prog
.fs
->prog_data
.fs
->disable_ez
;
766 shader
.fragment_shader_uses_real_pixel_centre_w_in_addition_to_centroid_w2
=
767 v3d
->prog
.fs
->prog_data
.fs
->uses_center_w
;
769 #if V3D_VERSION >= 41
770 shader
.any_shader_reads_hardware_written_primitive_id
=
771 v3d
->prog
.gs
? v3d
->prog
.gs
->prog_data
.gs
->uses_pid
:
775 #if V3D_VERSION >= 40
776 shader
.do_scoreboard_wait_on_first_thread_switch
=
777 v3d
->prog
.fs
->prog_data
.fs
->lock_scoreboard_on_first_thrsw
;
778 shader
.disable_implicit_point_line_varyings
=
779 !v3d
->prog
.fs
->prog_data
.fs
->uses_implicit_point_line_varyings
;
782 shader
.number_of_varyings_in_fragment_shader
=
783 v3d
->prog
.fs
->prog_data
.fs
->num_inputs
;
785 shader
.coordinate_shader_propagate_nans
= true;
786 shader
.vertex_shader_propagate_nans
= true;
787 shader
.fragment_shader_propagate_nans
= true;
789 shader
.coordinate_shader_code_address
=
790 cl_address(v3d_resource(v3d
->prog
.cs
->resource
)->bo
,
791 v3d
->prog
.cs
->offset
);
792 shader
.vertex_shader_code_address
=
793 cl_address(v3d_resource(v3d
->prog
.vs
->resource
)->bo
,
794 v3d
->prog
.vs
->offset
);
795 shader
.fragment_shader_code_address
=
796 cl_address(v3d_resource(v3d
->prog
.fs
->resource
)->bo
,
797 v3d
->prog
.fs
->offset
);
799 /* XXX: Use combined input/output size flag in the common
802 shader
.coordinate_shader_has_separate_input_and_output_vpm_blocks
=
803 v3d
->prog
.cs
->prog_data
.vs
->separate_segments
;
804 shader
.vertex_shader_has_separate_input_and_output_vpm_blocks
=
805 v3d
->prog
.vs
->prog_data
.vs
->separate_segments
;
807 shader
.coordinate_shader_input_vpm_segment_size
=
808 v3d
->prog
.cs
->prog_data
.vs
->separate_segments
?
809 v3d
->prog
.cs
->prog_data
.vs
->vpm_input_size
: 1;
810 shader
.vertex_shader_input_vpm_segment_size
=
811 v3d
->prog
.vs
->prog_data
.vs
->separate_segments
?
812 v3d
->prog
.vs
->prog_data
.vs
->vpm_input_size
: 1;
814 shader
.coordinate_shader_output_vpm_segment_size
=
815 v3d
->prog
.cs
->prog_data
.vs
->vpm_output_size
;
816 shader
.vertex_shader_output_vpm_segment_size
=
817 v3d
->prog
.vs
->prog_data
.vs
->vpm_output_size
;
819 shader
.coordinate_shader_uniforms_address
= cs_uniforms
;
820 shader
.vertex_shader_uniforms_address
= vs_uniforms
;
821 shader
.fragment_shader_uniforms_address
= fs_uniforms
;
823 #if V3D_VERSION >= 41
824 shader
.min_coord_shader_input_segments_required_in_play
=
826 shader
.min_vertex_shader_input_segments_required_in_play
=
829 shader
.min_coord_shader_output_segments_required_in_play_in_addition_to_vcm_cache_size
=
831 shader
.min_vertex_shader_output_segments_required_in_play_in_addition_to_vcm_cache_size
=
834 shader
.coordinate_shader_4_way_threadable
=
835 v3d
->prog
.cs
->prog_data
.vs
->base
.threads
== 4;
836 shader
.vertex_shader_4_way_threadable
=
837 v3d
->prog
.vs
->prog_data
.vs
->base
.threads
== 4;
838 shader
.fragment_shader_4_way_threadable
=
839 v3d
->prog
.fs
->prog_data
.fs
->base
.threads
== 4;
841 shader
.coordinate_shader_start_in_final_thread_section
=
842 v3d
->prog
.cs
->prog_data
.vs
->base
.single_seg
;
843 shader
.vertex_shader_start_in_final_thread_section
=
844 v3d
->prog
.vs
->prog_data
.vs
->base
.single_seg
;
845 shader
.fragment_shader_start_in_final_thread_section
=
846 v3d
->prog
.fs
->prog_data
.fs
->base
.single_seg
;
848 shader
.coordinate_shader_4_way_threadable
=
849 v3d
->prog
.cs
->prog_data
.vs
->base
.threads
== 4;
850 shader
.coordinate_shader_2_way_threadable
=
851 v3d
->prog
.cs
->prog_data
.vs
->base
.threads
== 2;
852 shader
.vertex_shader_4_way_threadable
=
853 v3d
->prog
.vs
->prog_data
.vs
->base
.threads
== 4;
854 shader
.vertex_shader_2_way_threadable
=
855 v3d
->prog
.vs
->prog_data
.vs
->base
.threads
== 2;
856 shader
.fragment_shader_4_way_threadable
=
857 v3d
->prog
.fs
->prog_data
.fs
->base
.threads
== 4;
858 shader
.fragment_shader_2_way_threadable
=
859 v3d
->prog
.fs
->prog_data
.fs
->base
.threads
== 2;
862 shader
.vertex_id_read_by_coordinate_shader
=
863 v3d
->prog
.cs
->prog_data
.vs
->uses_vid
;
864 shader
.instance_id_read_by_coordinate_shader
=
865 v3d
->prog
.cs
->prog_data
.vs
->uses_iid
;
866 shader
.vertex_id_read_by_vertex_shader
=
867 v3d
->prog
.vs
->prog_data
.vs
->uses_vid
;
868 shader
.instance_id_read_by_vertex_shader
=
869 v3d
->prog
.vs
->prog_data
.vs
->uses_iid
;
871 shader
.address_of_default_attribute_values
=
872 cl_address(v3d_resource(vtx
->defaults
)->bo
,
873 vtx
->defaults_offset
);
876 bool cs_loaded_any
= false;
877 for (int i
= 0; i
< vtx
->num_elements
; i
++) {
878 struct pipe_vertex_element
*elem
= &vtx
->pipe
[i
];
879 struct pipe_vertex_buffer
*vb
=
880 &vertexbuf
->vb
[elem
->vertex_buffer_index
];
881 struct v3d_resource
*rsc
= v3d_resource(vb
->buffer
.resource
);
883 const uint32_t size
=
884 cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD
);
885 cl_emit_with_prepacked(&job
->indirect
,
886 GL_SHADER_STATE_ATTRIBUTE_RECORD
,
887 &vtx
->attrs
[i
* size
], attr
) {
888 attr
.stride
= vb
->stride
;
889 attr
.address
= cl_address(rsc
->bo
,
892 attr
.number_of_values_read_by_coordinate_shader
=
893 v3d
->prog
.cs
->prog_data
.vs
->vattr_sizes
[i
];
894 attr
.number_of_values_read_by_vertex_shader
=
895 v3d
->prog
.vs
->prog_data
.vs
->vattr_sizes
[i
];
897 /* GFXH-930: At least one attribute must be enabled
898 * and read by CS and VS. If we have attributes being
899 * consumed by the VS but not the CS, then set up a
900 * dummy load of the last attribute into the CS's VPM
901 * inputs. (Since CS is just dead-code-elimination
902 * compared to VS, we can't have CS loading but not
905 if (v3d
->prog
.cs
->prog_data
.vs
->vattr_sizes
[i
])
906 cs_loaded_any
= true;
907 if (i
== vtx
->num_elements
- 1 && !cs_loaded_any
) {
908 attr
.number_of_values_read_by_coordinate_shader
= 1;
910 #if V3D_VERSION >= 41
911 attr
.maximum_index
= 0xffffff;
914 STATIC_ASSERT(sizeof(vtx
->attrs
) >= V3D_MAX_VS_INPUTS
/ 4 * size
);
917 if (vtx
->num_elements
== 0) {
918 /* GFXH-930: At least one attribute must be enabled and read
919 * by CS and VS. If we have no attributes being consumed by
920 * the shader, set up a dummy to be loaded into the VPM.
922 cl_emit(&job
->indirect
, GL_SHADER_STATE_ATTRIBUTE_RECORD
, attr
) {
923 /* Valid address of data whose value will be unused. */
924 attr
.address
= cl_address(job
->indirect
.bo
, 0);
926 attr
.type
= ATTRIBUTE_FLOAT
;
930 attr
.number_of_values_read_by_coordinate_shader
= 1;
931 attr
.number_of_values_read_by_vertex_shader
= 1;
935 cl_emit(&job
->bcl
, VCM_CACHE_SIZE
, vcm
) {
936 vcm
.number_of_16_vertex_batches_for_binning
= vpm_cfg_bin
.Vc
;
937 vcm
.number_of_16_vertex_batches_for_rendering
= vpm_cfg
.Vc
;
940 #if V3D_VERSION >= 41
942 cl_emit(&job
->bcl
, GL_SHADER_STATE_INCLUDING_GS
, state
) {
943 state
.address
= cl_address(job
->indirect
.bo
,
945 state
.number_of_attribute_arrays
= num_elements_to_emit
;
948 cl_emit(&job
->bcl
, GL_SHADER_STATE
, state
) {
949 state
.address
= cl_address(job
->indirect
.bo
,
951 state
.number_of_attribute_arrays
= num_elements_to_emit
;
955 assert(!v3d
->prog
.gs
);
956 cl_emit(&job
->bcl
, GL_SHADER_STATE
, state
) {
957 state
.address
= cl_address(job
->indirect
.bo
, shader_rec_offset
);
958 state
.number_of_attribute_arrays
= num_elements_to_emit
;
962 v3d_bo_unreference(&cs_uniforms
.bo
);
963 v3d_bo_unreference(&vs_uniforms
.bo
);
965 v3d_bo_unreference(&gs_uniforms
.bo
);
966 if (gs_bin_uniforms
.bo
)
967 v3d_bo_unreference(&gs_bin_uniforms
.bo
);
968 v3d_bo_unreference(&fs_uniforms
.bo
);
972 * Updates the number of primitives generated from the number of vertices
973 * to draw. This only works when no GS is present, since otherwise the number
974 * of primitives generated cannot be determined in advance and we need to
975 * use the PRIMITIVE_COUNTS_FEEDBACK command instead, however, that requires
976 * a sync wait for the draw to complete, so we only use that when GS is present.
979 v3d_update_primitives_generated_counter(struct v3d_context
*v3d
,
980 const struct pipe_draw_info
*info
)
982 assert(!v3d
->prog
.gs
);
984 if (!v3d
->active_queries
)
987 uint32_t prims
= u_prims_for_vertices(info
->mode
, info
->count
);
988 v3d
->prims_generated
+= prims
;
992 v3d_update_job_ez(struct v3d_context
*v3d
, struct v3d_job
*job
)
994 switch (v3d
->zsa
->ez_state
) {
995 case VC5_EZ_UNDECIDED
:
996 /* If the Z/S state didn't pick a direction but didn't
997 * disable, then go along with the current EZ state. This
998 * allows EZ optimization for Z func == EQUAL or NEVER.
1004 /* If the Z/S state picked a direction, then it needs to match
1005 * the current direction if we've decided on one.
1007 if (job
->ez_state
== VC5_EZ_UNDECIDED
)
1008 job
->ez_state
= v3d
->zsa
->ez_state
;
1009 else if (job
->ez_state
!= v3d
->zsa
->ez_state
)
1010 job
->ez_state
= VC5_EZ_DISABLED
;
1013 case VC5_EZ_DISABLED
:
1014 /* If the current Z/S state disables EZ because of a bad Z
1015 * func or stencil operation, then we can't do any more EZ in
1018 job
->ez_state
= VC5_EZ_DISABLED
;
1022 /* If the FS affects the Z of the pixels, then it may update against
1023 * the chosen EZ direction (though we could use
1024 * ARB_conservative_depth's hints to avoid this)
1026 if (v3d
->prog
.fs
->prog_data
.fs
->writes_z
) {
1027 job
->ez_state
= VC5_EZ_DISABLED
;
1030 if (job
->first_ez_state
== VC5_EZ_UNDECIDED
&&
1031 (job
->ez_state
!= VC5_EZ_DISABLED
|| job
->draw_calls_queued
== 0))
1032 job
->first_ez_state
= job
->ez_state
;
1036 v3d_hw_prim_type(enum pipe_prim_type prim_type
)
1038 switch (prim_type
) {
1039 case PIPE_PRIM_POINTS
:
1040 case PIPE_PRIM_LINES
:
1041 case PIPE_PRIM_LINE_LOOP
:
1042 case PIPE_PRIM_LINE_STRIP
:
1043 case PIPE_PRIM_TRIANGLES
:
1044 case PIPE_PRIM_TRIANGLE_STRIP
:
1045 case PIPE_PRIM_TRIANGLE_FAN
:
1048 case PIPE_PRIM_LINES_ADJACENCY
:
1049 case PIPE_PRIM_LINE_STRIP_ADJACENCY
:
1050 case PIPE_PRIM_TRIANGLES_ADJACENCY
:
1051 case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY
:
1052 return 8 + (prim_type
- PIPE_PRIM_LINES_ADJACENCY
);
1055 unreachable("Unsupported primitive type");
1060 v3d_check_compiled_shaders(struct v3d_context
*v3d
)
1062 static bool warned
[5] = { 0 };
1064 uint32_t failed_stage
= MESA_SHADER_NONE
;
1065 if (!v3d
->prog
.vs
->resource
|| !v3d
->prog
.cs
->resource
) {
1066 failed_stage
= MESA_SHADER_VERTEX
;
1067 } else if ((v3d
->prog
.gs_bin
&& !v3d
->prog
.gs_bin
->resource
) ||
1068 (v3d
->prog
.gs
&& !v3d
->prog
.gs
->resource
)) {
1069 failed_stage
= MESA_SHADER_GEOMETRY
;
1070 } else if (v3d
->prog
.fs
&& !v3d
->prog
.fs
->resource
) {
1071 failed_stage
= MESA_SHADER_FRAGMENT
;
1074 if (likely(failed_stage
== MESA_SHADER_NONE
))
1077 if (!warned
[failed_stage
]) {
1079 "%s shader failed to compile. Expect corruption.\n",
1080 _mesa_shader_stage_to_string(failed_stage
));
1081 warned
[failed_stage
] = true;
1087 v3d_draw_vbo(struct pipe_context
*pctx
, const struct pipe_draw_info
*info
)
1089 struct v3d_context
*v3d
= v3d_context(pctx
);
1091 if (!info
->count_from_stream_output
&& !info
->indirect
&&
1092 !info
->primitive_restart
&&
1093 !u_trim_pipe_prim(info
->mode
, (unsigned*)&info
->count
))
1096 /* Fall back for weird desktop GL primitive restart values. */
1097 if (info
->primitive_restart
&&
1101 switch (info
->index_size
) {
1110 if (info
->restart_index
!= mask
) {
1111 util_draw_vbo_without_prim_restart(pctx
, info
);
1116 if (info
->mode
>= PIPE_PRIM_QUADS
&& info
->mode
<= PIPE_PRIM_POLYGON
) {
1117 util_primconvert_save_rasterizer_state(v3d
->primconvert
, &v3d
->rasterizer
->base
);
1118 util_primconvert_draw_vbo(v3d
->primconvert
, info
);
1119 perf_debug("Fallback conversion for %d %s vertices\n",
1120 info
->count
, u_prim_name(info
->mode
));
1124 /* Before setting up the draw, flush anything writing to the resources
1125 * that we read from or reading from resources we write to.
1127 for (int s
= 0; s
< PIPE_SHADER_COMPUTE
; s
++)
1128 v3d_predraw_check_stage_inputs(pctx
, s
);
1130 if (info
->indirect
) {
1131 v3d_flush_jobs_writing_resource(v3d
, info
->indirect
->buffer
,
1132 V3D_FLUSH_DEFAULT
, false);
1135 v3d_predraw_check_outputs(pctx
);
1137 /* If transform feedback is active and we are switching primitive type
1138 * we need to submit the job before drawing and update the vertex count
1139 * written to TF based on the primitive type since we will need to
1140 * know the exact vertex count if the application decides to call
1141 * glDrawTransformFeedback() later.
1143 if (v3d
->streamout
.num_targets
> 0 &&
1144 u_base_prim_type(info
->mode
) != u_base_prim_type(v3d
->prim_mode
)) {
1145 v3d_update_primitive_counters(v3d
);
1148 struct v3d_job
*job
= v3d_get_job_for_fbo(v3d
);
1150 /* If vertex texturing depends on the output of rendering, we need to
1151 * ensure that that rendering is complete before we run a coordinate
1152 * shader that depends on it.
1154 * Given that doing that is unusual, for now we just block the binner
1155 * on the last submitted render, rather than tracking the last
1156 * rendering to each texture's BO.
1158 if (v3d
->tex
[PIPE_SHADER_VERTEX
].num_textures
|| info
->indirect
) {
1159 perf_debug("Blocking binner on last render "
1160 "due to vertex texturing or indirect drawing.\n");
1161 job
->submit
.in_sync_bcl
= v3d
->out_sync
;
1164 /* We also need to ensure that compute is complete when render depends
1165 * on resources written by it.
1167 if (v3d
->sync_on_last_compute_job
) {
1168 job
->submit
.in_sync_bcl
= v3d
->out_sync
;
1169 v3d
->sync_on_last_compute_job
= false;
1172 /* Mark SSBOs and images as being written. We don't actually know
1173 * which ones are read vs written, so just assume the worst.
1175 for (int s
= 0; s
< PIPE_SHADER_COMPUTE
; s
++) {
1176 foreach_bit(i
, v3d
->ssbo
[s
].enabled_mask
) {
1177 v3d_job_add_write_resource(job
,
1178 v3d
->ssbo
[s
].sb
[i
].buffer
);
1179 job
->tmu_dirty_rcl
= true;
1182 foreach_bit(i
, v3d
->shaderimg
[s
].enabled_mask
) {
1183 v3d_job_add_write_resource(job
,
1184 v3d
->shaderimg
[s
].si
[i
].base
.resource
);
1185 job
->tmu_dirty_rcl
= true;
1189 /* Get space to emit our draw call into the BCL, using a branch to
1190 * jump to a new BO if necessary.
1192 v3d_cl_ensure_space_with_branch(&job
->bcl
, 256 /* XXX */);
1194 if (v3d
->prim_mode
!= info
->mode
) {
1195 v3d
->prim_mode
= info
->mode
;
1196 v3d
->dirty
|= VC5_DIRTY_PRIM_MODE
;
1199 v3d_start_draw(v3d
);
1200 v3d_update_compiled_shaders(v3d
, info
->mode
);
1201 if (!v3d_check_compiled_shaders(v3d
))
1203 v3d_update_job_ez(v3d
, job
);
1205 /* If this job was writing to transform feedback buffers before this
1206 * draw and we are reading from them here, then we need to wait for TF
1207 * to complete before we emit this draw.
1209 * Notice this check needs to happen before we emit state for the
1210 * current draw call, where we update job->tf_enabled, so we can ensure
1211 * that we only check TF writes for prior draws.
1213 v3d_emit_wait_for_tf_if_needed(v3d
, job
);
1215 #if V3D_VERSION >= 41
1216 v3d41_emit_state(pctx
);
1218 v3d33_emit_state(pctx
);
1221 if (v3d
->dirty
& (VC5_DIRTY_VTXBUF
|
1222 VC5_DIRTY_VTXSTATE
|
1223 VC5_DIRTY_PRIM_MODE
|
1224 VC5_DIRTY_RASTERIZER
|
1225 VC5_DIRTY_COMPILED_CS
|
1226 VC5_DIRTY_COMPILED_VS
|
1227 VC5_DIRTY_COMPILED_GS_BIN
|
1228 VC5_DIRTY_COMPILED_GS
|
1229 VC5_DIRTY_COMPILED_FS
|
1230 v3d
->prog
.cs
->uniform_dirty_bits
|
1231 v3d
->prog
.vs
->uniform_dirty_bits
|
1233 v3d
->prog
.gs_bin
->uniform_dirty_bits
: 0) |
1235 v3d
->prog
.gs
->uniform_dirty_bits
: 0) |
1236 v3d
->prog
.fs
->uniform_dirty_bits
)) {
1237 v3d_emit_gl_shader_state(v3d
, info
);
1242 /* The Base Vertex/Base Instance packet sets those values to nonzero
1243 * for the next draw call only.
1245 if (info
->index_bias
|| info
->start_instance
) {
1246 cl_emit(&job
->bcl
, BASE_VERTEX_BASE_INSTANCE
, base
) {
1247 base
.base_instance
= info
->start_instance
;
1248 base
.base_vertex
= info
->index_bias
;
1252 uint32_t prim_tf_enable
= 0;
1253 #if V3D_VERSION < 40
1254 /* V3D 3.x: The HW only processes transform feedback on primitives
1255 * with the flag set.
1257 if (v3d
->streamout
.num_targets
)
1258 prim_tf_enable
= (V3D_PRIM_POINTS_TF
- V3D_PRIM_POINTS
);
1262 v3d_update_primitives_generated_counter(v3d
, info
);
1264 uint32_t hw_prim_type
= v3d_hw_prim_type(info
->mode
);
1265 if (info
->index_size
) {
1266 uint32_t index_size
= info
->index_size
;
1267 uint32_t offset
= info
->start
* index_size
;
1268 struct pipe_resource
*prsc
;
1269 if (info
->has_user_indices
) {
1271 u_upload_data(v3d
->uploader
, 0,
1272 info
->count
* info
->index_size
, 4,
1276 prsc
= info
->index
.resource
;
1278 struct v3d_resource
*rsc
= v3d_resource(prsc
);
1280 #if V3D_VERSION >= 40
1281 cl_emit(&job
->bcl
, INDEX_BUFFER_SETUP
, ib
) {
1282 ib
.address
= cl_address(rsc
->bo
, 0);
1283 ib
.size
= rsc
->bo
->size
;
1287 if (info
->indirect
) {
1288 cl_emit(&job
->bcl
, INDIRECT_INDEXED_INSTANCED_PRIM_LIST
, prim
) {
1289 prim
.index_type
= ffs(info
->index_size
) - 1;
1290 #if V3D_VERSION < 40
1291 prim
.address_of_indices_list
=
1292 cl_address(rsc
->bo
, offset
);
1293 #endif /* V3D_VERSION < 40 */
1294 prim
.mode
= hw_prim_type
| prim_tf_enable
;
1295 prim
.enable_primitive_restarts
= info
->primitive_restart
;
1297 prim
.number_of_draw_indirect_indexed_records
= info
->indirect
->draw_count
;
1299 prim
.stride_in_multiples_of_4_bytes
= info
->indirect
->stride
>> 2;
1300 prim
.address
= cl_address(v3d_resource(info
->indirect
->buffer
)->bo
,
1301 info
->indirect
->offset
);
1303 } else if (info
->instance_count
> 1) {
1304 cl_emit(&job
->bcl
, INDEXED_INSTANCED_PRIM_LIST
, prim
) {
1305 prim
.index_type
= ffs(info
->index_size
) - 1;
1306 #if V3D_VERSION >= 40
1307 prim
.index_offset
= offset
;
1308 #else /* V3D_VERSION < 40 */
1309 prim
.maximum_index
= (1u << 31) - 1; /* XXX */
1310 prim
.address_of_indices_list
=
1311 cl_address(rsc
->bo
, offset
);
1312 #endif /* V3D_VERSION < 40 */
1313 prim
.mode
= hw_prim_type
| prim_tf_enable
;
1314 prim
.enable_primitive_restarts
= info
->primitive_restart
;
1316 prim
.number_of_instances
= info
->instance_count
;
1317 prim
.instance_length
= info
->count
;
1320 cl_emit(&job
->bcl
, INDEXED_PRIM_LIST
, prim
) {
1321 prim
.index_type
= ffs(info
->index_size
) - 1;
1322 prim
.length
= info
->count
;
1323 #if V3D_VERSION >= 40
1324 prim
.index_offset
= offset
;
1325 #else /* V3D_VERSION < 40 */
1326 prim
.maximum_index
= (1u << 31) - 1; /* XXX */
1327 prim
.address_of_indices_list
=
1328 cl_address(rsc
->bo
, offset
);
1329 #endif /* V3D_VERSION < 40 */
1330 prim
.mode
= hw_prim_type
| prim_tf_enable
;
1331 prim
.enable_primitive_restarts
= info
->primitive_restart
;
1335 if (info
->has_user_indices
)
1336 pipe_resource_reference(&prsc
, NULL
);
1338 if (info
->indirect
) {
1339 cl_emit(&job
->bcl
, INDIRECT_VERTEX_ARRAY_INSTANCED_PRIMS
, prim
) {
1340 prim
.mode
= hw_prim_type
| prim_tf_enable
;
1341 prim
.number_of_draw_indirect_array_records
= info
->indirect
->draw_count
;
1343 prim
.stride_in_multiples_of_4_bytes
= info
->indirect
->stride
>> 2;
1344 prim
.address
= cl_address(v3d_resource(info
->indirect
->buffer
)->bo
,
1345 info
->indirect
->offset
);
1347 } else if (info
->instance_count
> 1) {
1348 struct pipe_stream_output_target
*so
=
1349 info
->count_from_stream_output
;
1350 uint32_t vert_count
= so
?
1351 v3d_stream_output_target_get_vertex_count(so
) :
1353 cl_emit(&job
->bcl
, VERTEX_ARRAY_INSTANCED_PRIMS
, prim
) {
1354 prim
.mode
= hw_prim_type
| prim_tf_enable
;
1355 prim
.index_of_first_vertex
= info
->start
;
1356 prim
.number_of_instances
= info
->instance_count
;
1357 prim
.instance_length
= vert_count
;
1360 struct pipe_stream_output_target
*so
=
1361 info
->count_from_stream_output
;
1362 uint32_t vert_count
= so
?
1363 v3d_stream_output_target_get_vertex_count(so
) :
1365 cl_emit(&job
->bcl
, VERTEX_ARRAY_PRIMS
, prim
) {
1366 prim
.mode
= hw_prim_type
| prim_tf_enable
;
1367 prim
.length
= vert_count
;
1368 prim
.index_of_first_vertex
= info
->start
;
1373 /* A flush is required in between a TF draw and any following TF specs
1374 * packet, or the GPU may hang. Just flush each time for now.
1376 if (v3d
->streamout
.num_targets
)
1377 cl_emit(&job
->bcl
, TRANSFORM_FEEDBACK_FLUSH_AND_COUNT
, flush
);
1379 job
->draw_calls_queued
++;
1380 if (v3d
->streamout
.num_targets
)
1381 job
->tf_draw_calls_queued
++;
1383 /* Increment the TF offsets by how many verts we wrote. XXX: This
1384 * needs some clamping to the buffer size.
1386 for (int i
= 0; i
< v3d
->streamout
.num_targets
; i
++)
1387 v3d
->streamout
.offsets
[i
] += info
->count
;
1389 if (v3d
->zsa
&& job
->zsbuf
&& v3d
->zsa
->base
.depth
.enabled
) {
1390 struct v3d_resource
*rsc
= v3d_resource(job
->zsbuf
->texture
);
1391 v3d_job_add_bo(job
, rsc
->bo
);
1393 job
->load
|= PIPE_CLEAR_DEPTH
& ~job
->clear
;
1394 if (v3d
->zsa
->base
.depth
.writemask
)
1395 job
->store
|= PIPE_CLEAR_DEPTH
;
1396 rsc
->initialized_buffers
= PIPE_CLEAR_DEPTH
;
1399 if (v3d
->zsa
&& job
->zsbuf
&& v3d
->zsa
->base
.stencil
[0].enabled
) {
1400 struct v3d_resource
*rsc
= v3d_resource(job
->zsbuf
->texture
);
1401 if (rsc
->separate_stencil
)
1402 rsc
= rsc
->separate_stencil
;
1404 v3d_job_add_bo(job
, rsc
->bo
);
1406 job
->load
|= PIPE_CLEAR_STENCIL
& ~job
->clear
;
1407 if (v3d
->zsa
->base
.stencil
[0].writemask
||
1408 v3d
->zsa
->base
.stencil
[1].writemask
) {
1409 job
->store
|= PIPE_CLEAR_STENCIL
;
1411 rsc
->initialized_buffers
|= PIPE_CLEAR_STENCIL
;
1414 for (int i
= 0; i
< V3D_MAX_DRAW_BUFFERS
; i
++) {
1415 uint32_t bit
= PIPE_CLEAR_COLOR0
<< i
;
1416 int blend_rt
= v3d
->blend
->base
.independent_blend_enable
? i
: 0;
1418 if (job
->store
& bit
|| !job
->cbufs
[i
])
1420 struct v3d_resource
*rsc
= v3d_resource(job
->cbufs
[i
]->texture
);
1422 job
->load
|= bit
& ~job
->clear
;
1423 if (v3d
->blend
->base
.rt
[blend_rt
].colormask
)
1425 v3d_job_add_bo(job
, rsc
->bo
);
1428 if (job
->referenced_size
> 768 * 1024 * 1024) {
1429 perf_debug("Flushing job with %dkb to try to free up memory\n",
1430 job
->referenced_size
/ 1024);
1434 if (V3D_DEBUG
& V3D_DEBUG_ALWAYS_FLUSH
)
1438 #if V3D_VERSION >= 41
1439 #define V3D_CSD_CFG012_WG_COUNT_SHIFT 16
1440 #define V3D_CSD_CFG012_WG_OFFSET_SHIFT 0
1441 /* Allow this dispatch to start while the last one is still running. */
1442 #define V3D_CSD_CFG3_OVERLAP_WITH_PREV (1 << 26)
1443 /* Maximum supergroup ID. 6 bits. */
1444 #define V3D_CSD_CFG3_MAX_SG_ID_SHIFT 20
1445 /* Batches per supergroup minus 1. 8 bits. */
1446 #define V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT 12
1447 /* Workgroups per supergroup, 0 means 16 */
1448 #define V3D_CSD_CFG3_WGS_PER_SG_SHIFT 8
1449 #define V3D_CSD_CFG3_WG_SIZE_SHIFT 0
1451 #define V3D_CSD_CFG5_PROPAGATE_NANS (1 << 2)
1452 #define V3D_CSD_CFG5_SINGLE_SEG (1 << 1)
1453 #define V3D_CSD_CFG5_THREADING (1 << 0)
1456 v3d_launch_grid(struct pipe_context
*pctx
, const struct pipe_grid_info
*info
)
1458 struct v3d_context
*v3d
= v3d_context(pctx
);
1459 struct v3d_screen
*screen
= v3d
->screen
;
1461 v3d_predraw_check_stage_inputs(pctx
, PIPE_SHADER_COMPUTE
);
1463 v3d_update_compiled_cs(v3d
);
1465 if (!v3d
->prog
.compute
->resource
) {
1466 static bool warned
= false;
1469 "Compute shader failed to compile. "
1470 "Expect corruption.\n");
1476 /* Some of the units of scale:
1478 * - Batches of 16 work items (shader invocations) that will be queued
1479 * to the run on a QPU at once.
1481 * - Workgroups composed of work items based on the shader's layout
1484 * - Supergroups of 1-16 workgroups. There can only be 16 supergroups
1485 * running at a time on the core, so we want to keep them large to
1486 * keep the QPUs busy, but a whole supergroup will sync at a barrier
1487 * so we want to keep them small if one is present.
1489 struct drm_v3d_submit_csd submit
= { 0 };
1490 struct v3d_job
*job
= v3d_job_create(v3d
);
1492 /* Set up the actual number of workgroups, synchronously mapping the
1493 * indirect buffer if necessary to get the dimensions.
1495 if (info
->indirect
) {
1496 struct pipe_transfer
*transfer
;
1497 uint32_t *map
= pipe_buffer_map_range(pctx
, info
->indirect
,
1498 info
->indirect_offset
,
1499 3 * sizeof(uint32_t),
1502 memcpy(v3d
->compute_num_workgroups
, map
, 3 * sizeof(uint32_t));
1503 pipe_buffer_unmap(pctx
, transfer
);
1505 if (v3d
->compute_num_workgroups
[0] == 0 ||
1506 v3d
->compute_num_workgroups
[1] == 0 ||
1507 v3d
->compute_num_workgroups
[2] == 0) {
1508 /* Nothing to dispatch, so skip the draw (CSD can't
1509 * handle 0 workgroups).
1514 v3d
->compute_num_workgroups
[0] = info
->grid
[0];
1515 v3d
->compute_num_workgroups
[1] = info
->grid
[1];
1516 v3d
->compute_num_workgroups
[2] = info
->grid
[2];
1519 for (int i
= 0; i
< 3; i
++) {
1520 submit
.cfg
[i
] |= (v3d
->compute_num_workgroups
[i
] <<
1521 V3D_CSD_CFG012_WG_COUNT_SHIFT
);
1524 perf_debug("CSD only using single WG per SG currently, "
1525 "should increase that when possible.");
1527 int wg_size
= info
->block
[0] * info
->block
[1] * info
->block
[2];
1528 submit
.cfg
[3] |= wgs_per_sg
<< V3D_CSD_CFG3_WGS_PER_SG_SHIFT
;
1529 submit
.cfg
[3] |= ((DIV_ROUND_UP(wgs_per_sg
* wg_size
, 16) - 1) <<
1530 V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT
);
1531 submit
.cfg
[3] |= (wg_size
& 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT
;
1533 int batches_per_wg
= DIV_ROUND_UP(wg_size
, 16);
1534 /* Number of batches the dispatch will invoke (minus 1). */
1535 submit
.cfg
[4] = batches_per_wg
* (v3d
->compute_num_workgroups
[0] *
1536 v3d
->compute_num_workgroups
[1] *
1537 v3d
->compute_num_workgroups
[2]) - 1;
1539 /* Make sure we didn't accidentally underflow. */
1540 assert(submit
.cfg
[4] != ~0);
1542 v3d_job_add_bo(job
, v3d_resource(v3d
->prog
.compute
->resource
)->bo
);
1543 submit
.cfg
[5] = (v3d_resource(v3d
->prog
.compute
->resource
)->bo
->offset
+
1544 v3d
->prog
.compute
->offset
);
1545 submit
.cfg
[5] |= V3D_CSD_CFG5_PROPAGATE_NANS
;
1546 if (v3d
->prog
.compute
->prog_data
.base
->single_seg
)
1547 submit
.cfg
[5] |= V3D_CSD_CFG5_SINGLE_SEG
;
1548 if (v3d
->prog
.compute
->prog_data
.base
->threads
== 4)
1549 submit
.cfg
[5] |= V3D_CSD_CFG5_THREADING
;
1551 if (v3d
->prog
.compute
->prog_data
.compute
->shared_size
) {
1552 v3d
->compute_shared_memory
=
1553 v3d_bo_alloc(v3d
->screen
,
1554 v3d
->prog
.compute
->prog_data
.compute
->shared_size
*
1559 struct v3d_cl_reloc uniforms
= v3d_write_uniforms(v3d
, job
,
1561 PIPE_SHADER_COMPUTE
);
1562 v3d_job_add_bo(job
, uniforms
.bo
);
1563 submit
.cfg
[6] = uniforms
.bo
->offset
+ uniforms
.offset
;
1565 /* Pull some job state that was stored in a SUBMIT_CL struct out to
1566 * our SUBMIT_CSD struct
1568 submit
.bo_handles
= job
->submit
.bo_handles
;
1569 submit
.bo_handle_count
= job
->submit
.bo_handle_count
;
1571 /* Serialize this in the rest of our command stream. */
1572 submit
.in_sync
= v3d
->out_sync
;
1573 submit
.out_sync
= v3d
->out_sync
;
1575 if (!(V3D_DEBUG
& V3D_DEBUG_NORAST
)) {
1576 int ret
= v3d_ioctl(screen
->fd
, DRM_IOCTL_V3D_SUBMIT_CSD
,
1578 static bool warned
= false;
1579 if (ret
&& !warned
) {
1580 fprintf(stderr
, "CSD submit call returned %s. "
1581 "Expect corruption.\n", strerror(errno
));
1586 v3d_job_free(v3d
, job
);
1588 /* Mark SSBOs as being written.. we don't actually know which ones are
1589 * read vs written, so just assume the worst
1591 foreach_bit(i
, v3d
->ssbo
[PIPE_SHADER_COMPUTE
].enabled_mask
) {
1592 struct v3d_resource
*rsc
= v3d_resource(
1593 v3d
->ssbo
[PIPE_SHADER_COMPUTE
].sb
[i
].buffer
);
1595 rsc
->compute_written
= true;
1598 foreach_bit(i
, v3d
->shaderimg
[PIPE_SHADER_COMPUTE
].enabled_mask
) {
1599 struct v3d_resource
*rsc
= v3d_resource(
1600 v3d
->shaderimg
[PIPE_SHADER_COMPUTE
].si
[i
].base
.resource
);
1602 rsc
->compute_written
= true;
1605 v3d_bo_unreference(&uniforms
.bo
);
1606 v3d_bo_unreference(&v3d
->compute_shared_memory
);
1611 * Implements gallium's clear() hook (glClear()) by drawing a pair of triangles.
1614 v3d_draw_clear(struct v3d_context
*v3d
,
1616 const union pipe_color_union
*color
,
1617 double depth
, unsigned stencil
)
1619 static const union pipe_color_union dummy_color
= {};
1621 /* The blitter util dereferences the color regardless, even though the
1622 * gallium clear API may not pass one in when only Z/S are cleared.
1625 color
= &dummy_color
;
1627 v3d_blitter_save(v3d
);
1628 util_blitter_clear(v3d
->blitter
,
1629 v3d
->framebuffer
.width
,
1630 v3d
->framebuffer
.height
,
1631 util_framebuffer_get_num_layers(&v3d
->framebuffer
),
1632 buffers
, color
, depth
, stencil
,
1633 util_framebuffer_get_num_samples(&v3d
->framebuffer
) > 1);
1637 * Attempts to perform the GL clear by using the TLB's fast clear at the start
1641 v3d_tlb_clear(struct v3d_job
*job
, unsigned buffers
,
1642 const union pipe_color_union
*color
,
1643 double depth
, unsigned stencil
)
1645 struct v3d_context
*v3d
= job
->v3d
;
1647 if (job
->draw_calls_queued
) {
1648 /* If anything in the CL has drawn using the buffer, then the
1649 * TLB clear we're trying to add now would happen before that
1652 buffers
&= ~(job
->load
| job
->store
);
1655 /* GFXH-1461: If we were to emit a load of just depth or just stencil,
1656 * then the clear for the other may get lost. We need to decide now
1657 * if it would be possible to need to emit a load of just one after
1658 * we've set up our TLB clears.
1660 if (buffers
& PIPE_CLEAR_DEPTHSTENCIL
&&
1661 (buffers
& PIPE_CLEAR_DEPTHSTENCIL
) != PIPE_CLEAR_DEPTHSTENCIL
&&
1663 util_format_is_depth_and_stencil(job
->zsbuf
->texture
->format
)) {
1664 buffers
&= ~PIPE_CLEAR_DEPTHSTENCIL
;
1667 for (int i
= 0; i
< V3D_MAX_DRAW_BUFFERS
; i
++) {
1668 uint32_t bit
= PIPE_CLEAR_COLOR0
<< i
;
1669 if (!(buffers
& bit
))
1672 struct pipe_surface
*psurf
= v3d
->framebuffer
.cbufs
[i
];
1673 struct v3d_surface
*surf
= v3d_surface(psurf
);
1674 struct v3d_resource
*rsc
= v3d_resource(psurf
->texture
);
1676 union util_color uc
;
1677 uint32_t internal_size
= 4 << surf
->internal_bpp
;
1679 static union pipe_color_union swapped_color
;
1680 if (v3d
->swap_color_rb
& (1 << i
)) {
1681 swapped_color
.f
[0] = color
->f
[2];
1682 swapped_color
.f
[1] = color
->f
[1];
1683 swapped_color
.f
[2] = color
->f
[0];
1684 swapped_color
.f
[3] = color
->f
[3];
1685 color
= &swapped_color
;
1688 switch (surf
->internal_type
) {
1689 case V3D_INTERNAL_TYPE_8
:
1690 util_pack_color(color
->f
, PIPE_FORMAT_R8G8B8A8_UNORM
,
1692 memcpy(job
->clear_color
[i
], uc
.ui
, internal_size
);
1694 case V3D_INTERNAL_TYPE_8I
:
1695 case V3D_INTERNAL_TYPE_8UI
:
1696 job
->clear_color
[i
][0] = ((color
->ui
[0] & 0xff) |
1697 (color
->ui
[1] & 0xff) << 8 |
1698 (color
->ui
[2] & 0xff) << 16 |
1699 (color
->ui
[3] & 0xff) << 24);
1701 case V3D_INTERNAL_TYPE_16F
:
1702 util_pack_color(color
->f
, PIPE_FORMAT_R16G16B16A16_FLOAT
,
1704 memcpy(job
->clear_color
[i
], uc
.ui
, internal_size
);
1706 case V3D_INTERNAL_TYPE_16I
:
1707 case V3D_INTERNAL_TYPE_16UI
:
1708 job
->clear_color
[i
][0] = ((color
->ui
[0] & 0xffff) |
1709 color
->ui
[1] << 16);
1710 job
->clear_color
[i
][1] = ((color
->ui
[2] & 0xffff) |
1711 color
->ui
[3] << 16);
1713 case V3D_INTERNAL_TYPE_32F
:
1714 case V3D_INTERNAL_TYPE_32I
:
1715 case V3D_INTERNAL_TYPE_32UI
:
1716 memcpy(job
->clear_color
[i
], color
->ui
, internal_size
);
1720 rsc
->initialized_buffers
|= bit
;
1723 unsigned zsclear
= buffers
& PIPE_CLEAR_DEPTHSTENCIL
;
1725 struct v3d_resource
*rsc
=
1726 v3d_resource(v3d
->framebuffer
.zsbuf
->texture
);
1728 if (zsclear
& PIPE_CLEAR_DEPTH
)
1729 job
->clear_z
= depth
;
1730 if (zsclear
& PIPE_CLEAR_STENCIL
)
1731 job
->clear_s
= stencil
;
1733 rsc
->initialized_buffers
|= zsclear
;
1736 job
->draw_min_x
= 0;
1737 job
->draw_min_y
= 0;
1738 job
->draw_max_x
= v3d
->framebuffer
.width
;
1739 job
->draw_max_y
= v3d
->framebuffer
.height
;
1740 job
->clear
|= buffers
;
1741 job
->store
|= buffers
;
1743 v3d_start_draw(v3d
);
1749 v3d_clear(struct pipe_context
*pctx
, unsigned buffers
, const struct pipe_scissor_state
*scissor_state
,
1750 const union pipe_color_union
*color
, double depth
, unsigned stencil
)
1752 struct v3d_context
*v3d
= v3d_context(pctx
);
1753 struct v3d_job
*job
= v3d_get_job_for_fbo(v3d
);
1755 buffers
&= ~v3d_tlb_clear(job
, buffers
, color
, depth
, stencil
);
1758 v3d_draw_clear(v3d
, buffers
, color
, depth
, stencil
);
1762 v3d_clear_render_target(struct pipe_context
*pctx
, struct pipe_surface
*ps
,
1763 const union pipe_color_union
*color
,
1764 unsigned x
, unsigned y
, unsigned w
, unsigned h
,
1765 bool render_condition_enabled
)
1767 fprintf(stderr
, "unimpl: clear RT\n");
1771 v3d_clear_depth_stencil(struct pipe_context
*pctx
, struct pipe_surface
*ps
,
1772 unsigned buffers
, double depth
, unsigned stencil
,
1773 unsigned x
, unsigned y
, unsigned w
, unsigned h
,
1774 bool render_condition_enabled
)
1776 fprintf(stderr
, "unimpl: clear DS\n");
1780 v3dX(draw_init
)(struct pipe_context
*pctx
)
1782 pctx
->draw_vbo
= v3d_draw_vbo
;
1783 pctx
->clear
= v3d_clear
;
1784 pctx
->clear_render_target
= v3d_clear_render_target
;
1785 pctx
->clear_depth_stencil
= v3d_clear_depth_stencil
;
1786 #if V3D_VERSION >= 41
1787 if (v3d_context(pctx
)->screen
->has_csd
)
1788 pctx
->launch_grid
= v3d_launch_grid
;