2 * Copyright © 2014-2017 Broadcom
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 #include "util/u_blitter.h"
25 #include "util/u_prim.h"
26 #include "util/format/u_format.h"
27 #include "util/u_pack_color.h"
28 #include "util/u_prim_restart.h"
29 #include "util/u_upload_mgr.h"
30 #include "indices/u_primconvert.h"
32 #include "v3d_context.h"
33 #include "v3d_resource.h"
35 #include "broadcom/compiler/v3d_compiler.h"
36 #include "broadcom/common/v3d_macros.h"
37 #include "broadcom/cle/v3dx_pack.h"
40 * Does the initial bining command list setup for drawing to a given FBO.
43 v3d_start_draw(struct v3d_context
*v3d
)
45 struct v3d_job
*job
= v3d
->job
;
50 /* Get space to emit our BCL state, using a branch to jump to a new BO
53 v3d_cl_ensure_space_with_branch(&job
->bcl
, 256 /* XXX */);
55 job
->submit
.bcl_start
= job
->bcl
.bo
->offset
;
56 v3d_job_add_bo(job
, job
->bcl
.bo
);
58 /* The PTB will request the tile alloc initial size per tile at start
61 uint32_t tile_alloc_size
= (job
->draw_tiles_x
*
62 job
->draw_tiles_y
) * 64;
63 /* The PTB allocates in aligned 4k chunks after the initial setup. */
64 tile_alloc_size
= align(tile_alloc_size
, 4096);
66 /* Include the first two chunk allocations that the PTB does so that
67 * we definitely clear the OOM condition before triggering one (the HW
68 * won't trigger OOM during the first allocations).
70 tile_alloc_size
+= 8192;
72 /* For performance, allocate some extra initial memory after the PTB's
73 * minimal allocations, so that we hopefully don't have to block the
74 * GPU on the kernel handling an OOM signal.
76 tile_alloc_size
+= 512 * 1024;
78 job
->tile_alloc
= v3d_bo_alloc(v3d
->screen
, tile_alloc_size
,
80 uint32_t tsda_per_tile_size
= v3d
->screen
->devinfo
.ver
>= 40 ? 256 : 64;
81 job
->tile_state
= v3d_bo_alloc(v3d
->screen
,
88 cl_emit(&job
->bcl
, TILE_BINNING_MODE_CFG
, config
) {
89 config
.width_in_pixels
= v3d
->framebuffer
.width
;
90 config
.height_in_pixels
= v3d
->framebuffer
.height
;
91 config
.number_of_render_targets
=
92 MAX2(v3d
->framebuffer
.nr_cbufs
, 1);
94 config
.multisample_mode_4x
= job
->msaa
;
96 config
.maximum_bpp_of_all_render_targets
= job
->internal_bpp
;
98 #else /* V3D_VERSION < 40 */
99 /* "Binning mode lists start with a Tile Binning Mode Configuration
102 * Part1 signals the end of binning config setup.
104 cl_emit(&job
->bcl
, TILE_BINNING_MODE_CFG_PART2
, config
) {
105 config
.tile_allocation_memory_address
=
106 cl_address(job
->tile_alloc
, 0);
107 config
.tile_allocation_memory_size
= job
->tile_alloc
->size
;
110 cl_emit(&job
->bcl
, TILE_BINNING_MODE_CFG_PART1
, config
) {
111 config
.tile_state_data_array_base_address
=
112 cl_address(job
->tile_state
, 0);
114 config
.width_in_tiles
= job
->draw_tiles_x
;
115 config
.height_in_tiles
= job
->draw_tiles_y
;
117 config
.number_of_render_targets
=
118 MAX2(v3d
->framebuffer
.nr_cbufs
, 1);
120 config
.multisample_mode_4x
= job
->msaa
;
122 config
.maximum_bpp_of_all_render_targets
= job
->internal_bpp
;
124 #endif /* V3D_VERSION < 40 */
126 /* There's definitely nothing in the VCD cache we want. */
127 cl_emit(&job
->bcl
, FLUSH_VCD_CACHE
, bin
);
129 /* Disable any leftover OQ state from another job. */
130 cl_emit(&job
->bcl
, OCCLUSION_QUERY_COUNTER
, counter
);
132 /* "Binning mode lists must have a Start Tile Binning item (6) after
133 * any prefix state data before the binning list proper starts."
135 cl_emit(&job
->bcl
, START_TILE_BINNING
, bin
);
137 job
->needs_flush
= true;
138 job
->draw_width
= v3d
->framebuffer
.width
;
139 job
->draw_height
= v3d
->framebuffer
.height
;
143 v3d_predraw_check_stage_inputs(struct pipe_context
*pctx
,
144 enum pipe_shader_type s
)
146 struct v3d_context
*v3d
= v3d_context(pctx
);
148 /* Flush writes to textures we're sampling. */
149 for (int i
= 0; i
< v3d
->tex
[s
].num_textures
; i
++) {
150 struct pipe_sampler_view
*pview
= v3d
->tex
[s
].textures
[i
];
153 struct v3d_sampler_view
*view
= v3d_sampler_view(pview
);
155 if (view
->texture
!= view
->base
.texture
&&
156 view
->base
.format
!= PIPE_FORMAT_X32_S8X24_UINT
)
157 v3d_update_shadow_texture(pctx
, &view
->base
);
159 v3d_flush_jobs_writing_resource(v3d
, view
->texture
,
163 /* Flush writes to UBOs. */
164 foreach_bit(i
, v3d
->constbuf
[s
].enabled_mask
) {
165 struct pipe_constant_buffer
*cb
= &v3d
->constbuf
[s
].cb
[i
];
167 v3d_flush_jobs_writing_resource(v3d
, cb
->buffer
,
172 /* Flush reads/writes to our SSBOs */
173 foreach_bit(i
, v3d
->ssbo
[s
].enabled_mask
) {
174 struct pipe_shader_buffer
*sb
= &v3d
->ssbo
[s
].sb
[i
];
176 v3d_flush_jobs_reading_resource(v3d
, sb
->buffer
,
177 V3D_FLUSH_NOT_CURRENT_JOB
);
181 /* Flush reads/writes to our image views */
182 foreach_bit(i
, v3d
->shaderimg
[s
].enabled_mask
) {
183 struct v3d_image_view
*view
= &v3d
->shaderimg
[s
].si
[i
];
185 v3d_flush_jobs_reading_resource(v3d
, view
->base
.resource
,
186 V3D_FLUSH_NOT_CURRENT_JOB
);
189 /* Flush writes to our vertex buffers (i.e. from transform feedback) */
190 if (s
== PIPE_SHADER_VERTEX
) {
191 foreach_bit(i
, v3d
->vertexbuf
.enabled_mask
) {
192 struct pipe_vertex_buffer
*vb
= &v3d
->vertexbuf
.vb
[i
];
194 v3d_flush_jobs_writing_resource(v3d
, vb
->buffer
.resource
,
201 v3d_predraw_check_outputs(struct pipe_context
*pctx
)
203 struct v3d_context
*v3d
= v3d_context(pctx
);
205 /* Flush jobs reading from TF buffers that we are about to write. */
206 if (v3d_transform_feedback_enabled(v3d
)) {
207 struct v3d_streamout_stateobj
*so
= &v3d
->streamout
;
209 for (int i
= 0; i
< so
->num_targets
; i
++) {
213 const struct pipe_stream_output_target
*target
=
215 v3d_flush_jobs_reading_resource(v3d
, target
->buffer
,
222 * Checks if the state for the current draw reads a particular resource in
223 * in the given shader stage.
226 v3d_state_reads_resource(struct v3d_context
*v3d
,
227 struct pipe_resource
*prsc
,
228 enum pipe_shader_type s
)
230 struct v3d_resource
*rsc
= v3d_resource(prsc
);
233 if (s
== PIPE_SHADER_VERTEX
) {
234 foreach_bit(i
, v3d
->vertexbuf
.enabled_mask
) {
235 struct pipe_vertex_buffer
*vb
= &v3d
->vertexbuf
.vb
[i
];
236 if (!vb
->buffer
.resource
)
239 struct v3d_resource
*vb_rsc
=
240 v3d_resource(vb
->buffer
.resource
);
241 if (rsc
->bo
== vb_rsc
->bo
)
246 /* Constant buffers */
247 foreach_bit(i
, v3d
->constbuf
[s
].enabled_mask
) {
248 struct pipe_constant_buffer
*cb
= &v3d
->constbuf
[s
].cb
[i
];
252 struct v3d_resource
*cb_rsc
= v3d_resource(cb
->buffer
);
253 if (rsc
->bo
== cb_rsc
->bo
)
257 /* Shader storage buffers */
258 foreach_bit(i
, v3d
->ssbo
[s
].enabled_mask
) {
259 struct pipe_shader_buffer
*sb
= &v3d
->ssbo
[s
].sb
[i
];
263 struct v3d_resource
*sb_rsc
= v3d_resource(sb
->buffer
);
264 if (rsc
->bo
== sb_rsc
->bo
)
269 for (int i
= 0; i
< v3d
->tex
[s
].num_textures
; i
++) {
270 struct pipe_sampler_view
*pview
= v3d
->tex
[s
].textures
[i
];
274 struct v3d_sampler_view
*view
= v3d_sampler_view(pview
);
275 struct v3d_resource
*v_rsc
= v3d_resource(view
->texture
);
276 if (rsc
->bo
== v_rsc
->bo
)
284 v3d_emit_wait_for_tf(struct v3d_job
*job
)
286 /* XXX: we might be able to skip this in some cases, for now we
289 cl_emit(&job
->bcl
, FLUSH_TRANSFORM_FEEDBACK_DATA
, flush
);
291 cl_emit(&job
->bcl
, WAIT_FOR_TRANSFORM_FEEDBACK
, wait
) {
292 /* XXX: Wait for all outstanding writes... maybe we can do
293 * better in some cases.
295 wait
.block_count
= 255;
298 /* We have just flushed all our outstanding TF work in this job so make
299 * sure we don't emit TF flushes again for any of it again.
301 _mesa_set_clear(job
->tf_write_prscs
, NULL
);
305 v3d_emit_wait_for_tf_if_needed(struct v3d_context
*v3d
, struct v3d_job
*job
)
307 if (!job
->tf_enabled
)
310 set_foreach(job
->tf_write_prscs
, entry
) {
311 struct pipe_resource
*prsc
= (struct pipe_resource
*)entry
->key
;
312 for (int s
= 0; s
< PIPE_SHADER_COMPUTE
; s
++) {
313 /* Fragment shaders can only start executing after all
314 * binning (and thus TF) is complete.
316 * XXX: For VS/GS/TES, if the binning shader does not
317 * read the resource then we could also avoid emitting
320 if (s
== PIPE_SHADER_FRAGMENT
)
323 if (v3d_state_reads_resource(v3d
, prsc
, s
)) {
324 v3d_emit_wait_for_tf(job
);
331 #if V3D_VERSION >= 41
333 v3d_emit_gs_state_record(struct v3d_job
*job
,
334 struct v3d_compiled_shader
*gs_bin
,
335 struct v3d_cl_reloc gs_bin_uniforms
,
336 struct v3d_compiled_shader
*gs
,
337 struct v3d_cl_reloc gs_render_uniforms
)
339 cl_emit(&job
->indirect
, GEOMETRY_SHADER_STATE_RECORD
, shader
) {
340 shader
.geometry_bin_mode_shader_code_address
=
341 cl_address(v3d_resource(gs_bin
->resource
)->bo
,
343 shader
.geometry_bin_mode_shader_4_way_threadable
=
344 gs_bin
->prog_data
.gs
->base
.threads
== 4;
345 shader
.geometry_bin_mode_shader_start_in_final_thread_section
=
346 gs_bin
->prog_data
.gs
->base
.single_seg
;
347 shader
.geometry_bin_mode_shader_propagate_nans
= true;
348 shader
.geometry_bin_mode_shader_uniforms_address
=
351 shader
.geometry_render_mode_shader_code_address
=
352 cl_address(v3d_resource(gs
->resource
)->bo
, gs
->offset
);
353 shader
.geometry_render_mode_shader_4_way_threadable
=
354 gs
->prog_data
.gs
->base
.threads
== 4;
355 shader
.geometry_render_mode_shader_start_in_final_thread_section
=
356 gs
->prog_data
.gs
->base
.single_seg
;
357 shader
.geometry_render_mode_shader_propagate_nans
= true;
358 shader
.geometry_render_mode_shader_uniforms_address
=
364 v3d_gs_output_primitive(uint32_t prim_type
)
368 return GEOMETRY_SHADER_POINTS
;
370 return GEOMETRY_SHADER_LINE_STRIP
;
371 case GL_TRIANGLE_STRIP
:
372 return GEOMETRY_SHADER_TRI_STRIP
;
374 unreachable("Unsupported primitive type");
379 v3d_emit_tes_gs_common_params(struct v3d_job
*job
,
380 uint8_t gs_out_prim_type
,
381 uint8_t gs_num_invocations
)
383 /* This, and v3d_emit_tes_gs_shader_params below, fill in default
384 * values for tessellation fields even though we don't support
385 * tessellation yet because our packing functions (and the simulator)
386 * complain if we don't.
388 cl_emit(&job
->indirect
, TESSELLATION_GEOMETRY_COMMON_PARAMS
, shader
) {
389 shader
.tessellation_type
= TESSELLATION_TYPE_TRIANGLE
;
390 shader
.tessellation_point_mode
= false;
391 shader
.tessellation_edge_spacing
= TESSELLATION_EDGE_SPACING_EVEN
;
392 shader
.tessellation_clockwise
= true;
393 shader
.tessellation_invocations
= 1;
395 shader
.geometry_shader_output_format
=
396 v3d_gs_output_primitive(gs_out_prim_type
);
397 shader
.geometry_shader_instances
= gs_num_invocations
& 0x1F;
402 v3d_emit_tes_gs_shader_params(struct v3d_job
*job
,
403 struct v3d_gs_prog_data
*gs
)
405 cl_emit(&job
->indirect
, TESSELLATION_GEOMETRY_SHADER_PARAMS
, shader
) {
406 shader
.tcs_batch_flush_mode
= V3D_TCS_FLUSH_MODE_FULLY_PACKED
;
407 shader
.per_patch_data_column_depth
= 1;
408 shader
.tcs_output_segment_size_in_sectors
= 1;
409 shader
.tcs_output_segment_pack_mode
= V3D_PACK_MODE_16_WAY
;
410 shader
.tes_output_segment_size_in_sectors
= 1;
411 shader
.tes_output_segment_pack_mode
= V3D_PACK_MODE_16_WAY
;
412 shader
.gs_output_segment_size_in_sectors
=
414 shader
.gs_output_segment_pack_mode
= V3D_PACK_MODE_16_WAY
; /* FIXME*/
415 shader
.tbg_max_patches_per_tcs_batch
= 1;
416 shader
.tbg_max_extra_vertex_segs_for_patches_after_first
= 0;
417 shader
.tbg_min_tcs_output_segments_required_in_play
= 1;
418 shader
.tbg_min_per_patch_data_segments_required_in_play
= 1;
419 shader
.tpg_max_patches_per_tes_batch
= 1;
420 shader
.tpg_max_vertex_segments_per_tes_batch
= 0;
421 shader
.tpg_max_tcs_output_segments_per_tes_batch
= 1;
422 shader
.tpg_min_tes_output_segments_required_in_play
= 1;
423 shader
.gbg_max_tes_output_vertex_segments_per_gs_batch
= 0;
424 shader
.gbg_min_gs_output_segments_required_in_play
= 1;
431 v3d_emit_gl_shader_state(struct v3d_context
*v3d
,
432 const struct pipe_draw_info
*info
)
434 struct v3d_job
*job
= v3d
->job
;
435 /* VC5_DIRTY_VTXSTATE */
436 struct v3d_vertex_stateobj
*vtx
= v3d
->vtx
;
437 /* VC5_DIRTY_VTXBUF */
438 struct v3d_vertexbuf_stateobj
*vertexbuf
= &v3d
->vertexbuf
;
440 /* Upload the uniforms to the indirect CL first */
441 struct v3d_cl_reloc fs_uniforms
=
442 v3d_write_uniforms(v3d
, job
, v3d
->prog
.fs
,
443 PIPE_SHADER_FRAGMENT
);
445 struct v3d_cl_reloc gs_uniforms
= { NULL
, 0 };
446 struct v3d_cl_reloc gs_bin_uniforms
= { NULL
, 0 };
448 gs_uniforms
= v3d_write_uniforms(v3d
, job
, v3d
->prog
.gs
,
449 PIPE_SHADER_GEOMETRY
);
451 if (v3d
->prog
.gs_bin
) {
452 gs_bin_uniforms
= v3d_write_uniforms(v3d
, job
, v3d
->prog
.gs_bin
,
453 PIPE_SHADER_GEOMETRY
);
456 struct v3d_cl_reloc vs_uniforms
=
457 v3d_write_uniforms(v3d
, job
, v3d
->prog
.vs
,
459 struct v3d_cl_reloc cs_uniforms
=
460 v3d_write_uniforms(v3d
, job
, v3d
->prog
.cs
,
463 /* Update the cache dirty flag based on the shader progs data */
464 job
->tmu_dirty_rcl
|= v3d
->prog
.cs
->prog_data
.vs
->base
.tmu_dirty_rcl
;
465 job
->tmu_dirty_rcl
|= v3d
->prog
.vs
->prog_data
.vs
->base
.tmu_dirty_rcl
;
466 if (v3d
->prog
.gs_bin
) {
467 job
->tmu_dirty_rcl
|=
468 v3d
->prog
.gs_bin
->prog_data
.gs
->base
.tmu_dirty_rcl
;
471 job
->tmu_dirty_rcl
|=
472 v3d
->prog
.gs
->prog_data
.gs
->base
.tmu_dirty_rcl
;
474 job
->tmu_dirty_rcl
|= v3d
->prog
.fs
->prog_data
.fs
->base
.tmu_dirty_rcl
;
476 /* See GFXH-930 workaround below */
477 uint32_t num_elements_to_emit
= MAX2(vtx
->num_elements
, 1);
479 uint32_t shader_state_record_length
=
480 cl_packet_length(GL_SHADER_STATE_RECORD
);
481 #if V3D_VERSION >= 41
483 shader_state_record_length
+=
484 cl_packet_length(GEOMETRY_SHADER_STATE_RECORD
) +
485 cl_packet_length(TESSELLATION_GEOMETRY_COMMON_PARAMS
) +
486 2 * cl_packet_length(TESSELLATION_GEOMETRY_SHADER_PARAMS
);
490 uint32_t shader_rec_offset
=
491 v3d_cl_ensure_space(&job
->indirect
,
492 shader_state_record_length
+
493 num_elements_to_emit
*
494 cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD
),
497 /* XXX perf: We should move most of the SHADER_STATE_RECORD setup to
498 * compile time, so that we mostly just have to OR the VS and FS
499 * records together at draw time.
501 #if V3D_VERSION >= 41
503 v3d_emit_gs_state_record(v3d
->job
,
504 v3d
->prog
.gs_bin
, gs_bin_uniforms
,
505 v3d
->prog
.gs
, gs_uniforms
);
507 struct v3d_gs_prog_data
*gs
= v3d
->prog
.gs
->prog_data
.gs
;
508 struct v3d_gs_prog_data
*gs_bin
= v3d
->prog
.gs_bin
->prog_data
.gs
;
510 v3d_emit_tes_gs_common_params(v3d
->job
,
512 gs
->num_invocations
);
513 v3d_emit_tes_gs_shader_params(v3d
->job
, gs_bin
);
514 v3d_emit_tes_gs_shader_params(v3d
->job
, gs
);
518 cl_emit(&job
->indirect
, GL_SHADER_STATE_RECORD
, shader
) {
519 shader
.enable_clipping
= true;
520 /* VC5_DIRTY_PRIM_MODE | VC5_DIRTY_RASTERIZER */
521 shader
.point_size_in_shaded_vertex_data
=
522 (info
->mode
== PIPE_PRIM_POINTS
&&
523 v3d
->rasterizer
->base
.point_size_per_vertex
);
525 /* Must be set if the shader modifies Z, discards, or modifies
526 * the sample mask. For any of these cases, the fragment
527 * shader needs to write the Z value (even just discards).
529 shader
.fragment_shader_does_z_writes
=
530 v3d
->prog
.fs
->prog_data
.fs
->writes_z
;
531 /* Set if the EZ test must be disabled (due to shader side
532 * effects and the early_z flag not being present in the
535 shader
.turn_off_early_z_test
=
536 v3d
->prog
.fs
->prog_data
.fs
->disable_ez
;
538 shader
.fragment_shader_uses_real_pixel_centre_w_in_addition_to_centroid_w2
=
539 v3d
->prog
.fs
->prog_data
.fs
->uses_center_w
;
541 #if V3D_VERSION >= 41
542 shader
.any_shader_reads_hardware_written_primitive_id
=
543 v3d
->prog
.gs
? v3d
->prog
.gs
->prog_data
.gs
->uses_pid
:
547 #if V3D_VERSION >= 40
548 shader
.do_scoreboard_wait_on_first_thread_switch
=
549 v3d
->prog
.fs
->prog_data
.fs
->lock_scoreboard_on_first_thrsw
;
550 shader
.disable_implicit_point_line_varyings
=
551 !v3d
->prog
.fs
->prog_data
.fs
->uses_implicit_point_line_varyings
;
554 shader
.number_of_varyings_in_fragment_shader
=
555 v3d
->prog
.fs
->prog_data
.fs
->num_inputs
;
557 shader
.coordinate_shader_propagate_nans
= true;
558 shader
.vertex_shader_propagate_nans
= true;
559 shader
.fragment_shader_propagate_nans
= true;
561 shader
.coordinate_shader_code_address
=
562 cl_address(v3d_resource(v3d
->prog
.cs
->resource
)->bo
,
563 v3d
->prog
.cs
->offset
);
564 shader
.vertex_shader_code_address
=
565 cl_address(v3d_resource(v3d
->prog
.vs
->resource
)->bo
,
566 v3d
->prog
.vs
->offset
);
567 shader
.fragment_shader_code_address
=
568 cl_address(v3d_resource(v3d
->prog
.fs
->resource
)->bo
,
569 v3d
->prog
.fs
->offset
);
571 /* XXX: Use combined input/output size flag in the common
574 shader
.coordinate_shader_has_separate_input_and_output_vpm_blocks
=
575 v3d
->prog
.cs
->prog_data
.vs
->separate_segments
;
576 shader
.vertex_shader_has_separate_input_and_output_vpm_blocks
=
577 v3d
->prog
.vs
->prog_data
.vs
->separate_segments
;
579 shader
.coordinate_shader_input_vpm_segment_size
=
580 v3d
->prog
.cs
->prog_data
.vs
->separate_segments
?
581 v3d
->prog
.cs
->prog_data
.vs
->vpm_input_size
: 1;
582 shader
.vertex_shader_input_vpm_segment_size
=
583 v3d
->prog
.vs
->prog_data
.vs
->separate_segments
?
584 v3d
->prog
.vs
->prog_data
.vs
->vpm_input_size
: 1;
586 shader
.coordinate_shader_output_vpm_segment_size
=
587 v3d
->prog
.cs
->prog_data
.vs
->vpm_output_size
;
588 shader
.vertex_shader_output_vpm_segment_size
=
589 v3d
->prog
.vs
->prog_data
.vs
->vpm_output_size
;
591 shader
.coordinate_shader_uniforms_address
= cs_uniforms
;
592 shader
.vertex_shader_uniforms_address
= vs_uniforms
;
593 shader
.fragment_shader_uniforms_address
= fs_uniforms
;
595 #if V3D_VERSION >= 41
596 shader
.min_coord_shader_input_segments_required_in_play
= 1;
597 shader
.min_vertex_shader_input_segments_required_in_play
= 1;
599 shader
.coordinate_shader_4_way_threadable
=
600 v3d
->prog
.cs
->prog_data
.vs
->base
.threads
== 4;
601 shader
.vertex_shader_4_way_threadable
=
602 v3d
->prog
.vs
->prog_data
.vs
->base
.threads
== 4;
603 shader
.fragment_shader_4_way_threadable
=
604 v3d
->prog
.fs
->prog_data
.fs
->base
.threads
== 4;
606 shader
.coordinate_shader_start_in_final_thread_section
=
607 v3d
->prog
.cs
->prog_data
.vs
->base
.single_seg
;
608 shader
.vertex_shader_start_in_final_thread_section
=
609 v3d
->prog
.vs
->prog_data
.vs
->base
.single_seg
;
610 shader
.fragment_shader_start_in_final_thread_section
=
611 v3d
->prog
.fs
->prog_data
.fs
->base
.single_seg
;
613 shader
.coordinate_shader_4_way_threadable
=
614 v3d
->prog
.cs
->prog_data
.vs
->base
.threads
== 4;
615 shader
.coordinate_shader_2_way_threadable
=
616 v3d
->prog
.cs
->prog_data
.vs
->base
.threads
== 2;
617 shader
.vertex_shader_4_way_threadable
=
618 v3d
->prog
.vs
->prog_data
.vs
->base
.threads
== 4;
619 shader
.vertex_shader_2_way_threadable
=
620 v3d
->prog
.vs
->prog_data
.vs
->base
.threads
== 2;
621 shader
.fragment_shader_4_way_threadable
=
622 v3d
->prog
.fs
->prog_data
.fs
->base
.threads
== 4;
623 shader
.fragment_shader_2_way_threadable
=
624 v3d
->prog
.fs
->prog_data
.fs
->base
.threads
== 2;
627 shader
.vertex_id_read_by_coordinate_shader
=
628 v3d
->prog
.cs
->prog_data
.vs
->uses_vid
;
629 shader
.instance_id_read_by_coordinate_shader
=
630 v3d
->prog
.cs
->prog_data
.vs
->uses_iid
;
631 shader
.vertex_id_read_by_vertex_shader
=
632 v3d
->prog
.vs
->prog_data
.vs
->uses_vid
;
633 shader
.instance_id_read_by_vertex_shader
=
634 v3d
->prog
.vs
->prog_data
.vs
->uses_iid
;
636 shader
.address_of_default_attribute_values
=
637 cl_address(v3d_resource(vtx
->defaults
)->bo
,
638 vtx
->defaults_offset
);
641 bool cs_loaded_any
= false;
642 for (int i
= 0; i
< vtx
->num_elements
; i
++) {
643 struct pipe_vertex_element
*elem
= &vtx
->pipe
[i
];
644 struct pipe_vertex_buffer
*vb
=
645 &vertexbuf
->vb
[elem
->vertex_buffer_index
];
646 struct v3d_resource
*rsc
= v3d_resource(vb
->buffer
.resource
);
648 const uint32_t size
=
649 cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD
);
650 cl_emit_with_prepacked(&job
->indirect
,
651 GL_SHADER_STATE_ATTRIBUTE_RECORD
,
652 &vtx
->attrs
[i
* size
], attr
) {
653 attr
.stride
= vb
->stride
;
654 attr
.address
= cl_address(rsc
->bo
,
657 attr
.number_of_values_read_by_coordinate_shader
=
658 v3d
->prog
.cs
->prog_data
.vs
->vattr_sizes
[i
];
659 attr
.number_of_values_read_by_vertex_shader
=
660 v3d
->prog
.vs
->prog_data
.vs
->vattr_sizes
[i
];
662 /* GFXH-930: At least one attribute must be enabled
663 * and read by CS and VS. If we have attributes being
664 * consumed by the VS but not the CS, then set up a
665 * dummy load of the last attribute into the CS's VPM
666 * inputs. (Since CS is just dead-code-elimination
667 * compared to VS, we can't have CS loading but not
670 if (v3d
->prog
.cs
->prog_data
.vs
->vattr_sizes
[i
])
671 cs_loaded_any
= true;
672 if (i
== vtx
->num_elements
- 1 && !cs_loaded_any
) {
673 attr
.number_of_values_read_by_coordinate_shader
= 1;
675 #if V3D_VERSION >= 41
676 attr
.maximum_index
= 0xffffff;
679 STATIC_ASSERT(sizeof(vtx
->attrs
) >= V3D_MAX_VS_INPUTS
/ 4 * size
);
682 if (vtx
->num_elements
== 0) {
683 /* GFXH-930: At least one attribute must be enabled and read
684 * by CS and VS. If we have no attributes being consumed by
685 * the shader, set up a dummy to be loaded into the VPM.
687 cl_emit(&job
->indirect
, GL_SHADER_STATE_ATTRIBUTE_RECORD
, attr
) {
688 /* Valid address of data whose value will be unused. */
689 attr
.address
= cl_address(job
->indirect
.bo
, 0);
691 attr
.type
= ATTRIBUTE_FLOAT
;
695 attr
.number_of_values_read_by_coordinate_shader
= 1;
696 attr
.number_of_values_read_by_vertex_shader
= 1;
700 cl_emit(&job
->bcl
, VCM_CACHE_SIZE
, vcm
) {
701 vcm
.number_of_16_vertex_batches_for_binning
=
702 v3d
->prog
.cs
->prog_data
.vs
->vcm_cache_size
;
703 vcm
.number_of_16_vertex_batches_for_rendering
=
704 v3d
->prog
.vs
->prog_data
.vs
->vcm_cache_size
;
707 #if V3D_VERSION >= 41
709 cl_emit(&job
->bcl
, GL_SHADER_STATE_INCLUDING_GS
, state
) {
710 state
.address
= cl_address(job
->indirect
.bo
,
712 state
.number_of_attribute_arrays
= num_elements_to_emit
;
715 cl_emit(&job
->bcl
, GL_SHADER_STATE
, state
) {
716 state
.address
= cl_address(job
->indirect
.bo
,
718 state
.number_of_attribute_arrays
= num_elements_to_emit
;
722 assert(!v3d
->prog
.gs
);
723 cl_emit(&job
->bcl
, GL_SHADER_STATE
, state
) {
724 state
.address
= cl_address(job
->indirect
.bo
, shader_rec_offset
);
725 state
.number_of_attribute_arrays
= num_elements_to_emit
;
729 v3d_bo_unreference(&cs_uniforms
.bo
);
730 v3d_bo_unreference(&vs_uniforms
.bo
);
732 v3d_bo_unreference(&gs_uniforms
.bo
);
733 if (gs_bin_uniforms
.bo
)
734 v3d_bo_unreference(&gs_bin_uniforms
.bo
);
735 v3d_bo_unreference(&fs_uniforms
.bo
);
739 * Updates the number of primitvies generated from the number of vertices
740 * to draw. We do this here instead of using PRIMITIVE_COUNTS_FEEDBACK because
741 * using the GPU packet for this might require sync waits and this is trivial
742 * to handle in the CPU instead.
745 v3d_update_primitives_generated_counter(struct v3d_context
*v3d
,
746 const struct pipe_draw_info
*info
)
748 if (!v3d
->active_queries
)
751 uint32_t prims
= u_prims_for_vertices(info
->mode
, info
->count
);
752 v3d
->prims_generated
+= prims
;
756 v3d_update_job_ez(struct v3d_context
*v3d
, struct v3d_job
*job
)
758 switch (v3d
->zsa
->ez_state
) {
759 case VC5_EZ_UNDECIDED
:
760 /* If the Z/S state didn't pick a direction but didn't
761 * disable, then go along with the current EZ state. This
762 * allows EZ optimization for Z func == EQUAL or NEVER.
768 /* If the Z/S state picked a direction, then it needs to match
769 * the current direction if we've decided on one.
771 if (job
->ez_state
== VC5_EZ_UNDECIDED
)
772 job
->ez_state
= v3d
->zsa
->ez_state
;
773 else if (job
->ez_state
!= v3d
->zsa
->ez_state
)
774 job
->ez_state
= VC5_EZ_DISABLED
;
777 case VC5_EZ_DISABLED
:
778 /* If the current Z/S state disables EZ because of a bad Z
779 * func or stencil operation, then we can't do any more EZ in
782 job
->ez_state
= VC5_EZ_DISABLED
;
786 /* If the FS affects the Z of the pixels, then it may update against
787 * the chosen EZ direction (though we could use
788 * ARB_conservative_depth's hints to avoid this)
790 if (v3d
->prog
.fs
->prog_data
.fs
->writes_z
) {
791 job
->ez_state
= VC5_EZ_DISABLED
;
794 if (job
->first_ez_state
== VC5_EZ_UNDECIDED
&&
795 (job
->ez_state
!= VC5_EZ_DISABLED
|| job
->draw_calls_queued
== 0))
796 job
->first_ez_state
= job
->ez_state
;
800 v3d_draw_vbo(struct pipe_context
*pctx
, const struct pipe_draw_info
*info
)
802 struct v3d_context
*v3d
= v3d_context(pctx
);
804 if (!info
->count_from_stream_output
&& !info
->indirect
&&
805 !info
->primitive_restart
&&
806 !u_trim_pipe_prim(info
->mode
, (unsigned*)&info
->count
))
809 /* Fall back for weird desktop GL primitive restart values. */
810 if (info
->primitive_restart
&&
814 switch (info
->index_size
) {
823 if (info
->restart_index
!= mask
) {
824 util_draw_vbo_without_prim_restart(pctx
, info
);
829 if (info
->mode
>= PIPE_PRIM_QUADS
) {
830 util_primconvert_save_rasterizer_state(v3d
->primconvert
, &v3d
->rasterizer
->base
);
831 util_primconvert_draw_vbo(v3d
->primconvert
, info
);
832 perf_debug("Fallback conversion for %d %s vertices\n",
833 info
->count
, u_prim_name(info
->mode
));
837 /* Before setting up the draw, flush anything writing to the resources
838 * that we read from or reading from resources we write to.
840 for (int s
= 0; s
< PIPE_SHADER_COMPUTE
; s
++)
841 v3d_predraw_check_stage_inputs(pctx
, s
);
843 if (info
->indirect
) {
844 v3d_flush_jobs_writing_resource(v3d
, info
->indirect
->buffer
,
848 v3d_predraw_check_outputs(pctx
);
850 /* If transform feedback is active and we are switching primitive type
851 * we need to submit the job before drawing and update the vertex count
852 * written to TF based on the primitive type since we will need to
853 * know the exact vertex count if the application decides to call
854 * glDrawTransformFeedback() later.
856 if (v3d
->streamout
.num_targets
> 0 &&
857 u_base_prim_type(info
->mode
) != u_base_prim_type(v3d
->prim_mode
)) {
858 v3d_tf_update_counters(v3d
);
861 struct v3d_job
*job
= v3d_get_job_for_fbo(v3d
);
863 /* If vertex texturing depends on the output of rendering, we need to
864 * ensure that that rendering is complete before we run a coordinate
865 * shader that depends on it.
867 * Given that doing that is unusual, for now we just block the binner
868 * on the last submitted render, rather than tracking the last
869 * rendering to each texture's BO.
871 if (v3d
->tex
[PIPE_SHADER_VERTEX
].num_textures
|| info
->indirect
) {
872 perf_debug("Blocking binner on last render "
873 "due to vertex texturing or indirect drawing.\n");
874 job
->submit
.in_sync_bcl
= v3d
->out_sync
;
877 /* Mark SSBOs and images as being written. We don't actually know
878 * which ones are read vs written, so just assume the worst.
880 for (int s
= 0; s
< PIPE_SHADER_COMPUTE
; s
++) {
881 foreach_bit(i
, v3d
->ssbo
[s
].enabled_mask
) {
882 v3d_job_add_write_resource(job
,
883 v3d
->ssbo
[s
].sb
[i
].buffer
);
884 job
->tmu_dirty_rcl
= true;
887 foreach_bit(i
, v3d
->shaderimg
[s
].enabled_mask
) {
888 v3d_job_add_write_resource(job
,
889 v3d
->shaderimg
[s
].si
[i
].base
.resource
);
890 job
->tmu_dirty_rcl
= true;
894 /* Get space to emit our draw call into the BCL, using a branch to
895 * jump to a new BO if necessary.
897 v3d_cl_ensure_space_with_branch(&job
->bcl
, 256 /* XXX */);
899 if (v3d
->prim_mode
!= info
->mode
) {
900 v3d
->prim_mode
= info
->mode
;
901 v3d
->dirty
|= VC5_DIRTY_PRIM_MODE
;
905 v3d_update_compiled_shaders(v3d
, info
->mode
);
906 v3d_update_job_ez(v3d
, job
);
908 /* If this job was writing to transform feedback buffers before this
909 * draw and we are reading from them here, then we need to wait for TF
910 * to complete before we emit this draw.
912 * Notice this check needs to happen before we emit state for the
913 * current draw call, where we update job->tf_enabled, so we can ensure
914 * that we only check TF writes for prior draws.
916 v3d_emit_wait_for_tf_if_needed(v3d
, job
);
918 #if V3D_VERSION >= 41
919 v3d41_emit_state(pctx
);
921 v3d33_emit_state(pctx
);
924 if (v3d
->dirty
& (VC5_DIRTY_VTXBUF
|
926 VC5_DIRTY_PRIM_MODE
|
927 VC5_DIRTY_RASTERIZER
|
928 VC5_DIRTY_COMPILED_CS
|
929 VC5_DIRTY_COMPILED_VS
|
930 VC5_DIRTY_COMPILED_GS_BIN
|
931 VC5_DIRTY_COMPILED_GS
|
932 VC5_DIRTY_COMPILED_FS
|
933 v3d
->prog
.cs
->uniform_dirty_bits
|
934 v3d
->prog
.vs
->uniform_dirty_bits
|
936 v3d
->prog
.gs_bin
->uniform_dirty_bits
: 0) |
938 v3d
->prog
.gs
->uniform_dirty_bits
: 0) |
939 v3d
->prog
.fs
->uniform_dirty_bits
)) {
940 v3d_emit_gl_shader_state(v3d
, info
);
945 /* The Base Vertex/Base Instance packet sets those values to nonzero
946 * for the next draw call only.
948 if (info
->index_bias
|| info
->start_instance
) {
949 cl_emit(&job
->bcl
, BASE_VERTEX_BASE_INSTANCE
, base
) {
950 base
.base_instance
= info
->start_instance
;
951 base
.base_vertex
= info
->index_bias
;
955 uint32_t prim_tf_enable
= 0;
957 /* V3D 3.x: The HW only processes transform feedback on primitives
960 if (v3d
->streamout
.num_targets
)
961 prim_tf_enable
= (V3D_PRIM_POINTS_TF
- V3D_PRIM_POINTS
);
964 v3d_update_primitives_generated_counter(v3d
, info
);
966 /* Note that the primitive type fields match with OpenGL/gallium
967 * definitions, up to but not including QUADS.
969 if (info
->index_size
) {
970 uint32_t index_size
= info
->index_size
;
971 uint32_t offset
= info
->start
* index_size
;
972 struct pipe_resource
*prsc
;
973 if (info
->has_user_indices
) {
975 u_upload_data(v3d
->uploader
, 0,
976 info
->count
* info
->index_size
, 4,
980 prsc
= info
->index
.resource
;
982 struct v3d_resource
*rsc
= v3d_resource(prsc
);
984 #if V3D_VERSION >= 40
985 cl_emit(&job
->bcl
, INDEX_BUFFER_SETUP
, ib
) {
986 ib
.address
= cl_address(rsc
->bo
, 0);
987 ib
.size
= rsc
->bo
->size
;
991 if (info
->indirect
) {
992 cl_emit(&job
->bcl
, INDIRECT_INDEXED_INSTANCED_PRIM_LIST
, prim
) {
993 prim
.index_type
= ffs(info
->index_size
) - 1;
995 prim
.address_of_indices_list
=
996 cl_address(rsc
->bo
, offset
);
997 #endif /* V3D_VERSION < 40 */
998 prim
.mode
= info
->mode
| prim_tf_enable
;
999 prim
.enable_primitive_restarts
= info
->primitive_restart
;
1001 prim
.number_of_draw_indirect_indexed_records
= info
->indirect
->draw_count
;
1003 prim
.stride_in_multiples_of_4_bytes
= info
->indirect
->stride
>> 2;
1004 prim
.address
= cl_address(v3d_resource(info
->indirect
->buffer
)->bo
,
1005 info
->indirect
->offset
);
1007 } else if (info
->instance_count
> 1) {
1008 cl_emit(&job
->bcl
, INDEXED_INSTANCED_PRIM_LIST
, prim
) {
1009 prim
.index_type
= ffs(info
->index_size
) - 1;
1010 #if V3D_VERSION >= 40
1011 prim
.index_offset
= offset
;
1012 #else /* V3D_VERSION < 40 */
1013 prim
.maximum_index
= (1u << 31) - 1; /* XXX */
1014 prim
.address_of_indices_list
=
1015 cl_address(rsc
->bo
, offset
);
1016 #endif /* V3D_VERSION < 40 */
1017 prim
.mode
= info
->mode
| prim_tf_enable
;
1018 prim
.enable_primitive_restarts
= info
->primitive_restart
;
1020 prim
.number_of_instances
= info
->instance_count
;
1021 prim
.instance_length
= info
->count
;
1024 cl_emit(&job
->bcl
, INDEXED_PRIM_LIST
, prim
) {
1025 prim
.index_type
= ffs(info
->index_size
) - 1;
1026 prim
.length
= info
->count
;
1027 #if V3D_VERSION >= 40
1028 prim
.index_offset
= offset
;
1029 #else /* V3D_VERSION < 40 */
1030 prim
.maximum_index
= (1u << 31) - 1; /* XXX */
1031 prim
.address_of_indices_list
=
1032 cl_address(rsc
->bo
, offset
);
1033 #endif /* V3D_VERSION < 40 */
1034 prim
.mode
= info
->mode
| prim_tf_enable
;
1035 prim
.enable_primitive_restarts
= info
->primitive_restart
;
1039 if (info
->has_user_indices
)
1040 pipe_resource_reference(&prsc
, NULL
);
1042 if (info
->indirect
) {
1043 cl_emit(&job
->bcl
, INDIRECT_VERTEX_ARRAY_INSTANCED_PRIMS
, prim
) {
1044 prim
.mode
= info
->mode
| prim_tf_enable
;
1045 prim
.number_of_draw_indirect_array_records
= info
->indirect
->draw_count
;
1047 prim
.stride_in_multiples_of_4_bytes
= info
->indirect
->stride
>> 2;
1048 prim
.address
= cl_address(v3d_resource(info
->indirect
->buffer
)->bo
,
1049 info
->indirect
->offset
);
1051 } else if (info
->instance_count
> 1) {
1052 struct pipe_stream_output_target
*so
=
1053 info
->count_from_stream_output
;
1054 uint32_t vert_count
= so
?
1055 v3d_stream_output_target_get_vertex_count(so
) :
1057 cl_emit(&job
->bcl
, VERTEX_ARRAY_INSTANCED_PRIMS
, prim
) {
1058 prim
.mode
= info
->mode
| prim_tf_enable
;
1059 prim
.index_of_first_vertex
= info
->start
;
1060 prim
.number_of_instances
= info
->instance_count
;
1061 prim
.instance_length
= vert_count
;
1064 struct pipe_stream_output_target
*so
=
1065 info
->count_from_stream_output
;
1066 uint32_t vert_count
= so
?
1067 v3d_stream_output_target_get_vertex_count(so
) :
1069 cl_emit(&job
->bcl
, VERTEX_ARRAY_PRIMS
, prim
) {
1070 prim
.mode
= info
->mode
| prim_tf_enable
;
1071 prim
.length
= vert_count
;
1072 prim
.index_of_first_vertex
= info
->start
;
1077 /* A flush is required in between a TF draw and any following TF specs
1078 * packet, or the GPU may hang. Just flush each time for now.
1080 if (v3d
->streamout
.num_targets
)
1081 cl_emit(&job
->bcl
, TRANSFORM_FEEDBACK_FLUSH_AND_COUNT
, flush
);
1083 job
->draw_calls_queued
++;
1084 if (v3d
->streamout
.num_targets
)
1085 job
->tf_draw_calls_queued
++;
1087 /* Increment the TF offsets by how many verts we wrote. XXX: This
1088 * needs some clamping to the buffer size.
1090 for (int i
= 0; i
< v3d
->streamout
.num_targets
; i
++)
1091 v3d
->streamout
.offsets
[i
] += info
->count
;
1093 if (v3d
->zsa
&& job
->zsbuf
&& v3d
->zsa
->base
.depth
.enabled
) {
1094 struct v3d_resource
*rsc
= v3d_resource(job
->zsbuf
->texture
);
1095 v3d_job_add_bo(job
, rsc
->bo
);
1097 job
->load
|= PIPE_CLEAR_DEPTH
& ~job
->clear
;
1098 if (v3d
->zsa
->base
.depth
.writemask
)
1099 job
->store
|= PIPE_CLEAR_DEPTH
;
1100 rsc
->initialized_buffers
= PIPE_CLEAR_DEPTH
;
1103 if (v3d
->zsa
&& job
->zsbuf
&& v3d
->zsa
->base
.stencil
[0].enabled
) {
1104 struct v3d_resource
*rsc
= v3d_resource(job
->zsbuf
->texture
);
1105 if (rsc
->separate_stencil
)
1106 rsc
= rsc
->separate_stencil
;
1108 v3d_job_add_bo(job
, rsc
->bo
);
1110 job
->load
|= PIPE_CLEAR_STENCIL
& ~job
->clear
;
1111 if (v3d
->zsa
->base
.stencil
[0].writemask
||
1112 v3d
->zsa
->base
.stencil
[1].writemask
) {
1113 job
->store
|= PIPE_CLEAR_STENCIL
;
1115 rsc
->initialized_buffers
|= PIPE_CLEAR_STENCIL
;
1118 for (int i
= 0; i
< V3D_MAX_DRAW_BUFFERS
; i
++) {
1119 uint32_t bit
= PIPE_CLEAR_COLOR0
<< i
;
1120 int blend_rt
= v3d
->blend
->base
.independent_blend_enable
? i
: 0;
1122 if (job
->store
& bit
|| !job
->cbufs
[i
])
1124 struct v3d_resource
*rsc
= v3d_resource(job
->cbufs
[i
]->texture
);
1126 job
->load
|= bit
& ~job
->clear
;
1127 if (v3d
->blend
->base
.rt
[blend_rt
].colormask
)
1129 v3d_job_add_bo(job
, rsc
->bo
);
1132 if (job
->referenced_size
> 768 * 1024 * 1024) {
1133 perf_debug("Flushing job with %dkb to try to free up memory\n",
1134 job
->referenced_size
/ 1024);
1138 if (V3D_DEBUG
& V3D_DEBUG_ALWAYS_FLUSH
)
1142 #if V3D_VERSION >= 41
1143 #define V3D_CSD_CFG012_WG_COUNT_SHIFT 16
1144 #define V3D_CSD_CFG012_WG_OFFSET_SHIFT 0
1145 /* Allow this dispatch to start while the last one is still running. */
1146 #define V3D_CSD_CFG3_OVERLAP_WITH_PREV (1 << 26)
1147 /* Maximum supergroup ID. 6 bits. */
1148 #define V3D_CSD_CFG3_MAX_SG_ID_SHIFT 20
1149 /* Batches per supergroup minus 1. 8 bits. */
1150 #define V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT 12
1151 /* Workgroups per supergroup, 0 means 16 */
1152 #define V3D_CSD_CFG3_WGS_PER_SG_SHIFT 8
1153 #define V3D_CSD_CFG3_WG_SIZE_SHIFT 0
1155 #define V3D_CSD_CFG5_PROPAGATE_NANS (1 << 2)
1156 #define V3D_CSD_CFG5_SINGLE_SEG (1 << 1)
1157 #define V3D_CSD_CFG5_THREADING (1 << 0)
1160 v3d_launch_grid(struct pipe_context
*pctx
, const struct pipe_grid_info
*info
)
1162 struct v3d_context
*v3d
= v3d_context(pctx
);
1163 struct v3d_screen
*screen
= v3d
->screen
;
1165 v3d_predraw_check_stage_inputs(pctx
, PIPE_SHADER_COMPUTE
);
1167 v3d_update_compiled_cs(v3d
);
1169 if (!v3d
->prog
.compute
->resource
) {
1170 static bool warned
= false;
1173 "Compute shader failed to compile. "
1174 "Expect corruption.\n");
1180 /* Some of the units of scale:
1182 * - Batches of 16 work items (shader invocations) that will be queued
1183 * to the run on a QPU at once.
1185 * - Workgroups composed of work items based on the shader's layout
1188 * - Supergroups of 1-16 workgroups. There can only be 16 supergroups
1189 * running at a time on the core, so we want to keep them large to
1190 * keep the QPUs busy, but a whole supergroup will sync at a barrier
1191 * so we want to keep them small if one is present.
1193 struct drm_v3d_submit_csd submit
= { 0 };
1194 struct v3d_job
*job
= v3d_job_create(v3d
);
1196 /* Set up the actual number of workgroups, synchronously mapping the
1197 * indirect buffer if necessary to get the dimensions.
1199 if (info
->indirect
) {
1200 struct pipe_transfer
*transfer
;
1201 uint32_t *map
= pipe_buffer_map_range(pctx
, info
->indirect
,
1202 info
->indirect_offset
,
1203 3 * sizeof(uint32_t),
1206 memcpy(v3d
->compute_num_workgroups
, map
, 3 * sizeof(uint32_t));
1207 pipe_buffer_unmap(pctx
, transfer
);
1209 if (v3d
->compute_num_workgroups
[0] == 0 ||
1210 v3d
->compute_num_workgroups
[1] == 0 ||
1211 v3d
->compute_num_workgroups
[2] == 0) {
1212 /* Nothing to dispatch, so skip the draw (CSD can't
1213 * handle 0 workgroups).
1218 v3d
->compute_num_workgroups
[0] = info
->grid
[0];
1219 v3d
->compute_num_workgroups
[1] = info
->grid
[1];
1220 v3d
->compute_num_workgroups
[2] = info
->grid
[2];
1223 for (int i
= 0; i
< 3; i
++) {
1224 submit
.cfg
[i
] |= (v3d
->compute_num_workgroups
[i
] <<
1225 V3D_CSD_CFG012_WG_COUNT_SHIFT
);
1228 perf_debug("CSD only using single WG per SG currently, "
1229 "should increase that when possible.");
1231 int wg_size
= info
->block
[0] * info
->block
[1] * info
->block
[2];
1232 submit
.cfg
[3] |= wgs_per_sg
<< V3D_CSD_CFG3_WGS_PER_SG_SHIFT
;
1233 submit
.cfg
[3] |= ((DIV_ROUND_UP(wgs_per_sg
* wg_size
, 16) - 1) <<
1234 V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT
);
1235 submit
.cfg
[3] |= (wg_size
& 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT
;
1237 int batches_per_wg
= DIV_ROUND_UP(wg_size
, 16);
1238 /* Number of batches the dispatch will invoke (minus 1). */
1239 submit
.cfg
[4] = batches_per_wg
* (v3d
->compute_num_workgroups
[0] *
1240 v3d
->compute_num_workgroups
[1] *
1241 v3d
->compute_num_workgroups
[2]) - 1;
1243 /* Make sure we didn't accidentally underflow. */
1244 assert(submit
.cfg
[4] != ~0);
1246 v3d_job_add_bo(job
, v3d_resource(v3d
->prog
.compute
->resource
)->bo
);
1247 submit
.cfg
[5] = (v3d_resource(v3d
->prog
.compute
->resource
)->bo
->offset
+
1248 v3d
->prog
.compute
->offset
);
1249 submit
.cfg
[5] |= V3D_CSD_CFG5_PROPAGATE_NANS
;
1250 if (v3d
->prog
.compute
->prog_data
.base
->single_seg
)
1251 submit
.cfg
[5] |= V3D_CSD_CFG5_SINGLE_SEG
;
1252 if (v3d
->prog
.compute
->prog_data
.base
->threads
== 4)
1253 submit
.cfg
[5] |= V3D_CSD_CFG5_THREADING
;
1255 if (v3d
->prog
.compute
->prog_data
.compute
->shared_size
) {
1256 v3d
->compute_shared_memory
=
1257 v3d_bo_alloc(v3d
->screen
,
1258 v3d
->prog
.compute
->prog_data
.compute
->shared_size
*
1263 struct v3d_cl_reloc uniforms
= v3d_write_uniforms(v3d
, job
,
1265 PIPE_SHADER_COMPUTE
);
1266 v3d_job_add_bo(job
, uniforms
.bo
);
1267 submit
.cfg
[6] = uniforms
.bo
->offset
+ uniforms
.offset
;
1269 /* Pull some job state that was stored in a SUBMIT_CL struct out to
1270 * our SUBMIT_CSD struct
1272 submit
.bo_handles
= job
->submit
.bo_handles
;
1273 submit
.bo_handle_count
= job
->submit
.bo_handle_count
;
1275 /* Serialize this in the rest of our command stream. */
1276 submit
.in_sync
= v3d
->out_sync
;
1277 submit
.out_sync
= v3d
->out_sync
;
1279 if (!(V3D_DEBUG
& V3D_DEBUG_NORAST
)) {
1280 int ret
= v3d_ioctl(screen
->fd
, DRM_IOCTL_V3D_SUBMIT_CSD
,
1282 static bool warned
= false;
1283 if (ret
&& !warned
) {
1284 fprintf(stderr
, "CSD submit call returned %s. "
1285 "Expect corruption.\n", strerror(errno
));
1290 v3d_job_free(v3d
, job
);
1292 /* Mark SSBOs as being written.. we don't actually know which ones are
1293 * read vs written, so just assume the worst
1295 foreach_bit(i
, v3d
->ssbo
[PIPE_SHADER_COMPUTE
].enabled_mask
) {
1296 struct v3d_resource
*rsc
= v3d_resource(
1297 v3d
->ssbo
[PIPE_SHADER_COMPUTE
].sb
[i
].buffer
);
1298 rsc
->writes
++; /* XXX */
1301 foreach_bit(i
, v3d
->shaderimg
[PIPE_SHADER_COMPUTE
].enabled_mask
) {
1302 struct v3d_resource
*rsc
= v3d_resource(
1303 v3d
->shaderimg
[PIPE_SHADER_COMPUTE
].si
[i
].base
.resource
);
1307 v3d_bo_unreference(&uniforms
.bo
);
1308 v3d_bo_unreference(&v3d
->compute_shared_memory
);
1313 * Implements gallium's clear() hook (glClear()) by drawing a pair of triangles.
1316 v3d_draw_clear(struct v3d_context
*v3d
,
1318 const union pipe_color_union
*color
,
1319 double depth
, unsigned stencil
)
1321 static const union pipe_color_union dummy_color
= {};
1323 /* The blitter util dereferences the color regardless, even though the
1324 * gallium clear API may not pass one in when only Z/S are cleared.
1327 color
= &dummy_color
;
1329 v3d_blitter_save(v3d
);
1330 util_blitter_clear(v3d
->blitter
,
1331 v3d
->framebuffer
.width
,
1332 v3d
->framebuffer
.height
,
1333 util_framebuffer_get_num_layers(&v3d
->framebuffer
),
1334 buffers
, color
, depth
, stencil
,
1335 util_framebuffer_get_num_samples(&v3d
->framebuffer
) > 1);
1339 * Attempts to perform the GL clear by using the TLB's fast clear at the start
1343 v3d_tlb_clear(struct v3d_job
*job
, unsigned buffers
,
1344 const union pipe_color_union
*color
,
1345 double depth
, unsigned stencil
)
1347 struct v3d_context
*v3d
= job
->v3d
;
1349 if (job
->draw_calls_queued
) {
1350 /* If anything in the CL has drawn using the buffer, then the
1351 * TLB clear we're trying to add now would happen before that
1354 buffers
&= ~(job
->load
| job
->store
);
1357 /* GFXH-1461: If we were to emit a load of just depth or just stencil,
1358 * then the clear for the other may get lost. We need to decide now
1359 * if it would be possible to need to emit a load of just one after
1360 * we've set up our TLB clears.
1362 if (buffers
& PIPE_CLEAR_DEPTHSTENCIL
&&
1363 (buffers
& PIPE_CLEAR_DEPTHSTENCIL
) != PIPE_CLEAR_DEPTHSTENCIL
&&
1365 util_format_is_depth_and_stencil(job
->zsbuf
->texture
->format
)) {
1366 buffers
&= ~PIPE_CLEAR_DEPTHSTENCIL
;
1369 for (int i
= 0; i
< V3D_MAX_DRAW_BUFFERS
; i
++) {
1370 uint32_t bit
= PIPE_CLEAR_COLOR0
<< i
;
1371 if (!(buffers
& bit
))
1374 struct pipe_surface
*psurf
= v3d
->framebuffer
.cbufs
[i
];
1375 struct v3d_surface
*surf
= v3d_surface(psurf
);
1376 struct v3d_resource
*rsc
= v3d_resource(psurf
->texture
);
1378 union util_color uc
;
1379 uint32_t internal_size
= 4 << surf
->internal_bpp
;
1381 static union pipe_color_union swapped_color
;
1382 if (v3d
->swap_color_rb
& (1 << i
)) {
1383 swapped_color
.f
[0] = color
->f
[2];
1384 swapped_color
.f
[1] = color
->f
[1];
1385 swapped_color
.f
[2] = color
->f
[0];
1386 swapped_color
.f
[3] = color
->f
[3];
1387 color
= &swapped_color
;
1390 switch (surf
->internal_type
) {
1391 case V3D_INTERNAL_TYPE_8
:
1392 util_pack_color(color
->f
, PIPE_FORMAT_R8G8B8A8_UNORM
,
1394 memcpy(job
->clear_color
[i
], uc
.ui
, internal_size
);
1396 case V3D_INTERNAL_TYPE_8I
:
1397 case V3D_INTERNAL_TYPE_8UI
:
1398 job
->clear_color
[i
][0] = ((color
->ui
[0] & 0xff) |
1399 (color
->ui
[1] & 0xff) << 8 |
1400 (color
->ui
[2] & 0xff) << 16 |
1401 (color
->ui
[3] & 0xff) << 24);
1403 case V3D_INTERNAL_TYPE_16F
:
1404 util_pack_color(color
->f
, PIPE_FORMAT_R16G16B16A16_FLOAT
,
1406 memcpy(job
->clear_color
[i
], uc
.ui
, internal_size
);
1408 case V3D_INTERNAL_TYPE_16I
:
1409 case V3D_INTERNAL_TYPE_16UI
:
1410 job
->clear_color
[i
][0] = ((color
->ui
[0] & 0xffff) |
1411 color
->ui
[1] << 16);
1412 job
->clear_color
[i
][1] = ((color
->ui
[2] & 0xffff) |
1413 color
->ui
[3] << 16);
1415 case V3D_INTERNAL_TYPE_32F
:
1416 case V3D_INTERNAL_TYPE_32I
:
1417 case V3D_INTERNAL_TYPE_32UI
:
1418 memcpy(job
->clear_color
[i
], color
->ui
, internal_size
);
1422 rsc
->initialized_buffers
|= bit
;
1425 unsigned zsclear
= buffers
& PIPE_CLEAR_DEPTHSTENCIL
;
1427 struct v3d_resource
*rsc
=
1428 v3d_resource(v3d
->framebuffer
.zsbuf
->texture
);
1430 if (zsclear
& PIPE_CLEAR_DEPTH
)
1431 job
->clear_z
= depth
;
1432 if (zsclear
& PIPE_CLEAR_STENCIL
)
1433 job
->clear_s
= stencil
;
1435 rsc
->initialized_buffers
|= zsclear
;
1438 job
->draw_min_x
= 0;
1439 job
->draw_min_y
= 0;
1440 job
->draw_max_x
= v3d
->framebuffer
.width
;
1441 job
->draw_max_y
= v3d
->framebuffer
.height
;
1442 job
->clear
|= buffers
;
1443 job
->store
|= buffers
;
1445 v3d_start_draw(v3d
);
1451 v3d_clear(struct pipe_context
*pctx
, unsigned buffers
,
1452 const union pipe_color_union
*color
, double depth
, unsigned stencil
)
1454 struct v3d_context
*v3d
= v3d_context(pctx
);
1455 struct v3d_job
*job
= v3d_get_job_for_fbo(v3d
);
1457 buffers
&= ~v3d_tlb_clear(job
, buffers
, color
, depth
, stencil
);
1460 v3d_draw_clear(v3d
, buffers
, color
, depth
, stencil
);
1464 v3d_clear_render_target(struct pipe_context
*pctx
, struct pipe_surface
*ps
,
1465 const union pipe_color_union
*color
,
1466 unsigned x
, unsigned y
, unsigned w
, unsigned h
,
1467 bool render_condition_enabled
)
1469 fprintf(stderr
, "unimpl: clear RT\n");
1473 v3d_clear_depth_stencil(struct pipe_context
*pctx
, struct pipe_surface
*ps
,
1474 unsigned buffers
, double depth
, unsigned stencil
,
1475 unsigned x
, unsigned y
, unsigned w
, unsigned h
,
1476 bool render_condition_enabled
)
1478 fprintf(stderr
, "unimpl: clear DS\n");
1482 v3dX(draw_init
)(struct pipe_context
*pctx
)
1484 pctx
->draw_vbo
= v3d_draw_vbo
;
1485 pctx
->clear
= v3d_clear
;
1486 pctx
->clear_render_target
= v3d_clear_render_target
;
1487 pctx
->clear_depth_stencil
= v3d_clear_depth_stencil
;
1488 #if V3D_VERSION >= 41
1489 if (v3d_context(pctx
)->screen
->has_csd
)
1490 pctx
->launch_grid
= v3d_launch_grid
;