2 * Copyright (c) 2014 Scott Mansell
3 * Copyright © 2014 Broadcom
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include "util/u_blitter.h"
26 #include "util/u_prim.h"
27 #include "util/format/u_format.h"
28 #include "util/u_pack_color.h"
29 #include "util/u_split_draw.h"
30 #include "util/u_upload_mgr.h"
31 #include "indices/u_primconvert.h"
33 #include "vc4_context.h"
34 #include "vc4_resource.h"
36 #define VC4_HW_2116_COUNT 0x1ef0
39 vc4_get_draw_cl_space(struct vc4_job
*job
, int vert_count
)
41 /* The SW-5891 workaround may cause us to emit multiple shader recs
44 int num_draws
= DIV_ROUND_UP(vert_count
, 65535 - 2) + 1;
46 /* Binner gets our packet state -- vc4_emit.c contents,
47 * and the primitive itself.
49 cl_ensure_space(&job
->bcl
,
50 256 + (VC4_PACKET_GL_ARRAY_PRIMITIVE_SIZE
+
51 VC4_PACKET_GL_SHADER_STATE_SIZE
) * num_draws
);
53 /* Nothing for rcl -- that's covered by vc4_context.c */
55 /* shader_rec gets up to 12 dwords of reloc handles plus a maximally
56 * sized shader_rec (104 bytes base for 8 vattrs plus 32 bytes of
59 cl_ensure_space(&job
->shader_rec
,
60 (12 * sizeof(uint32_t) + 104 + 8 * 32) * num_draws
);
62 /* Uniforms are covered by vc4_write_uniforms(). */
64 /* There could be up to 16 textures per stage, plus misc other
67 cl_ensure_space(&job
->bo_handles
, (2 * 16 + 20) * sizeof(uint32_t));
68 cl_ensure_space(&job
->bo_pointers
,
69 (2 * 16 + 20) * sizeof(struct vc4_bo
*));
73 * Does the initial bining command list setup for drawing to a given FBO.
76 vc4_start_draw(struct vc4_context
*vc4
)
78 struct vc4_job
*job
= vc4
->job
;
83 vc4_get_draw_cl_space(job
, 0);
85 cl_emit(&job
->bcl
, TILE_BINNING_MODE_CONFIGURATION
, bin
) {
86 bin
.width_in_tiles
= job
->draw_tiles_x
;
87 bin
.height_in_tiles
= job
->draw_tiles_y
;
88 bin
.multisample_mode_4x
= job
->msaa
;
91 /* START_TILE_BINNING resets the statechange counters in the hardware,
92 * which are what is used when a primitive is binned to a tile to
93 * figure out what new state packets need to be written to that tile's
96 cl_emit(&job
->bcl
, START_TILE_BINNING
, start
);
98 /* Reset the current compressed primitives format. This gets modified
99 * by VC4_PACKET_GL_INDEXED_PRIMITIVE and
100 * VC4_PACKET_GL_ARRAY_PRIMITIVE, so it needs to be reset at the start
103 cl_emit(&job
->bcl
, PRIMITIVE_LIST_FORMAT
, list
) {
104 list
.data_type
= _16_BIT_INDEX
;
105 list
.primitive_type
= TRIANGLES_LIST
;
108 job
->needs_flush
= true;
109 job
->draw_width
= vc4
->framebuffer
.width
;
110 job
->draw_height
= vc4
->framebuffer
.height
;
114 vc4_predraw_check_textures(struct pipe_context
*pctx
,
115 struct vc4_texture_stateobj
*stage_tex
)
117 struct vc4_context
*vc4
= vc4_context(pctx
);
119 for (int i
= 0; i
< stage_tex
->num_textures
; i
++) {
120 struct vc4_sampler_view
*view
=
121 vc4_sampler_view(stage_tex
->textures
[i
]);
125 if (view
->texture
!= view
->base
.texture
)
126 vc4_update_shadow_baselevel_texture(pctx
, &view
->base
);
128 vc4_flush_jobs_writing_resource(vc4
, view
->texture
);
133 vc4_emit_gl_shader_state(struct vc4_context
*vc4
,
134 const struct pipe_draw_info
*info
,
135 uint32_t extra_index_bias
)
137 struct vc4_job
*job
= vc4
->job
;
138 /* VC4_DIRTY_VTXSTATE */
139 struct vc4_vertex_stateobj
*vtx
= vc4
->vtx
;
140 /* VC4_DIRTY_VTXBUF */
141 struct vc4_vertexbuf_stateobj
*vertexbuf
= &vc4
->vertexbuf
;
143 /* The simulator throws a fit if VS or CS don't read an attribute, so
144 * we emit a dummy read.
146 uint32_t num_elements_emit
= MAX2(vtx
->num_elements
, 1);
148 /* Emit the shader record. */
149 cl_start_shader_reloc(&job
->shader_rec
, 3 + num_elements_emit
);
151 cl_emit(&job
->shader_rec
, SHADER_RECORD
, rec
) {
152 rec
.enable_clipping
= true;
154 /* VC4_DIRTY_COMPILED_FS */
155 rec
.fragment_shader_is_single_threaded
=
156 !vc4
->prog
.fs
->fs_threaded
;
158 /* VC4_DIRTY_PRIM_MODE | VC4_DIRTY_RASTERIZER */
159 rec
.point_size_included_in_shaded_vertex_data
=
160 (info
->mode
== PIPE_PRIM_POINTS
&&
161 vc4
->rasterizer
->base
.point_size_per_vertex
);
163 /* VC4_DIRTY_COMPILED_FS */
164 rec
.fragment_shader_number_of_varyings
=
165 vc4
->prog
.fs
->num_inputs
;
166 rec
.fragment_shader_code_address
=
167 cl_address(vc4
->prog
.fs
->bo
, 0);
169 rec
.coordinate_shader_attribute_array_select_bits
=
170 vc4
->prog
.cs
->vattrs_live
;
171 rec
.coordinate_shader_total_attributes_size
=
172 vc4
->prog
.cs
->vattr_offsets
[8];
173 rec
.coordinate_shader_code_address
=
174 cl_address(vc4
->prog
.cs
->bo
, 0);
176 rec
.vertex_shader_attribute_array_select_bits
=
177 vc4
->prog
.vs
->vattrs_live
;
178 rec
.vertex_shader_total_attributes_size
=
179 vc4
->prog
.vs
->vattr_offsets
[8];
180 rec
.vertex_shader_code_address
=
181 cl_address(vc4
->prog
.vs
->bo
, 0);
184 uint32_t max_index
= 0xffff;
185 for (int i
= 0; i
< vtx
->num_elements
; i
++) {
186 struct pipe_vertex_element
*elem
= &vtx
->pipe
[i
];
187 struct pipe_vertex_buffer
*vb
=
188 &vertexbuf
->vb
[elem
->vertex_buffer_index
];
189 struct vc4_resource
*rsc
= vc4_resource(vb
->buffer
.resource
);
190 /* not vc4->dirty tracked: vc4->last_index_bias */
191 uint32_t offset
= (vb
->buffer_offset
+
193 vb
->stride
* (info
->index_bias
+
195 uint32_t vb_size
= rsc
->bo
->size
- offset
;
197 util_format_get_blocksize(elem
->src_format
);
199 cl_emit(&job
->shader_rec
, ATTRIBUTE_RECORD
, attr
) {
200 attr
.address
= cl_address(rsc
->bo
, offset
);
201 attr
.number_of_bytes_minus_1
= elem_size
- 1;
202 attr
.stride
= vb
->stride
;
203 attr
.coordinate_shader_vpm_offset
=
204 vc4
->prog
.cs
->vattr_offsets
[i
];
205 attr
.vertex_shader_vpm_offset
=
206 vc4
->prog
.vs
->vattr_offsets
[i
];
209 if (vb
->stride
> 0) {
210 max_index
= MIN2(max_index
,
211 (vb_size
- elem_size
) / vb
->stride
);
215 if (vtx
->num_elements
== 0) {
216 assert(num_elements_emit
== 1);
217 struct vc4_bo
*bo
= vc4_bo_alloc(vc4
->screen
, 4096, "scratch VBO");
219 cl_emit(&job
->shader_rec
, ATTRIBUTE_RECORD
, attr
) {
220 attr
.address
= cl_address(bo
, 0);
221 attr
.number_of_bytes_minus_1
= 16 - 1;
223 attr
.coordinate_shader_vpm_offset
= 0;
224 attr
.vertex_shader_vpm_offset
= 0;
227 vc4_bo_unreference(&bo
);
230 cl_emit(&job
->bcl
, GL_SHADER_STATE
, shader_state
) {
231 /* Note that number of attributes == 0 in the packet means 8
232 * attributes. This field also contains the offset into
235 assert(vtx
->num_elements
<= 8);
236 shader_state
.number_of_attribute_arrays
=
237 num_elements_emit
& 0x7;
240 vc4_write_uniforms(vc4
, vc4
->prog
.fs
,
241 &vc4
->constbuf
[PIPE_SHADER_FRAGMENT
],
243 vc4_write_uniforms(vc4
, vc4
->prog
.vs
,
244 &vc4
->constbuf
[PIPE_SHADER_VERTEX
],
246 vc4_write_uniforms(vc4
, vc4
->prog
.cs
,
247 &vc4
->constbuf
[PIPE_SHADER_VERTEX
],
250 vc4
->last_index_bias
= info
->index_bias
+ extra_index_bias
;
251 vc4
->max_index
= max_index
;
252 job
->shader_rec_count
++;
256 * HW-2116 workaround: Flush the batch before triggering the hardware state
257 * counter wraparound behavior.
259 * State updates are tracked by a global counter which increments at the first
260 * state update after a draw or a START_BINNING. Tiles can then have their
261 * state updated at draw time with a set of cheap checks for whether the
262 * state's copy of the global counter matches the global counter the last time
263 * that state was written to the tile.
265 * The state counters are relatively small and wrap around quickly, so you
266 * could get false negatives for needing to update a particular state in the
267 * tile. To avoid this, the hardware attempts to write all of the state in
268 * the tile at wraparound time. This apparently is broken, so we just flush
269 * everything before that behavior is triggered. A batch flush is sufficient
270 * to get our current contents drawn and reset the counters to 0.
272 * Note that we can't just use VC4_PACKET_FLUSH_ALL, because that caps the
273 * tiles with VC4_PACKET_RETURN_FROM_LIST.
276 vc4_hw_2116_workaround(struct pipe_context
*pctx
, int vert_count
)
278 struct vc4_context
*vc4
= vc4_context(pctx
);
279 struct vc4_job
*job
= vc4_get_job_for_fbo(vc4
);
281 if (job
->draw_calls_queued
+ vert_count
/ 65535 >= VC4_HW_2116_COUNT
) {
282 perf_debug("Flushing batch due to HW-2116 workaround "
283 "(too many draw calls per scene\n");
284 vc4_job_submit(vc4
, job
);
289 vc4_draw_vbo(struct pipe_context
*pctx
, const struct pipe_draw_info
*info
)
291 struct vc4_context
*vc4
= vc4_context(pctx
);
292 struct pipe_draw_info local_info
;
294 if (!info
->count_from_stream_output
&& !info
->indirect
&&
295 !info
->primitive_restart
&&
296 !u_trim_pipe_prim(info
->mode
, (unsigned*)&info
->count
))
299 if (info
->mode
>= PIPE_PRIM_QUADS
) {
300 if (info
->mode
== PIPE_PRIM_QUADS
&&
302 !vc4
->rasterizer
->base
.flatshade
) {
304 local_info
.mode
= PIPE_PRIM_TRIANGLE_FAN
;
307 util_primconvert_save_rasterizer_state(vc4
->primconvert
, &vc4
->rasterizer
->base
);
308 util_primconvert_draw_vbo(vc4
->primconvert
, info
);
309 perf_debug("Fallback conversion for %d %s vertices\n",
310 info
->count
, u_prim_name(info
->mode
));
315 /* Before setting up the draw, do any fixup blits necessary. */
316 vc4_predraw_check_textures(pctx
, &vc4
->verttex
);
317 vc4_predraw_check_textures(pctx
, &vc4
->fragtex
);
319 vc4_hw_2116_workaround(pctx
, info
->count
);
321 struct vc4_job
*job
= vc4_get_job_for_fbo(vc4
);
323 /* Make sure that the raster order flags haven't changed, which can
324 * only be set at job granularity.
326 if (job
->flags
!= vc4
->rasterizer
->tile_raster_order_flags
) {
327 vc4_job_submit(vc4
, job
);
328 job
= vc4_get_job_for_fbo(vc4
);
331 vc4_get_draw_cl_space(job
, info
->count
);
333 if (vc4
->prim_mode
!= info
->mode
) {
334 vc4
->prim_mode
= info
->mode
;
335 vc4
->dirty
|= VC4_DIRTY_PRIM_MODE
;
339 if (!vc4_update_compiled_shaders(vc4
, info
->mode
)) {
340 debug_warn_once("shader compile failed, skipping draw call.\n");
344 vc4_emit_state(pctx
);
346 bool needs_drawarrays_shader_state
= false;
348 if ((vc4
->dirty
& (VC4_DIRTY_VTXBUF
|
350 VC4_DIRTY_PRIM_MODE
|
351 VC4_DIRTY_RASTERIZER
|
352 VC4_DIRTY_COMPILED_CS
|
353 VC4_DIRTY_COMPILED_VS
|
354 VC4_DIRTY_COMPILED_FS
|
355 vc4
->prog
.cs
->uniform_dirty_bits
|
356 vc4
->prog
.vs
->uniform_dirty_bits
|
357 vc4
->prog
.fs
->uniform_dirty_bits
)) ||
358 vc4
->last_index_bias
!= info
->index_bias
) {
359 if (info
->index_size
)
360 vc4_emit_gl_shader_state(vc4
, info
, 0);
362 needs_drawarrays_shader_state
= true;
367 /* Note that the primitive type fields match with OpenGL/gallium
368 * definitions, up to but not including QUADS.
370 if (info
->index_size
) {
371 uint32_t index_size
= info
->index_size
;
372 uint32_t offset
= info
->start
* index_size
;
373 struct pipe_resource
*prsc
;
374 if (info
->index_size
== 4) {
375 prsc
= vc4_get_shadow_index_buffer(pctx
, info
,
377 info
->count
, &offset
);
380 if (info
->has_user_indices
) {
382 u_upload_data(vc4
->uploader
, 0,
383 info
->count
* index_size
, 4,
387 prsc
= info
->index
.resource
;
390 struct vc4_resource
*rsc
= vc4_resource(prsc
);
392 struct vc4_cl_out
*bcl
= cl_start(&job
->bcl
);
394 /* The original design for the VC4 kernel UABI had multiple
395 * packets that used relocations in the BCL (some of which
396 * needed two BOs), but later modifications eliminated all but
397 * this one usage. We have an arbitrary 32-bit offset value,
398 * and need to also supply an arbitrary 32-bit index buffer
399 * GEM handle, so we have this fake packet we emit in our BCL
400 * to be validated, which the kernel uses at validation time
401 * to perform the relocation in the IB packet (without
402 * emitting to the actual HW).
404 uint32_t hindex
= vc4_gem_hindex(job
, rsc
->bo
);
405 if (job
->last_gem_handle_hindex
!= hindex
) {
406 cl_u8(&bcl
, VC4_PACKET_GEM_HANDLES
);
407 cl_u32(&bcl
, hindex
);
409 job
->last_gem_handle_hindex
= hindex
;
412 cl_u8(&bcl
, VC4_PACKET_GL_INDEXED_PRIMITIVE
);
416 VC4_INDEX_BUFFER_U16
:
417 VC4_INDEX_BUFFER_U8
));
418 cl_u32(&bcl
, info
->count
);
419 cl_u32(&bcl
, offset
);
420 cl_u32(&bcl
, vc4
->max_index
);
422 cl_end(&job
->bcl
, bcl
);
423 job
->draw_calls_queued
++;
425 if (info
->index_size
== 4 || info
->has_user_indices
)
426 pipe_resource_reference(&prsc
, NULL
);
428 uint32_t count
= info
->count
;
429 uint32_t start
= info
->start
;
430 uint32_t extra_index_bias
= 0;
431 static const uint32_t max_verts
= 65535;
433 /* GFXH-515 / SW-5891: The binner emits 16 bit indices for
434 * drawarrays, which means that if start + count > 64k it
435 * would truncate the top bits. Work around this by emitting
436 * a limited number of primitives at a time and reemitting the
437 * shader state pointing farther down the vertex attribute
440 * To do this properly for line loops or trifans, we'd need to
441 * make a new VB containing the first vertex plus whatever
444 if (start
+ count
> max_verts
) {
445 extra_index_bias
= start
;
447 needs_drawarrays_shader_state
= true;
451 uint32_t this_count
= count
;
454 if (needs_drawarrays_shader_state
) {
455 vc4_emit_gl_shader_state(vc4
, info
,
459 u_split_draw(info
, max_verts
, &this_count
, &step
);
461 cl_emit(&job
->bcl
, VERTEX_ARRAY_PRIMITIVES
, array
) {
462 array
.primitive_mode
= info
->mode
;
463 array
.length
= this_count
;
464 array
.index_of_first_vertex
= start
;
466 job
->draw_calls_queued
++;
469 extra_index_bias
+= start
+ step
;
471 needs_drawarrays_shader_state
= true;
475 /* We shouldn't have tripped the HW_2116 bug with the GFXH-515
478 assert(job
->draw_calls_queued
<= VC4_HW_2116_COUNT
);
480 if (vc4
->zsa
&& vc4
->framebuffer
.zsbuf
) {
481 struct vc4_resource
*rsc
=
482 vc4_resource(vc4
->framebuffer
.zsbuf
->texture
);
484 if (vc4
->zsa
->base
.depth
.enabled
) {
485 job
->resolve
|= PIPE_CLEAR_DEPTH
;
486 rsc
->initialized_buffers
= PIPE_CLEAR_DEPTH
;
489 if (vc4
->zsa
->base
.stencil
[0].enabled
) {
490 job
->resolve
|= PIPE_CLEAR_STENCIL
;
491 rsc
->initialized_buffers
|= PIPE_CLEAR_STENCIL
;
495 job
->resolve
|= PIPE_CLEAR_COLOR0
;
497 /* If we've used half of the presumably 256MB CMA area, flush the job
498 * so that we don't accumulate a job that will end up not being
501 if (job
->bo_space
> 128 * 1024 * 1024)
504 if (vc4_debug
& VC4_DEBUG_ALWAYS_FLUSH
)
509 pack_rgba(enum pipe_format format
, const float *rgba
)
512 util_pack_color(rgba
, format
, &uc
);
513 if (util_format_get_blocksize(format
) == 2)
520 vc4_clear(struct pipe_context
*pctx
, unsigned buffers
, const struct pipe_scissor_state
*scissor_state
,
521 const union pipe_color_union
*color
, double depth
, unsigned stencil
)
523 struct vc4_context
*vc4
= vc4_context(pctx
);
524 struct vc4_job
*job
= vc4_get_job_for_fbo(vc4
);
526 if (buffers
& PIPE_CLEAR_DEPTHSTENCIL
) {
527 struct vc4_resource
*rsc
=
528 vc4_resource(vc4
->framebuffer
.zsbuf
->texture
);
529 unsigned zsclear
= buffers
& PIPE_CLEAR_DEPTHSTENCIL
;
531 /* Clearing ZS will clear both Z and stencil, so if we're
532 * trying to clear just one then we need to draw a quad to do
533 * it instead. We need to do this before setting up
534 * tile-based clears in vc4->job, because the blitter may
535 * submit the current job.
537 if ((zsclear
== PIPE_CLEAR_DEPTH
||
538 zsclear
== PIPE_CLEAR_STENCIL
) &&
539 (rsc
->initialized_buffers
& ~(zsclear
| job
->cleared
)) &&
540 util_format_is_depth_and_stencil(vc4
->framebuffer
.zsbuf
->format
)) {
541 static const union pipe_color_union dummy_color
= {};
543 perf_debug("Partial clear of Z+stencil buffer, "
544 "drawing a quad instead of fast clearing\n");
545 vc4_blitter_save(vc4
);
546 util_blitter_clear(vc4
->blitter
,
547 vc4
->framebuffer
.width
,
548 vc4
->framebuffer
.height
,
551 &dummy_color
, depth
, stencil
,
556 job
= vc4_get_job_for_fbo(vc4
);
560 /* We can't flag new buffers for clearing once we've queued draws. We
561 * could avoid this by using the 3d engine to clear.
563 if (job
->draw_calls_queued
) {
564 perf_debug("Flushing rendering to process new clear.\n");
565 vc4_job_submit(vc4
, job
);
566 job
= vc4_get_job_for_fbo(vc4
);
569 if (buffers
& PIPE_CLEAR_COLOR0
) {
570 struct vc4_resource
*rsc
=
571 vc4_resource(vc4
->framebuffer
.cbufs
[0]->texture
);
572 uint32_t clear_color
;
574 if (vc4_rt_format_is_565(vc4
->framebuffer
.cbufs
[0]->format
)) {
575 /* In 565 mode, the hardware will be packing our color
578 clear_color
= pack_rgba(PIPE_FORMAT_R8G8B8A8_UNORM
,
581 /* Otherwise, we need to do this packing because we
582 * support multiple swizzlings of RGBA8888.
585 pack_rgba(vc4
->framebuffer
.cbufs
[0]->format
,
588 job
->clear_color
[0] = job
->clear_color
[1] = clear_color
;
589 rsc
->initialized_buffers
|= (buffers
& PIPE_CLEAR_COLOR0
);
592 if (buffers
& PIPE_CLEAR_DEPTHSTENCIL
) {
593 struct vc4_resource
*rsc
=
594 vc4_resource(vc4
->framebuffer
.zsbuf
->texture
);
596 /* Though the depth buffer is stored with Z in the high 24,
597 * for this field we just need to store it in the low 24.
599 if (buffers
& PIPE_CLEAR_DEPTH
) {
600 job
->clear_depth
= util_pack_z(PIPE_FORMAT_Z24X8_UNORM
,
603 if (buffers
& PIPE_CLEAR_STENCIL
)
604 job
->clear_stencil
= stencil
;
606 rsc
->initialized_buffers
|= (buffers
& PIPE_CLEAR_DEPTHSTENCIL
);
611 job
->draw_max_x
= vc4
->framebuffer
.width
;
612 job
->draw_max_y
= vc4
->framebuffer
.height
;
613 job
->cleared
|= buffers
;
614 job
->resolve
|= buffers
;
620 vc4_clear_render_target(struct pipe_context
*pctx
, struct pipe_surface
*ps
,
621 const union pipe_color_union
*color
,
622 unsigned x
, unsigned y
, unsigned w
, unsigned h
,
623 bool render_condition_enabled
)
625 fprintf(stderr
, "unimpl: clear RT\n");
629 vc4_clear_depth_stencil(struct pipe_context
*pctx
, struct pipe_surface
*ps
,
630 unsigned buffers
, double depth
, unsigned stencil
,
631 unsigned x
, unsigned y
, unsigned w
, unsigned h
,
632 bool render_condition_enabled
)
634 fprintf(stderr
, "unimpl: clear DS\n");
638 vc4_draw_init(struct pipe_context
*pctx
)
640 pctx
->draw_vbo
= vc4_draw_vbo
;
641 pctx
->clear
= vc4_clear
;
642 pctx
->clear_render_target
= vc4_clear_render_target
;
643 pctx
->clear_depth_stencil
= vc4_clear_depth_stencil
;