2 * Copyright (c) 2014 Scott Mansell
3 * Copyright © 2014 Broadcom
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include "util/u_blitter.h"
26 #include "util/u_prim.h"
27 #include "util/u_format.h"
28 #include "util/u_pack_color.h"
29 #include "util/u_upload_mgr.h"
30 #include "indices/u_primconvert.h"
32 #include "vc4_context.h"
33 #include "vc4_resource.h"
35 #define VC4_HW_2116_COUNT 0x1ef0
38 vc4_get_draw_cl_space(struct vc4_job
*job
, int vert_count
)
40 /* The SW-5891 workaround may cause us to emit multiple shader recs
43 int num_draws
= DIV_ROUND_UP(vert_count
, 65535) + 1;
45 /* Binner gets our packet state -- vc4_emit.c contents,
46 * and the primitive itself.
48 cl_ensure_space(&job
->bcl
,
49 256 + (VC4_PACKET_GL_ARRAY_PRIMITIVE_SIZE
+
50 VC4_PACKET_GL_SHADER_STATE_SIZE
) * num_draws
);
52 /* Nothing for rcl -- that's covered by vc4_context.c */
54 /* shader_rec gets up to 12 dwords of reloc handles plus a maximally
55 * sized shader_rec (104 bytes base for 8 vattrs plus 32 bytes of
58 cl_ensure_space(&job
->shader_rec
,
59 (12 * sizeof(uint32_t) + 104 + 8 * 32) * num_draws
);
61 /* Uniforms are covered by vc4_write_uniforms(). */
63 /* There could be up to 16 textures per stage, plus misc other
66 cl_ensure_space(&job
->bo_handles
, (2 * 16 + 20) * sizeof(uint32_t));
67 cl_ensure_space(&job
->bo_pointers
,
68 (2 * 16 + 20) * sizeof(struct vc4_bo
*));
72 * Does the initial bining command list setup for drawing to a given FBO.
75 vc4_start_draw(struct vc4_context
*vc4
)
77 struct vc4_job
*job
= vc4
->job
;
82 vc4_get_draw_cl_space(job
, 0);
84 struct vc4_cl_out
*bcl
= cl_start(&job
->bcl
);
85 // Tile state data is 48 bytes per tile, I think it can be thrown away
86 // as soon as binning is finished.
87 cl_u8(&bcl
, VC4_PACKET_TILE_BINNING_MODE_CONFIG
);
88 cl_u32(&bcl
, 0); /* tile alloc addr, filled by kernel */
89 cl_u32(&bcl
, 0); /* tile alloc size, filled by kernel */
90 cl_u32(&bcl
, 0); /* tile state addr, filled by kernel */
91 cl_u8(&bcl
, job
->draw_tiles_x
);
92 cl_u8(&bcl
, job
->draw_tiles_y
);
93 /* Other flags are filled by kernel. */
94 cl_u8(&bcl
, job
->msaa
? VC4_BIN_CONFIG_MS_MODE_4X
: 0);
96 /* START_TILE_BINNING resets the statechange counters in the hardware,
97 * which are what is used when a primitive is binned to a tile to
98 * figure out what new state packets need to be written to that tile's
101 cl_u8(&bcl
, VC4_PACKET_START_TILE_BINNING
);
103 /* Reset the current compressed primitives format. This gets modified
104 * by VC4_PACKET_GL_INDEXED_PRIMITIVE and
105 * VC4_PACKET_GL_ARRAY_PRIMITIVE, so it needs to be reset at the start
108 cl_u8(&bcl
, VC4_PACKET_PRIMITIVE_LIST_FORMAT
);
109 cl_u8(&bcl
, (VC4_PRIMITIVE_LIST_FORMAT_16_INDEX
|
110 VC4_PRIMITIVE_LIST_FORMAT_TYPE_TRIANGLES
));
112 job
->needs_flush
= true;
113 job
->draw_width
= vc4
->framebuffer
.width
;
114 job
->draw_height
= vc4
->framebuffer
.height
;
116 cl_end(&job
->bcl
, bcl
);
120 vc4_predraw_check_textures(struct pipe_context
*pctx
,
121 struct vc4_texture_stateobj
*stage_tex
)
123 struct vc4_context
*vc4
= vc4_context(pctx
);
125 for (int i
= 0; i
< stage_tex
->num_textures
; i
++) {
126 struct pipe_sampler_view
*view
= stage_tex
->textures
[i
];
129 struct vc4_resource
*rsc
= vc4_resource(view
->texture
);
130 if (rsc
->shadow_parent
)
131 vc4_update_shadow_baselevel_texture(pctx
, view
);
133 vc4_flush_jobs_writing_resource(vc4
, view
->texture
);
138 vc4_emit_gl_shader_state(struct vc4_context
*vc4
,
139 const struct pipe_draw_info
*info
,
140 uint32_t extra_index_bias
)
142 struct vc4_job
*job
= vc4
->job
;
143 /* VC4_DIRTY_VTXSTATE */
144 struct vc4_vertex_stateobj
*vtx
= vc4
->vtx
;
145 /* VC4_DIRTY_VTXBUF */
146 struct vc4_vertexbuf_stateobj
*vertexbuf
= &vc4
->vertexbuf
;
148 /* The simulator throws a fit if VS or CS don't read an attribute, so
149 * we emit a dummy read.
151 uint32_t num_elements_emit
= MAX2(vtx
->num_elements
, 1);
152 /* Emit the shader record. */
153 struct vc4_cl_out
*shader_rec
=
154 cl_start_shader_reloc(&job
->shader_rec
, 3 + num_elements_emit
);
155 /* VC4_DIRTY_PRIM_MODE | VC4_DIRTY_RASTERIZER */
157 VC4_SHADER_FLAG_ENABLE_CLIPPING
|
158 VC4_SHADER_FLAG_FS_SINGLE_THREAD
|
159 ((info
->mode
== PIPE_PRIM_POINTS
&&
160 vc4
->rasterizer
->base
.point_size_per_vertex
) ?
161 VC4_SHADER_FLAG_VS_POINT_SIZE
: 0));
163 /* VC4_DIRTY_COMPILED_FS */
164 cl_u8(&shader_rec
, 0); /* fs num uniforms (unused) */
165 cl_u8(&shader_rec
, vc4
->prog
.fs
->num_inputs
);
166 cl_reloc(job
, &job
->shader_rec
, &shader_rec
, vc4
->prog
.fs
->bo
, 0);
167 cl_u32(&shader_rec
, 0); /* UBO offset written by kernel */
169 /* VC4_DIRTY_COMPILED_VS */
170 cl_u16(&shader_rec
, 0); /* vs num uniforms */
171 cl_u8(&shader_rec
, vc4
->prog
.vs
->vattrs_live
);
172 cl_u8(&shader_rec
, vc4
->prog
.vs
->vattr_offsets
[8]);
173 cl_reloc(job
, &job
->shader_rec
, &shader_rec
, vc4
->prog
.vs
->bo
, 0);
174 cl_u32(&shader_rec
, 0); /* UBO offset written by kernel */
176 /* VC4_DIRTY_COMPILED_CS */
177 cl_u16(&shader_rec
, 0); /* cs num uniforms */
178 cl_u8(&shader_rec
, vc4
->prog
.cs
->vattrs_live
);
179 cl_u8(&shader_rec
, vc4
->prog
.cs
->vattr_offsets
[8]);
180 cl_reloc(job
, &job
->shader_rec
, &shader_rec
, vc4
->prog
.cs
->bo
, 0);
181 cl_u32(&shader_rec
, 0); /* UBO offset written by kernel */
183 uint32_t max_index
= 0xffff;
184 for (int i
= 0; i
< vtx
->num_elements
; i
++) {
185 struct pipe_vertex_element
*elem
= &vtx
->pipe
[i
];
186 struct pipe_vertex_buffer
*vb
=
187 &vertexbuf
->vb
[elem
->vertex_buffer_index
];
188 struct vc4_resource
*rsc
= vc4_resource(vb
->buffer
);
189 /* not vc4->dirty tracked: vc4->last_index_bias */
190 uint32_t offset
= (vb
->buffer_offset
+
192 vb
->stride
* (info
->index_bias
+
194 uint32_t vb_size
= rsc
->bo
->size
- offset
;
196 util_format_get_blocksize(elem
->src_format
);
198 cl_reloc(job
, &job
->shader_rec
, &shader_rec
, rsc
->bo
, offset
);
199 cl_u8(&shader_rec
, elem_size
- 1);
200 cl_u8(&shader_rec
, vb
->stride
);
201 cl_u8(&shader_rec
, vc4
->prog
.vs
->vattr_offsets
[i
]);
202 cl_u8(&shader_rec
, vc4
->prog
.cs
->vattr_offsets
[i
]);
204 if (vb
->stride
> 0) {
205 max_index
= MIN2(max_index
,
206 (vb_size
- elem_size
) / vb
->stride
);
210 if (vtx
->num_elements
== 0) {
211 assert(num_elements_emit
== 1);
212 struct vc4_bo
*bo
= vc4_bo_alloc(vc4
->screen
, 4096, "scratch VBO");
213 cl_reloc(job
, &job
->shader_rec
, &shader_rec
, bo
, 0);
214 cl_u8(&shader_rec
, 16 - 1); /* element size */
215 cl_u8(&shader_rec
, 0); /* stride */
216 cl_u8(&shader_rec
, 0); /* VS VPM offset */
217 cl_u8(&shader_rec
, 0); /* CS VPM offset */
218 vc4_bo_unreference(&bo
);
220 cl_end(&job
->shader_rec
, shader_rec
);
222 struct vc4_cl_out
*bcl
= cl_start(&job
->bcl
);
223 /* the actual draw call. */
224 cl_u8(&bcl
, VC4_PACKET_GL_SHADER_STATE
);
225 assert(vtx
->num_elements
<= 8);
226 /* Note that number of attributes == 0 in the packet means 8
227 * attributes. This field also contains the offset into shader_rec.
229 cl_u32(&bcl
, num_elements_emit
& 0x7);
230 cl_end(&job
->bcl
, bcl
);
232 vc4_write_uniforms(vc4
, vc4
->prog
.fs
,
233 &vc4
->constbuf
[PIPE_SHADER_FRAGMENT
],
235 vc4_write_uniforms(vc4
, vc4
->prog
.vs
,
236 &vc4
->constbuf
[PIPE_SHADER_VERTEX
],
238 vc4_write_uniforms(vc4
, vc4
->prog
.cs
,
239 &vc4
->constbuf
[PIPE_SHADER_VERTEX
],
242 vc4
->last_index_bias
= info
->index_bias
+ extra_index_bias
;
243 vc4
->max_index
= max_index
;
244 job
->shader_rec_count
++;
248 * HW-2116 workaround: Flush the batch before triggering the hardware state
249 * counter wraparound behavior.
251 * State updates are tracked by a global counter which increments at the first
252 * state update after a draw or a START_BINNING. Tiles can then have their
253 * state updated at draw time with a set of cheap checks for whether the
254 * state's copy of the global counter matches the global counter the last time
255 * that state was written to the tile.
257 * The state counters are relatively small and wrap around quickly, so you
258 * could get false negatives for needing to update a particular state in the
259 * tile. To avoid this, the hardware attempts to write all of the state in
260 * the tile at wraparound time. This apparently is broken, so we just flush
261 * everything before that behavior is triggered. A batch flush is sufficient
262 * to get our current contents drawn and reset the counters to 0.
264 * Note that we can't just use VC4_PACKET_FLUSH_ALL, because that caps the
265 * tiles with VC4_PACKET_RETURN_FROM_LIST.
268 vc4_hw_2116_workaround(struct pipe_context
*pctx
, int vert_count
)
270 struct vc4_context
*vc4
= vc4_context(pctx
);
271 struct vc4_job
*job
= vc4_get_job_for_fbo(vc4
);
273 if (job
->draw_calls_queued
+ vert_count
/ 65535 >= VC4_HW_2116_COUNT
) {
274 perf_debug("Flushing batch due to HW-2116 workaround "
275 "(too many draw calls per scene\n");
276 vc4_job_submit(vc4
, job
);
281 vc4_draw_vbo(struct pipe_context
*pctx
, const struct pipe_draw_info
*info
)
283 struct vc4_context
*vc4
= vc4_context(pctx
);
285 if (info
->mode
>= PIPE_PRIM_QUADS
) {
286 util_primconvert_save_index_buffer(vc4
->primconvert
, &vc4
->indexbuf
);
287 util_primconvert_save_rasterizer_state(vc4
->primconvert
, &vc4
->rasterizer
->base
);
288 util_primconvert_draw_vbo(vc4
->primconvert
, info
);
289 perf_debug("Fallback conversion for %d %s vertices\n",
290 info
->count
, u_prim_name(info
->mode
));
294 /* Before setting up the draw, do any fixup blits necessary. */
295 vc4_predraw_check_textures(pctx
, &vc4
->verttex
);
296 vc4_predraw_check_textures(pctx
, &vc4
->fragtex
);
298 vc4_hw_2116_workaround(pctx
, info
->count
);
300 struct vc4_job
*job
= vc4_get_job_for_fbo(vc4
);
302 vc4_get_draw_cl_space(job
, info
->count
);
304 if (vc4
->prim_mode
!= info
->mode
) {
305 vc4
->prim_mode
= info
->mode
;
306 vc4
->dirty
|= VC4_DIRTY_PRIM_MODE
;
310 if (!vc4_update_compiled_shaders(vc4
, info
->mode
)) {
311 debug_warn_once("shader compile failed, skipping draw call.\n");
315 vc4_emit_state(pctx
);
317 if ((vc4
->dirty
& (VC4_DIRTY_VTXBUF
|
319 VC4_DIRTY_PRIM_MODE
|
320 VC4_DIRTY_RASTERIZER
|
321 VC4_DIRTY_COMPILED_CS
|
322 VC4_DIRTY_COMPILED_VS
|
323 VC4_DIRTY_COMPILED_FS
|
324 vc4
->prog
.cs
->uniform_dirty_bits
|
325 vc4
->prog
.vs
->uniform_dirty_bits
|
326 vc4
->prog
.fs
->uniform_dirty_bits
)) ||
327 vc4
->last_index_bias
!= info
->index_bias
) {
328 vc4_emit_gl_shader_state(vc4
, info
, 0);
333 /* Note that the primitive type fields match with OpenGL/gallium
334 * definitions, up to but not including QUADS.
336 struct vc4_cl_out
*bcl
= cl_start(&job
->bcl
);
338 uint32_t offset
= vc4
->indexbuf
.offset
;
339 uint32_t index_size
= vc4
->indexbuf
.index_size
;
340 struct pipe_resource
*prsc
;
341 if (vc4
->indexbuf
.index_size
== 4) {
342 prsc
= vc4_get_shadow_index_buffer(pctx
, &vc4
->indexbuf
,
343 info
->count
, &offset
);
346 if (vc4
->indexbuf
.user_buffer
) {
348 u_upload_data(vc4
->uploader
, 0,
349 info
->count
* index_size
, 4,
350 vc4
->indexbuf
.user_buffer
,
353 prsc
= vc4
->indexbuf
.buffer
;
356 struct vc4_resource
*rsc
= vc4_resource(prsc
);
358 cl_start_reloc(&job
->bcl
, &bcl
, 1);
359 cl_u8(&bcl
, VC4_PACKET_GL_INDEXED_PRIMITIVE
);
363 VC4_INDEX_BUFFER_U16
:
364 VC4_INDEX_BUFFER_U8
));
365 cl_u32(&bcl
, info
->count
);
366 cl_reloc(job
, &job
->bcl
, &bcl
, rsc
->bo
, offset
);
367 cl_u32(&bcl
, vc4
->max_index
);
368 job
->draw_calls_queued
++;
370 if (vc4
->indexbuf
.index_size
== 4 || vc4
->indexbuf
.user_buffer
)
371 pipe_resource_reference(&prsc
, NULL
);
373 uint32_t count
= info
->count
;
374 uint32_t start
= info
->start
;
375 uint32_t extra_index_bias
= 0;
378 uint32_t this_count
= count
;
379 uint32_t step
= count
;
380 static const uint32_t max_verts
= 65535;
382 /* GFXH-515 / SW-5891: The binner emits 16 bit indices
383 * for drawarrays, which means that if start + count >
384 * 64k it would truncate the top bits. Work around
385 * this by emitting a limited number of primitives at
386 * a time and reemitting the shader state pointing
387 * farther down the vertex attribute arrays.
389 * To do this properly for line loops or trifans, we'd
390 * need to make a new VB containing the first vertex
391 * plus whatever remainder.
393 if (extra_index_bias
) {
394 cl_end(&job
->bcl
, bcl
);
395 vc4_emit_gl_shader_state(vc4
, info
,
397 bcl
= cl_start(&job
->bcl
);
400 if (start
+ count
> max_verts
) {
401 switch (info
->mode
) {
402 case PIPE_PRIM_POINTS
:
403 this_count
= step
= max_verts
;
405 case PIPE_PRIM_LINES
:
406 this_count
= step
= max_verts
- (max_verts
% 2);
408 case PIPE_PRIM_LINE_STRIP
:
409 this_count
= max_verts
;
410 step
= max_verts
- 1;
412 case PIPE_PRIM_LINE_LOOP
:
413 this_count
= max_verts
;
414 step
= max_verts
- 1;
415 debug_warn_once("unhandled line loop "
416 "looping behavior with "
419 case PIPE_PRIM_TRIANGLES
:
420 this_count
= step
= max_verts
- (max_verts
% 3);
422 case PIPE_PRIM_TRIANGLE_STRIP
:
423 this_count
= max_verts
;
424 step
= max_verts
- 2;
427 debug_warn_once("unhandled primitive "
428 "max vert count, truncating\n");
429 this_count
= step
= max_verts
;
433 cl_u8(&bcl
, VC4_PACKET_GL_ARRAY_PRIMITIVE
);
434 cl_u8(&bcl
, info
->mode
);
435 cl_u32(&bcl
, this_count
);
437 job
->draw_calls_queued
++;
440 extra_index_bias
+= start
+ step
;
444 cl_end(&job
->bcl
, bcl
);
446 /* We shouldn't have tripped the HW_2116 bug with the GFXH-515
449 assert(job
->draw_calls_queued
<= VC4_HW_2116_COUNT
);
451 if (vc4
->zsa
&& vc4
->framebuffer
.zsbuf
) {
452 struct vc4_resource
*rsc
=
453 vc4_resource(vc4
->framebuffer
.zsbuf
->texture
);
455 if (vc4
->zsa
->base
.depth
.enabled
) {
456 job
->resolve
|= PIPE_CLEAR_DEPTH
;
457 rsc
->initialized_buffers
= PIPE_CLEAR_DEPTH
;
460 if (vc4
->zsa
->base
.stencil
[0].enabled
) {
461 job
->resolve
|= PIPE_CLEAR_STENCIL
;
462 rsc
->initialized_buffers
|= PIPE_CLEAR_STENCIL
;
466 job
->resolve
|= PIPE_CLEAR_COLOR0
;
468 if (vc4_debug
& VC4_DEBUG_ALWAYS_FLUSH
)
473 pack_rgba(enum pipe_format format
, const float *rgba
)
476 util_pack_color(rgba
, format
, &uc
);
477 if (util_format_get_blocksize(format
) == 2)
484 vc4_clear(struct pipe_context
*pctx
, unsigned buffers
,
485 const union pipe_color_union
*color
, double depth
, unsigned stencil
)
487 struct vc4_context
*vc4
= vc4_context(pctx
);
488 struct vc4_job
*job
= vc4_get_job_for_fbo(vc4
);
490 /* We can't flag new buffers for clearing once we've queued draws. We
491 * could avoid this by using the 3d engine to clear.
493 if (job
->draw_calls_queued
) {
494 perf_debug("Flushing rendering to process new clear.\n");
495 vc4_job_submit(vc4
, job
);
496 job
= vc4_get_job_for_fbo(vc4
);
499 if (buffers
& PIPE_CLEAR_COLOR0
) {
500 struct vc4_resource
*rsc
=
501 vc4_resource(vc4
->framebuffer
.cbufs
[0]->texture
);
502 uint32_t clear_color
;
504 if (vc4_rt_format_is_565(vc4
->framebuffer
.cbufs
[0]->format
)) {
505 /* In 565 mode, the hardware will be packing our color
508 clear_color
= pack_rgba(PIPE_FORMAT_R8G8B8A8_UNORM
,
511 /* Otherwise, we need to do this packing because we
512 * support multiple swizzlings of RGBA8888.
515 pack_rgba(vc4
->framebuffer
.cbufs
[0]->format
,
518 job
->clear_color
[0] = job
->clear_color
[1] = clear_color
;
519 rsc
->initialized_buffers
|= (buffers
& PIPE_CLEAR_COLOR0
);
522 if (buffers
& PIPE_CLEAR_DEPTHSTENCIL
) {
523 struct vc4_resource
*rsc
=
524 vc4_resource(vc4
->framebuffer
.zsbuf
->texture
);
525 unsigned zsclear
= buffers
& PIPE_CLEAR_DEPTHSTENCIL
;
527 /* Clearing ZS will clear both Z and stencil, so if we're
528 * trying to clear just one then we need to draw a quad to do
531 if ((zsclear
== PIPE_CLEAR_DEPTH
||
532 zsclear
== PIPE_CLEAR_STENCIL
) &&
533 (rsc
->initialized_buffers
& ~(zsclear
| job
->cleared
)) &&
534 util_format_is_depth_and_stencil(vc4
->framebuffer
.zsbuf
->format
)) {
535 perf_debug("Partial clear of Z+stencil buffer, "
536 "drawing a quad instead of fast clearing\n");
537 vc4_blitter_save(vc4
);
538 util_blitter_clear(vc4
->blitter
,
539 vc4
->framebuffer
.width
,
540 vc4
->framebuffer
.height
,
543 NULL
, depth
, stencil
);
549 /* Though the depth buffer is stored with Z in the high 24,
550 * for this field we just need to store it in the low 24.
552 if (buffers
& PIPE_CLEAR_DEPTH
) {
553 job
->clear_depth
= util_pack_z(PIPE_FORMAT_Z24X8_UNORM
,
556 if (buffers
& PIPE_CLEAR_STENCIL
)
557 job
->clear_stencil
= stencil
;
559 rsc
->initialized_buffers
|= zsclear
;
564 job
->draw_max_x
= vc4
->framebuffer
.width
;
565 job
->draw_max_y
= vc4
->framebuffer
.height
;
566 job
->cleared
|= buffers
;
567 job
->resolve
|= buffers
;
573 vc4_clear_render_target(struct pipe_context
*pctx
, struct pipe_surface
*ps
,
574 const union pipe_color_union
*color
,
575 unsigned x
, unsigned y
, unsigned w
, unsigned h
,
576 bool render_condition_enabled
)
578 fprintf(stderr
, "unimpl: clear RT\n");
582 vc4_clear_depth_stencil(struct pipe_context
*pctx
, struct pipe_surface
*ps
,
583 unsigned buffers
, double depth
, unsigned stencil
,
584 unsigned x
, unsigned y
, unsigned w
, unsigned h
,
585 bool render_condition_enabled
)
587 fprintf(stderr
, "unimpl: clear DS\n");
591 vc4_draw_init(struct pipe_context
*pctx
)
593 pctx
->draw_vbo
= vc4_draw_vbo
;
594 pctx
->clear
= vc4_clear
;
595 pctx
->clear_render_target
= vc4_clear_render_target
;
596 pctx
->clear_depth_stencil
= vc4_clear_depth_stencil
;