2 * Copyright 2013 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * Authors: Marek Olšák <maraeo@gmail.com>
30 #include "radeon/r600_cs.h"
32 #include "util/u_memory.h"
34 static void si_set_streamout_enable(struct si_context
*sctx
, bool enable
);
36 static inline void si_so_target_reference(struct si_streamout_target
**dst
,
37 struct pipe_stream_output_target
*src
)
39 pipe_so_target_reference((struct pipe_stream_output_target
**)dst
, src
);
42 static struct pipe_stream_output_target
*
43 si_create_so_target(struct pipe_context
*ctx
,
44 struct pipe_resource
*buffer
,
45 unsigned buffer_offset
,
48 struct si_context
*sctx
= (struct si_context
*)ctx
;
49 struct si_streamout_target
*t
;
50 struct r600_resource
*rbuffer
= (struct r600_resource
*)buffer
;
52 t
= CALLOC_STRUCT(si_streamout_target
);
57 u_suballocator_alloc(sctx
->b
.allocator_zeroed_memory
, 4, 4,
58 &t
->buf_filled_size_offset
,
59 (struct pipe_resource
**)&t
->buf_filled_size
);
60 if (!t
->buf_filled_size
) {
65 t
->b
.reference
.count
= 1;
67 pipe_resource_reference(&t
->b
.buffer
, buffer
);
68 t
->b
.buffer_offset
= buffer_offset
;
69 t
->b
.buffer_size
= buffer_size
;
71 util_range_add(&rbuffer
->valid_buffer_range
, buffer_offset
,
72 buffer_offset
+ buffer_size
);
76 static void si_so_target_destroy(struct pipe_context
*ctx
,
77 struct pipe_stream_output_target
*target
)
79 struct si_streamout_target
*t
= (struct si_streamout_target
*)target
;
80 pipe_resource_reference(&t
->b
.buffer
, NULL
);
81 r600_resource_reference(&t
->buf_filled_size
, NULL
);
85 void si_streamout_buffers_dirty(struct si_context
*sctx
)
87 if (!sctx
->streamout
.enabled_mask
)
90 si_mark_atom_dirty(sctx
, &sctx
->streamout
.begin_atom
);
91 si_set_streamout_enable(sctx
, true);
94 static void si_set_streamout_targets(struct pipe_context
*ctx
,
96 struct pipe_stream_output_target
**targets
,
97 const unsigned *offsets
)
99 struct si_context
*sctx
= (struct si_context
*)ctx
;
100 struct si_buffer_resources
*buffers
= &sctx
->rw_buffers
;
101 struct si_descriptors
*descs
= &sctx
->descriptors
[SI_DESCS_RW_BUFFERS
];
102 unsigned old_num_targets
= sctx
->streamout
.num_targets
;
105 /* We are going to unbind the buffers. Mark which caches need to be flushed. */
106 if (sctx
->streamout
.num_targets
&& sctx
->streamout
.begin_emitted
) {
107 /* Since streamout uses vector writes which go through TC L2
108 * and most other clients can use TC L2 as well, we don't need
111 * The only cases which requires flushing it is VGT DMA index
112 * fetching (on <= CIK) and indirect draw data, which are rare
113 * cases. Thus, flag the TC L2 dirtiness in the resource and
114 * handle it at draw call time.
116 for (i
= 0; i
< sctx
->streamout
.num_targets
; i
++)
117 if (sctx
->streamout
.targets
[i
])
118 r600_resource(sctx
->streamout
.targets
[i
]->b
.buffer
)->TC_L2_dirty
= true;
120 /* Invalidate the scalar cache in case a streamout buffer is
121 * going to be used as a constant buffer.
123 * Invalidate TC L1, because streamout bypasses it (done by
124 * setting GLC=1 in the store instruction), but it can contain
125 * outdated data of streamout buffers.
127 * VS_PARTIAL_FLUSH is required if the buffers are going to be
128 * used as an input immediately.
130 sctx
->b
.flags
|= SI_CONTEXT_INV_SMEM_L1
|
131 SI_CONTEXT_INV_VMEM_L1
|
132 SI_CONTEXT_VS_PARTIAL_FLUSH
;
135 /* All readers of the streamout targets need to be finished before we can
136 * start writing to the targets.
139 sctx
->b
.flags
|= SI_CONTEXT_PS_PARTIAL_FLUSH
|
140 SI_CONTEXT_CS_PARTIAL_FLUSH
;
142 /* Streamout buffers must be bound in 2 places:
143 * 1) in VGT by setting the VGT_STRMOUT registers
144 * 2) as shader resources
147 /* Stop streamout. */
148 if (sctx
->streamout
.num_targets
&& sctx
->streamout
.begin_emitted
)
149 si_emit_streamout_end(sctx
);
151 /* Set the new targets. */
152 unsigned enabled_mask
= 0, append_bitmask
= 0;
153 for (i
= 0; i
< num_targets
; i
++) {
154 si_so_target_reference(&sctx
->streamout
.targets
[i
], targets
[i
]);
158 r600_context_add_resource_size(ctx
, targets
[i
]->buffer
);
159 enabled_mask
|= 1 << i
;
161 if (offsets
[i
] == ((unsigned)-1))
162 append_bitmask
|= 1 << i
;
165 for (; i
< sctx
->streamout
.num_targets
; i
++)
166 si_so_target_reference(&sctx
->streamout
.targets
[i
], NULL
);
168 sctx
->streamout
.enabled_mask
= enabled_mask
;
169 sctx
->streamout
.num_targets
= num_targets
;
170 sctx
->streamout
.append_bitmask
= append_bitmask
;
172 /* Update dirty state bits. */
174 si_streamout_buffers_dirty(sctx
);
176 si_set_atom_dirty(sctx
, &sctx
->streamout
.begin_atom
, false);
177 si_set_streamout_enable(sctx
, false);
180 /* Set the shader resources.*/
181 for (i
= 0; i
< num_targets
; i
++) {
182 bufidx
= SI_VS_STREAMOUT_BUF0
+ i
;
185 struct pipe_resource
*buffer
= targets
[i
]->buffer
;
186 uint64_t va
= r600_resource(buffer
)->gpu_address
;
188 /* Set the descriptor.
190 * On VI, the format must be non-INVALID, otherwise
191 * the buffer will be considered not bound and store
192 * instructions will be no-ops.
194 uint32_t *desc
= descs
->list
+ bufidx
*4;
196 desc
[1] = S_008F04_BASE_ADDRESS_HI(va
>> 32);
197 desc
[2] = 0xffffffff;
198 desc
[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X
) |
199 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y
) |
200 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z
) |
201 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W
) |
202 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32
);
204 /* Set the resource. */
205 pipe_resource_reference(&buffers
->buffers
[bufidx
],
207 radeon_add_to_buffer_list_check_mem(&sctx
->b
, &sctx
->b
.gfx
,
208 (struct r600_resource
*)buffer
,
209 buffers
->shader_usage
,
210 RADEON_PRIO_SHADER_RW_BUFFER
,
212 r600_resource(buffer
)->bind_history
|= PIPE_BIND_STREAM_OUTPUT
;
214 buffers
->enabled_mask
|= 1u << bufidx
;
216 /* Clear the descriptor and unset the resource. */
217 memset(descs
->list
+ bufidx
*4, 0,
218 sizeof(uint32_t) * 4);
219 pipe_resource_reference(&buffers
->buffers
[bufidx
],
221 buffers
->enabled_mask
&= ~(1u << bufidx
);
224 for (; i
< old_num_targets
; i
++) {
225 bufidx
= SI_VS_STREAMOUT_BUF0
+ i
;
226 /* Clear the descriptor and unset the resource. */
227 memset(descs
->list
+ bufidx
*4, 0, sizeof(uint32_t) * 4);
228 pipe_resource_reference(&buffers
->buffers
[bufidx
], NULL
);
229 buffers
->enabled_mask
&= ~(1u << bufidx
);
232 sctx
->descriptors_dirty
|= 1u << SI_DESCS_RW_BUFFERS
;
235 static void si_flush_vgt_streamout(struct si_context
*sctx
)
237 struct radeon_winsys_cs
*cs
= sctx
->b
.gfx
.cs
;
238 unsigned reg_strmout_cntl
;
240 /* The register is at different places on different ASICs. */
241 if (sctx
->b
.chip_class
>= CIK
) {
242 reg_strmout_cntl
= R_0300FC_CP_STRMOUT_CNTL
;
243 radeon_set_uconfig_reg(cs
, reg_strmout_cntl
, 0);
245 reg_strmout_cntl
= R_0084FC_CP_STRMOUT_CNTL
;
246 radeon_set_config_reg(cs
, reg_strmout_cntl
, 0);
249 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
250 radeon_emit(cs
, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH
) | EVENT_INDEX(0));
252 radeon_emit(cs
, PKT3(PKT3_WAIT_REG_MEM
, 5, 0));
253 radeon_emit(cs
, WAIT_REG_MEM_EQUAL
); /* wait until the register is equal to the reference value */
254 radeon_emit(cs
, reg_strmout_cntl
>> 2); /* register */
256 radeon_emit(cs
, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
257 radeon_emit(cs
, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
258 radeon_emit(cs
, 4); /* poll interval */
261 static void si_emit_streamout_begin(struct r600_common_context
*rctx
, struct r600_atom
*atom
)
263 struct si_context
*sctx
= (struct si_context
*)rctx
;
264 struct radeon_winsys_cs
*cs
= sctx
->b
.gfx
.cs
;
265 struct si_streamout_target
**t
= sctx
->streamout
.targets
;
266 uint16_t *stride_in_dw
= sctx
->streamout
.stride_in_dw
;
269 si_flush_vgt_streamout(sctx
);
271 for (i
= 0; i
< sctx
->streamout
.num_targets
; i
++) {
275 t
[i
]->stride_in_dw
= stride_in_dw
[i
];
277 /* SI binds streamout buffers as shader resources.
278 * VGT only counts primitives and tells the shader
279 * through SGPRs what to do. */
280 radeon_set_context_reg_seq(cs
, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0
+ 16*i
, 2);
281 radeon_emit(cs
, (t
[i
]->b
.buffer_offset
+
282 t
[i
]->b
.buffer_size
) >> 2); /* BUFFER_SIZE (in DW) */
283 radeon_emit(cs
, stride_in_dw
[i
]); /* VTX_STRIDE (in DW) */
285 if (sctx
->streamout
.append_bitmask
& (1 << i
) && t
[i
]->buf_filled_size_valid
) {
286 uint64_t va
= t
[i
]->buf_filled_size
->gpu_address
+
287 t
[i
]->buf_filled_size_offset
;
290 radeon_emit(cs
, PKT3(PKT3_STRMOUT_BUFFER_UPDATE
, 4, 0));
291 radeon_emit(cs
, STRMOUT_SELECT_BUFFER(i
) |
292 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM
)); /* control */
293 radeon_emit(cs
, 0); /* unused */
294 radeon_emit(cs
, 0); /* unused */
295 radeon_emit(cs
, va
); /* src address lo */
296 radeon_emit(cs
, va
>> 32); /* src address hi */
298 radeon_add_to_buffer_list(&sctx
->b
, &sctx
->b
.gfx
,
299 t
[i
]->buf_filled_size
,
301 RADEON_PRIO_SO_FILLED_SIZE
);
303 /* Start from the beginning. */
304 radeon_emit(cs
, PKT3(PKT3_STRMOUT_BUFFER_UPDATE
, 4, 0));
305 radeon_emit(cs
, STRMOUT_SELECT_BUFFER(i
) |
306 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET
)); /* control */
307 radeon_emit(cs
, 0); /* unused */
308 radeon_emit(cs
, 0); /* unused */
309 radeon_emit(cs
, t
[i
]->b
.buffer_offset
>> 2); /* buffer offset in DW */
310 radeon_emit(cs
, 0); /* unused */
314 sctx
->streamout
.begin_emitted
= true;
317 void si_emit_streamout_end(struct si_context
*sctx
)
319 struct radeon_winsys_cs
*cs
= sctx
->b
.gfx
.cs
;
320 struct si_streamout_target
**t
= sctx
->streamout
.targets
;
324 si_flush_vgt_streamout(sctx
);
326 for (i
= 0; i
< sctx
->streamout
.num_targets
; i
++) {
330 va
= t
[i
]->buf_filled_size
->gpu_address
+ t
[i
]->buf_filled_size_offset
;
331 radeon_emit(cs
, PKT3(PKT3_STRMOUT_BUFFER_UPDATE
, 4, 0));
332 radeon_emit(cs
, STRMOUT_SELECT_BUFFER(i
) |
333 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE
) |
334 STRMOUT_STORE_BUFFER_FILLED_SIZE
); /* control */
335 radeon_emit(cs
, va
); /* dst address lo */
336 radeon_emit(cs
, va
>> 32); /* dst address hi */
337 radeon_emit(cs
, 0); /* unused */
338 radeon_emit(cs
, 0); /* unused */
340 radeon_add_to_buffer_list(&sctx
->b
, &sctx
->b
.gfx
,
341 t
[i
]->buf_filled_size
,
343 RADEON_PRIO_SO_FILLED_SIZE
);
345 /* Zero the buffer size. The counters (primitives generated,
346 * primitives emitted) may be enabled even if there is not
347 * buffer bound. This ensures that the primitives-emitted query
348 * won't increment. */
349 radeon_set_context_reg(cs
, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0
+ 16*i
, 0);
351 t
[i
]->buf_filled_size_valid
= true;
354 sctx
->streamout
.begin_emitted
= false;
355 sctx
->b
.flags
|= R600_CONTEXT_STREAMOUT_FLUSH
;
358 /* STREAMOUT CONFIG DERIVED STATE
360 * Streamout must be enabled for the PRIMITIVES_GENERATED query to work.
361 * The buffer mask is an independent state, so no writes occur if there
362 * are no buffers bound.
365 static void si_emit_streamout_enable(struct r600_common_context
*rctx
,
366 struct r600_atom
*atom
)
368 struct si_context
*sctx
= (struct si_context
*)rctx
;
370 radeon_set_context_reg_seq(sctx
->b
.gfx
.cs
, R_028B94_VGT_STRMOUT_CONFIG
, 2);
371 radeon_emit(sctx
->b
.gfx
.cs
,
372 S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx
)) |
373 S_028B94_RAST_STREAM(0) |
374 S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx
)) |
375 S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx
)) |
376 S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx
)));
377 radeon_emit(sctx
->b
.gfx
.cs
,
378 sctx
->streamout
.hw_enabled_mask
&
379 sctx
->streamout
.enabled_stream_buffers_mask
);
382 static void si_set_streamout_enable(struct si_context
*sctx
, bool enable
)
384 bool old_strmout_en
= si_get_strmout_en(sctx
);
385 unsigned old_hw_enabled_mask
= sctx
->streamout
.hw_enabled_mask
;
387 sctx
->streamout
.streamout_enabled
= enable
;
389 sctx
->streamout
.hw_enabled_mask
= sctx
->streamout
.enabled_mask
|
390 (sctx
->streamout
.enabled_mask
<< 4) |
391 (sctx
->streamout
.enabled_mask
<< 8) |
392 (sctx
->streamout
.enabled_mask
<< 12);
394 if ((old_strmout_en
!= si_get_strmout_en(sctx
)) ||
395 (old_hw_enabled_mask
!= sctx
->streamout
.hw_enabled_mask
))
396 si_mark_atom_dirty(sctx
, &sctx
->streamout
.enable_atom
);
399 void si_update_prims_generated_query_state(struct si_context
*sctx
,
400 unsigned type
, int diff
)
402 if (type
== PIPE_QUERY_PRIMITIVES_GENERATED
) {
403 bool old_strmout_en
= si_get_strmout_en(sctx
);
405 sctx
->streamout
.num_prims_gen_queries
+= diff
;
406 assert(sctx
->streamout
.num_prims_gen_queries
>= 0);
408 sctx
->streamout
.prims_gen_query_enabled
=
409 sctx
->streamout
.num_prims_gen_queries
!= 0;
411 if (old_strmout_en
!= si_get_strmout_en(sctx
))
412 si_mark_atom_dirty(sctx
, &sctx
->streamout
.enable_atom
);
416 void si_init_streamout_functions(struct si_context
*sctx
)
418 sctx
->b
.b
.create_stream_output_target
= si_create_so_target
;
419 sctx
->b
.b
.stream_output_target_destroy
= si_so_target_destroy
;
420 sctx
->b
.b
.set_stream_output_targets
= si_set_streamout_targets
;
421 sctx
->streamout
.begin_atom
.emit
= si_emit_streamout_begin
;
422 sctx
->streamout
.enable_atom
.emit
= si_emit_streamout_enable
;