2 * Copyright 2018 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
28 /* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
29 * and L2_STREAM for src.
31 static enum si_cache_policy
get_cache_policy(struct si_context
*sctx
,
32 enum si_coherency coher
,
35 if ((sctx
->chip_class
>= GFX9
&& (coher
== SI_COHERENCY_CB_META
||
36 coher
== SI_COHERENCY_CP
)) ||
37 (sctx
->chip_class
>= CIK
&& coher
== SI_COHERENCY_SHADER
))
38 return size
<= 256 * 1024 ? L2_LRU
: L2_STREAM
;
43 unsigned si_get_flush_flags(struct si_context
*sctx
, enum si_coherency coher
,
44 enum si_cache_policy cache_policy
)
48 case SI_COHERENCY_NONE
:
51 case SI_COHERENCY_SHADER
:
52 return SI_CONTEXT_INV_SMEM_L1
|
53 SI_CONTEXT_INV_VMEM_L1
|
54 (cache_policy
== L2_BYPASS
? SI_CONTEXT_INV_GLOBAL_L2
: 0);
55 case SI_COHERENCY_CB_META
:
56 return SI_CONTEXT_FLUSH_AND_INV_CB
;
60 static void si_compute_do_clear_or_copy(struct si_context
*sctx
,
61 struct pipe_resource
*dst
,
63 struct pipe_resource
*src
,
66 const uint32_t *clear_value
,
67 unsigned clear_value_size
,
68 enum si_coherency coher
)
70 struct pipe_context
*ctx
= &sctx
->b
;
72 assert(src_offset
% 4 == 0);
73 assert(dst_offset
% 4 == 0);
74 assert(size
% 4 == 0);
76 assert(dst
->target
!= PIPE_BUFFER
|| dst_offset
+ size
<= dst
->width0
);
77 assert(!src
|| src_offset
+ size
<= src
->width0
);
79 sctx
->flags
|= SI_CONTEXT_PS_PARTIAL_FLUSH
|
80 SI_CONTEXT_CS_PARTIAL_FLUSH
|
81 si_get_flush_flags(sctx
, coher
, SI_COMPUTE_DST_CACHE_POLICY
);
84 void *saved_cs
= sctx
->cs_shader_state
.program
;
85 struct pipe_shader_buffer saved_sb
[2] = {};
86 si_get_shader_buffers(sctx
, PIPE_SHADER_COMPUTE
, 0, src
? 2 : 1, saved_sb
);
88 /* The memory accesses are coalesced, meaning that the 1st instruction writes
89 * the 1st contiguous block of data for the whole wave, the 2nd instruction
90 * writes the 2nd contiguous block of data, etc.
92 unsigned dwords_per_thread
= src
? SI_COMPUTE_COPY_DW_PER_THREAD
:
93 SI_COMPUTE_CLEAR_DW_PER_THREAD
;
94 unsigned instructions_per_thread
= MAX2(1, dwords_per_thread
/ 4);
95 unsigned dwords_per_instruction
= dwords_per_thread
/ instructions_per_thread
;
96 unsigned dwords_per_wave
= dwords_per_thread
* 64;
98 unsigned num_dwords
= size
/ 4;
99 unsigned num_instructions
= DIV_ROUND_UP(num_dwords
, dwords_per_instruction
);
101 struct pipe_grid_info info
= {};
102 info
.block
[0] = MIN2(64, num_instructions
);
105 info
.grid
[0] = DIV_ROUND_UP(num_dwords
, dwords_per_wave
);
109 struct pipe_shader_buffer sb
[2] = {};
111 sb
[0].buffer_offset
= dst_offset
;
112 sb
[0].buffer_size
= size
;
114 bool shader_dst_stream_policy
= SI_COMPUTE_DST_CACHE_POLICY
!= L2_LRU
;
118 sb
[1].buffer_offset
= src_offset
;
119 sb
[1].buffer_size
= size
;
121 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, 2, sb
);
123 if (!sctx
->cs_copy_buffer
) {
124 sctx
->cs_copy_buffer
= si_create_dma_compute_shader(&sctx
->b
,
125 SI_COMPUTE_COPY_DW_PER_THREAD
,
126 shader_dst_stream_policy
, true);
128 ctx
->bind_compute_state(ctx
, sctx
->cs_copy_buffer
);
130 assert(clear_value_size
>= 4 &&
131 clear_value_size
<= 16 &&
132 util_is_power_of_two_or_zero(clear_value_size
));
134 for (unsigned i
= 0; i
< 4; i
++)
135 sctx
->cs_user_data
[i
] = clear_value
[i
% (clear_value_size
/ 4)];
137 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, 1, sb
);
139 if (!sctx
->cs_clear_buffer
) {
140 sctx
->cs_clear_buffer
= si_create_dma_compute_shader(&sctx
->b
,
141 SI_COMPUTE_CLEAR_DW_PER_THREAD
,
142 shader_dst_stream_policy
, false);
144 ctx
->bind_compute_state(ctx
, sctx
->cs_clear_buffer
);
147 ctx
->launch_grid(ctx
, &info
);
149 enum si_cache_policy cache_policy
= get_cache_policy(sctx
, coher
, size
);
150 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
|
151 (cache_policy
== L2_BYPASS
? SI_CONTEXT_WRITEBACK_GLOBAL_L2
: 0);
153 if (cache_policy
!= L2_BYPASS
)
154 r600_resource(dst
)->TC_L2_dirty
= true;
156 /* Restore states. */
157 ctx
->bind_compute_state(ctx
, saved_cs
);
158 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, src
? 2 : 1, saved_sb
);
161 void si_clear_buffer(struct si_context
*sctx
, struct pipe_resource
*dst
,
162 uint64_t offset
, uint64_t size
, uint32_t *clear_value
,
163 uint32_t clear_value_size
, enum si_coherency coher
)
168 unsigned clear_alignment
= MIN2(clear_value_size
, 4);
170 assert(clear_value_size
!= 3 && clear_value_size
!= 6); /* 12 is allowed. */
171 assert(offset
% clear_alignment
== 0);
172 assert(size
% clear_alignment
== 0);
173 assert(size
< (UINT_MAX
& ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
175 /* Reduce a large clear value size if possible. */
176 if (clear_value_size
> 4) {
177 bool clear_dword_duplicated
= true;
179 /* See if we can lower large fills to dword fills. */
180 for (unsigned i
= 1; i
< clear_value_size
/ 4; i
++) {
181 if (clear_value
[0] != clear_value
[i
]) {
182 clear_dword_duplicated
= false;
186 if (clear_dword_duplicated
)
187 clear_value_size
= 4;
190 /* Expand a small clear value size. */
191 uint32_t tmp_clear_value
;
192 if (clear_value_size
<= 2) {
193 if (clear_value_size
== 1) {
194 tmp_clear_value
= *(uint8_t*)clear_value
;
195 tmp_clear_value
|= (tmp_clear_value
<< 8) |
196 (tmp_clear_value
<< 16) |
197 (tmp_clear_value
<< 24);
199 tmp_clear_value
= *(uint16_t*)clear_value
;
200 tmp_clear_value
|= tmp_clear_value
<< 16;
202 clear_value
= &tmp_clear_value
;
203 clear_value_size
= 4;
206 /* Use transform feedback for 12-byte clears. */
207 /* TODO: Use compute. */
208 if (clear_value_size
== 12) {
209 union pipe_color_union streamout_clear_value
;
211 memcpy(&streamout_clear_value
, clear_value
, clear_value_size
);
212 si_blitter_begin(sctx
, SI_DISABLE_RENDER_COND
);
213 util_blitter_clear_buffer(sctx
->blitter
, dst
, offset
,
214 size
, clear_value_size
/ 4,
215 &streamout_clear_value
);
216 si_blitter_end(sctx
);
220 uint64_t aligned_size
= size
& ~3ull;
221 if (aligned_size
>= 4) {
222 /* Before GFX9, CP DMA was very slow when clearing GTT, so never
223 * use CP DMA clears on those chips, because we can't be certain
224 * about buffer placements.
226 if (clear_value_size
> 4 ||
227 (clear_value_size
== 4 &&
229 (size
> 32*1024 || sctx
->chip_class
<= VI
))) {
230 si_compute_do_clear_or_copy(sctx
, dst
, offset
, NULL
, 0,
231 aligned_size
, clear_value
,
232 clear_value_size
, coher
);
234 assert(clear_value_size
== 4);
235 si_cp_dma_clear_buffer(sctx
, sctx
->gfx_cs
, dst
, offset
,
236 aligned_size
, *clear_value
, 0, coher
,
237 get_cache_policy(sctx
, coher
, size
));
240 offset
+= aligned_size
;
241 size
-= aligned_size
;
244 /* Handle non-dword alignment. */
247 assert(dst
->target
== PIPE_BUFFER
);
250 pipe_buffer_write(&sctx
->b
, dst
, offset
, size
, clear_value
);
254 static void si_pipe_clear_buffer(struct pipe_context
*ctx
,
255 struct pipe_resource
*dst
,
256 unsigned offset
, unsigned size
,
257 const void *clear_value
,
258 int clear_value_size
)
260 enum si_coherency coher
;
262 if (dst
->flags
& SI_RESOURCE_FLAG_SO_FILLED_SIZE
)
263 coher
= SI_COHERENCY_CP
;
265 coher
= SI_COHERENCY_SHADER
;
267 si_clear_buffer((struct si_context
*)ctx
, dst
, offset
, size
, (uint32_t*)clear_value
,
268 clear_value_size
, coher
);
271 void si_copy_buffer(struct si_context
*sctx
,
272 struct pipe_resource
*dst
, struct pipe_resource
*src
,
273 uint64_t dst_offset
, uint64_t src_offset
, unsigned size
)
278 enum si_coherency coher
= SI_COHERENCY_SHADER
;
279 enum si_cache_policy cache_policy
= get_cache_policy(sctx
, coher
, size
);
281 /* Only use compute for VRAM copies on dGPUs. */
282 if (sctx
->screen
->info
.has_dedicated_vram
&&
283 r600_resource(dst
)->domains
& RADEON_DOMAIN_VRAM
&&
284 r600_resource(src
)->domains
& RADEON_DOMAIN_VRAM
&&
286 dst_offset
% 4 == 0 && src_offset
% 4 == 0 && size
% 4 == 0) {
287 si_compute_do_clear_or_copy(sctx
, dst
, dst_offset
, src
, src_offset
,
288 size
, NULL
, 0, coher
);
290 si_cp_dma_copy_buffer(sctx
, dst
, src
, dst_offset
, src_offset
, size
,
291 0, coher
, cache_policy
);
295 void si_init_compute_blit_functions(struct si_context
*sctx
)
297 sctx
->b
.clear_buffer
= si_pipe_clear_buffer
;