2 * Copyright 2018 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
28 /* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
29 * and L2_STREAM for src.
31 static enum si_cache_policy
get_cache_policy(struct si_context
*sctx
,
32 enum si_coherency coher
,
35 if ((sctx
->chip_class
>= GFX9
&& (coher
== SI_COHERENCY_CB_META
||
36 coher
== SI_COHERENCY_CP
)) ||
37 (sctx
->chip_class
>= CIK
&& coher
== SI_COHERENCY_SHADER
))
38 return size
<= 256 * 1024 ? L2_LRU
: L2_STREAM
;
43 unsigned si_get_flush_flags(struct si_context
*sctx
, enum si_coherency coher
,
44 enum si_cache_policy cache_policy
)
48 case SI_COHERENCY_NONE
:
51 case SI_COHERENCY_SHADER
:
52 return SI_CONTEXT_INV_SMEM_L1
|
53 SI_CONTEXT_INV_VMEM_L1
|
54 (cache_policy
== L2_BYPASS
? SI_CONTEXT_INV_GLOBAL_L2
: 0);
55 case SI_COHERENCY_CB_META
:
56 return SI_CONTEXT_FLUSH_AND_INV_CB
;
60 static void si_compute_do_clear_or_copy(struct si_context
*sctx
,
61 struct pipe_resource
*dst
,
63 struct pipe_resource
*src
,
66 const uint32_t *clear_value
,
67 unsigned clear_value_size
,
68 enum si_coherency coher
)
70 struct pipe_context
*ctx
= &sctx
->b
;
72 assert(src_offset
% 4 == 0);
73 assert(dst_offset
% 4 == 0);
74 assert(size
% 4 == 0);
76 assert(dst
->target
!= PIPE_BUFFER
|| dst_offset
+ size
<= dst
->width0
);
77 assert(!src
|| src_offset
+ size
<= src
->width0
);
79 sctx
->flags
|= SI_CONTEXT_PS_PARTIAL_FLUSH
|
80 SI_CONTEXT_CS_PARTIAL_FLUSH
|
81 si_get_flush_flags(sctx
, coher
, SI_COMPUTE_DST_CACHE_POLICY
);
84 void *saved_cs
= sctx
->cs_shader_state
.program
;
85 struct pipe_shader_buffer saved_sb
[2] = {};
86 si_get_shader_buffers(sctx
, PIPE_SHADER_COMPUTE
, 0, src
? 2 : 1, saved_sb
);
88 /* The memory accesses are coalesced, meaning that the 1st instruction writes
89 * the 1st contiguous block of data for the whole wave, the 2nd instruction
90 * writes the 2nd contiguous block of data, etc.
92 unsigned dwords_per_thread
= src
? SI_COMPUTE_COPY_DW_PER_THREAD
:
93 SI_COMPUTE_CLEAR_DW_PER_THREAD
;
94 unsigned instructions_per_thread
= MAX2(1, dwords_per_thread
/ 4);
95 unsigned dwords_per_instruction
= dwords_per_thread
/ instructions_per_thread
;
96 unsigned dwords_per_wave
= dwords_per_thread
* 64;
98 unsigned num_dwords
= size
/ 4;
99 unsigned num_instructions
= DIV_ROUND_UP(num_dwords
, dwords_per_instruction
);
101 struct pipe_grid_info info
= {};
102 info
.block
[0] = MIN2(64, num_instructions
);
105 info
.grid
[0] = DIV_ROUND_UP(num_dwords
, dwords_per_wave
);
109 struct pipe_shader_buffer sb
[2] = {};
111 sb
[0].buffer_offset
= dst_offset
;
112 sb
[0].buffer_size
= size
;
116 sb
[1].buffer_offset
= src_offset
;
117 sb
[1].buffer_size
= size
;
119 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, 2, sb
);
120 ctx
->bind_compute_state(ctx
, sctx
->cs_copy_buffer
);
122 assert(clear_value_size
>= 4 &&
123 clear_value_size
<= 16 &&
124 util_is_power_of_two_or_zero(clear_value_size
));
126 for (unsigned i
= 0; i
< 4; i
++)
127 sctx
->cs_user_data
[i
] = clear_value
[i
% (clear_value_size
/ 4)];
129 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, 1, sb
);
130 ctx
->bind_compute_state(ctx
, sctx
->cs_clear_buffer
);
133 ctx
->launch_grid(ctx
, &info
);
135 enum si_cache_policy cache_policy
= get_cache_policy(sctx
, coher
, size
);
136 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
|
137 (cache_policy
== L2_BYPASS
? SI_CONTEXT_WRITEBACK_GLOBAL_L2
: 0);
139 if (cache_policy
!= L2_BYPASS
)
140 r600_resource(dst
)->TC_L2_dirty
= true;
142 /* Restore states. */
143 ctx
->bind_compute_state(ctx
, saved_cs
);
144 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, src
? 2 : 1, saved_sb
);
147 void si_clear_buffer(struct si_context
*sctx
, struct pipe_resource
*dst
,
148 uint64_t offset
, uint64_t size
, uint32_t *clear_value
,
149 uint32_t clear_value_size
, enum si_coherency coher
)
154 unsigned clear_alignment
= MIN2(clear_value_size
, 4);
156 assert(clear_value_size
!= 3 && clear_value_size
!= 6); /* 12 is allowed. */
157 assert(offset
% clear_alignment
== 0);
158 assert(size
% clear_alignment
== 0);
159 assert(size
< (UINT_MAX
& ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
161 /* Reduce a large clear value size if possible. */
162 if (clear_value_size
> 4) {
163 bool clear_dword_duplicated
= true;
165 /* See if we can lower large fills to dword fills. */
166 for (unsigned i
= 1; i
< clear_value_size
/ 4; i
++) {
167 if (clear_value
[0] != clear_value
[i
]) {
168 clear_dword_duplicated
= false;
172 if (clear_dword_duplicated
)
173 clear_value_size
= 4;
176 /* Expand a small clear value size. */
177 uint32_t tmp_clear_value
;
178 if (clear_value_size
<= 2) {
179 if (clear_value_size
== 1) {
180 tmp_clear_value
= *(uint8_t*)clear_value
;
181 tmp_clear_value
|= (tmp_clear_value
<< 8) |
182 (tmp_clear_value
<< 16) |
183 (tmp_clear_value
<< 24);
185 tmp_clear_value
= *(uint16_t*)clear_value
;
186 tmp_clear_value
|= tmp_clear_value
<< 16;
188 clear_value
= &tmp_clear_value
;
189 clear_value_size
= 4;
192 /* Use transform feedback for 12-byte clears. */
193 /* TODO: Use compute. */
194 if (clear_value_size
== 12) {
195 union pipe_color_union streamout_clear_value
;
197 memcpy(&streamout_clear_value
, clear_value
, clear_value_size
);
198 si_blitter_begin(sctx
, SI_DISABLE_RENDER_COND
);
199 util_blitter_clear_buffer(sctx
->blitter
, dst
, offset
,
200 size
, clear_value_size
/ 4,
201 &streamout_clear_value
);
202 si_blitter_end(sctx
);
206 uint64_t aligned_size
= size
& ~3ull;
207 if (aligned_size
>= 4) {
208 /* Before GFX9, CP DMA was very slow when clearing GTT, so never
209 * use CP DMA clears on those chips, because we can't be certain
210 * about buffer placements.
212 if (clear_value_size
> 4 ||
213 (clear_value_size
== 4 &&
215 (size
> 32*1024 || sctx
->chip_class
<= VI
))) {
216 si_compute_do_clear_or_copy(sctx
, dst
, offset
, NULL
, 0,
217 aligned_size
, clear_value
,
218 clear_value_size
, coher
);
220 assert(clear_value_size
== 4);
221 si_cp_dma_clear_buffer(sctx
, sctx
->gfx_cs
, dst
, offset
,
222 aligned_size
, *clear_value
, 0, coher
,
223 get_cache_policy(sctx
, coher
, size
));
226 offset
+= aligned_size
;
227 size
-= aligned_size
;
230 /* Handle non-dword alignment. */
233 assert(dst
->target
== PIPE_BUFFER
);
236 pipe_buffer_write(&sctx
->b
, dst
, offset
, size
, clear_value
);
240 static void si_pipe_clear_buffer(struct pipe_context
*ctx
,
241 struct pipe_resource
*dst
,
242 unsigned offset
, unsigned size
,
243 const void *clear_value
,
244 int clear_value_size
)
246 enum si_coherency coher
;
248 if (dst
->flags
& SI_RESOURCE_FLAG_SO_FILLED_SIZE
)
249 coher
= SI_COHERENCY_CP
;
251 coher
= SI_COHERENCY_SHADER
;
253 si_clear_buffer((struct si_context
*)ctx
, dst
, offset
, size
, (uint32_t*)clear_value
,
254 clear_value_size
, coher
);
257 void si_copy_buffer(struct si_context
*sctx
,
258 struct pipe_resource
*dst
, struct pipe_resource
*src
,
259 uint64_t dst_offset
, uint64_t src_offset
, unsigned size
)
264 enum si_coherency coher
= SI_COHERENCY_SHADER
;
265 enum si_cache_policy cache_policy
= get_cache_policy(sctx
, coher
, size
);
267 /* Only use compute for VRAM copies on dGPUs. */
268 if (sctx
->screen
->info
.has_dedicated_vram
&&
269 r600_resource(dst
)->domains
& RADEON_DOMAIN_VRAM
&&
270 r600_resource(src
)->domains
& RADEON_DOMAIN_VRAM
&&
272 dst_offset
% 4 == 0 && src_offset
% 4 == 0 && size
% 4 == 0) {
273 si_compute_do_clear_or_copy(sctx
, dst
, dst_offset
, src
, src_offset
,
274 size
, NULL
, 0, coher
);
276 si_cp_dma_copy_buffer(sctx
, dst
, src
, dst_offset
, src_offset
, size
,
277 0, coher
, cache_policy
);
281 void si_init_compute_blit_functions(struct si_context
*sctx
)
283 sctx
->b
.clear_buffer
= si_pipe_clear_buffer
;