2 * Copyright 2018 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
28 /* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
29 * and L2_STREAM for src.
31 static enum si_cache_policy
get_cache_policy(struct si_context
*sctx
,
32 enum si_coherency coher
,
35 if ((sctx
->chip_class
>= GFX9
&& (coher
== SI_COHERENCY_CB_META
||
36 coher
== SI_COHERENCY_CP
)) ||
37 (sctx
->chip_class
>= CIK
&& coher
== SI_COHERENCY_SHADER
))
38 return size
<= 256 * 1024 ? L2_LRU
: L2_STREAM
;
43 unsigned si_get_flush_flags(struct si_context
*sctx
, enum si_coherency coher
,
44 enum si_cache_policy cache_policy
)
48 case SI_COHERENCY_NONE
:
51 case SI_COHERENCY_SHADER
:
52 return SI_CONTEXT_INV_SMEM_L1
|
53 SI_CONTEXT_INV_VMEM_L1
|
54 (cache_policy
== L2_BYPASS
? SI_CONTEXT_INV_GLOBAL_L2
: 0);
55 case SI_COHERENCY_CB_META
:
56 return SI_CONTEXT_FLUSH_AND_INV_CB
;
60 static void si_compute_do_clear_or_copy(struct si_context
*sctx
,
61 struct pipe_resource
*dst
,
63 struct pipe_resource
*src
,
66 const uint32_t *clear_value
,
67 unsigned clear_value_size
,
68 enum si_coherency coher
)
70 struct pipe_context
*ctx
= &sctx
->b
;
72 assert(src_offset
% 4 == 0);
73 assert(dst_offset
% 4 == 0);
74 assert(size
% 4 == 0);
76 assert(dst
->target
!= PIPE_BUFFER
|| dst_offset
+ size
<= dst
->width0
);
77 assert(!src
|| src_offset
+ size
<= src
->width0
);
79 sctx
->flags
|= SI_CONTEXT_PS_PARTIAL_FLUSH
|
80 SI_CONTEXT_CS_PARTIAL_FLUSH
|
81 si_get_flush_flags(sctx
, coher
, SI_COMPUTE_DST_CACHE_POLICY
);
82 si_emit_cache_flush(sctx
);
85 void *saved_cs
= sctx
->cs_shader_state
.program
;
86 struct pipe_shader_buffer saved_sb
[2] = {};
87 si_get_shader_buffers(sctx
, PIPE_SHADER_COMPUTE
, 0, src
? 2 : 1, saved_sb
);
89 /* The memory accesses are coalesced, meaning that the 1st instruction writes
90 * the 1st contiguous block of data for the whole wave, the 2nd instruction
91 * writes the 2nd contiguous block of data, etc.
93 unsigned dwords_per_thread
= src
? SI_COMPUTE_COPY_DW_PER_THREAD
:
94 SI_COMPUTE_CLEAR_DW_PER_THREAD
;
95 unsigned instructions_per_thread
= MAX2(1, dwords_per_thread
/ 4);
96 unsigned dwords_per_instruction
= dwords_per_thread
/ instructions_per_thread
;
97 unsigned dwords_per_wave
= dwords_per_thread
* 64;
99 unsigned num_dwords
= size
/ 4;
100 unsigned num_instructions
= DIV_ROUND_UP(num_dwords
, dwords_per_instruction
);
102 struct pipe_grid_info info
= {};
103 info
.block
[0] = MIN2(64, num_instructions
);
106 info
.grid
[0] = DIV_ROUND_UP(num_dwords
, dwords_per_wave
);
110 struct pipe_shader_buffer sb
[2] = {};
112 sb
[0].buffer_offset
= dst_offset
;
113 sb
[0].buffer_size
= size
;
117 sb
[1].buffer_offset
= src_offset
;
118 sb
[1].buffer_size
= size
;
120 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, 2, sb
);
121 ctx
->bind_compute_state(ctx
, sctx
->cs_copy_buffer
);
123 assert(clear_value_size
>= 4 &&
124 clear_value_size
<= 16 &&
125 util_is_power_of_two_or_zero(clear_value_size
));
127 for (unsigned i
= 0; i
< 4; i
++)
128 sctx
->cs_user_data
[i
] = clear_value
[i
% (clear_value_size
/ 4)];
130 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, 1, sb
);
131 ctx
->bind_compute_state(ctx
, sctx
->cs_clear_buffer
);
134 ctx
->launch_grid(ctx
, &info
);
136 enum si_cache_policy cache_policy
= get_cache_policy(sctx
, coher
, size
);
137 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
|
138 (cache_policy
== L2_BYPASS
? SI_CONTEXT_WRITEBACK_GLOBAL_L2
: 0);
140 if (cache_policy
!= L2_BYPASS
)
141 r600_resource(dst
)->TC_L2_dirty
= true;
143 /* Restore states. */
144 ctx
->bind_compute_state(ctx
, saved_cs
);
145 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, src
? 2 : 1, saved_sb
);
148 void si_clear_buffer(struct si_context
*sctx
, struct pipe_resource
*dst
,
149 uint64_t offset
, uint64_t size
, uint32_t *clear_value
,
150 uint32_t clear_value_size
, enum si_coherency coher
)
155 unsigned clear_alignment
= MIN2(clear_value_size
, 4);
157 assert(clear_value_size
!= 3 && clear_value_size
!= 6); /* 12 is allowed. */
158 assert(offset
% clear_alignment
== 0);
159 assert(size
% clear_alignment
== 0);
160 assert(size
< (UINT_MAX
& ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
162 /* Reduce a large clear value size if possible. */
163 if (clear_value_size
> 4) {
164 bool clear_dword_duplicated
= true;
166 /* See if we can lower large fills to dword fills. */
167 for (unsigned i
= 1; i
< clear_value_size
/ 4; i
++) {
168 if (clear_value
[0] != clear_value
[i
]) {
169 clear_dword_duplicated
= false;
173 if (clear_dword_duplicated
)
174 clear_value_size
= 4;
177 /* Expand a small clear value size. */
178 uint32_t tmp_clear_value
;
179 if (clear_value_size
<= 2) {
180 if (clear_value_size
== 1) {
181 tmp_clear_value
= *(uint8_t*)clear_value
;
182 tmp_clear_value
|= (tmp_clear_value
<< 8) |
183 (tmp_clear_value
<< 16) |
184 (tmp_clear_value
<< 24);
186 tmp_clear_value
= *(uint16_t*)clear_value
;
187 tmp_clear_value
|= tmp_clear_value
<< 16;
189 clear_value
= &tmp_clear_value
;
190 clear_value_size
= 4;
193 /* Use transform feedback for 12-byte clears. */
194 /* TODO: Use compute. */
195 if (clear_value_size
== 12) {
196 union pipe_color_union streamout_clear_value
;
198 memcpy(&streamout_clear_value
, clear_value
, clear_value_size
);
199 si_blitter_begin(sctx
, SI_DISABLE_RENDER_COND
);
200 util_blitter_clear_buffer(sctx
->blitter
, dst
, offset
,
201 size
, clear_value_size
/ 4,
202 &streamout_clear_value
);
203 si_blitter_end(sctx
);
207 uint64_t aligned_size
= size
& ~3ull;
208 if (aligned_size
>= 4) {
209 /* Before GFX9, CP DMA was very slow when clearing GTT, so never
210 * use CP DMA clears on those chips, because we can't be certain
211 * about buffer placements.
213 if (clear_value_size
> 4 ||
214 (clear_value_size
== 4 &&
216 (size
> 32*1024 || sctx
->chip_class
<= VI
))) {
217 si_compute_do_clear_or_copy(sctx
, dst
, offset
, NULL
, 0,
218 aligned_size
, clear_value
,
219 clear_value_size
, coher
);
221 assert(clear_value_size
== 4);
222 si_cp_dma_clear_buffer(sctx
, sctx
->gfx_cs
, dst
, offset
,
223 aligned_size
, *clear_value
, 0, coher
,
224 get_cache_policy(sctx
, coher
, size
));
227 offset
+= aligned_size
;
228 size
-= aligned_size
;
231 /* Handle non-dword alignment. */
234 assert(dst
->target
== PIPE_BUFFER
);
237 pipe_buffer_write(&sctx
->b
, dst
, offset
, size
, clear_value
);
241 static void si_pipe_clear_buffer(struct pipe_context
*ctx
,
242 struct pipe_resource
*dst
,
243 unsigned offset
, unsigned size
,
244 const void *clear_value
,
245 int clear_value_size
)
247 enum si_coherency coher
;
249 if (dst
->flags
& SI_RESOURCE_FLAG_SO_FILLED_SIZE
)
250 coher
= SI_COHERENCY_CP
;
252 coher
= SI_COHERENCY_SHADER
;
254 si_clear_buffer((struct si_context
*)ctx
, dst
, offset
, size
, (uint32_t*)clear_value
,
255 clear_value_size
, coher
);
258 void si_copy_buffer(struct si_context
*sctx
,
259 struct pipe_resource
*dst
, struct pipe_resource
*src
,
260 uint64_t dst_offset
, uint64_t src_offset
, unsigned size
)
265 enum si_coherency coher
= SI_COHERENCY_SHADER
;
266 enum si_cache_policy cache_policy
= get_cache_policy(sctx
, coher
, size
);
268 /* Only use compute for VRAM copies on dGPUs. */
269 if (sctx
->screen
->info
.has_dedicated_vram
&&
270 r600_resource(dst
)->domains
& RADEON_DOMAIN_VRAM
&&
271 r600_resource(src
)->domains
& RADEON_DOMAIN_VRAM
&&
273 dst_offset
% 4 == 0 && src_offset
% 4 == 0 && size
% 4 == 0) {
274 si_compute_do_clear_or_copy(sctx
, dst
, dst_offset
, src
, src_offset
,
275 size
, NULL
, 0, coher
);
277 si_cp_dma_copy_buffer(sctx
, dst
, src
, dst_offset
, src_offset
, size
,
278 0, coher
, cache_policy
);
282 void si_init_compute_blit_functions(struct si_context
*sctx
)
284 sctx
->b
.clear_buffer
= si_pipe_clear_buffer
;