1 /**************************************************************************
3 * Copyright 2017 Advanced Micro Devices, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * on the rights to use, copy, modify, merge, publish, distribute, sub
10 * license, and/or sell copies of the Software, and to permit persons to whom
11 * the Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
21 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
22 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
23 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 **************************************************************************/
27 /* This is a wrapper for pipe_context that executes all pipe_context calls
31 * Guidelines for adopters and deviations from Gallium
32 * ---------------------------------------------------
34 * 1) pipe_context is wrapped. pipe_screen isn't wrapped. All pipe_screen
35 * driver functions that take a context (fence_finish, texture_get_handle)
36 * should manually unwrap pipe_context by doing:
37 * pipe = threaded_context_unwrap_sync(pipe);
39 * pipe_context::priv is used to unwrap the context, so drivers and state
40 * trackers shouldn't use it.
42 * No other objects are wrapped.
44 * 2) Drivers must subclass and initialize these structures:
45 * - threaded_resource for pipe_resource (use threaded_resource_init/deinit)
46 * - threaded_query for pipe_query (zero memory)
47 * - threaded_transfer for pipe_transfer (zero memory)
49 * 3) The threaded context must not be enabled for contexts that can use video
52 * 4) Changes in driver behavior:
53 * - begin_query and end_query always return true; return values from
54 * the driver are ignored.
55 * - generate_mipmap uses is_format_supported to determine success;
56 * the return value from the driver is ignored.
57 * - resource_commit always returns true; failures are ignored.
58 * - set_debug_callback is skipped if the callback is synchronous.
61 * Thread-safety requirements on context functions
62 * -----------------------------------------------
64 * These pipe_context functions are executed directly, so they shouldn't use
65 * pipe_context in an unsafe way. They are de-facto screen functions now:
67 * - create_batch_query
68 * - create_*_state (all CSOs and shaders)
69 * - Make sure the shader compiler doesn't use any per-context stuff.
70 * (e.g. LLVM target machine)
71 * - Only pipe_context's debug callback for shader dumps is guaranteed to
72 * be up to date, because set_debug_callback synchronizes execution.
75 * - create_sampler_view
76 * - sampler_view_destroy
77 * - stream_output_target_destroy
78 * - transfer_map (only unsychronized buffer mappings)
79 * - get_query_result (when threaded_query::flushed == true)
81 * Create calls causing a sync that can't be async due to driver limitations:
82 * - create_stream_output_target
85 * Transfer_map rules for buffer mappings
86 * --------------------------------------
88 * 1) If transfer_map has PIPE_TRANSFER_UNSYNCHRONIZED, the call is made
89 * in the non-driver thread without flushing the queue. The driver will
90 * receive TC_TRANSFER_MAP_THREADED_UNSYNC in addition to PIPE_TRANSFER_-
91 * UNSYNCHRONIZED to indicate this.
92 * Note that transfer_unmap is always enqueued and called from the driver
95 * 2) The driver isn't allowed to infer unsychronized mappings by tracking
96 * the valid buffer range. The threaded context always sends TC_TRANSFER_-
97 * MAP_NO_INFER_UNSYNCHRONIZED to indicate this. Ignoring the flag will lead
99 * The threaded context does its own detection of unsynchronized mappings.
101 * 3) The driver isn't allowed to do buffer invalidations by itself under any
102 * circumstances. This is necessary for unsychronized maps to map the latest
103 * version of the buffer. (because invalidations can be queued, while
104 * unsychronized maps are not queued and they should return the latest
105 * storage after invalidation). The threaded context always sends
106 * TC_TRANSFER_MAP_NO_INVALIDATE into transfer_map and buffer_subdata to
107 * indicate this. Ignoring the flag will lead to failures.
108 * The threaded context uses its own buffer invalidation mechanism.
114 * Flushes will be executed asynchronously in the driver thread if a
115 * create_fence callback is provided. This affects fence semantics as follows.
117 * When the threaded context wants to perform an asynchronous flush, it will
118 * use the create_fence callback to pre-create the fence from the calling
119 * thread. This pre-created fence will be passed to pipe_context::flush
120 * together with the TC_FLUSH_ASYNC flag.
122 * The callback receives the unwrapped context as a parameter, but must use it
123 * in a thread-safe way because it is called from a non-driver thread.
125 * If the threaded_context does not immediately flush the current batch, the
126 * callback also receives a tc_unflushed_batch_token. If fence_finish is called
127 * on the returned fence in the context that created the fence,
128 * threaded_context_flush must be called.
130 * The driver must implement pipe_context::fence_server_sync properly, since
131 * the threaded context handles PIPE_FLUSH_ASYNC.
134 * Additional requirements
135 * -----------------------
138 * If threaded_query::flushed == true, get_query_result should assume that
139 * it's called from a non-driver thread, in which case the driver shouldn't
140 * use the context in an unsafe way.
142 * replace_buffer_storage:
143 * The driver has to implement this callback, which will be called when
144 * the threaded context wants to replace a resource's backing storage with
145 * another resource's backing storage. The threaded context uses it to
146 * implement buffer invalidation. This call is always queued.
149 * Performance gotchas
150 * -------------------
152 * Buffer invalidations are done unconditionally - they don't check whether
153 * the buffer is busy. This can cause drivers to have more live allocations
154 * and CPU mappings than necessary.
157 * How it works (queue architecture)
158 * ---------------------------------
160 * There is a multithreaded queue consisting of batches, each batch consisting
161 * of call slots. Each call slot consists of an 8-byte header (call ID +
162 * call size + constant 32-bit marker for integrity checking) and an 8-byte
163 * body for per-call data. That is 16 bytes per call slot.
165 * Simple calls such as bind_xx_state(CSO) occupy only one call slot. Bigger
166 * calls occupy multiple call slots depending on the size needed by call
167 * parameters. That means that calls can have a variable size in the batch.
168 * For example, set_vertex_buffers(count = any, buffers = NULL) occupies only
169 * 1 call slot, but set_vertex_buffers(count = 5) occupies 6 call slots.
170 * Even though the first call slot can use only 8 bytes for data, additional
171 * call slots used by the same call can use all 16 bytes for data.
172 * For example, a call using 2 call slots has 24 bytes of space for data.
174 * Once a batch is full and there is no space for the next call, it's flushed,
175 * meaning that it's added to the queue for execution in the other thread.
176 * The batches are ordered in a ring and reused once they are idle again.
177 * The batching is necessary for low queue/mutex overhead.
181 #ifndef U_THREADED_CONTEXT_H
182 #define U_THREADED_CONTEXT_H
184 #include "pipe/p_context.h"
185 #include "pipe/p_state.h"
186 #include "util/u_inlines.h"
187 #include "util/u_queue.h"
188 #include "util/u_range.h"
189 #include "util/slab.h"
191 struct threaded_context
;
192 struct tc_unflushed_batch_token
;
194 /* These are transfer flags sent to drivers. */
195 /* Never infer whether it's safe to use unsychronized mappings: */
196 #define TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED (1u << 29)
197 /* Don't invalidate buffers: */
198 #define TC_TRANSFER_MAP_NO_INVALIDATE (1u << 30)
199 /* transfer_map is called from a non-driver thread: */
200 #define TC_TRANSFER_MAP_THREADED_UNSYNC (1u << 31)
202 /* Custom flush flags sent to drivers. */
203 /* fence is pre-populated with a fence created by the create_fence callback */
204 #define TC_FLUSH_ASYNC (1u << 31)
206 /* Size of the queue = number of batch slots in memory.
207 * - 1 batch is always idle and records new commands
208 * - 1 batch is being executed
209 * so the queue size is TC_MAX_BATCHES - 2 = number of waiting batches.
211 * Use a size as small as possible for low CPU L2 cache usage but large enough
212 * so that the queue isn't stalled too often for not having enough idle batch
215 #define TC_MAX_BATCHES 10
217 /* The size of one batch. Non-trivial calls (i.e. not setting a CSO pointer)
218 * can occupy multiple call slots.
220 * The idea is to have batches as small as possible but large enough so that
221 * the queuing and mutex overhead is negligible.
223 #define TC_CALLS_PER_BATCH 768
225 /* Threshold for when to use the queue or sync. */
226 #define TC_MAX_STRING_MARKER_BYTES 512
228 /* Threshold for when to enqueue buffer/texture_subdata as-is.
229 * If the upload size is greater than this, it will do instead:
230 * - for buffers: DISCARD_RANGE is done by the threaded context
231 * - for textures: sync and call the driver directly
233 #define TC_MAX_SUBDATA_BYTES 320
235 typedef void (*tc_replace_buffer_storage_func
)(struct pipe_context
*ctx
,
236 struct pipe_resource
*dst
,
237 struct pipe_resource
*src
);
238 typedef struct pipe_fence_handle
*(*tc_create_fence_func
)(struct pipe_context
*ctx
,
239 struct tc_unflushed_batch_token
*token
);
241 struct threaded_resource
{
242 struct pipe_resource b
;
243 const struct u_resource_vtbl
*vtbl
;
245 /* Since buffer invalidations are queued, we can't use the base resource
246 * for unsychronized mappings. This points to the latest version of
247 * the buffer after the latest invalidation. It's only used for unsychro-
248 * nized mappings in the non-driver thread. Initially it's set to &b.
250 struct pipe_resource
*latest
;
252 /* The buffer range which is initialized (with a write transfer, streamout,
253 * or writable shader resources). The remainder of the buffer is considered
254 * invalid and can be mapped unsynchronized.
256 * This allows unsychronized mapping of a buffer range which hasn't been
257 * used yet. It's for applications which forget to use the unsynchronized
258 * map flag and expect the driver to figure it out.
260 * Drivers should set this to the full range for buffers backed by user
263 struct util_range valid_buffer_range
;
265 /* If "this" is not the base instance of the buffer, but it's one of its
266 * reallocations (set in "latest" of the base instance), this points to
267 * the valid range of the base instance. It's used for transfers after
268 * a buffer invalidation, because such transfers operate on "latest", not
269 * the base instance. Initially it's set to &valid_buffer_range.
271 struct util_range
*base_valid_buffer_range
;
273 /* Drivers are required to update this for shared resources and user
278 /* If positive, prefer DISCARD_RANGE with a staging buffer over any other
279 * method of CPU access when map flags allow it. Useful for buffers that
280 * are too large for the visible VRAM window.
282 int max_forced_staging_uploads
;
285 struct threaded_transfer
{
286 struct pipe_transfer b
;
288 /* Staging buffer for DISCARD_RANGE transfers. */
289 struct pipe_resource
*staging
;
291 /* Offset into the staging buffer, because the backing buffer is
296 struct threaded_query
{
297 /* The query is added to the list in end_query and removed in flush. */
298 struct list_head head_unflushed
;
300 /* Whether pipe->flush has been called in non-deferred mode after end_query. */
304 /* This is the second half of tc_call containing call data.
305 * Most calls will typecast this to the type they need, typically larger
309 struct pipe_query
*query
;
310 struct pipe_resource
*resource
;
311 struct pipe_transfer
*transfer
;
312 struct pipe_fence_handle
*fence
;
317 #define ALIGN16 __declspec(align(16))
319 #define ALIGN16 __attribute__((aligned(16)))
322 /* Each call slot should be aligned to its own size for optimal cache usage. */
323 struct ALIGN16 tc_call
{
325 ushort num_call_slots
;
327 union tc_payload payload
;
331 * A token representing an unflushed batch.
333 * See the general rules for fences for an explanation.
335 struct tc_unflushed_batch_token
{
336 struct pipe_reference ref
;
337 struct threaded_context
*tc
;
341 struct pipe_context
*pipe
;
343 unsigned num_total_call_slots
;
344 struct tc_unflushed_batch_token
*token
;
345 struct util_queue_fence fence
;
346 struct tc_call call
[TC_CALLS_PER_BATCH
];
349 struct threaded_context
{
350 struct pipe_context base
;
351 struct pipe_context
*pipe
;
352 struct slab_child_pool pool_transfers
;
353 tc_replace_buffer_storage_func replace_buffer_storage
;
354 tc_create_fence_func create_fence
;
355 unsigned map_buffer_alignment
;
357 struct list_head unflushed_queries
;
359 /* Counters for the HUD. */
360 unsigned num_offloaded_slots
;
361 unsigned num_direct_slots
;
364 struct util_queue queue
;
365 struct util_queue_fence
*fence
;
368 struct tc_batch batch_slots
[TC_MAX_BATCHES
];
371 void threaded_resource_init(struct pipe_resource
*res
);
372 void threaded_resource_deinit(struct pipe_resource
*res
);
373 struct pipe_context
*threaded_context_unwrap_sync(struct pipe_context
*pipe
);
375 struct pipe_context
*
376 threaded_context_create(struct pipe_context
*pipe
,
377 struct slab_parent_pool
*parent_transfer_pool
,
378 tc_replace_buffer_storage_func replace_buffer
,
379 tc_create_fence_func create_fence
,
380 struct threaded_context
**out
);
383 threaded_context_flush(struct pipe_context
*_pipe
,
384 struct tc_unflushed_batch_token
*token
,
387 static inline struct threaded_context
*
388 threaded_context(struct pipe_context
*pipe
)
390 return (struct threaded_context
*)pipe
;
393 static inline struct threaded_resource
*
394 threaded_resource(struct pipe_resource
*res
)
396 return (struct threaded_resource
*)res
;
399 static inline struct threaded_query
*
400 threaded_query(struct pipe_query
*q
)
402 return (struct threaded_query
*)q
;
405 static inline struct threaded_transfer
*
406 threaded_transfer(struct pipe_transfer
*transfer
)
408 return (struct threaded_transfer
*)transfer
;
412 tc_unflushed_batch_token_reference(struct tc_unflushed_batch_token
**dst
,
413 struct tc_unflushed_batch_token
*src
)
415 if (pipe_reference((struct pipe_reference
*)*dst
, (struct pipe_reference
*)src
))