src/gallium/auxiliary/util/u_threaded_context.h

   1 /**************************************************************************
   2  *
   3  * Copyright 2017 Advanced Micro Devices, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * on the rights to use, copy, modify, merge, publish, distribute, sub
  10  * license, and/or sell copies of the Software, and to permit persons to whom
  11  * the Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice (including the next
  14  * paragraph) shall be included in all copies or substantial portions of the
  15  * Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  20  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  21  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  22  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  23  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  24  *
  25  **************************************************************************/
  26
  27 /* This is a wrapper for pipe_context that executes all pipe_context calls
  28  * in another thread.
  29  *
  30  *
  31  * Guidelines for adopters and deviations from Gallium
  32  * ---------------------------------------------------
  33  *
  34  * 1) pipe_context is wrapped. pipe_screen isn't wrapped. All pipe_screen
  35  *    driver functions that take a context (fence_finish, texture_get_handle)
  36  *    should manually unwrap pipe_context by doing:
  37  *      pipe = threaded_context_unwrap_sync(pipe);
  38  *
  39  *    pipe_context::priv is used to unwrap the context, so drivers and state
  40  *    trackers shouldn't use it.
  41  *
  42  *    No other objects are wrapped.
  43  *
  44  * 2) Drivers must subclass and initialize these structures:
  45  *    - threaded_resource for pipe_resource (use threaded_resource_init/deinit)
  46  *    - threaded_query for pipe_query (zero memory)
  47  *    - threaded_transfer for pipe_transfer (zero memory)
  48  *
  49  * 3) The threaded context must not be enabled for contexts that can use video
  50  *    codecs.
  51  *
  52  * 4) Changes in driver behavior:
  53  *    - begin_query and end_query always return true; return values from
  54  *      the driver are ignored.
  55  *    - generate_mipmap uses is_format_supported to determine success;
  56  *      the return value from the driver is ignored.
  57  *    - resource_commit always returns true; failures are ignored.
  58  *    - set_debug_callback is skipped if the callback is synchronous.
  59  *
  60  *
  61  * Thread-safety requirements on context functions
  62  * -----------------------------------------------
  63  *
  64  * These pipe_context functions are executed directly, so they shouldn't use
  65  * pipe_context in an unsafe way. They are de-facto screen functions now:
  66  * - create_query
  67  * - create_batch_query
  68  * - create_*_state (all CSOs and shaders)
  69  *     - Make sure the shader compiler doesn't use any per-context stuff.
  70  *       (e.g. LLVM target machine)
  71  *     - Only pipe_context's debug callback for shader dumps is guaranteed to
  72  *       be up to date, because set_debug_callback synchronizes execution.
  73  * - create_surface
  74  * - surface_destroy
  75  * - create_sampler_view
  76  * - sampler_view_destroy
  77  * - stream_output_target_destroy
  78  * - transfer_map (only unsychronized buffer mappings)
  79  * - get_query_result (when threaded_query::flushed == true)
  80  *
  81  * Create calls causing a sync that can't be async due to driver limitations:
  82  * - create_stream_output_target
  83  *
  84  *
  85  * Transfer_map rules for buffer mappings
  86  * --------------------------------------
  87  *
  88  * 1) If transfer_map has PIPE_TRANSFER_UNSYNCHRONIZED, the call is made
  89  *    in the non-driver thread without flushing the queue. The driver will
  90  *    receive TC_TRANSFER_MAP_THREADED_UNSYNC in addition to PIPE_TRANSFER_-
  91  *    UNSYNCHRONIZED to indicate this.
  92  *    Note that transfer_unmap is always enqueued and called from the driver
  93  *    thread.
  94  *
  95  * 2) The driver isn't allowed to infer unsychronized mappings by tracking
  96  *    the valid buffer range. The threaded context always sends TC_TRANSFER_-
  97  *    MAP_NO_INFER_UNSYNCHRONIZED to indicate this. Ignoring the flag will lead
  98  *    to failures.
  99  *    The threaded context does its own detection of unsynchronized mappings.
 100  *
 101  * 3) The driver isn't allowed to do buffer invalidations by itself under any
 102  *    circumstances. This is necessary for unsychronized maps to map the latest
 103  *    version of the buffer. (because invalidations can be queued, while
 104  *    unsychronized maps are not queued and they should return the latest
 105  *    storage after invalidation). The threaded context always sends
 106  *    TC_TRANSFER_MAP_NO_INVALIDATE into transfer_map and buffer_subdata to
 107  *    indicate this. Ignoring the flag will lead to failures.
 108  *    The threaded context uses its own buffer invalidation mechanism.
 109  *
 110  *
 111  * Additional requirements
 112  * -----------------------
 113  *
 114  * get_query_result:
 115  *    If threaded_query::flushed == true, get_query_result should assume that
 116  *    it's called from a non-driver thread, in which case the driver shouldn't
 117  *    use the context in an unsafe way.
 118  *
 119  * replace_buffer_storage:
 120  *    The driver has to implement this callback, which will be called when
 121  *    the threaded context wants to replace a resource's backing storage with
 122  *    another resource's backing storage. The threaded context uses it to
 123  *    implement buffer invalidation. This call is always queued.
 124  *
 125  *
 126  * Performance gotchas
 127  * -------------------
 128  *
 129  * Buffer invalidations are done unconditionally - they don't check whether
 130  * the buffer is busy. This can cause drivers to have more live allocations
 131  * and CPU mappings than necessary.
 132  *
 133  *
 134  * How it works (queue architecture)
 135  * ---------------------------------
 136  *
 137  * There is a multithreaded queue consisting of batches, each batch consisting
 138  * of call slots. Each call slot consists of an 8-byte header (call ID +
 139  * call size + constant 32-bit marker for integrity checking) and an 8-byte
 140  * body for per-call data. That is 16 bytes per call slot.
 141  *
 142  * Simple calls such as bind_xx_state(CSO) occupy only one call slot. Bigger
 143  * calls occupy multiple call slots depending on the size needed by call
 144  * parameters. That means that calls can have a variable size in the batch.
 145  * For example, set_vertex_buffers(count = any, buffers = NULL) occupies only
 146  * 1 call slot, but set_vertex_buffers(count = 5) occupies 6 call slots.
 147  * Even though the first call slot can use only 8 bytes for data, additional
 148  * call slots used by the same call can use all 16 bytes for data.
 149  * For example, a call using 2 call slots has 24 bytes of space for data.
 150  *
 151  * Once a batch is full and there is no space for the next call, it's flushed,
 152  * meaning that it's added to the queue for execution in the other thread.
 153  * The batches are ordered in a ring and reused once they are idle again.
 154  * The batching is necessary for low queue/mutex overhead.
 155  *
 156  */
 157
 158 #ifndef U_THREADED_CONTEXT_H
 159 #define U_THREADED_CONTEXT_H
 160
 161 #include "pipe/p_context.h"
 162 #include "pipe/p_state.h"
 163 #include "util/u_queue.h"
 164 #include "util/u_range.h"
 165 #include "util/slab.h"
 166
 167 /* These are transfer flags sent to drivers. */
 168 /* Never infer whether it's safe to use unsychronized mappings: */
 169 #define TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED (1u << 29)
 170 /* Don't invalidate buffers: */
 171 #define TC_TRANSFER_MAP_NO_INVALIDATE        (1u << 30)
 172 /* transfer_map is called from a non-driver thread: */
 173 #define TC_TRANSFER_MAP_THREADED_UNSYNC      (1u << 31)
 174
 175 /* Size of the queue = number of batch slots in memory.
 176  * - 1 batch is always idle and records new commands
 177  * - 1 batch is being executed
 178  * so the queue size is TC_MAX_BATCHES - 2 = number of waiting batches.
 179  *
 180  * Use a size as small as possible for low CPU L2 cache usage but large enough
 181  * so that the queue isn't stalled too often for not having enough idle batch
 182  * slots.
 183  */
 184 #define TC_MAX_BATCHES        10
 185
 186 /* The size of one batch. Non-trivial calls (i.e. not setting a CSO pointer)
 187  * can occupy multiple call slots.
 188  *
 189  * The idea is to have batches as small as possible but large enough so that
 190  * the queuing and mutex overhead is negligible.
 191  */
 192 #define TC_CALLS_PER_BATCH    192
 193
 194 /* Threshold for when to use the queue or sync. */
 195 #define TC_MAX_STRING_MARKER_BYTES  512
 196
 197 /* Threshold for when to enqueue buffer/texture_subdata as-is.
 198  * If the upload size is greater than this, it will do instead:
 199  * - for buffers: DISCARD_RANGE is done by the threaded context
 200  * - for textures: sync and call the driver directly
 201  */
 202 #define TC_MAX_SUBDATA_BYTES        320
 203
 204 typedef void (*tc_replace_buffer_storage_func)(struct pipe_context *ctx,
 205                                                struct pipe_resource *dst,
 206                                                struct pipe_resource *src);
 207
 208 struct threaded_resource {
 209    struct pipe_resource b;
 210    const struct u_resource_vtbl *vtbl;
 211
 212    /* Since buffer invalidations are queued, we can't use the base resource
 213     * for unsychronized mappings. This points to the latest version of
 214     * the buffer after the latest invalidation. It's only used for unsychro-
 215     * nized mappings in the non-driver thread. Initially it's set to &b.
 216     */
 217    struct pipe_resource *latest;
 218
 219    /* The buffer range which is initialized (with a write transfer, streamout,
 220     * or writable shader resources). The remainder of the buffer is considered
 221     * invalid and can be mapped unsynchronized.
 222     *
 223     * This allows unsychronized mapping of a buffer range which hasn't been
 224     * used yet. It's for applications which forget to use the unsynchronized
 225     * map flag and expect the driver to figure it out.
 226     *
 227     * Drivers should set this to the full range for buffers backed by user
 228     * memory.
 229     */
 230    struct util_range valid_buffer_range;
 231
 232    /* If "this" is not the base instance of the buffer, but it's one of its
 233     * reallocations (set in "latest" of the base instance), this points to
 234     * the valid range of the base instance. It's used for transfers after
 235     * a buffer invalidation, because such transfers operate on "latest", not
 236     * the base instance. Initially it's set to &valid_buffer_range.
 237     */
 238    struct util_range *base_valid_buffer_range;
 239
 240    /* Drivers are required to update this for shared resources and user
 241     * pointers. */
 242    bool is_shared;
 243    bool is_user_ptr;
 244
 245    /* If positive, prefer DISCARD_RANGE with a staging buffer over any other
 246     * method of CPU access when map flags allow it. Useful for buffers that
 247     * are too large for the visible VRAM window.
 248     */
 249    int max_forced_staging_uploads;
 250 };
 251
 252 struct threaded_transfer {
 253    struct pipe_transfer b;
 254
 255    /* Staging buffer for DISCARD_RANGE transfers. */
 256    struct pipe_resource *staging;
 257
 258    /* Offset into the staging buffer, because the backing buffer is
 259     * sub-allocated. */
 260    unsigned offset;
 261 };
 262
 263 struct threaded_query {
 264    /* The query is added to the list in end_query and removed in flush. */
 265    struct list_head head_unflushed;
 266
 267    /* Whether pipe->flush has been called after end_query. */
 268    bool flushed;
 269 };
 270
 271 /* This is the second half of tc_call containing call data.
 272  * Most calls will typecast this to the type they need, typically larger
 273  * than 8 bytes.
 274  */
 275 union tc_payload {
 276    struct pipe_query *query;
 277    struct pipe_resource *resource;
 278    struct pipe_transfer *transfer;
 279    uint64_t handle;
 280 };
 281
 282 #ifdef _MSC_VER
 283 #define ALIGN16 __declspec(align(16))
 284 #else
 285 #define ALIGN16 __attribute__((aligned(16)))
 286 #endif
 287
 288 /* Each call slot should be aligned to its own size for optimal cache usage. */
 289 struct ALIGN16 tc_call {
 290    unsigned sentinel;
 291    ushort num_call_slots;
 292    ushort call_id;
 293    union tc_payload payload;
 294 };
 295
 296 struct tc_batch {
 297    struct pipe_context *pipe;
 298    unsigned sentinel;
 299    unsigned num_total_call_slots;
 300    struct util_queue_fence fence;
 301    struct tc_call call[TC_CALLS_PER_BATCH];
 302 };
 303
 304 struct threaded_context {
 305    struct pipe_context base;
 306    struct pipe_context *pipe;
 307    struct slab_child_pool pool_transfers;
 308    tc_replace_buffer_storage_func replace_buffer_storage;
 309    unsigned map_buffer_alignment;
 310
 311    struct list_head unflushed_queries;
 312
 313    /* Counters for the HUD. */
 314    unsigned num_offloaded_slots;
 315    unsigned num_direct_slots;
 316    unsigned num_syncs;
 317
 318    struct util_queue queue;
 319    struct util_queue_fence *fence;
 320
 321    unsigned last, next;
 322    struct tc_batch batch_slots[TC_MAX_BATCHES];
 323 };
 324
 325 void threaded_resource_init(struct pipe_resource *res);
 326 void threaded_resource_deinit(struct pipe_resource *res);
 327 struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe);
 328
 329 struct pipe_context *
 330 threaded_context_create(struct pipe_context *pipe,
 331                         struct slab_parent_pool *parent_transfer_pool,
 332                         tc_replace_buffer_storage_func replace_buffer,
 333                         struct threaded_context **out);
 334
 335 static inline struct threaded_context *
 336 threaded_context(struct pipe_context *pipe)
 337 {
 338    return (struct threaded_context*)pipe;
 339 }
 340
 341 static inline struct threaded_resource *
 342 threaded_resource(struct pipe_resource *res)
 343 {
 344    return (struct threaded_resource*)res;
 345 }
 346
 347 static inline struct threaded_query *
 348 threaded_query(struct pipe_query *q)
 349 {
 350    return (struct threaded_query*)q;
 351 }
 352
 353 static inline struct threaded_transfer *
 354 threaded_transfer(struct pipe_transfer *transfer)
 355 {
 356    return (struct threaded_transfer*)transfer;
 357 }
 358
 359 #endif