src/gallium/auxiliary/util/u_vbuf.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2011 Marek Olšák <maraeo@gmail.com>
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * This module uploads user buffers and translates the vertex buffers which
  30  * contain incompatible vertices (i.e. not supported by the driver/hardware)
  31  * into compatible ones, based on the Gallium CAPs.
  32  *
  33  * It does not upload index buffers.
  34  *
  35  * The module heavily uses bitmasks to represent per-buffer and
  36  * per-vertex-element flags to avoid looping over the list of buffers just
  37  * to see if there's a non-zero stride, or user buffer, or unsupported format,
  38  * etc.
  39  *
  40  * There are 3 categories of vertex elements, which are processed separately:
  41  * - per-vertex attribs (stride != 0, instance_divisor == 0)
  42  * - instanced attribs (stride != 0, instance_divisor > 0)
  43  * - constant attribs (stride == 0)
  44  *
  45  * All needed uploads and translations are performed every draw command, but
  46  * only the subset of vertices needed for that draw command is uploaded or
  47  * translated. (the module never translates whole buffers)
  48  *
  49  *
  50  * The module consists of two main parts:
  51  *
  52  *
  53  * 1) Translate (u_vbuf_translate_begin/end)
  54  *
  55  * This is pretty much a vertex fetch fallback. It translates vertices from
  56  * one vertex buffer to another in an unused vertex buffer slot. It does
  57  * whatever is needed to make the vertices readable by the hardware (changes
  58  * vertex formats and aligns offsets and strides). The translate module is
  59  * used here.
  60  *
  61  * Each of the 3 categories is translated to a separate buffer.
  62  * Only the [min_index, max_index] range is translated. For instanced attribs,
  63  * the range is [start_instance, start_instance+instance_count]. For constant
  64  * attribs, the range is [0, 1].
  65  *
  66  *
  67  * 2) User buffer uploading (u_vbuf_upload_buffers)
  68  *
  69  * Only the [min_index, max_index] range is uploaded (just like Translate)
  70  * with a single memcpy.
  71  *
  72  * This method works best for non-indexed draw operations or indexed draw
  73  * operations where the [min_index, max_index] range is not being way bigger
  74  * than the vertex count.
  75  *
  76  * If the range is too big (e.g. one triangle with indices {0, 1, 10000}),
  77  * the per-vertex attribs are uploaded via the translate module, all packed
  78  * into one vertex buffer, and the indexed draw call is turned into
  79  * a non-indexed one in the process. This adds additional complexity
  80  * to the translate part, but it prevents bad apps from bringing your frame
  81  * rate down.
  82  *
  83  *
  84  * If there is nothing to do, it forwards every command to the driver.
  85  * The module also has its own CSO cache of vertex element states.
  86  */
  87
  88 #include "util/u_vbuf.h"
  89
  90 #include "util/u_dump.h"
  91 #include "util/format/u_format.h"
  92 #include "util/u_inlines.h"
  93 #include "util/u_memory.h"
  94 #include "util/u_screen.h"
  95 #include "util/u_upload_mgr.h"
  96 #include "translate/translate.h"
  97 #include "translate/translate_cache.h"
  98 #include "cso_cache/cso_cache.h"
  99 #include "cso_cache/cso_hash.h"
 100
 101 struct u_vbuf_elements {
 102    unsigned count;
 103    struct pipe_vertex_element ve[PIPE_MAX_ATTRIBS];
 104
 105    unsigned src_format_size[PIPE_MAX_ATTRIBS];
 106
 107    /* If (velem[i].src_format != native_format[i]), the vertex buffer
 108     * referenced by the vertex element cannot be used for rendering and
 109     * its vertex data must be translated to native_format[i]. */
 110    enum pipe_format native_format[PIPE_MAX_ATTRIBS];
 111    unsigned native_format_size[PIPE_MAX_ATTRIBS];
 112
 113    /* Which buffers are used by the vertex element state. */
 114    uint32_t used_vb_mask;
 115    /* This might mean two things:
 116     * - src_format != native_format, as discussed above.
 117     * - src_offset % 4 != 0 (if the caps don't allow such an offset). */
 118    uint32_t incompatible_elem_mask; /* each bit describes a corresp. attrib  */
 119    /* Which buffer has at least one vertex element referencing it
 120     * incompatible. */
 121    uint32_t incompatible_vb_mask_any;
 122    /* Which buffer has all vertex elements referencing it incompatible. */
 123    uint32_t incompatible_vb_mask_all;
 124    /* Which buffer has at least one vertex element referencing it
 125     * compatible. */
 126    uint32_t compatible_vb_mask_any;
 127    /* Which buffer has all vertex elements referencing it compatible. */
 128    uint32_t compatible_vb_mask_all;
 129
 130    /* Which buffer has at least one vertex element referencing it
 131     * non-instanced. */
 132    uint32_t noninstance_vb_mask_any;
 133
 134    void *driver_cso;
 135 };
 136
 137 enum {
 138    VB_VERTEX = 0,
 139    VB_INSTANCE = 1,
 140    VB_CONST = 2,
 141    VB_NUM = 3
 142 };
 143
 144 struct u_vbuf {
 145    struct u_vbuf_caps caps;
 146    bool has_signed_vb_offset;
 147
 148    struct pipe_context *pipe;
 149    struct translate_cache *translate_cache;
 150    struct cso_cache *cso_cache;
 151
 152    /* This is what was set in set_vertex_buffers.
 153     * May contain user buffers. */
 154    struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
 155    uint32_t enabled_vb_mask;
 156
 157    /* Saved vertex buffer. */
 158    struct pipe_vertex_buffer vertex_buffer0_saved;
 159
 160    /* Vertex buffers for the driver.
 161     * There are usually no user buffers. */
 162    struct pipe_vertex_buffer real_vertex_buffer[PIPE_MAX_ATTRIBS];
 163    uint32_t dirty_real_vb_mask; /* which buffers are dirty since the last
 164                                    call of set_vertex_buffers */
 165
 166    /* Vertex elements. */
 167    struct u_vbuf_elements *ve, *ve_saved;
 168
 169    /* Vertex elements used for the translate fallback. */
 170    struct pipe_vertex_element fallback_velems[PIPE_MAX_ATTRIBS];
 171    /* If non-NULL, this is a vertex element state used for the translate
 172     * fallback and therefore used for rendering too. */
 173    boolean using_translate;
 174    /* The vertex buffer slot index where translated vertices have been
 175     * stored in. */
 176    unsigned fallback_vbs[VB_NUM];
 177
 178    /* Which buffer is a user buffer. */
 179    uint32_t user_vb_mask; /* each bit describes a corresp. buffer */
 180    /* Which buffer is incompatible (unaligned). */
 181    uint32_t incompatible_vb_mask; /* each bit describes a corresp. buffer */
 182    /* Which buffer has a non-zero stride. */
 183    uint32_t nonzero_stride_vb_mask; /* each bit describes a corresp. buffer */
 184    /* Which buffers are allowed (supported by hardware). */
 185    uint32_t allowed_vb_mask;
 186 };
 187
 188 static void *
 189 u_vbuf_create_vertex_elements(struct u_vbuf *mgr, unsigned count,
 190                               const struct pipe_vertex_element *attribs);
 191 static void u_vbuf_delete_vertex_elements(struct u_vbuf *mgr, void *cso);
 192
 193 static const struct {
 194    enum pipe_format from, to;
 195 } vbuf_format_fallbacks[] = {
 196    { PIPE_FORMAT_R32_FIXED,            PIPE_FORMAT_R32_FLOAT },
 197    { PIPE_FORMAT_R32G32_FIXED,         PIPE_FORMAT_R32G32_FLOAT },
 198    { PIPE_FORMAT_R32G32B32_FIXED,      PIPE_FORMAT_R32G32B32_FLOAT },
 199    { PIPE_FORMAT_R32G32B32A32_FIXED,   PIPE_FORMAT_R32G32B32A32_FLOAT },
 200    { PIPE_FORMAT_R16_FLOAT,            PIPE_FORMAT_R32_FLOAT },
 201    { PIPE_FORMAT_R16G16_FLOAT,         PIPE_FORMAT_R32G32_FLOAT },
 202    { PIPE_FORMAT_R16G16B16_FLOAT,      PIPE_FORMAT_R32G32B32_FLOAT },
 203    { PIPE_FORMAT_R16G16B16A16_FLOAT,   PIPE_FORMAT_R32G32B32A32_FLOAT },
 204    { PIPE_FORMAT_R64_FLOAT,            PIPE_FORMAT_R32_FLOAT },
 205    { PIPE_FORMAT_R64G64_FLOAT,         PIPE_FORMAT_R32G32_FLOAT },
 206    { PIPE_FORMAT_R64G64B64_FLOAT,      PIPE_FORMAT_R32G32B32_FLOAT },
 207    { PIPE_FORMAT_R64G64B64A64_FLOAT,   PIPE_FORMAT_R32G32B32A32_FLOAT },
 208    { PIPE_FORMAT_R32_UNORM,            PIPE_FORMAT_R32_FLOAT },
 209    { PIPE_FORMAT_R32G32_UNORM,         PIPE_FORMAT_R32G32_FLOAT },
 210    { PIPE_FORMAT_R32G32B32_UNORM,      PIPE_FORMAT_R32G32B32_FLOAT },
 211    { PIPE_FORMAT_R32G32B32A32_UNORM,   PIPE_FORMAT_R32G32B32A32_FLOAT },
 212    { PIPE_FORMAT_R32_SNORM,            PIPE_FORMAT_R32_FLOAT },
 213    { PIPE_FORMAT_R32G32_SNORM,         PIPE_FORMAT_R32G32_FLOAT },
 214    { PIPE_FORMAT_R32G32B32_SNORM,      PIPE_FORMAT_R32G32B32_FLOAT },
 215    { PIPE_FORMAT_R32G32B32A32_SNORM,   PIPE_FORMAT_R32G32B32A32_FLOAT },
 216    { PIPE_FORMAT_R32_USCALED,          PIPE_FORMAT_R32_FLOAT },
 217    { PIPE_FORMAT_R32G32_USCALED,       PIPE_FORMAT_R32G32_FLOAT },
 218    { PIPE_FORMAT_R32G32B32_USCALED,    PIPE_FORMAT_R32G32B32_FLOAT },
 219    { PIPE_FORMAT_R32G32B32A32_USCALED, PIPE_FORMAT_R32G32B32A32_FLOAT },
 220    { PIPE_FORMAT_R32_SSCALED,          PIPE_FORMAT_R32_FLOAT },
 221    { PIPE_FORMAT_R32G32_SSCALED,       PIPE_FORMAT_R32G32_FLOAT },
 222    { PIPE_FORMAT_R32G32B32_SSCALED,    PIPE_FORMAT_R32G32B32_FLOAT },
 223    { PIPE_FORMAT_R32G32B32A32_SSCALED, PIPE_FORMAT_R32G32B32A32_FLOAT },
 224    { PIPE_FORMAT_R16_UNORM,            PIPE_FORMAT_R32_FLOAT },
 225    { PIPE_FORMAT_R16G16_UNORM,         PIPE_FORMAT_R32G32_FLOAT },
 226    { PIPE_FORMAT_R16G16B16_UNORM,      PIPE_FORMAT_R32G32B32_FLOAT },
 227    { PIPE_FORMAT_R16G16B16A16_UNORM,   PIPE_FORMAT_R32G32B32A32_FLOAT },
 228    { PIPE_FORMAT_R16_SNORM,            PIPE_FORMAT_R32_FLOAT },
 229    { PIPE_FORMAT_R16G16_SNORM,         PIPE_FORMAT_R32G32_FLOAT },
 230    { PIPE_FORMAT_R16G16B16_SNORM,      PIPE_FORMAT_R32G32B32_FLOAT },
 231    { PIPE_FORMAT_R16G16B16A16_SNORM,   PIPE_FORMAT_R32G32B32A32_FLOAT },
 232    { PIPE_FORMAT_R16_USCALED,          PIPE_FORMAT_R32_FLOAT },
 233    { PIPE_FORMAT_R16G16_USCALED,       PIPE_FORMAT_R32G32_FLOAT },
 234    { PIPE_FORMAT_R16G16B16_USCALED,    PIPE_FORMAT_R32G32B32_FLOAT },
 235    { PIPE_FORMAT_R16G16B16A16_USCALED, PIPE_FORMAT_R32G32B32A32_FLOAT },
 236    { PIPE_FORMAT_R16_SSCALED,          PIPE_FORMAT_R32_FLOAT },
 237    { PIPE_FORMAT_R16G16_SSCALED,       PIPE_FORMAT_R32G32_FLOAT },
 238    { PIPE_FORMAT_R16G16B16_SSCALED,    PIPE_FORMAT_R32G32B32_FLOAT },
 239    { PIPE_FORMAT_R16G16B16A16_SSCALED, PIPE_FORMAT_R32G32B32A32_FLOAT },
 240    { PIPE_FORMAT_R8_UNORM,             PIPE_FORMAT_R32_FLOAT },
 241    { PIPE_FORMAT_R8G8_UNORM,           PIPE_FORMAT_R32G32_FLOAT },
 242    { PIPE_FORMAT_R8G8B8_UNORM,         PIPE_FORMAT_R32G32B32_FLOAT },
 243    { PIPE_FORMAT_R8G8B8A8_UNORM,       PIPE_FORMAT_R32G32B32A32_FLOAT },
 244    { PIPE_FORMAT_R8_SNORM,             PIPE_FORMAT_R32_FLOAT },
 245    { PIPE_FORMAT_R8G8_SNORM,           PIPE_FORMAT_R32G32_FLOAT },
 246    { PIPE_FORMAT_R8G8B8_SNORM,         PIPE_FORMAT_R32G32B32_FLOAT },
 247    { PIPE_FORMAT_R8G8B8A8_SNORM,       PIPE_FORMAT_R32G32B32A32_FLOAT },
 248    { PIPE_FORMAT_R8_USCALED,           PIPE_FORMAT_R32_FLOAT },
 249    { PIPE_FORMAT_R8G8_USCALED,         PIPE_FORMAT_R32G32_FLOAT },
 250    { PIPE_FORMAT_R8G8B8_USCALED,       PIPE_FORMAT_R32G32B32_FLOAT },
 251    { PIPE_FORMAT_R8G8B8A8_USCALED,     PIPE_FORMAT_R32G32B32A32_FLOAT },
 252    { PIPE_FORMAT_R8_SSCALED,           PIPE_FORMAT_R32_FLOAT },
 253    { PIPE_FORMAT_R8G8_SSCALED,         PIPE_FORMAT_R32G32_FLOAT },
 254    { PIPE_FORMAT_R8G8B8_SSCALED,       PIPE_FORMAT_R32G32B32_FLOAT },
 255    { PIPE_FORMAT_R8G8B8A8_SSCALED,     PIPE_FORMAT_R32G32B32A32_FLOAT },
 256 };
 257
 258 boolean u_vbuf_get_caps(struct pipe_screen *screen, struct u_vbuf_caps *caps,
 259                         unsigned flags)
 260 {
 261    unsigned i;
 262    boolean fallback = FALSE;
 263
 264    /* I'd rather have a bitfield of which formats are supported and a static
 265     * table of the translations indexed by format, but since we don't have C99
 266     * we can't easily make a sparsely-populated table indexed by format.  So,
 267     * we construct the sparse table here.
 268     */
 269    for (i = 0; i < PIPE_FORMAT_COUNT; i++)
 270       caps->format_translation[i] = i;
 271
 272    for (i = 0; i < ARRAY_SIZE(vbuf_format_fallbacks); i++) {
 273       enum pipe_format format = vbuf_format_fallbacks[i].from;
 274
 275       if (!screen->is_format_supported(screen, format, PIPE_BUFFER, 0, 0,
 276                                        PIPE_BIND_VERTEX_BUFFER)) {
 277          caps->format_translation[format] = vbuf_format_fallbacks[i].to;
 278          fallback = TRUE;
 279       }
 280    }
 281
 282    caps->buffer_offset_unaligned =
 283       !screen->get_param(screen,
 284                          PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY);
 285    caps->buffer_stride_unaligned =
 286      !screen->get_param(screen,
 287                         PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY);
 288    caps->velem_src_offset_unaligned =
 289       !screen->get_param(screen,
 290                          PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY);
 291    caps->user_vertex_buffers =
 292       screen->get_param(screen, PIPE_CAP_USER_VERTEX_BUFFERS);
 293    caps->max_vertex_buffers =
 294       screen->get_param(screen, PIPE_CAP_MAX_VERTEX_BUFFERS);
 295
 296    /* OpenGL 2.0 requires a minimum of 16 vertex buffers */
 297    if (caps->max_vertex_buffers < 16)
 298       fallback = TRUE;
 299
 300    if (!caps->buffer_offset_unaligned ||
 301        !caps->buffer_stride_unaligned ||
 302        !caps->velem_src_offset_unaligned ||
 303        (!(flags & U_VBUF_FLAG_NO_USER_VBOS) && !caps->user_vertex_buffers)) {
 304       fallback = TRUE;
 305    }
 306
 307    return fallback;
 308 }
 309
 310 struct u_vbuf *
 311 u_vbuf_create(struct pipe_context *pipe, struct u_vbuf_caps *caps)
 312 {
 313    struct u_vbuf *mgr = CALLOC_STRUCT(u_vbuf);
 314
 315    mgr->caps = *caps;
 316    mgr->pipe = pipe;
 317    mgr->cso_cache = cso_cache_create();
 318    mgr->translate_cache = translate_cache_create();
 319    memset(mgr->fallback_vbs, ~0, sizeof(mgr->fallback_vbs));
 320    mgr->allowed_vb_mask = u_bit_consecutive(0, mgr->caps.max_vertex_buffers);
 321
 322    mgr->has_signed_vb_offset =
 323       pipe->screen->get_param(pipe->screen,
 324                               PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET);
 325
 326    return mgr;
 327 }
 328
 329 /* u_vbuf uses its own caching for vertex elements, because it needs to keep
 330  * its own preprocessed state per vertex element CSO. */
 331 static struct u_vbuf_elements *
 332 u_vbuf_set_vertex_elements_internal(struct u_vbuf *mgr, unsigned count,
 333                                     const struct pipe_vertex_element *states)
 334 {
 335    struct pipe_context *pipe = mgr->pipe;
 336    unsigned key_size, hash_key;
 337    struct cso_hash_iter iter;
 338    struct u_vbuf_elements *ve;
 339    struct cso_velems_state velems_state;
 340
 341    /* need to include the count into the stored state data too. */
 342    key_size = sizeof(struct pipe_vertex_element) * count + sizeof(unsigned);
 343    velems_state.count = count;
 344    memcpy(velems_state.velems, states,
 345           sizeof(struct pipe_vertex_element) * count);
 346    hash_key = cso_construct_key((void*)&velems_state, key_size);
 347    iter = cso_find_state_template(mgr->cso_cache, hash_key, CSO_VELEMENTS,
 348                                   (void*)&velems_state, key_size);
 349
 350    if (cso_hash_iter_is_null(iter)) {
 351       struct cso_velements *cso = MALLOC_STRUCT(cso_velements);
 352       memcpy(&cso->state, &velems_state, key_size);
 353       cso->data = u_vbuf_create_vertex_elements(mgr, count, states);
 354       cso->delete_state = (cso_state_callback)u_vbuf_delete_vertex_elements;
 355       cso->context = (void*)mgr;
 356
 357       iter = cso_insert_state(mgr->cso_cache, hash_key, CSO_VELEMENTS, cso);
 358       ve = cso->data;
 359    } else {
 360       ve = ((struct cso_velements *)cso_hash_iter_data(iter))->data;
 361    }
 362
 363    assert(ve);
 364
 365    if (ve != mgr->ve)
 366       pipe->bind_vertex_elements_state(pipe, ve->driver_cso);
 367
 368    return ve;
 369 }
 370
 371 void u_vbuf_set_vertex_elements(struct u_vbuf *mgr, unsigned count,
 372                                const struct pipe_vertex_element *states)
 373 {
 374    mgr->ve = u_vbuf_set_vertex_elements_internal(mgr, count, states);
 375 }
 376
 377 void u_vbuf_destroy(struct u_vbuf *mgr)
 378 {
 379    struct pipe_screen *screen = mgr->pipe->screen;
 380    unsigned i;
 381    const unsigned num_vb = screen->get_shader_param(screen, PIPE_SHADER_VERTEX,
 382                                                     PIPE_SHADER_CAP_MAX_INPUTS);
 383
 384    mgr->pipe->set_vertex_buffers(mgr->pipe, 0, num_vb, NULL);
 385
 386    for (i = 0; i < PIPE_MAX_ATTRIBS; i++)
 387       pipe_vertex_buffer_unreference(&mgr->vertex_buffer[i]);
 388    for (i = 0; i < PIPE_MAX_ATTRIBS; i++)
 389       pipe_vertex_buffer_unreference(&mgr->real_vertex_buffer[i]);
 390
 391    pipe_vertex_buffer_unreference(&mgr->vertex_buffer0_saved);
 392
 393    translate_cache_destroy(mgr->translate_cache);
 394    cso_cache_delete(mgr->cso_cache);
 395    FREE(mgr);
 396 }
 397
 398 static enum pipe_error
 399 u_vbuf_translate_buffers(struct u_vbuf *mgr, struct translate_key *key,
 400                          const struct pipe_draw_info *info,
 401                          unsigned vb_mask, unsigned out_vb,
 402                          int start_vertex, unsigned num_vertices,
 403                          int min_index, boolean unroll_indices)
 404 {
 405    struct translate *tr;
 406    struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS] = {0};
 407    struct pipe_resource *out_buffer = NULL;
 408    uint8_t *out_map;
 409    unsigned out_offset, mask;
 410
 411    /* Get a translate object. */
 412    tr = translate_cache_find(mgr->translate_cache, key);
 413
 414    /* Map buffers we want to translate. */
 415    mask = vb_mask;
 416    while (mask) {
 417       struct pipe_vertex_buffer *vb;
 418       unsigned offset;
 419       uint8_t *map;
 420       unsigned i = u_bit_scan(&mask);
 421
 422       vb = &mgr->vertex_buffer[i];
 423       offset = vb->buffer_offset + vb->stride * start_vertex;
 424
 425       if (vb->is_user_buffer) {
 426          map = (uint8_t*)vb->buffer.user + offset;
 427       } else {
 428          unsigned size = vb->stride ? num_vertices * vb->stride
 429                                     : sizeof(double)*4;
 430
 431          if (!vb->buffer.resource)
 432             continue;
 433
 434          if (offset + size > vb->buffer.resource->width0) {
 435             /* Don't try to map past end of buffer.  This often happens when
 436              * we're translating an attribute that's at offset > 0 from the
 437              * start of the vertex.  If we'd subtract attrib's offset from
 438              * the size, this probably wouldn't happen.
 439              */
 440             size = vb->buffer.resource->width0 - offset;
 441
 442             /* Also adjust num_vertices.  A common user error is to call
 443              * glDrawRangeElements() with incorrect 'end' argument.  The 'end
 444              * value should be the max index value, but people often
 445              * accidentally add one to this value.  This adjustment avoids
 446              * crashing (by reading past the end of a hardware buffer mapping)
 447              * when people do that.
 448              */
 449             num_vertices = (size + vb->stride - 1) / vb->stride;
 450          }
 451
 452          map = pipe_buffer_map_range(mgr->pipe, vb->buffer.resource, offset, size,
 453                                      PIPE_TRANSFER_READ, &vb_transfer[i]);
 454       }
 455
 456       /* Subtract min_index so that indexing with the index buffer works. */
 457       if (unroll_indices) {
 458          map -= (ptrdiff_t)vb->stride * min_index;
 459       }
 460
 461       tr->set_buffer(tr, i, map, vb->stride, info->max_index);
 462    }
 463
 464    /* Translate. */
 465    if (unroll_indices) {
 466       struct pipe_transfer *transfer = NULL;
 467       const unsigned offset = info->start * info->index_size;
 468       uint8_t *map;
 469
 470       /* Create and map the output buffer. */
 471       u_upload_alloc(mgr->pipe->stream_uploader, 0,
 472                      key->output_stride * info->count, 4,
 473                      &out_offset, &out_buffer,
 474                      (void**)&out_map);
 475       if (!out_buffer)
 476          return PIPE_ERROR_OUT_OF_MEMORY;
 477
 478       if (info->has_user_indices) {
 479          map = (uint8_t*)info->index.user + offset;
 480       } else {
 481          map = pipe_buffer_map_range(mgr->pipe, info->index.resource, offset,
 482                                      info->count * info->index_size,
 483                                      PIPE_TRANSFER_READ, &transfer);
 484       }
 485
 486       switch (info->index_size) {
 487       case 4:
 488          tr->run_elts(tr, (unsigned*)map, info->count, 0, 0, out_map);
 489          break;
 490       case 2:
 491          tr->run_elts16(tr, (uint16_t*)map, info->count, 0, 0, out_map);
 492          break;
 493       case 1:
 494          tr->run_elts8(tr, map, info->count, 0, 0, out_map);
 495          break;
 496       }
 497
 498       if (transfer) {
 499          pipe_buffer_unmap(mgr->pipe, transfer);
 500       }
 501    } else {
 502       /* Create and map the output buffer. */
 503       u_upload_alloc(mgr->pipe->stream_uploader,
 504                      mgr->has_signed_vb_offset ?
 505                         0 : key->output_stride * start_vertex,
 506                      key->output_stride * num_vertices, 4,
 507                      &out_offset, &out_buffer,
 508                      (void**)&out_map);
 509       if (!out_buffer)
 510          return PIPE_ERROR_OUT_OF_MEMORY;
 511
 512       out_offset -= key->output_stride * start_vertex;
 513
 514       tr->run(tr, 0, num_vertices, 0, 0, out_map);
 515    }
 516
 517    /* Unmap all buffers. */
 518    mask = vb_mask;
 519    while (mask) {
 520       unsigned i = u_bit_scan(&mask);
 521
 522       if (vb_transfer[i]) {
 523          pipe_buffer_unmap(mgr->pipe, vb_transfer[i]);
 524       }
 525    }
 526
 527    /* Setup the new vertex buffer. */
 528    mgr->real_vertex_buffer[out_vb].buffer_offset = out_offset;
 529    mgr->real_vertex_buffer[out_vb].stride = key->output_stride;
 530
 531    /* Move the buffer reference. */
 532    pipe_vertex_buffer_unreference(&mgr->real_vertex_buffer[out_vb]);
 533    mgr->real_vertex_buffer[out_vb].buffer.resource = out_buffer;
 534    mgr->real_vertex_buffer[out_vb].is_user_buffer = false;
 535
 536    return PIPE_OK;
 537 }
 538
 539 static boolean
 540 u_vbuf_translate_find_free_vb_slots(struct u_vbuf *mgr,
 541                                     unsigned mask[VB_NUM])
 542 {
 543    unsigned type;
 544    unsigned fallback_vbs[VB_NUM];
 545    /* Set the bit for each buffer which is incompatible, or isn't set. */
 546    uint32_t unused_vb_mask =
 547       mgr->ve->incompatible_vb_mask_all | mgr->incompatible_vb_mask |
 548       ~mgr->enabled_vb_mask;
 549    uint32_t unused_vb_mask_orig;
 550    boolean insufficient_buffers = false;
 551
 552    /* No vertex buffers available at all */
 553    if (!unused_vb_mask)
 554       return FALSE;
 555
 556    memset(fallback_vbs, ~0, sizeof(fallback_vbs));
 557
 558    /* Find free slots for each type if needed. */
 559    unused_vb_mask_orig = unused_vb_mask;
 560    for (type = 0; type < VB_NUM; type++) {
 561       if (mask[type]) {
 562          uint32_t index;
 563
 564          if (!unused_vb_mask) {
 565             insufficient_buffers = true;
 566             break;
 567          }
 568
 569          index = ffs(unused_vb_mask) - 1;
 570          fallback_vbs[type] = index;
 571          unused_vb_mask &= ~(1 << index);
 572          /*printf("found slot=%i for type=%i\n", index, type);*/
 573       }
 574    }
 575
 576    if (insufficient_buffers) {
 577       /* not enough vbs for all types supported by the hardware, they will have to share one
 578        * buffer */
 579       uint32_t index = ffs(unused_vb_mask_orig) - 1;
 580       /* When sharing one vertex buffer use per-vertex frequency for everything. */
 581       fallback_vbs[VB_VERTEX] = index;
 582       mask[VB_VERTEX] = mask[VB_VERTEX] | mask[VB_CONST] | mask[VB_INSTANCE];
 583       mask[VB_CONST] = 0;
 584       mask[VB_INSTANCE] = 0;
 585    }
 586
 587    for (type = 0; type < VB_NUM; type++) {
 588       if (mask[type]) {
 589          mgr->dirty_real_vb_mask |= 1 << fallback_vbs[type];
 590       }
 591    }
 592
 593    memcpy(mgr->fallback_vbs, fallback_vbs, sizeof(fallback_vbs));
 594    return TRUE;
 595 }
 596
 597 static boolean
 598 u_vbuf_translate_begin(struct u_vbuf *mgr,
 599                        const struct pipe_draw_info *info,
 600                        int start_vertex, unsigned num_vertices,
 601                        int min_index, boolean unroll_indices)
 602 {
 603    unsigned mask[VB_NUM] = {0};
 604    struct translate_key key[VB_NUM];
 605    unsigned elem_index[VB_NUM][PIPE_MAX_ATTRIBS]; /* ... into key.elements */
 606    unsigned i, type;
 607    const unsigned incompatible_vb_mask = mgr->incompatible_vb_mask &
 608                                          mgr->ve->used_vb_mask;
 609
 610    const int start[VB_NUM] = {
 611       start_vertex,           /* VERTEX */
 612       info->start_instance,   /* INSTANCE */
 613       0                       /* CONST */
 614    };
 615
 616    const unsigned num[VB_NUM] = {
 617       num_vertices,           /* VERTEX */
 618       info->instance_count,   /* INSTANCE */
 619       1                       /* CONST */
 620    };
 621
 622    memset(key, 0, sizeof(key));
 623    memset(elem_index, ~0, sizeof(elem_index));
 624
 625    /* See if there are vertex attribs of each type to translate and
 626     * which ones. */
 627    for (i = 0; i < mgr->ve->count; i++) {
 628       unsigned vb_index = mgr->ve->ve[i].vertex_buffer_index;
 629
 630       if (!mgr->vertex_buffer[vb_index].stride) {
 631          if (!(mgr->ve->incompatible_elem_mask & (1 << i)) &&
 632              !(incompatible_vb_mask & (1 << vb_index))) {
 633             continue;
 634          }
 635          mask[VB_CONST] |= 1 << vb_index;
 636       } else if (mgr->ve->ve[i].instance_divisor) {
 637          if (!(mgr->ve->incompatible_elem_mask & (1 << i)) &&
 638              !(incompatible_vb_mask & (1 << vb_index))) {
 639             continue;
 640          }
 641          mask[VB_INSTANCE] |= 1 << vb_index;
 642       } else {
 643          if (!unroll_indices &&
 644              !(mgr->ve->incompatible_elem_mask & (1 << i)) &&
 645              !(incompatible_vb_mask & (1 << vb_index))) {
 646             continue;
 647          }
 648          mask[VB_VERTEX] |= 1 << vb_index;
 649       }
 650    }
 651
 652    assert(mask[VB_VERTEX] || mask[VB_INSTANCE] || mask[VB_CONST]);
 653
 654    /* Find free vertex buffer slots. */
 655    if (!u_vbuf_translate_find_free_vb_slots(mgr, mask)) {
 656       return FALSE;
 657    }
 658
 659    /* Initialize the translate keys. */
 660    for (i = 0; i < mgr->ve->count; i++) {
 661       struct translate_key *k;
 662       struct translate_element *te;
 663       enum pipe_format output_format = mgr->ve->native_format[i];
 664       unsigned bit, vb_index = mgr->ve->ve[i].vertex_buffer_index;
 665       bit = 1 << vb_index;
 666
 667       if (!(mgr->ve->incompatible_elem_mask & (1 << i)) &&
 668           !(incompatible_vb_mask & (1 << vb_index)) &&
 669           (!unroll_indices || !(mask[VB_VERTEX] & bit))) {
 670          continue;
 671       }
 672
 673       /* Set type to what we will translate.
 674        * Whether vertex, instance, or constant attribs. */
 675       for (type = 0; type < VB_NUM; type++) {
 676          if (mask[type] & bit) {
 677             break;
 678          }
 679       }
 680       assert(type < VB_NUM);
 681       if (mgr->ve->ve[i].src_format != output_format)
 682          assert(translate_is_output_format_supported(output_format));
 683       /*printf("velem=%i type=%i\n", i, type);*/
 684
 685       /* Add the vertex element. */
 686       k = &key[type];
 687       elem_index[type][i] = k->nr_elements;
 688
 689       te = &k->element[k->nr_elements];
 690       te->type = TRANSLATE_ELEMENT_NORMAL;
 691       te->instance_divisor = 0;
 692       te->input_buffer = vb_index;
 693       te->input_format = mgr->ve->ve[i].src_format;
 694       te->input_offset = mgr->ve->ve[i].src_offset;
 695       te->output_format = output_format;
 696       te->output_offset = k->output_stride;
 697
 698       k->output_stride += mgr->ve->native_format_size[i];
 699       k->nr_elements++;
 700    }
 701
 702    /* Translate buffers. */
 703    for (type = 0; type < VB_NUM; type++) {
 704       if (key[type].nr_elements) {
 705          enum pipe_error err;
 706          err = u_vbuf_translate_buffers(mgr, &key[type], info, mask[type],
 707                                         mgr->fallback_vbs[type],
 708                                         start[type], num[type], min_index,
 709                                         unroll_indices && type == VB_VERTEX);
 710          if (err != PIPE_OK)
 711             return FALSE;
 712
 713          /* Fixup the stride for constant attribs. */
 714          if (type == VB_CONST) {
 715             mgr->real_vertex_buffer[mgr->fallback_vbs[VB_CONST]].stride = 0;
 716          }
 717       }
 718    }
 719
 720    /* Setup new vertex elements. */
 721    for (i = 0; i < mgr->ve->count; i++) {
 722       for (type = 0; type < VB_NUM; type++) {
 723          if (elem_index[type][i] < key[type].nr_elements) {
 724             struct translate_element *te = &key[type].element[elem_index[type][i]];
 725             mgr->fallback_velems[i].instance_divisor = mgr->ve->ve[i].instance_divisor;
 726             mgr->fallback_velems[i].src_format = te->output_format;
 727             mgr->fallback_velems[i].src_offset = te->output_offset;
 728             mgr->fallback_velems[i].vertex_buffer_index = mgr->fallback_vbs[type];
 729
 730             /* elem_index[type][i] can only be set for one type. */
 731             assert(type > VB_INSTANCE || elem_index[type+1][i] == ~0u);
 732             assert(type > VB_VERTEX   || elem_index[type+2][i] == ~0u);
 733             break;
 734          }
 735       }
 736       /* No translating, just copy the original vertex element over. */
 737       if (type == VB_NUM) {
 738          memcpy(&mgr->fallback_velems[i], &mgr->ve->ve[i],
 739                 sizeof(struct pipe_vertex_element));
 740       }
 741    }
 742
 743    u_vbuf_set_vertex_elements_internal(mgr, mgr->ve->count,
 744                                        mgr->fallback_velems);
 745    mgr->using_translate = TRUE;
 746    return TRUE;
 747 }
 748
 749 static void u_vbuf_translate_end(struct u_vbuf *mgr)
 750 {
 751    unsigned i;
 752
 753    /* Restore vertex elements. */
 754    mgr->pipe->bind_vertex_elements_state(mgr->pipe, mgr->ve->driver_cso);
 755    mgr->using_translate = FALSE;
 756
 757    /* Unreference the now-unused VBOs. */
 758    for (i = 0; i < VB_NUM; i++) {
 759       unsigned vb = mgr->fallback_vbs[i];
 760       if (vb != ~0u) {
 761          pipe_resource_reference(&mgr->real_vertex_buffer[vb].buffer.resource, NULL);
 762          mgr->fallback_vbs[i] = ~0;
 763
 764          /* This will cause the buffer to be unbound in the driver later. */
 765          mgr->dirty_real_vb_mask |= 1 << vb;
 766       }
 767    }
 768 }
 769
 770 static void *
 771 u_vbuf_create_vertex_elements(struct u_vbuf *mgr, unsigned count,
 772                               const struct pipe_vertex_element *attribs)
 773 {
 774    struct pipe_context *pipe = mgr->pipe;
 775    unsigned i;
 776    struct pipe_vertex_element driver_attribs[PIPE_MAX_ATTRIBS];
 777    struct u_vbuf_elements *ve = CALLOC_STRUCT(u_vbuf_elements);
 778    uint32_t used_buffers = 0;
 779
 780    ve->count = count;
 781
 782    memcpy(ve->ve, attribs, sizeof(struct pipe_vertex_element) * count);
 783    memcpy(driver_attribs, attribs, sizeof(struct pipe_vertex_element) * count);
 784
 785    /* Set the best native format in case the original format is not
 786     * supported. */
 787    for (i = 0; i < count; i++) {
 788       enum pipe_format format = ve->ve[i].src_format;
 789
 790       ve->src_format_size[i] = util_format_get_blocksize(format);
 791
 792       used_buffers |= 1 << ve->ve[i].vertex_buffer_index;
 793
 794       if (!ve->ve[i].instance_divisor) {
 795          ve->noninstance_vb_mask_any |= 1 << ve->ve[i].vertex_buffer_index;
 796       }
 797
 798       format = mgr->caps.format_translation[format];
 799
 800       driver_attribs[i].src_format = format;
 801       ve->native_format[i] = format;
 802       ve->native_format_size[i] =
 803             util_format_get_blocksize(ve->native_format[i]);
 804
 805       if (ve->ve[i].src_format != format ||
 806           (!mgr->caps.velem_src_offset_unaligned &&
 807            ve->ve[i].src_offset % 4 != 0)) {
 808          ve->incompatible_elem_mask |= 1 << i;
 809          ve->incompatible_vb_mask_any |= 1 << ve->ve[i].vertex_buffer_index;
 810       } else {
 811          ve->compatible_vb_mask_any |= 1 << ve->ve[i].vertex_buffer_index;
 812       }
 813    }
 814
 815    if (used_buffers & ~mgr->allowed_vb_mask) {
 816       /* More vertex buffers are used than the hardware supports.  In
 817        * principle, we only need to make sure that less vertex buffers are
 818        * used, and mark some of the latter vertex buffers as incompatible.
 819        * For now, mark all vertex buffers as incompatible.
 820        */
 821       ve->incompatible_vb_mask_any = used_buffers;
 822       ve->compatible_vb_mask_any = 0;
 823       ve->incompatible_elem_mask = u_bit_consecutive(0, count);
 824    }
 825
 826    ve->used_vb_mask = used_buffers;
 827    ve->compatible_vb_mask_all = ~ve->incompatible_vb_mask_any & used_buffers;
 828    ve->incompatible_vb_mask_all = ~ve->compatible_vb_mask_any & used_buffers;
 829
 830    /* Align the formats and offsets to the size of DWORD if needed. */
 831    if (!mgr->caps.velem_src_offset_unaligned) {
 832       for (i = 0; i < count; i++) {
 833          ve->native_format_size[i] = align(ve->native_format_size[i], 4);
 834          driver_attribs[i].src_offset = align(ve->ve[i].src_offset, 4);
 835       }
 836    }
 837
 838    /* Only create driver CSO if no incompatible elements */
 839    if (!ve->incompatible_elem_mask) {
 840       ve->driver_cso =
 841          pipe->create_vertex_elements_state(pipe, count, driver_attribs);
 842    }
 843
 844    return ve;
 845 }
 846
 847 static void u_vbuf_delete_vertex_elements(struct u_vbuf *mgr, void *cso)
 848 {
 849    struct pipe_context *pipe = mgr->pipe;
 850    struct u_vbuf_elements *ve = cso;
 851
 852    pipe->delete_vertex_elements_state(pipe, ve->driver_cso);
 853    FREE(ve);
 854 }
 855
 856 void u_vbuf_set_vertex_buffers(struct u_vbuf *mgr,
 857                                unsigned start_slot, unsigned count,
 858                                const struct pipe_vertex_buffer *bufs)
 859 {
 860    unsigned i;
 861    /* which buffers are enabled */
 862    uint32_t enabled_vb_mask = 0;
 863    /* which buffers are in user memory */
 864    uint32_t user_vb_mask = 0;
 865    /* which buffers are incompatible with the driver */
 866    uint32_t incompatible_vb_mask = 0;
 867    /* which buffers have a non-zero stride */
 868    uint32_t nonzero_stride_vb_mask = 0;
 869    const uint32_t mask = ~(((1ull << count) - 1) << start_slot);
 870
 871    /* Zero out the bits we are going to rewrite completely. */
 872    mgr->user_vb_mask &= mask;
 873    mgr->incompatible_vb_mask &= mask;
 874    mgr->nonzero_stride_vb_mask &= mask;
 875    mgr->enabled_vb_mask &= mask;
 876
 877    if (!bufs) {
 878       struct pipe_context *pipe = mgr->pipe;
 879       /* Unbind. */
 880       mgr->dirty_real_vb_mask &= mask;
 881
 882       for (i = 0; i < count; i++) {
 883          unsigned dst_index = start_slot + i;
 884
 885          pipe_vertex_buffer_unreference(&mgr->vertex_buffer[dst_index]);
 886          pipe_vertex_buffer_unreference(&mgr->real_vertex_buffer[dst_index]);
 887       }
 888
 889       pipe->set_vertex_buffers(pipe, start_slot, count, NULL);
 890       return;
 891    }
 892
 893    for (i = 0; i < count; i++) {
 894       unsigned dst_index = start_slot + i;
 895       const struct pipe_vertex_buffer *vb = &bufs[i];
 896       struct pipe_vertex_buffer *orig_vb = &mgr->vertex_buffer[dst_index];
 897       struct pipe_vertex_buffer *real_vb = &mgr->real_vertex_buffer[dst_index];
 898
 899       if (!vb->buffer.resource) {
 900          pipe_vertex_buffer_unreference(orig_vb);
 901          pipe_vertex_buffer_unreference(real_vb);
 902          continue;
 903       }
 904
 905       pipe_vertex_buffer_reference(orig_vb, vb);
 906
 907       if (vb->stride) {
 908          nonzero_stride_vb_mask |= 1 << dst_index;
 909       }
 910       enabled_vb_mask |= 1 << dst_index;
 911
 912       if ((!mgr->caps.buffer_offset_unaligned && vb->buffer_offset % 4 != 0) ||
 913           (!mgr->caps.buffer_stride_unaligned && vb->stride % 4 != 0)) {
 914          incompatible_vb_mask |= 1 << dst_index;
 915          real_vb->buffer_offset = vb->buffer_offset;
 916          real_vb->stride = vb->stride;
 917          pipe_vertex_buffer_unreference(real_vb);
 918          real_vb->is_user_buffer = false;
 919          continue;
 920       }
 921
 922       if (!mgr->caps.user_vertex_buffers && vb->is_user_buffer) {
 923          user_vb_mask |= 1 << dst_index;
 924          real_vb->buffer_offset = vb->buffer_offset;
 925          real_vb->stride = vb->stride;
 926          pipe_vertex_buffer_unreference(real_vb);
 927          real_vb->is_user_buffer = false;
 928          continue;
 929       }
 930
 931       pipe_vertex_buffer_reference(real_vb, vb);
 932    }
 933
 934    mgr->user_vb_mask |= user_vb_mask;
 935    mgr->incompatible_vb_mask |= incompatible_vb_mask;
 936    mgr->nonzero_stride_vb_mask |= nonzero_stride_vb_mask;
 937    mgr->enabled_vb_mask |= enabled_vb_mask;
 938
 939    /* All changed buffers are marked as dirty, even the NULL ones,
 940     * which will cause the NULL buffers to be unbound in the driver later. */
 941    mgr->dirty_real_vb_mask |= ~mask;
 942 }
 943
 944 static enum pipe_error
 945 u_vbuf_upload_buffers(struct u_vbuf *mgr,
 946                       int start_vertex, unsigned num_vertices,
 947                       int start_instance, unsigned num_instances)
 948 {
 949    unsigned i;
 950    unsigned nr_velems = mgr->ve->count;
 951    const struct pipe_vertex_element *velems =
 952          mgr->using_translate ? mgr->fallback_velems : mgr->ve->ve;
 953    unsigned start_offset[PIPE_MAX_ATTRIBS];
 954    unsigned end_offset[PIPE_MAX_ATTRIBS];
 955    uint32_t buffer_mask = 0;
 956
 957    /* Determine how much data needs to be uploaded. */
 958    for (i = 0; i < nr_velems; i++) {
 959       const struct pipe_vertex_element *velem = &velems[i];
 960       unsigned index = velem->vertex_buffer_index;
 961       struct pipe_vertex_buffer *vb = &mgr->vertex_buffer[index];
 962       unsigned instance_div, first, size, index_bit;
 963
 964       /* Skip the buffers generated by translate. */
 965       if (index == mgr->fallback_vbs[VB_VERTEX] ||
 966           index == mgr->fallback_vbs[VB_INSTANCE] ||
 967           index == mgr->fallback_vbs[VB_CONST]) {
 968          continue;
 969       }
 970
 971       if (!vb->is_user_buffer) {
 972          continue;
 973       }
 974
 975       instance_div = velem->instance_divisor;
 976       first = vb->buffer_offset + velem->src_offset;
 977
 978       if (!vb->stride) {
 979          /* Constant attrib. */
 980          size = mgr->ve->src_format_size[i];
 981       } else if (instance_div) {
 982          /* Per-instance attrib. */
 983
 984          /* Figure out how many instances we'll render given instance_div.  We
 985           * can't use the typical div_round_up() pattern because the CTS uses
 986           * instance_div = ~0 for a test, which overflows div_round_up()'s
 987           * addition.
 988           */
 989          unsigned count = num_instances / instance_div;
 990          if (count * instance_div != num_instances)
 991             count++;
 992
 993          first += vb->stride * start_instance;
 994          size = vb->stride * (count - 1) + mgr->ve->src_format_size[i];
 995       } else {
 996          /* Per-vertex attrib. */
 997          first += vb->stride * start_vertex;
 998          size = vb->stride * (num_vertices - 1) + mgr->ve->src_format_size[i];
 999       }
1000
1001       index_bit = 1 << index;
1002
1003       /* Update offsets. */
1004       if (!(buffer_mask & index_bit)) {
1005          start_offset[index] = first;
1006          end_offset[index] = first + size;
1007       } else {
1008          if (first < start_offset[index])
1009             start_offset[index] = first;
1010          if (first + size > end_offset[index])
1011             end_offset[index] = first + size;
1012       }
1013
1014       buffer_mask |= index_bit;
1015    }
1016
1017    /* Upload buffers. */
1018    while (buffer_mask) {
1019       unsigned start, end;
1020       struct pipe_vertex_buffer *real_vb;
1021       const uint8_t *ptr;
1022
1023       i = u_bit_scan(&buffer_mask);
1024
1025       start = start_offset[i];
1026       end = end_offset[i];
1027       assert(start < end);
1028
1029       real_vb = &mgr->real_vertex_buffer[i];
1030       ptr = mgr->vertex_buffer[i].buffer.user;
1031
1032       u_upload_data(mgr->pipe->stream_uploader,
1033                     mgr->has_signed_vb_offset ? 0 : start,
1034                     end - start, 4,
1035                     ptr + start, &real_vb->buffer_offset, &real_vb->buffer.resource);
1036       if (!real_vb->buffer.resource)
1037          return PIPE_ERROR_OUT_OF_MEMORY;
1038
1039       real_vb->buffer_offset -= start;
1040    }
1041
1042    return PIPE_OK;
1043 }
1044
1045 static boolean u_vbuf_need_minmax_index(const struct u_vbuf *mgr)
1046 {
1047    /* See if there are any per-vertex attribs which will be uploaded or
1048     * translated. Use bitmasks to get the info instead of looping over vertex
1049     * elements. */
1050    return (mgr->ve->used_vb_mask &
1051            ((mgr->user_vb_mask |
1052              mgr->incompatible_vb_mask |
1053              mgr->ve->incompatible_vb_mask_any) &
1054             mgr->ve->noninstance_vb_mask_any &
1055             mgr->nonzero_stride_vb_mask)) != 0;
1056 }
1057
1058 static boolean u_vbuf_mapping_vertex_buffer_blocks(const struct u_vbuf *mgr)
1059 {
1060    /* Return true if there are hw buffers which don't need to be translated.
1061     *
1062     * We could query whether each buffer is busy, but that would
1063     * be way more costly than this. */
1064    return (mgr->ve->used_vb_mask &
1065            (~mgr->user_vb_mask &
1066             ~mgr->incompatible_vb_mask &
1067             mgr->ve->compatible_vb_mask_all &
1068             mgr->ve->noninstance_vb_mask_any &
1069             mgr->nonzero_stride_vb_mask)) != 0;
1070 }
1071
1072 static void
1073 u_vbuf_get_minmax_index_mapped(const struct pipe_draw_info *info,
1074                                const void *indices, unsigned *out_min_index,
1075                                unsigned *out_max_index)
1076 {
1077    if (!info->count) {
1078       *out_min_index = 0;
1079       *out_max_index = 0;
1080       return;
1081    }
1082
1083    switch (info->index_size) {
1084    case 4: {
1085       const unsigned *ui_indices = (const unsigned*)indices;
1086       unsigned max = 0;
1087       unsigned min = ~0u;
1088       if (info->primitive_restart) {
1089          for (unsigned i = 0; i < info->count; i++) {
1090             if (ui_indices[i] != info->restart_index) {
1091                if (ui_indices[i] > max) max = ui_indices[i];
1092                if (ui_indices[i] < min) min = ui_indices[i];
1093             }
1094          }
1095       }
1096       else {
1097          for (unsigned i = 0; i < info->count; i++) {
1098             if (ui_indices[i] > max) max = ui_indices[i];
1099             if (ui_indices[i] < min) min = ui_indices[i];
1100          }
1101       }
1102       *out_min_index = min;
1103       *out_max_index = max;
1104       break;
1105    }
1106    case 2: {
1107       const unsigned short *us_indices = (const unsigned short*)indices;
1108       unsigned short max = 0;
1109       unsigned short min = ~((unsigned short)0);
1110       if (info->primitive_restart) {
1111          for (unsigned i = 0; i < info->count; i++) {
1112             if (us_indices[i] != info->restart_index) {
1113                if (us_indices[i] > max) max = us_indices[i];
1114                if (us_indices[i] < min) min = us_indices[i];
1115             }
1116          }
1117       }
1118       else {
1119          for (unsigned i = 0; i < info->count; i++) {
1120             if (us_indices[i] > max) max = us_indices[i];
1121             if (us_indices[i] < min) min = us_indices[i];
1122          }
1123       }
1124       *out_min_index = min;
1125       *out_max_index = max;
1126       break;
1127    }
1128    case 1: {
1129       const unsigned char *ub_indices = (const unsigned char*)indices;
1130       unsigned char max = 0;
1131       unsigned char min = ~((unsigned char)0);
1132       if (info->primitive_restart) {
1133          for (unsigned i = 0; i < info->count; i++) {
1134             if (ub_indices[i] != info->restart_index) {
1135                if (ub_indices[i] > max) max = ub_indices[i];
1136                if (ub_indices[i] < min) min = ub_indices[i];
1137             }
1138          }
1139       }
1140       else {
1141          for (unsigned i = 0; i < info->count; i++) {
1142             if (ub_indices[i] > max) max = ub_indices[i];
1143             if (ub_indices[i] < min) min = ub_indices[i];
1144          }
1145       }
1146       *out_min_index = min;
1147       *out_max_index = max;
1148       break;
1149    }
1150    default:
1151       assert(0);
1152    }
1153 }
1154
1155 void u_vbuf_get_minmax_index(struct pipe_context *pipe,
1156                              const struct pipe_draw_info *info,
1157                              unsigned *out_min_index, unsigned *out_max_index)
1158 {
1159    struct pipe_transfer *transfer = NULL;
1160    const void *indices;
1161
1162    if (info->has_user_indices) {
1163       indices = (uint8_t*)info->index.user +
1164                 info->start * info->index_size;
1165    } else {
1166       indices = pipe_buffer_map_range(pipe, info->index.resource,
1167                                       info->start * info->index_size,
1168                                       info->count * info->index_size,
1169                                       PIPE_TRANSFER_READ, &transfer);
1170    }
1171
1172    u_vbuf_get_minmax_index_mapped(info, indices, out_min_index, out_max_index);
1173
1174    if (transfer) {
1175       pipe_buffer_unmap(pipe, transfer);
1176    }
1177 }
1178
1179 static void u_vbuf_set_driver_vertex_buffers(struct u_vbuf *mgr)
1180 {
1181    struct pipe_context *pipe = mgr->pipe;
1182    unsigned start_slot, count;
1183
1184    start_slot = ffs(mgr->dirty_real_vb_mask) - 1;
1185    count = util_last_bit(mgr->dirty_real_vb_mask >> start_slot);
1186
1187    pipe->set_vertex_buffers(pipe, start_slot, count,
1188                             mgr->real_vertex_buffer + start_slot);
1189    mgr->dirty_real_vb_mask = 0;
1190 }
1191
1192 static void
1193 u_vbuf_split_indexed_multidraw(struct u_vbuf *mgr, struct pipe_draw_info *info,
1194                                unsigned *indirect_data, unsigned stride,
1195                                unsigned draw_count)
1196 {
1197    assert(info->index_size);
1198    info->indirect = NULL;
1199
1200    for (unsigned i = 0; i < draw_count; i++) {
1201       unsigned offset = i * stride / 4;
1202
1203       info->count = indirect_data[offset + 0];
1204       info->instance_count = indirect_data[offset + 1];
1205
1206       if (!info->count || !info->instance_count)
1207          continue;
1208
1209       info->start = indirect_data[offset + 2];
1210       info->index_bias = indirect_data[offset + 3];
1211       info->start_instance = indirect_data[offset + 4];
1212
1213       u_vbuf_draw_vbo(mgr, info);
1214    }
1215 }
1216
1217 void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info)
1218 {
1219    struct pipe_context *pipe = mgr->pipe;
1220    int start_vertex;
1221    unsigned min_index;
1222    unsigned num_vertices;
1223    boolean unroll_indices = FALSE;
1224    const uint32_t used_vb_mask = mgr->ve->used_vb_mask;
1225    uint32_t user_vb_mask = mgr->user_vb_mask & used_vb_mask;
1226    const uint32_t incompatible_vb_mask =
1227       mgr->incompatible_vb_mask & used_vb_mask;
1228    struct pipe_draw_info new_info;
1229
1230    /* Normal draw. No fallback and no user buffers. */
1231    if (!incompatible_vb_mask &&
1232        !mgr->ve->incompatible_elem_mask &&
1233        !user_vb_mask) {
1234
1235       /* Set vertex buffers if needed. */
1236       if (mgr->dirty_real_vb_mask & used_vb_mask) {
1237          u_vbuf_set_driver_vertex_buffers(mgr);
1238       }
1239
1240       pipe->draw_vbo(pipe, info);
1241       return;
1242    }
1243
1244    new_info = *info;
1245
1246    /* Handle indirect (multi)draws. */
1247    if (new_info.indirect) {
1248       const struct pipe_draw_indirect_info *indirect = new_info.indirect;
1249       unsigned draw_count = 0;
1250
1251       /* Get the number of draws. */
1252       if (indirect->indirect_draw_count) {
1253          pipe_buffer_read(pipe, indirect->indirect_draw_count,
1254                           indirect->indirect_draw_count_offset,
1255                           4, &draw_count);
1256       } else {
1257          draw_count = indirect->draw_count;
1258       }
1259
1260       if (!draw_count)
1261          return;
1262
1263       unsigned data_size = (draw_count - 1) * indirect->stride +
1264                            (new_info.index_size ? 20 : 16);
1265       unsigned *data = malloc(data_size);
1266       if (!data)
1267          return; /* report an error? */
1268
1269       /* Read the used buffer range only once, because the read can be
1270        * uncached.
1271        */
1272       pipe_buffer_read(pipe, indirect->buffer, indirect->offset, data_size,
1273                        data);
1274
1275       if (info->index_size) {
1276          /* Indexed multidraw. */
1277          unsigned index_bias0 = data[3];
1278          bool index_bias_same = true;
1279
1280          /* If we invoke the translate path, we have to split the multidraw. */
1281          if (incompatible_vb_mask ||
1282              mgr->ve->incompatible_elem_mask) {
1283             u_vbuf_split_indexed_multidraw(mgr, &new_info, data,
1284                                            indirect->stride, draw_count);
1285             free(data);
1286             return;
1287          }
1288
1289          /* See if index_bias is the same for all draws. */
1290          for (unsigned i = 1; i < draw_count; i++) {
1291             if (data[i * indirect->stride / 4 + 3] != index_bias0) {
1292                index_bias_same = false;
1293                break;
1294             }
1295          }
1296
1297          /* Split the multidraw if index_bias is different. */
1298          if (!index_bias_same) {
1299             u_vbuf_split_indexed_multidraw(mgr, &new_info, data,
1300                                            indirect->stride, draw_count);
1301             free(data);
1302             return;
1303          }
1304
1305          /* If we don't need to use the translate path and index_bias is
1306           * the same, we can process the multidraw with the time complexity
1307           * equal to 1 draw call (except for the index range computation).
1308           * We only need to compute the index range covering all draw calls
1309           * of the multidraw.
1310           *
1311           * The driver will not look at these values because indirect != NULL.
1312           * These values determine the user buffer bounds to upload.
1313           */
1314          new_info.index_bias = index_bias0;
1315          new_info.min_index = ~0u;
1316          new_info.max_index = 0;
1317          new_info.start_instance = ~0u;
1318          unsigned end_instance = 0;
1319
1320          struct pipe_transfer *transfer = NULL;
1321          const uint8_t *indices;
1322
1323          if (info->has_user_indices) {
1324             indices = (uint8_t*)info->index.user;
1325          } else {
1326             indices = (uint8_t*)pipe_buffer_map(pipe, info->index.resource,
1327                                                 PIPE_TRANSFER_READ, &transfer);
1328          }
1329
1330          for (unsigned i = 0; i < draw_count; i++) {
1331             unsigned offset = i * indirect->stride / 4;
1332             unsigned start = data[offset + 2];
1333             unsigned count = data[offset + 0];
1334             unsigned start_instance = data[offset + 4];
1335             unsigned instance_count = data[offset + 1];
1336
1337             if (!count || !instance_count)
1338                continue;
1339
1340             /* Update the ranges of instances. */
1341             new_info.start_instance = MIN2(new_info.start_instance,
1342                                            start_instance);
1343             end_instance = MAX2(end_instance, start_instance + instance_count);
1344
1345             /* Update the index range. */
1346             unsigned min, max;
1347             new_info.count = count; /* only used by get_minmax_index */
1348             u_vbuf_get_minmax_index_mapped(&new_info,
1349                                            indices +
1350                                            new_info.index_size * start,
1351                                            &min, &max);
1352
1353             new_info.min_index = MIN2(new_info.min_index, min);
1354             new_info.max_index = MAX2(new_info.max_index, max);
1355          }
1356          free(data);
1357
1358          if (transfer)
1359             pipe_buffer_unmap(pipe, transfer);
1360
1361          /* Set the final instance count. */
1362          new_info.instance_count = end_instance - new_info.start_instance;
1363
1364          if (new_info.start_instance == ~0u || !new_info.instance_count)
1365             return;
1366       } else {
1367          /* Non-indexed multidraw.
1368           *
1369           * Keep the draw call indirect and compute minimums & maximums,
1370           * which will determine the user buffer bounds to upload, but
1371           * the driver will not look at these values because indirect != NULL.
1372           *
1373           * This efficiently processes the multidraw with the time complexity
1374           * equal to 1 draw call.
1375           */
1376          new_info.start = ~0u;
1377          new_info.start_instance = ~0u;
1378          unsigned end_vertex = 0;
1379          unsigned end_instance = 0;
1380
1381          for (unsigned i = 0; i < draw_count; i++) {
1382             unsigned offset = i * indirect->stride / 4;
1383             unsigned start = data[offset + 2];
1384             unsigned count = data[offset + 0];
1385             unsigned start_instance = data[offset + 3];
1386             unsigned instance_count = data[offset + 1];
1387
1388             new_info.start = MIN2(new_info.start, start);
1389             new_info.start_instance = MIN2(new_info.start_instance,
1390                                            start_instance);
1391
1392             end_vertex = MAX2(end_vertex, start + count);
1393             end_instance = MAX2(end_instance, start_instance + instance_count);
1394          }
1395          free(data);
1396
1397          /* Set the final counts. */
1398          new_info.count = end_vertex - new_info.start;
1399          new_info.instance_count = end_instance - new_info.start_instance;
1400
1401          if (new_info.start == ~0u || !new_info.count || !new_info.instance_count)
1402             return;
1403       }
1404    }
1405
1406    if (new_info.index_size) {
1407       /* See if anything needs to be done for per-vertex attribs. */
1408       if (u_vbuf_need_minmax_index(mgr)) {
1409          unsigned max_index;
1410
1411          if (new_info.max_index != ~0u) {
1412             min_index = new_info.min_index;
1413             max_index = new_info.max_index;
1414          } else {
1415             u_vbuf_get_minmax_index(mgr->pipe, &new_info,
1416                                     &min_index, &max_index);
1417          }
1418
1419          assert(min_index <= max_index);
1420
1421          start_vertex = min_index + new_info.index_bias;
1422          num_vertices = max_index + 1 - min_index;
1423
1424          /* Primitive restart doesn't work when unrolling indices.
1425           * We would have to break this drawing operation into several ones. */
1426          /* Use some heuristic to see if unrolling indices improves
1427           * performance. */
1428          if (!info->indirect &&
1429              !new_info.primitive_restart &&
1430              num_vertices > new_info.count*2 &&
1431              num_vertices - new_info.count > 32 &&
1432              !u_vbuf_mapping_vertex_buffer_blocks(mgr)) {
1433             unroll_indices = TRUE;
1434             user_vb_mask &= ~(mgr->nonzero_stride_vb_mask &
1435                               mgr->ve->noninstance_vb_mask_any);
1436          }
1437       } else {
1438          /* Nothing to do for per-vertex attribs. */
1439          start_vertex = 0;
1440          num_vertices = 0;
1441          min_index = 0;
1442       }
1443    } else {
1444       start_vertex = new_info.start;
1445       num_vertices = new_info.count;
1446       min_index = 0;
1447    }
1448
1449    /* Translate vertices with non-native layouts or formats. */
1450    if (unroll_indices ||
1451        incompatible_vb_mask ||
1452        mgr->ve->incompatible_elem_mask) {
1453       if (!u_vbuf_translate_begin(mgr, &new_info, start_vertex, num_vertices,
1454                                   min_index, unroll_indices)) {
1455          debug_warn_once("u_vbuf_translate_begin() failed");
1456          return;
1457       }
1458
1459       if (unroll_indices) {
1460          new_info.index_size = 0;
1461          new_info.index_bias = 0;
1462          new_info.min_index = 0;
1463          new_info.max_index = new_info.count - 1;
1464          new_info.start = 0;
1465       }
1466
1467       user_vb_mask &= ~(incompatible_vb_mask |
1468                         mgr->ve->incompatible_vb_mask_all);
1469    }
1470
1471    /* Upload user buffers. */
1472    if (user_vb_mask) {
1473       if (u_vbuf_upload_buffers(mgr, start_vertex, num_vertices,
1474                                 new_info.start_instance,
1475                                 new_info.instance_count) != PIPE_OK) {
1476          debug_warn_once("u_vbuf_upload_buffers() failed");
1477          return;
1478       }
1479
1480       mgr->dirty_real_vb_mask |= user_vb_mask;
1481    }
1482
1483    /*
1484    if (unroll_indices) {
1485       printf("unrolling indices: start_vertex = %i, num_vertices = %i\n",
1486              start_vertex, num_vertices);
1487       util_dump_draw_info(stdout, info);
1488       printf("\n");
1489    }
1490
1491    unsigned i;
1492    for (i = 0; i < mgr->nr_vertex_buffers; i++) {
1493       printf("input %i: ", i);
1494       util_dump_vertex_buffer(stdout, mgr->vertex_buffer+i);
1495       printf("\n");
1496    }
1497    for (i = 0; i < mgr->nr_real_vertex_buffers; i++) {
1498       printf("real %i: ", i);
1499       util_dump_vertex_buffer(stdout, mgr->real_vertex_buffer+i);
1500       printf("\n");
1501    }
1502    */
1503
1504    u_upload_unmap(pipe->stream_uploader);
1505    u_vbuf_set_driver_vertex_buffers(mgr);
1506
1507    pipe->draw_vbo(pipe, &new_info);
1508
1509    if (mgr->using_translate) {
1510       u_vbuf_translate_end(mgr);
1511    }
1512 }
1513
1514 void u_vbuf_save_vertex_elements(struct u_vbuf *mgr)
1515 {
1516    assert(!mgr->ve_saved);
1517    mgr->ve_saved = mgr->ve;
1518 }
1519
1520 void u_vbuf_restore_vertex_elements(struct u_vbuf *mgr)
1521 {
1522    if (mgr->ve != mgr->ve_saved) {
1523       struct pipe_context *pipe = mgr->pipe;
1524
1525       mgr->ve = mgr->ve_saved;
1526       pipe->bind_vertex_elements_state(pipe,
1527                                        mgr->ve ? mgr->ve->driver_cso : NULL);
1528    }
1529    mgr->ve_saved = NULL;
1530 }
1531
1532 void u_vbuf_save_vertex_buffer0(struct u_vbuf *mgr)
1533 {
1534    pipe_vertex_buffer_reference(&mgr->vertex_buffer0_saved,
1535                                 &mgr->vertex_buffer[0]);
1536 }
1537
1538 void u_vbuf_restore_vertex_buffer0(struct u_vbuf *mgr)
1539 {
1540    u_vbuf_set_vertex_buffers(mgr, 0, 1, &mgr->vertex_buffer0_saved);
1541    pipe_vertex_buffer_unreference(&mgr->vertex_buffer0_saved);
1542 }