src/gallium/auxiliary/util/u_vbuf.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2011 Marek Olšák <maraeo@gmail.com>
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * This module uploads user buffers and translates the vertex buffers which
  30  * contain incompatible vertices (i.e. not supported by the driver/hardware)
  31  * into compatible ones, based on the Gallium CAPs.
  32  *
  33  * It does not upload index buffers.
  34  *
  35  * The module heavily uses bitmasks to represent per-buffer and
  36  * per-vertex-element flags to avoid looping over the list of buffers just
  37  * to see if there's a non-zero stride, or user buffer, or unsupported format,
  38  * etc.
  39  *
  40  * There are 3 categories of vertex elements, which are processed separately:
  41  * - per-vertex attribs (stride != 0, instance_divisor == 0)
  42  * - instanced attribs (stride != 0, instance_divisor > 0)
  43  * - constant attribs (stride == 0)
  44  *
  45  * All needed uploads and translations are performed every draw command, but
  46  * only the subset of vertices needed for that draw command is uploaded or
  47  * translated. (the module never translates whole buffers)
  48  *
  49  *
  50  * The module consists of two main parts:
  51  *
  52  *
  53  * 1) Translate (u_vbuf_translate_begin/end)
  54  *
  55  * This is pretty much a vertex fetch fallback. It translates vertices from
  56  * one vertex buffer to another in an unused vertex buffer slot. It does
  57  * whatever is needed to make the vertices readable by the hardware (changes
  58  * vertex formats and aligns offsets and strides). The translate module is
  59  * used here.
  60  *
  61  * Each of the 3 categories is translated to a separate buffer.
  62  * Only the [min_index, max_index] range is translated. For instanced attribs,
  63  * the range is [start_instance, start_instance+instance_count]. For constant
  64  * attribs, the range is [0, 1].
  65  *
  66  *
  67  * 2) User buffer uploading (u_vbuf_upload_buffers)
  68  *
  69  * Only the [min_index, max_index] range is uploaded (just like Translate)
  70  * with a single memcpy.
  71  *
  72  * This method works best for non-indexed draw operations or indexed draw
  73  * operations where the [min_index, max_index] range is not being way bigger
  74  * than the vertex count.
  75  *
  76  * If the range is too big (e.g. one triangle with indices {0, 1, 10000}),
  77  * the per-vertex attribs are uploaded via the translate module, all packed
  78  * into one vertex buffer, and the indexed draw call is turned into
  79  * a non-indexed one in the process. This adds additional complexity
  80  * to the translate part, but it prevents bad apps from bringing your frame
  81  * rate down.
  82  *
  83  *
  84  * If there is nothing to do, it forwards every command to the driver.
  85  * The module also has its own CSO cache of vertex element states.
  86  */
  87
  88 #include "util/u_vbuf.h"
  89
  90 #include "util/u_dump.h"
  91 #include "util/format/u_format.h"
  92 #include "util/u_inlines.h"
  93 #include "util/u_memory.h"
  94 #include "util/u_screen.h"
  95 #include "util/u_upload_mgr.h"
  96 #include "translate/translate.h"
  97 #include "translate/translate_cache.h"
  98 #include "cso_cache/cso_cache.h"
  99 #include "cso_cache/cso_hash.h"
 100
 101 struct u_vbuf_elements {
 102    unsigned count;
 103    struct pipe_vertex_element ve[PIPE_MAX_ATTRIBS];
 104
 105    unsigned src_format_size[PIPE_MAX_ATTRIBS];
 106
 107    /* If (velem[i].src_format != native_format[i]), the vertex buffer
 108     * referenced by the vertex element cannot be used for rendering and
 109     * its vertex data must be translated to native_format[i]. */
 110    enum pipe_format native_format[PIPE_MAX_ATTRIBS];
 111    unsigned native_format_size[PIPE_MAX_ATTRIBS];
 112
 113    /* Which buffers are used by the vertex element state. */
 114    uint32_t used_vb_mask;
 115    /* This might mean two things:
 116     * - src_format != native_format, as discussed above.
 117     * - src_offset % 4 != 0 (if the caps don't allow such an offset). */
 118    uint32_t incompatible_elem_mask; /* each bit describes a corresp. attrib  */
 119    /* Which buffer has at least one vertex element referencing it
 120     * incompatible. */
 121    uint32_t incompatible_vb_mask_any;
 122    /* Which buffer has all vertex elements referencing it incompatible. */
 123    uint32_t incompatible_vb_mask_all;
 124    /* Which buffer has at least one vertex element referencing it
 125     * compatible. */
 126    uint32_t compatible_vb_mask_any;
 127    /* Which buffer has all vertex elements referencing it compatible. */
 128    uint32_t compatible_vb_mask_all;
 129
 130    /* Which buffer has at least one vertex element referencing it
 131     * non-instanced. */
 132    uint32_t noninstance_vb_mask_any;
 133
 134    void *driver_cso;
 135 };
 136
 137 enum {
 138    VB_VERTEX = 0,
 139    VB_INSTANCE = 1,
 140    VB_CONST = 2,
 141    VB_NUM = 3
 142 };
 143
 144 struct u_vbuf {
 145    struct u_vbuf_caps caps;
 146    bool has_signed_vb_offset;
 147
 148    struct pipe_context *pipe;
 149    struct translate_cache *translate_cache;
 150    struct cso_cache *cso_cache;
 151
 152    /* This is what was set in set_vertex_buffers.
 153     * May contain user buffers. */
 154    struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
 155    uint32_t enabled_vb_mask;
 156
 157    /* Saved vertex buffer. */
 158    struct pipe_vertex_buffer vertex_buffer0_saved;
 159
 160    /* Vertex buffers for the driver.
 161     * There are usually no user buffers. */
 162    struct pipe_vertex_buffer real_vertex_buffer[PIPE_MAX_ATTRIBS];
 163    uint32_t dirty_real_vb_mask; /* which buffers are dirty since the last
 164                                    call of set_vertex_buffers */
 165
 166    /* Vertex elements. */
 167    struct u_vbuf_elements *ve, *ve_saved;
 168
 169    /* Vertex elements used for the translate fallback. */
 170    struct pipe_vertex_element fallback_velems[PIPE_MAX_ATTRIBS];
 171    /* If non-NULL, this is a vertex element state used for the translate
 172     * fallback and therefore used for rendering too. */
 173    boolean using_translate;
 174    /* The vertex buffer slot index where translated vertices have been
 175     * stored in. */
 176    unsigned fallback_vbs[VB_NUM];
 177
 178    /* Which buffer is a user buffer. */
 179    uint32_t user_vb_mask; /* each bit describes a corresp. buffer */
 180    /* Which buffer is incompatible (unaligned). */
 181    uint32_t incompatible_vb_mask; /* each bit describes a corresp. buffer */
 182    /* Which buffer has a non-zero stride. */
 183    uint32_t nonzero_stride_vb_mask; /* each bit describes a corresp. buffer */
 184    /* Which buffers are allowed (supported by hardware). */
 185    uint32_t allowed_vb_mask;
 186 };
 187
 188 static void *
 189 u_vbuf_create_vertex_elements(struct u_vbuf *mgr, unsigned count,
 190                               const struct pipe_vertex_element *attribs);
 191 static void u_vbuf_delete_vertex_elements(struct u_vbuf *mgr, void *cso);
 192
 193 static const struct {
 194    enum pipe_format from, to;
 195 } vbuf_format_fallbacks[] = {
 196    { PIPE_FORMAT_R32_FIXED,            PIPE_FORMAT_R32_FLOAT },
 197    { PIPE_FORMAT_R32G32_FIXED,         PIPE_FORMAT_R32G32_FLOAT },
 198    { PIPE_FORMAT_R32G32B32_FIXED,      PIPE_FORMAT_R32G32B32_FLOAT },
 199    { PIPE_FORMAT_R32G32B32A32_FIXED,   PIPE_FORMAT_R32G32B32A32_FLOAT },
 200    { PIPE_FORMAT_R16_FLOAT,            PIPE_FORMAT_R32_FLOAT },
 201    { PIPE_FORMAT_R16G16_FLOAT,         PIPE_FORMAT_R32G32_FLOAT },
 202    { PIPE_FORMAT_R16G16B16_FLOAT,      PIPE_FORMAT_R32G32B32_FLOAT },
 203    { PIPE_FORMAT_R16G16B16A16_FLOAT,   PIPE_FORMAT_R32G32B32A32_FLOAT },
 204    { PIPE_FORMAT_R64_FLOAT,            PIPE_FORMAT_R32_FLOAT },
 205    { PIPE_FORMAT_R64G64_FLOAT,         PIPE_FORMAT_R32G32_FLOAT },
 206    { PIPE_FORMAT_R64G64B64_FLOAT,      PIPE_FORMAT_R32G32B32_FLOAT },
 207    { PIPE_FORMAT_R64G64B64A64_FLOAT,   PIPE_FORMAT_R32G32B32A32_FLOAT },
 208    { PIPE_FORMAT_R32_UNORM,            PIPE_FORMAT_R32_FLOAT },
 209    { PIPE_FORMAT_R32G32_UNORM,         PIPE_FORMAT_R32G32_FLOAT },
 210    { PIPE_FORMAT_R32G32B32_UNORM,      PIPE_FORMAT_R32G32B32_FLOAT },
 211    { PIPE_FORMAT_R32G32B32A32_UNORM,   PIPE_FORMAT_R32G32B32A32_FLOAT },
 212    { PIPE_FORMAT_R32_SNORM,            PIPE_FORMAT_R32_FLOAT },
 213    { PIPE_FORMAT_R32G32_SNORM,         PIPE_FORMAT_R32G32_FLOAT },
 214    { PIPE_FORMAT_R32G32B32_SNORM,      PIPE_FORMAT_R32G32B32_FLOAT },
 215    { PIPE_FORMAT_R32G32B32A32_SNORM,   PIPE_FORMAT_R32G32B32A32_FLOAT },
 216    { PIPE_FORMAT_R32_USCALED,          PIPE_FORMAT_R32_FLOAT },
 217    { PIPE_FORMAT_R32G32_USCALED,       PIPE_FORMAT_R32G32_FLOAT },
 218    { PIPE_FORMAT_R32G32B32_USCALED,    PIPE_FORMAT_R32G32B32_FLOAT },
 219    { PIPE_FORMAT_R32G32B32A32_USCALED, PIPE_FORMAT_R32G32B32A32_FLOAT },
 220    { PIPE_FORMAT_R32_SSCALED,          PIPE_FORMAT_R32_FLOAT },
 221    { PIPE_FORMAT_R32G32_SSCALED,       PIPE_FORMAT_R32G32_FLOAT },
 222    { PIPE_FORMAT_R32G32B32_SSCALED,    PIPE_FORMAT_R32G32B32_FLOAT },
 223    { PIPE_FORMAT_R32G32B32A32_SSCALED, PIPE_FORMAT_R32G32B32A32_FLOAT },
 224    { PIPE_FORMAT_R16_UNORM,            PIPE_FORMAT_R32_FLOAT },
 225    { PIPE_FORMAT_R16G16_UNORM,         PIPE_FORMAT_R32G32_FLOAT },
 226    { PIPE_FORMAT_R16G16B16_UNORM,      PIPE_FORMAT_R32G32B32_FLOAT },
 227    { PIPE_FORMAT_R16G16B16A16_UNORM,   PIPE_FORMAT_R32G32B32A32_FLOAT },
 228    { PIPE_FORMAT_R16_SNORM,            PIPE_FORMAT_R32_FLOAT },
 229    { PIPE_FORMAT_R16G16_SNORM,         PIPE_FORMAT_R32G32_FLOAT },
 230    { PIPE_FORMAT_R16G16B16_SNORM,      PIPE_FORMAT_R32G32B32_FLOAT },
 231    { PIPE_FORMAT_R16G16B16A16_SNORM,   PIPE_FORMAT_R32G32B32A32_FLOAT },
 232    { PIPE_FORMAT_R16_USCALED,          PIPE_FORMAT_R32_FLOAT },
 233    { PIPE_FORMAT_R16G16_USCALED,       PIPE_FORMAT_R32G32_FLOAT },
 234    { PIPE_FORMAT_R16G16B16_USCALED,    PIPE_FORMAT_R32G32B32_FLOAT },
 235    { PIPE_FORMAT_R16G16B16A16_USCALED, PIPE_FORMAT_R32G32B32A32_FLOAT },
 236    { PIPE_FORMAT_R16_SSCALED,          PIPE_FORMAT_R32_FLOAT },
 237    { PIPE_FORMAT_R16G16_SSCALED,       PIPE_FORMAT_R32G32_FLOAT },
 238    { PIPE_FORMAT_R16G16B16_SSCALED,    PIPE_FORMAT_R32G32B32_FLOAT },
 239    { PIPE_FORMAT_R16G16B16A16_SSCALED, PIPE_FORMAT_R32G32B32A32_FLOAT },
 240    { PIPE_FORMAT_R8_UNORM,             PIPE_FORMAT_R32_FLOAT },
 241    { PIPE_FORMAT_R8G8_UNORM,           PIPE_FORMAT_R32G32_FLOAT },
 242    { PIPE_FORMAT_R8G8B8_UNORM,         PIPE_FORMAT_R32G32B32_FLOAT },
 243    { PIPE_FORMAT_R8G8B8A8_UNORM,       PIPE_FORMAT_R32G32B32A32_FLOAT },
 244    { PIPE_FORMAT_R8_SNORM,             PIPE_FORMAT_R32_FLOAT },
 245    { PIPE_FORMAT_R8G8_SNORM,           PIPE_FORMAT_R32G32_FLOAT },
 246    { PIPE_FORMAT_R8G8B8_SNORM,         PIPE_FORMAT_R32G32B32_FLOAT },
 247    { PIPE_FORMAT_R8G8B8A8_SNORM,       PIPE_FORMAT_R32G32B32A32_FLOAT },
 248    { PIPE_FORMAT_R8_USCALED,           PIPE_FORMAT_R32_FLOAT },
 249    { PIPE_FORMAT_R8G8_USCALED,         PIPE_FORMAT_R32G32_FLOAT },
 250    { PIPE_FORMAT_R8G8B8_USCALED,       PIPE_FORMAT_R32G32B32_FLOAT },
 251    { PIPE_FORMAT_R8G8B8A8_USCALED,     PIPE_FORMAT_R32G32B32A32_FLOAT },
 252    { PIPE_FORMAT_R8_SSCALED,           PIPE_FORMAT_R32_FLOAT },
 253    { PIPE_FORMAT_R8G8_SSCALED,         PIPE_FORMAT_R32G32_FLOAT },
 254    { PIPE_FORMAT_R8G8B8_SSCALED,       PIPE_FORMAT_R32G32B32_FLOAT },
 255    { PIPE_FORMAT_R8G8B8A8_SSCALED,     PIPE_FORMAT_R32G32B32A32_FLOAT },
 256 };
 257
 258 boolean u_vbuf_get_caps(struct pipe_screen *screen, struct u_vbuf_caps *caps,
 259                         unsigned flags)
 260 {
 261    unsigned i;
 262    boolean fallback = FALSE;
 263
 264    /* I'd rather have a bitfield of which formats are supported and a static
 265     * table of the translations indexed by format, but since we don't have C99
 266     * we can't easily make a sparsely-populated table indexed by format.  So,
 267     * we construct the sparse table here.
 268     */
 269    for (i = 0; i < PIPE_FORMAT_COUNT; i++)
 270       caps->format_translation[i] = i;
 271
 272    for (i = 0; i < ARRAY_SIZE(vbuf_format_fallbacks); i++) {
 273       enum pipe_format format = vbuf_format_fallbacks[i].from;
 274
 275       if (!screen->is_format_supported(screen, format, PIPE_BUFFER, 0, 0,
 276                                        PIPE_BIND_VERTEX_BUFFER)) {
 277          caps->format_translation[format] = vbuf_format_fallbacks[i].to;
 278          fallback = TRUE;
 279       }
 280    }
 281
 282    caps->buffer_offset_unaligned =
 283       !screen->get_param(screen,
 284                          PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY);
 285    caps->buffer_stride_unaligned =
 286      !screen->get_param(screen,
 287                         PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY);
 288    caps->velem_src_offset_unaligned =
 289       !screen->get_param(screen,
 290                          PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY);
 291    caps->user_vertex_buffers =
 292       screen->get_param(screen, PIPE_CAP_USER_VERTEX_BUFFERS);
 293    caps->max_vertex_buffers =
 294       screen->get_param(screen, PIPE_CAP_MAX_VERTEX_BUFFERS);
 295
 296    /* OpenGL 2.0 requires a minimum of 16 vertex buffers */
 297    if (caps->max_vertex_buffers < 16)
 298       fallback = TRUE;
 299
 300    if (!caps->buffer_offset_unaligned ||
 301        !caps->buffer_stride_unaligned ||
 302        !caps->velem_src_offset_unaligned ||
 303        (!(flags & U_VBUF_FLAG_NO_USER_VBOS) && !caps->user_vertex_buffers)) {
 304       fallback = TRUE;
 305    }
 306
 307    return fallback;
 308 }
 309
 310 struct u_vbuf *
 311 u_vbuf_create(struct pipe_context *pipe, struct u_vbuf_caps *caps)
 312 {
 313    struct u_vbuf *mgr = CALLOC_STRUCT(u_vbuf);
 314
 315    mgr->caps = *caps;
 316    mgr->pipe = pipe;
 317    mgr->cso_cache = cso_cache_create();
 318    mgr->translate_cache = translate_cache_create();
 319    memset(mgr->fallback_vbs, ~0, sizeof(mgr->fallback_vbs));
 320    mgr->allowed_vb_mask = u_bit_consecutive(0, mgr->caps.max_vertex_buffers);
 321
 322    mgr->has_signed_vb_offset =
 323       pipe->screen->get_param(pipe->screen,
 324                               PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET);
 325
 326    return mgr;
 327 }
 328
 329 /* u_vbuf uses its own caching for vertex elements, because it needs to keep
 330  * its own preprocessed state per vertex element CSO. */
 331 static struct u_vbuf_elements *
 332 u_vbuf_set_vertex_elements_internal(struct u_vbuf *mgr, unsigned count,
 333                                     const struct pipe_vertex_element *states)
 334 {
 335    struct pipe_context *pipe = mgr->pipe;
 336    unsigned key_size, hash_key;
 337    struct cso_hash_iter iter;
 338    struct u_vbuf_elements *ve;
 339    struct cso_velems_state velems_state;
 340
 341    /* need to include the count into the stored state data too. */
 342    key_size = sizeof(struct pipe_vertex_element) * count + sizeof(unsigned);
 343    velems_state.count = count;
 344    memcpy(velems_state.velems, states,
 345           sizeof(struct pipe_vertex_element) * count);
 346    hash_key = cso_construct_key((void*)&velems_state, key_size);
 347    iter = cso_find_state_template(mgr->cso_cache, hash_key, CSO_VELEMENTS,
 348                                   (void*)&velems_state, key_size);
 349
 350    if (cso_hash_iter_is_null(iter)) {
 351       struct cso_velements *cso = MALLOC_STRUCT(cso_velements);
 352       memcpy(&cso->state, &velems_state, key_size);
 353       cso->data = u_vbuf_create_vertex_elements(mgr, count, states);
 354       cso->delete_state = (cso_state_callback)u_vbuf_delete_vertex_elements;
 355       cso->context = (void*)mgr;
 356
 357       iter = cso_insert_state(mgr->cso_cache, hash_key, CSO_VELEMENTS, cso);
 358       ve = cso->data;
 359    } else {
 360       ve = ((struct cso_velements *)cso_hash_iter_data(iter))->data;
 361    }
 362
 363    assert(ve);
 364
 365    if (ve != mgr->ve)
 366       pipe->bind_vertex_elements_state(pipe, ve->driver_cso);
 367
 368    return ve;
 369 }
 370
 371 void u_vbuf_set_vertex_elements(struct u_vbuf *mgr, unsigned count,
 372                                const struct pipe_vertex_element *states)
 373 {
 374    mgr->ve = u_vbuf_set_vertex_elements_internal(mgr, count, states);
 375 }
 376
 377 void u_vbuf_destroy(struct u_vbuf *mgr)
 378 {
 379    struct pipe_screen *screen = mgr->pipe->screen;
 380    unsigned i;
 381    const unsigned num_vb = screen->get_shader_param(screen, PIPE_SHADER_VERTEX,
 382                                                     PIPE_SHADER_CAP_MAX_INPUTS);
 383
 384    mgr->pipe->set_vertex_buffers(mgr->pipe, 0, num_vb, NULL);
 385
 386    for (i = 0; i < PIPE_MAX_ATTRIBS; i++)
 387       pipe_vertex_buffer_unreference(&mgr->vertex_buffer[i]);
 388    for (i = 0; i < PIPE_MAX_ATTRIBS; i++)
 389       pipe_vertex_buffer_unreference(&mgr->real_vertex_buffer[i]);
 390
 391    pipe_vertex_buffer_unreference(&mgr->vertex_buffer0_saved);
 392
 393    translate_cache_destroy(mgr->translate_cache);
 394    cso_cache_delete(mgr->cso_cache);
 395    FREE(mgr);
 396 }
 397
 398 static enum pipe_error
 399 u_vbuf_translate_buffers(struct u_vbuf *mgr, struct translate_key *key,
 400                          const struct pipe_draw_info *info,
 401                          unsigned vb_mask, unsigned out_vb,
 402                          int start_vertex, unsigned num_vertices,
 403                          int min_index, boolean unroll_indices)
 404 {
 405    struct translate *tr;
 406    struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS] = {0};
 407    struct pipe_resource *out_buffer = NULL;
 408    uint8_t *out_map;
 409    unsigned out_offset, mask;
 410
 411    /* Get a translate object. */
 412    tr = translate_cache_find(mgr->translate_cache, key);
 413
 414    /* Map buffers we want to translate. */
 415    mask = vb_mask;
 416    while (mask) {
 417       struct pipe_vertex_buffer *vb;
 418       unsigned offset;
 419       uint8_t *map;
 420       unsigned i = u_bit_scan(&mask);
 421
 422       vb = &mgr->vertex_buffer[i];
 423       offset = vb->buffer_offset + vb->stride * start_vertex;
 424
 425       if (vb->is_user_buffer) {
 426          map = (uint8_t*)vb->buffer.user + offset;
 427       } else {
 428          unsigned size = vb->stride ? num_vertices * vb->stride
 429                                     : sizeof(double)*4;
 430
 431          if (!vb->buffer.resource)
 432             continue;
 433
 434          if (offset + size > vb->buffer.resource->width0) {
 435             /* Don't try to map past end of buffer.  This often happens when
 436              * we're translating an attribute that's at offset > 0 from the
 437              * start of the vertex.  If we'd subtract attrib's offset from
 438              * the size, this probably wouldn't happen.
 439              */
 440             size = vb->buffer.resource->width0 - offset;
 441
 442             /* Also adjust num_vertices.  A common user error is to call
 443              * glDrawRangeElements() with incorrect 'end' argument.  The 'end
 444              * value should be the max index value, but people often
 445              * accidentally add one to this value.  This adjustment avoids
 446              * crashing (by reading past the end of a hardware buffer mapping)
 447              * when people do that.
 448              */
 449             num_vertices = (size + vb->stride - 1) / vb->stride;
 450          }
 451
 452          map = pipe_buffer_map_range(mgr->pipe, vb->buffer.resource, offset, size,
 453                                      PIPE_TRANSFER_READ, &vb_transfer[i]);
 454       }
 455
 456       /* Subtract min_index so that indexing with the index buffer works. */
 457       if (unroll_indices) {
 458          map -= (ptrdiff_t)vb->stride * min_index;
 459       }
 460
 461       tr->set_buffer(tr, i, map, vb->stride, info->max_index);
 462    }
 463
 464    /* Translate. */
 465    if (unroll_indices) {
 466       struct pipe_transfer *transfer = NULL;
 467       const unsigned offset = info->start * info->index_size;
 468       uint8_t *map;
 469
 470       /* Create and map the output buffer. */
 471       u_upload_alloc(mgr->pipe->stream_uploader, 0,
 472                      key->output_stride * info->count, 4,
 473                      &out_offset, &out_buffer,
 474                      (void**)&out_map);
 475       if (!out_buffer)
 476          return PIPE_ERROR_OUT_OF_MEMORY;
 477
 478       if (info->has_user_indices) {
 479          map = (uint8_t*)info->index.user + offset;
 480       } else {
 481          map = pipe_buffer_map_range(mgr->pipe, info->index.resource, offset,
 482                                      info->count * info->index_size,
 483                                      PIPE_TRANSFER_READ, &transfer);
 484       }
 485
 486       switch (info->index_size) {
 487       case 4:
 488          tr->run_elts(tr, (unsigned*)map, info->count, 0, 0, out_map);
 489          break;
 490       case 2:
 491          tr->run_elts16(tr, (uint16_t*)map, info->count, 0, 0, out_map);
 492          break;
 493       case 1:
 494          tr->run_elts8(tr, map, info->count, 0, 0, out_map);
 495          break;
 496       }
 497
 498       if (transfer) {
 499          pipe_buffer_unmap(mgr->pipe, transfer);
 500       }
 501    } else {
 502       /* Create and map the output buffer. */
 503       u_upload_alloc(mgr->pipe->stream_uploader,
 504                      mgr->has_signed_vb_offset ?
 505                         0 : key->output_stride * start_vertex,
 506                      key->output_stride * num_vertices, 4,
 507                      &out_offset, &out_buffer,
 508                      (void**)&out_map);
 509       if (!out_buffer)
 510          return PIPE_ERROR_OUT_OF_MEMORY;
 511
 512       out_offset -= key->output_stride * start_vertex;
 513
 514       tr->run(tr, 0, num_vertices, 0, 0, out_map);
 515    }
 516
 517    /* Unmap all buffers. */
 518    mask = vb_mask;
 519    while (mask) {
 520       unsigned i = u_bit_scan(&mask);
 521
 522       if (vb_transfer[i]) {
 523          pipe_buffer_unmap(mgr->pipe, vb_transfer[i]);
 524       }
 525    }
 526
 527    /* Setup the new vertex buffer. */
 528    mgr->real_vertex_buffer[out_vb].buffer_offset = out_offset;
 529    mgr->real_vertex_buffer[out_vb].stride = key->output_stride;
 530
 531    /* Move the buffer reference. */
 532    pipe_vertex_buffer_unreference(&mgr->real_vertex_buffer[out_vb]);
 533    mgr->real_vertex_buffer[out_vb].buffer.resource = out_buffer;
 534    mgr->real_vertex_buffer[out_vb].is_user_buffer = false;
 535
 536    return PIPE_OK;
 537 }
 538
 539 static boolean
 540 u_vbuf_translate_find_free_vb_slots(struct u_vbuf *mgr,
 541                                     unsigned mask[VB_NUM],
 542                                     unsigned extra_free_vb_mask)
 543 {
 544    unsigned type;
 545    unsigned fallback_vbs[VB_NUM];
 546    /* Set the bit for each buffer which is incompatible, or isn't set. */
 547    uint32_t unused_vb_mask =
 548       (mgr->ve->incompatible_vb_mask_all | mgr->incompatible_vb_mask |
 549       ~mgr->enabled_vb_mask | extra_free_vb_mask) & mgr->allowed_vb_mask;
 550    uint32_t unused_vb_mask_orig;
 551    boolean insufficient_buffers = false;
 552
 553    /* No vertex buffers available at all */
 554    if (!unused_vb_mask)
 555       return FALSE;
 556
 557    memset(fallback_vbs, ~0, sizeof(fallback_vbs));
 558
 559    /* Find free slots for each type if needed. */
 560    unused_vb_mask_orig = unused_vb_mask;
 561    for (type = 0; type < VB_NUM; type++) {
 562       if (mask[type]) {
 563          uint32_t index;
 564
 565          if (!unused_vb_mask) {
 566             insufficient_buffers = true;
 567             break;
 568          }
 569
 570          index = ffs(unused_vb_mask) - 1;
 571          fallback_vbs[type] = index;
 572          unused_vb_mask &= ~(1 << index);
 573          /*printf("found slot=%i for type=%i\n", index, type);*/
 574       }
 575    }
 576
 577    if (insufficient_buffers) {
 578       /* not enough vbs for all types supported by the hardware, they will have to share one
 579        * buffer */
 580       uint32_t index = ffs(unused_vb_mask_orig) - 1;
 581       /* When sharing one vertex buffer use per-vertex frequency for everything. */
 582       fallback_vbs[VB_VERTEX] = index;
 583       mask[VB_VERTEX] = mask[VB_VERTEX] | mask[VB_CONST] | mask[VB_INSTANCE];
 584       mask[VB_CONST] = 0;
 585       mask[VB_INSTANCE] = 0;
 586    }
 587
 588    for (type = 0; type < VB_NUM; type++) {
 589       if (mask[type]) {
 590          mgr->dirty_real_vb_mask |= 1 << fallback_vbs[type];
 591       }
 592    }
 593
 594    memcpy(mgr->fallback_vbs, fallback_vbs, sizeof(fallback_vbs));
 595    return TRUE;
 596 }
 597
 598 static boolean
 599 u_vbuf_translate_begin(struct u_vbuf *mgr,
 600                        const struct pipe_draw_info *info,
 601                        int start_vertex, unsigned num_vertices,
 602                        int min_index, boolean unroll_indices)
 603 {
 604    unsigned mask[VB_NUM] = {0};
 605    struct translate_key key[VB_NUM];
 606    unsigned elem_index[VB_NUM][PIPE_MAX_ATTRIBS]; /* ... into key.elements */
 607    unsigned i, type;
 608    const unsigned incompatible_vb_mask = mgr->incompatible_vb_mask &
 609                                          mgr->ve->used_vb_mask;
 610    unsigned extra_free_vb_mask = 0;
 611
 612    const int start[VB_NUM] = {
 613       start_vertex,           /* VERTEX */
 614       info->start_instance,   /* INSTANCE */
 615       0                       /* CONST */
 616    };
 617
 618    const unsigned num[VB_NUM] = {
 619       num_vertices,           /* VERTEX */
 620       info->instance_count,   /* INSTANCE */
 621       1                       /* CONST */
 622    };
 623
 624    memset(key, 0, sizeof(key));
 625    memset(elem_index, ~0, sizeof(elem_index));
 626
 627    /* See if there are vertex attribs of each type to translate and
 628     * which ones. */
 629    for (i = 0; i < mgr->ve->count; i++) {
 630       unsigned vb_index = mgr->ve->ve[i].vertex_buffer_index;
 631
 632       if (!mgr->vertex_buffer[vb_index].stride) {
 633          if (!(mgr->ve->incompatible_elem_mask & (1 << i)) &&
 634              !(incompatible_vb_mask & (1 << vb_index))) {
 635             continue;
 636          }
 637          mask[VB_CONST] |= 1 << vb_index;
 638       } else if (mgr->ve->ve[i].instance_divisor) {
 639          if (!(mgr->ve->incompatible_elem_mask & (1 << i)) &&
 640              !(incompatible_vb_mask & (1 << vb_index))) {
 641             continue;
 642          }
 643          mask[VB_INSTANCE] |= 1 << vb_index;
 644       } else {
 645          if (!unroll_indices &&
 646              !(mgr->ve->incompatible_elem_mask & (1 << i)) &&
 647              !(incompatible_vb_mask & (1 << vb_index))) {
 648             continue;
 649          }
 650          mask[VB_VERTEX] |= 1 << vb_index;
 651       }
 652    }
 653
 654    assert(mask[VB_VERTEX] || mask[VB_INSTANCE] || mask[VB_CONST]);
 655
 656    /* In the case of unroll_indices, we can regard all non-constant
 657     * vertex buffers with only non-instance vertex elements as incompatible
 658     * and thus free.
 659     */
 660    if (unroll_indices)
 661        extra_free_vb_mask = mask[VB_VERTEX] & ~mask[VB_INSTANCE];
 662
 663    /* Find free vertex buffer slots. */
 664    if (!u_vbuf_translate_find_free_vb_slots(mgr, mask, extra_free_vb_mask)) {
 665       return FALSE;
 666    }
 667
 668    /* Initialize the translate keys. */
 669    for (i = 0; i < mgr->ve->count; i++) {
 670       struct translate_key *k;
 671       struct translate_element *te;
 672       enum pipe_format output_format = mgr->ve->native_format[i];
 673       unsigned bit, vb_index = mgr->ve->ve[i].vertex_buffer_index;
 674       bit = 1 << vb_index;
 675
 676       if (!(mgr->ve->incompatible_elem_mask & (1 << i)) &&
 677           !(incompatible_vb_mask & (1 << vb_index)) &&
 678           (!unroll_indices || !(mask[VB_VERTEX] & bit))) {
 679          continue;
 680       }
 681
 682       /* Set type to what we will translate.
 683        * Whether vertex, instance, or constant attribs. */
 684       for (type = 0; type < VB_NUM; type++) {
 685          if (mask[type] & bit) {
 686             break;
 687          }
 688       }
 689       assert(type < VB_NUM);
 690       if (mgr->ve->ve[i].src_format != output_format)
 691          assert(translate_is_output_format_supported(output_format));
 692       /*printf("velem=%i type=%i\n", i, type);*/
 693
 694       /* Add the vertex element. */
 695       k = &key[type];
 696       elem_index[type][i] = k->nr_elements;
 697
 698       te = &k->element[k->nr_elements];
 699       te->type = TRANSLATE_ELEMENT_NORMAL;
 700       te->instance_divisor = 0;
 701       te->input_buffer = vb_index;
 702       te->input_format = mgr->ve->ve[i].src_format;
 703       te->input_offset = mgr->ve->ve[i].src_offset;
 704       te->output_format = output_format;
 705       te->output_offset = k->output_stride;
 706
 707       k->output_stride += mgr->ve->native_format_size[i];
 708       k->nr_elements++;
 709    }
 710
 711    /* Translate buffers. */
 712    for (type = 0; type < VB_NUM; type++) {
 713       if (key[type].nr_elements) {
 714          enum pipe_error err;
 715          err = u_vbuf_translate_buffers(mgr, &key[type], info, mask[type],
 716                                         mgr->fallback_vbs[type],
 717                                         start[type], num[type], min_index,
 718                                         unroll_indices && type == VB_VERTEX);
 719          if (err != PIPE_OK)
 720             return FALSE;
 721
 722          /* Fixup the stride for constant attribs. */
 723          if (type == VB_CONST) {
 724             mgr->real_vertex_buffer[mgr->fallback_vbs[VB_CONST]].stride = 0;
 725          }
 726       }
 727    }
 728
 729    /* Setup new vertex elements. */
 730    for (i = 0; i < mgr->ve->count; i++) {
 731       for (type = 0; type < VB_NUM; type++) {
 732          if (elem_index[type][i] < key[type].nr_elements) {
 733             struct translate_element *te = &key[type].element[elem_index[type][i]];
 734             mgr->fallback_velems[i].instance_divisor = mgr->ve->ve[i].instance_divisor;
 735             mgr->fallback_velems[i].src_format = te->output_format;
 736             mgr->fallback_velems[i].src_offset = te->output_offset;
 737             mgr->fallback_velems[i].vertex_buffer_index = mgr->fallback_vbs[type];
 738
 739             /* elem_index[type][i] can only be set for one type. */
 740             assert(type > VB_INSTANCE || elem_index[type+1][i] == ~0u);
 741             assert(type > VB_VERTEX   || elem_index[type+2][i] == ~0u);
 742             break;
 743          }
 744       }
 745       /* No translating, just copy the original vertex element over. */
 746       if (type == VB_NUM) {
 747          memcpy(&mgr->fallback_velems[i], &mgr->ve->ve[i],
 748                 sizeof(struct pipe_vertex_element));
 749       }
 750    }
 751
 752    u_vbuf_set_vertex_elements_internal(mgr, mgr->ve->count,
 753                                        mgr->fallback_velems);
 754    mgr->using_translate = TRUE;
 755    return TRUE;
 756 }
 757
 758 static void u_vbuf_translate_end(struct u_vbuf *mgr)
 759 {
 760    unsigned i;
 761
 762    /* Restore vertex elements. */
 763    mgr->pipe->bind_vertex_elements_state(mgr->pipe, mgr->ve->driver_cso);
 764    mgr->using_translate = FALSE;
 765
 766    /* Unreference the now-unused VBOs. */
 767    for (i = 0; i < VB_NUM; i++) {
 768       unsigned vb = mgr->fallback_vbs[i];
 769       if (vb != ~0u) {
 770          pipe_resource_reference(&mgr->real_vertex_buffer[vb].buffer.resource, NULL);
 771          mgr->fallback_vbs[i] = ~0;
 772
 773          /* This will cause the buffer to be unbound in the driver later. */
 774          mgr->dirty_real_vb_mask |= 1 << vb;
 775       }
 776    }
 777 }
 778
 779 static void *
 780 u_vbuf_create_vertex_elements(struct u_vbuf *mgr, unsigned count,
 781                               const struct pipe_vertex_element *attribs)
 782 {
 783    struct pipe_context *pipe = mgr->pipe;
 784    unsigned i;
 785    struct pipe_vertex_element driver_attribs[PIPE_MAX_ATTRIBS];
 786    struct u_vbuf_elements *ve = CALLOC_STRUCT(u_vbuf_elements);
 787    uint32_t used_buffers = 0;
 788
 789    ve->count = count;
 790
 791    memcpy(ve->ve, attribs, sizeof(struct pipe_vertex_element) * count);
 792    memcpy(driver_attribs, attribs, sizeof(struct pipe_vertex_element) * count);
 793
 794    /* Set the best native format in case the original format is not
 795     * supported. */
 796    for (i = 0; i < count; i++) {
 797       enum pipe_format format = ve->ve[i].src_format;
 798
 799       ve->src_format_size[i] = util_format_get_blocksize(format);
 800
 801       used_buffers |= 1 << ve->ve[i].vertex_buffer_index;
 802
 803       if (!ve->ve[i].instance_divisor) {
 804          ve->noninstance_vb_mask_any |= 1 << ve->ve[i].vertex_buffer_index;
 805       }
 806
 807       format = mgr->caps.format_translation[format];
 808
 809       driver_attribs[i].src_format = format;
 810       ve->native_format[i] = format;
 811       ve->native_format_size[i] =
 812             util_format_get_blocksize(ve->native_format[i]);
 813
 814       if (ve->ve[i].src_format != format ||
 815           (!mgr->caps.velem_src_offset_unaligned &&
 816            ve->ve[i].src_offset % 4 != 0)) {
 817          ve->incompatible_elem_mask |= 1 << i;
 818          ve->incompatible_vb_mask_any |= 1 << ve->ve[i].vertex_buffer_index;
 819       } else {
 820          ve->compatible_vb_mask_any |= 1 << ve->ve[i].vertex_buffer_index;
 821       }
 822    }
 823
 824    if (used_buffers & ~mgr->allowed_vb_mask) {
 825       /* More vertex buffers are used than the hardware supports.  In
 826        * principle, we only need to make sure that less vertex buffers are
 827        * used, and mark some of the latter vertex buffers as incompatible.
 828        * For now, mark all vertex buffers as incompatible.
 829        */
 830       ve->incompatible_vb_mask_any = used_buffers;
 831       ve->compatible_vb_mask_any = 0;
 832       ve->incompatible_elem_mask = u_bit_consecutive(0, count);
 833    }
 834
 835    ve->used_vb_mask = used_buffers;
 836    ve->compatible_vb_mask_all = ~ve->incompatible_vb_mask_any & used_buffers;
 837    ve->incompatible_vb_mask_all = ~ve->compatible_vb_mask_any & used_buffers;
 838
 839    /* Align the formats and offsets to the size of DWORD if needed. */
 840    if (!mgr->caps.velem_src_offset_unaligned) {
 841       for (i = 0; i < count; i++) {
 842          ve->native_format_size[i] = align(ve->native_format_size[i], 4);
 843          driver_attribs[i].src_offset = align(ve->ve[i].src_offset, 4);
 844       }
 845    }
 846
 847    /* Only create driver CSO if no incompatible elements */
 848    if (!ve->incompatible_elem_mask) {
 849       ve->driver_cso =
 850          pipe->create_vertex_elements_state(pipe, count, driver_attribs);
 851    }
 852
 853    return ve;
 854 }
 855
 856 static void u_vbuf_delete_vertex_elements(struct u_vbuf *mgr, void *cso)
 857 {
 858    struct pipe_context *pipe = mgr->pipe;
 859    struct u_vbuf_elements *ve = cso;
 860
 861    pipe->delete_vertex_elements_state(pipe, ve->driver_cso);
 862    FREE(ve);
 863 }
 864
 865 void u_vbuf_set_vertex_buffers(struct u_vbuf *mgr,
 866                                unsigned start_slot, unsigned count,
 867                                const struct pipe_vertex_buffer *bufs)
 868 {
 869    unsigned i;
 870    /* which buffers are enabled */
 871    uint32_t enabled_vb_mask = 0;
 872    /* which buffers are in user memory */
 873    uint32_t user_vb_mask = 0;
 874    /* which buffers are incompatible with the driver */
 875    uint32_t incompatible_vb_mask = 0;
 876    /* which buffers have a non-zero stride */
 877    uint32_t nonzero_stride_vb_mask = 0;
 878    const uint32_t mask = ~(((1ull << count) - 1) << start_slot);
 879
 880    /* Zero out the bits we are going to rewrite completely. */
 881    mgr->user_vb_mask &= mask;
 882    mgr->incompatible_vb_mask &= mask;
 883    mgr->nonzero_stride_vb_mask &= mask;
 884    mgr->enabled_vb_mask &= mask;
 885
 886    if (!bufs) {
 887       struct pipe_context *pipe = mgr->pipe;
 888       /* Unbind. */
 889       mgr->dirty_real_vb_mask &= mask;
 890
 891       for (i = 0; i < count; i++) {
 892          unsigned dst_index = start_slot + i;
 893
 894          pipe_vertex_buffer_unreference(&mgr->vertex_buffer[dst_index]);
 895          pipe_vertex_buffer_unreference(&mgr->real_vertex_buffer[dst_index]);
 896       }
 897
 898       pipe->set_vertex_buffers(pipe, start_slot, count, NULL);
 899       return;
 900    }
 901
 902    for (i = 0; i < count; i++) {
 903       unsigned dst_index = start_slot + i;
 904       const struct pipe_vertex_buffer *vb = &bufs[i];
 905       struct pipe_vertex_buffer *orig_vb = &mgr->vertex_buffer[dst_index];
 906       struct pipe_vertex_buffer *real_vb = &mgr->real_vertex_buffer[dst_index];
 907
 908       if (!vb->buffer.resource) {
 909          pipe_vertex_buffer_unreference(orig_vb);
 910          pipe_vertex_buffer_unreference(real_vb);
 911          continue;
 912       }
 913
 914       pipe_vertex_buffer_reference(orig_vb, vb);
 915
 916       if (vb->stride) {
 917          nonzero_stride_vb_mask |= 1 << dst_index;
 918       }
 919       enabled_vb_mask |= 1 << dst_index;
 920
 921       if ((!mgr->caps.buffer_offset_unaligned && vb->buffer_offset % 4 != 0) ||
 922           (!mgr->caps.buffer_stride_unaligned && vb->stride % 4 != 0)) {
 923          incompatible_vb_mask |= 1 << dst_index;
 924          real_vb->buffer_offset = vb->buffer_offset;
 925          real_vb->stride = vb->stride;
 926          pipe_vertex_buffer_unreference(real_vb);
 927          real_vb->is_user_buffer = false;
 928          continue;
 929       }
 930
 931       if (!mgr->caps.user_vertex_buffers && vb->is_user_buffer) {
 932          user_vb_mask |= 1 << dst_index;
 933          real_vb->buffer_offset = vb->buffer_offset;
 934          real_vb->stride = vb->stride;
 935          pipe_vertex_buffer_unreference(real_vb);
 936          real_vb->is_user_buffer = false;
 937          continue;
 938       }
 939
 940       pipe_vertex_buffer_reference(real_vb, vb);
 941    }
 942
 943    mgr->user_vb_mask |= user_vb_mask;
 944    mgr->incompatible_vb_mask |= incompatible_vb_mask;
 945    mgr->nonzero_stride_vb_mask |= nonzero_stride_vb_mask;
 946    mgr->enabled_vb_mask |= enabled_vb_mask;
 947
 948    /* All changed buffers are marked as dirty, even the NULL ones,
 949     * which will cause the NULL buffers to be unbound in the driver later. */
 950    mgr->dirty_real_vb_mask |= ~mask;
 951 }
 952
 953 static enum pipe_error
 954 u_vbuf_upload_buffers(struct u_vbuf *mgr,
 955                       int start_vertex, unsigned num_vertices,
 956                       int start_instance, unsigned num_instances)
 957 {
 958    unsigned i;
 959    unsigned nr_velems = mgr->ve->count;
 960    const struct pipe_vertex_element *velems =
 961          mgr->using_translate ? mgr->fallback_velems : mgr->ve->ve;
 962    unsigned start_offset[PIPE_MAX_ATTRIBS];
 963    unsigned end_offset[PIPE_MAX_ATTRIBS];
 964    uint32_t buffer_mask = 0;
 965
 966    /* Determine how much data needs to be uploaded. */
 967    for (i = 0; i < nr_velems; i++) {
 968       const struct pipe_vertex_element *velem = &velems[i];
 969       unsigned index = velem->vertex_buffer_index;
 970       struct pipe_vertex_buffer *vb = &mgr->vertex_buffer[index];
 971       unsigned instance_div, first, size, index_bit;
 972
 973       /* Skip the buffers generated by translate. */
 974       if (index == mgr->fallback_vbs[VB_VERTEX] ||
 975           index == mgr->fallback_vbs[VB_INSTANCE] ||
 976           index == mgr->fallback_vbs[VB_CONST]) {
 977          continue;
 978       }
 979
 980       if (!vb->is_user_buffer) {
 981          continue;
 982       }
 983
 984       instance_div = velem->instance_divisor;
 985       first = vb->buffer_offset + velem->src_offset;
 986
 987       if (!vb->stride) {
 988          /* Constant attrib. */
 989          size = mgr->ve->src_format_size[i];
 990       } else if (instance_div) {
 991          /* Per-instance attrib. */
 992
 993          /* Figure out how many instances we'll render given instance_div.  We
 994           * can't use the typical div_round_up() pattern because the CTS uses
 995           * instance_div = ~0 for a test, which overflows div_round_up()'s
 996           * addition.
 997           */
 998          unsigned count = num_instances / instance_div;
 999          if (count * instance_div != num_instances)
1000             count++;
1001
1002          first += vb->stride * start_instance;
1003          size = vb->stride * (count - 1) + mgr->ve->src_format_size[i];
1004       } else {
1005          /* Per-vertex attrib. */
1006          first += vb->stride * start_vertex;
1007          size = vb->stride * (num_vertices - 1) + mgr->ve->src_format_size[i];
1008       }
1009
1010       index_bit = 1 << index;
1011
1012       /* Update offsets. */
1013       if (!(buffer_mask & index_bit)) {
1014          start_offset[index] = first;
1015          end_offset[index] = first + size;
1016       } else {
1017          if (first < start_offset[index])
1018             start_offset[index] = first;
1019          if (first + size > end_offset[index])
1020             end_offset[index] = first + size;
1021       }
1022
1023       buffer_mask |= index_bit;
1024    }
1025
1026    /* Upload buffers. */
1027    while (buffer_mask) {
1028       unsigned start, end;
1029       struct pipe_vertex_buffer *real_vb;
1030       const uint8_t *ptr;
1031
1032       i = u_bit_scan(&buffer_mask);
1033
1034       start = start_offset[i];
1035       end = end_offset[i];
1036       assert(start < end);
1037
1038       real_vb = &mgr->real_vertex_buffer[i];
1039       ptr = mgr->vertex_buffer[i].buffer.user;
1040
1041       u_upload_data(mgr->pipe->stream_uploader,
1042                     mgr->has_signed_vb_offset ? 0 : start,
1043                     end - start, 4,
1044                     ptr + start, &real_vb->buffer_offset, &real_vb->buffer.resource);
1045       if (!real_vb->buffer.resource)
1046          return PIPE_ERROR_OUT_OF_MEMORY;
1047
1048       real_vb->buffer_offset -= start;
1049    }
1050
1051    return PIPE_OK;
1052 }
1053
1054 static boolean u_vbuf_need_minmax_index(const struct u_vbuf *mgr)
1055 {
1056    /* See if there are any per-vertex attribs which will be uploaded or
1057     * translated. Use bitmasks to get the info instead of looping over vertex
1058     * elements. */
1059    return (mgr->ve->used_vb_mask &
1060            ((mgr->user_vb_mask |
1061              mgr->incompatible_vb_mask |
1062              mgr->ve->incompatible_vb_mask_any) &
1063             mgr->ve->noninstance_vb_mask_any &
1064             mgr->nonzero_stride_vb_mask)) != 0;
1065 }
1066
1067 static boolean u_vbuf_mapping_vertex_buffer_blocks(const struct u_vbuf *mgr)
1068 {
1069    /* Return true if there are hw buffers which don't need to be translated.
1070     *
1071     * We could query whether each buffer is busy, but that would
1072     * be way more costly than this. */
1073    return (mgr->ve->used_vb_mask &
1074            (~mgr->user_vb_mask &
1075             ~mgr->incompatible_vb_mask &
1076             mgr->ve->compatible_vb_mask_all &
1077             mgr->ve->noninstance_vb_mask_any &
1078             mgr->nonzero_stride_vb_mask)) != 0;
1079 }
1080
1081 static void
1082 u_vbuf_get_minmax_index_mapped(const struct pipe_draw_info *info,
1083                                const void *indices, unsigned *out_min_index,
1084                                unsigned *out_max_index)
1085 {
1086    if (!info->count) {
1087       *out_min_index = 0;
1088       *out_max_index = 0;
1089       return;
1090    }
1091
1092    switch (info->index_size) {
1093    case 4: {
1094       const unsigned *ui_indices = (const unsigned*)indices;
1095       unsigned max = 0;
1096       unsigned min = ~0u;
1097       if (info->primitive_restart) {
1098          for (unsigned i = 0; i < info->count; i++) {
1099             if (ui_indices[i] != info->restart_index) {
1100                if (ui_indices[i] > max) max = ui_indices[i];
1101                if (ui_indices[i] < min) min = ui_indices[i];
1102             }
1103          }
1104       }
1105       else {
1106          for (unsigned i = 0; i < info->count; i++) {
1107             if (ui_indices[i] > max) max = ui_indices[i];
1108             if (ui_indices[i] < min) min = ui_indices[i];
1109          }
1110       }
1111       *out_min_index = min;
1112       *out_max_index = max;
1113       break;
1114    }
1115    case 2: {
1116       const unsigned short *us_indices = (const unsigned short*)indices;
1117       unsigned short max = 0;
1118       unsigned short min = ~((unsigned short)0);
1119       if (info->primitive_restart) {
1120          for (unsigned i = 0; i < info->count; i++) {
1121             if (us_indices[i] != info->restart_index) {
1122                if (us_indices[i] > max) max = us_indices[i];
1123                if (us_indices[i] < min) min = us_indices[i];
1124             }
1125          }
1126       }
1127       else {
1128          for (unsigned i = 0; i < info->count; i++) {
1129             if (us_indices[i] > max) max = us_indices[i];
1130             if (us_indices[i] < min) min = us_indices[i];
1131          }
1132       }
1133       *out_min_index = min;
1134       *out_max_index = max;
1135       break;
1136    }
1137    case 1: {
1138       const unsigned char *ub_indices = (const unsigned char*)indices;
1139       unsigned char max = 0;
1140       unsigned char min = ~((unsigned char)0);
1141       if (info->primitive_restart) {
1142          for (unsigned i = 0; i < info->count; i++) {
1143             if (ub_indices[i] != info->restart_index) {
1144                if (ub_indices[i] > max) max = ub_indices[i];
1145                if (ub_indices[i] < min) min = ub_indices[i];
1146             }
1147          }
1148       }
1149       else {
1150          for (unsigned i = 0; i < info->count; i++) {
1151             if (ub_indices[i] > max) max = ub_indices[i];
1152             if (ub_indices[i] < min) min = ub_indices[i];
1153          }
1154       }
1155       *out_min_index = min;
1156       *out_max_index = max;
1157       break;
1158    }
1159    default:
1160       assert(0);
1161    }
1162 }
1163
1164 void u_vbuf_get_minmax_index(struct pipe_context *pipe,
1165                              const struct pipe_draw_info *info,
1166                              unsigned *out_min_index, unsigned *out_max_index)
1167 {
1168    struct pipe_transfer *transfer = NULL;
1169    const void *indices;
1170
1171    if (info->has_user_indices) {
1172       indices = (uint8_t*)info->index.user +
1173                 info->start * info->index_size;
1174    } else {
1175       indices = pipe_buffer_map_range(pipe, info->index.resource,
1176                                       info->start * info->index_size,
1177                                       info->count * info->index_size,
1178                                       PIPE_TRANSFER_READ, &transfer);
1179    }
1180
1181    u_vbuf_get_minmax_index_mapped(info, indices, out_min_index, out_max_index);
1182
1183    if (transfer) {
1184       pipe_buffer_unmap(pipe, transfer);
1185    }
1186 }
1187
1188 static void u_vbuf_set_driver_vertex_buffers(struct u_vbuf *mgr)
1189 {
1190    struct pipe_context *pipe = mgr->pipe;
1191    unsigned start_slot, count;
1192
1193    start_slot = ffs(mgr->dirty_real_vb_mask) - 1;
1194    count = util_last_bit(mgr->dirty_real_vb_mask >> start_slot);
1195
1196    pipe->set_vertex_buffers(pipe, start_slot, count,
1197                             mgr->real_vertex_buffer + start_slot);
1198    mgr->dirty_real_vb_mask = 0;
1199 }
1200
1201 static void
1202 u_vbuf_split_indexed_multidraw(struct u_vbuf *mgr, struct pipe_draw_info *info,
1203                                unsigned *indirect_data, unsigned stride,
1204                                unsigned draw_count)
1205 {
1206    assert(info->index_size);
1207    info->indirect = NULL;
1208
1209    for (unsigned i = 0; i < draw_count; i++) {
1210       unsigned offset = i * stride / 4;
1211
1212       info->count = indirect_data[offset + 0];
1213       info->instance_count = indirect_data[offset + 1];
1214
1215       if (!info->count || !info->instance_count)
1216          continue;
1217
1218       info->start = indirect_data[offset + 2];
1219       info->index_bias = indirect_data[offset + 3];
1220       info->start_instance = indirect_data[offset + 4];
1221
1222       u_vbuf_draw_vbo(mgr, info);
1223    }
1224 }
1225
1226 void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info)
1227 {
1228    struct pipe_context *pipe = mgr->pipe;
1229    int start_vertex;
1230    unsigned min_index;
1231    unsigned num_vertices;
1232    boolean unroll_indices = FALSE;
1233    const uint32_t used_vb_mask = mgr->ve->used_vb_mask;
1234    uint32_t user_vb_mask = mgr->user_vb_mask & used_vb_mask;
1235    const uint32_t incompatible_vb_mask =
1236       mgr->incompatible_vb_mask & used_vb_mask;
1237    struct pipe_draw_info new_info;
1238
1239    /* Normal draw. No fallback and no user buffers. */
1240    if (!incompatible_vb_mask &&
1241        !mgr->ve->incompatible_elem_mask &&
1242        !user_vb_mask) {
1243
1244       /* Set vertex buffers if needed. */
1245       if (mgr->dirty_real_vb_mask & used_vb_mask) {
1246          u_vbuf_set_driver_vertex_buffers(mgr);
1247       }
1248
1249       pipe->draw_vbo(pipe, info);
1250       return;
1251    }
1252
1253    new_info = *info;
1254
1255    /* Handle indirect (multi)draws. */
1256    if (new_info.indirect) {
1257       const struct pipe_draw_indirect_info *indirect = new_info.indirect;
1258       unsigned draw_count = 0;
1259
1260       /* Get the number of draws. */
1261       if (indirect->indirect_draw_count) {
1262          pipe_buffer_read(pipe, indirect->indirect_draw_count,
1263                           indirect->indirect_draw_count_offset,
1264                           4, &draw_count);
1265       } else {
1266          draw_count = indirect->draw_count;
1267       }
1268
1269       if (!draw_count)
1270          return;
1271
1272       unsigned data_size = (draw_count - 1) * indirect->stride +
1273                            (new_info.index_size ? 20 : 16);
1274       unsigned *data = malloc(data_size);
1275       if (!data)
1276          return; /* report an error? */
1277
1278       /* Read the used buffer range only once, because the read can be
1279        * uncached.
1280        */
1281       pipe_buffer_read(pipe, indirect->buffer, indirect->offset, data_size,
1282                        data);
1283
1284       if (info->index_size) {
1285          /* Indexed multidraw. */
1286          unsigned index_bias0 = data[3];
1287          bool index_bias_same = true;
1288
1289          /* If we invoke the translate path, we have to split the multidraw. */
1290          if (incompatible_vb_mask ||
1291              mgr->ve->incompatible_elem_mask) {
1292             u_vbuf_split_indexed_multidraw(mgr, &new_info, data,
1293                                            indirect->stride, draw_count);
1294             free(data);
1295             return;
1296          }
1297
1298          /* See if index_bias is the same for all draws. */
1299          for (unsigned i = 1; i < draw_count; i++) {
1300             if (data[i * indirect->stride / 4 + 3] != index_bias0) {
1301                index_bias_same = false;
1302                break;
1303             }
1304          }
1305
1306          /* Split the multidraw if index_bias is different. */
1307          if (!index_bias_same) {
1308             u_vbuf_split_indexed_multidraw(mgr, &new_info, data,
1309                                            indirect->stride, draw_count);
1310             free(data);
1311             return;
1312          }
1313
1314          /* If we don't need to use the translate path and index_bias is
1315           * the same, we can process the multidraw with the time complexity
1316           * equal to 1 draw call (except for the index range computation).
1317           * We only need to compute the index range covering all draw calls
1318           * of the multidraw.
1319           *
1320           * The driver will not look at these values because indirect != NULL.
1321           * These values determine the user buffer bounds to upload.
1322           */
1323          new_info.index_bias = index_bias0;
1324          new_info.min_index = ~0u;
1325          new_info.max_index = 0;
1326          new_info.start_instance = ~0u;
1327          unsigned end_instance = 0;
1328
1329          struct pipe_transfer *transfer = NULL;
1330          const uint8_t *indices;
1331
1332          if (info->has_user_indices) {
1333             indices = (uint8_t*)info->index.user;
1334          } else {
1335             indices = (uint8_t*)pipe_buffer_map(pipe, info->index.resource,
1336                                                 PIPE_TRANSFER_READ, &transfer);
1337          }
1338
1339          for (unsigned i = 0; i < draw_count; i++) {
1340             unsigned offset = i * indirect->stride / 4;
1341             unsigned start = data[offset + 2];
1342             unsigned count = data[offset + 0];
1343             unsigned start_instance = data[offset + 4];
1344             unsigned instance_count = data[offset + 1];
1345
1346             if (!count || !instance_count)
1347                continue;
1348
1349             /* Update the ranges of instances. */
1350             new_info.start_instance = MIN2(new_info.start_instance,
1351                                            start_instance);
1352             end_instance = MAX2(end_instance, start_instance + instance_count);
1353
1354             /* Update the index range. */
1355             unsigned min, max;
1356             new_info.count = count; /* only used by get_minmax_index */
1357             u_vbuf_get_minmax_index_mapped(&new_info,
1358                                            indices +
1359                                            new_info.index_size * start,
1360                                            &min, &max);
1361
1362             new_info.min_index = MIN2(new_info.min_index, min);
1363             new_info.max_index = MAX2(new_info.max_index, max);
1364          }
1365          free(data);
1366
1367          if (transfer)
1368             pipe_buffer_unmap(pipe, transfer);
1369
1370          /* Set the final instance count. */
1371          new_info.instance_count = end_instance - new_info.start_instance;
1372
1373          if (new_info.start_instance == ~0u || !new_info.instance_count)
1374             return;
1375       } else {
1376          /* Non-indexed multidraw.
1377           *
1378           * Keep the draw call indirect and compute minimums & maximums,
1379           * which will determine the user buffer bounds to upload, but
1380           * the driver will not look at these values because indirect != NULL.
1381           *
1382           * This efficiently processes the multidraw with the time complexity
1383           * equal to 1 draw call.
1384           */
1385          new_info.start = ~0u;
1386          new_info.start_instance = ~0u;
1387          unsigned end_vertex = 0;
1388          unsigned end_instance = 0;
1389
1390          for (unsigned i = 0; i < draw_count; i++) {
1391             unsigned offset = i * indirect->stride / 4;
1392             unsigned start = data[offset + 2];
1393             unsigned count = data[offset + 0];
1394             unsigned start_instance = data[offset + 3];
1395             unsigned instance_count = data[offset + 1];
1396
1397             new_info.start = MIN2(new_info.start, start);
1398             new_info.start_instance = MIN2(new_info.start_instance,
1399                                            start_instance);
1400
1401             end_vertex = MAX2(end_vertex, start + count);
1402             end_instance = MAX2(end_instance, start_instance + instance_count);
1403          }
1404          free(data);
1405
1406          /* Set the final counts. */
1407          new_info.count = end_vertex - new_info.start;
1408          new_info.instance_count = end_instance - new_info.start_instance;
1409
1410          if (new_info.start == ~0u || !new_info.count || !new_info.instance_count)
1411             return;
1412       }
1413    }
1414
1415    if (new_info.index_size) {
1416       /* See if anything needs to be done for per-vertex attribs. */
1417       if (u_vbuf_need_minmax_index(mgr)) {
1418          unsigned max_index;
1419
1420          if (new_info.max_index != ~0u) {
1421             min_index = new_info.min_index;
1422             max_index = new_info.max_index;
1423          } else {
1424             u_vbuf_get_minmax_index(mgr->pipe, &new_info,
1425                                     &min_index, &max_index);
1426          }
1427
1428          assert(min_index <= max_index);
1429
1430          start_vertex = min_index + new_info.index_bias;
1431          num_vertices = max_index + 1 - min_index;
1432
1433          /* Primitive restart doesn't work when unrolling indices.
1434           * We would have to break this drawing operation into several ones. */
1435          /* Use some heuristic to see if unrolling indices improves
1436           * performance. */
1437          if (!info->indirect &&
1438              !new_info.primitive_restart &&
1439              num_vertices > new_info.count*2 &&
1440              num_vertices - new_info.count > 32 &&
1441              !u_vbuf_mapping_vertex_buffer_blocks(mgr)) {
1442             unroll_indices = TRUE;
1443             user_vb_mask &= ~(mgr->nonzero_stride_vb_mask &
1444                               mgr->ve->noninstance_vb_mask_any);
1445          }
1446       } else {
1447          /* Nothing to do for per-vertex attribs. */
1448          start_vertex = 0;
1449          num_vertices = 0;
1450          min_index = 0;
1451       }
1452    } else {
1453       start_vertex = new_info.start;
1454       num_vertices = new_info.count;
1455       min_index = 0;
1456    }
1457
1458    /* Translate vertices with non-native layouts or formats. */
1459    if (unroll_indices ||
1460        incompatible_vb_mask ||
1461        mgr->ve->incompatible_elem_mask) {
1462       if (!u_vbuf_translate_begin(mgr, &new_info, start_vertex, num_vertices,
1463                                   min_index, unroll_indices)) {
1464          debug_warn_once("u_vbuf_translate_begin() failed");
1465          return;
1466       }
1467
1468       if (unroll_indices) {
1469          new_info.index_size = 0;
1470          new_info.index_bias = 0;
1471          new_info.min_index = 0;
1472          new_info.max_index = new_info.count - 1;
1473          new_info.start = 0;
1474       }
1475
1476       user_vb_mask &= ~(incompatible_vb_mask |
1477                         mgr->ve->incompatible_vb_mask_all);
1478    }
1479
1480    /* Upload user buffers. */
1481    if (user_vb_mask) {
1482       if (u_vbuf_upload_buffers(mgr, start_vertex, num_vertices,
1483                                 new_info.start_instance,
1484                                 new_info.instance_count) != PIPE_OK) {
1485          debug_warn_once("u_vbuf_upload_buffers() failed");
1486          return;
1487       }
1488
1489       mgr->dirty_real_vb_mask |= user_vb_mask;
1490    }
1491
1492    /*
1493    if (unroll_indices) {
1494       printf("unrolling indices: start_vertex = %i, num_vertices = %i\n",
1495              start_vertex, num_vertices);
1496       util_dump_draw_info(stdout, info);
1497       printf("\n");
1498    }
1499
1500    unsigned i;
1501    for (i = 0; i < mgr->nr_vertex_buffers; i++) {
1502       printf("input %i: ", i);
1503       util_dump_vertex_buffer(stdout, mgr->vertex_buffer+i);
1504       printf("\n");
1505    }
1506    for (i = 0; i < mgr->nr_real_vertex_buffers; i++) {
1507       printf("real %i: ", i);
1508       util_dump_vertex_buffer(stdout, mgr->real_vertex_buffer+i);
1509       printf("\n");
1510    }
1511    */
1512
1513    u_upload_unmap(pipe->stream_uploader);
1514    u_vbuf_set_driver_vertex_buffers(mgr);
1515
1516    pipe->draw_vbo(pipe, &new_info);
1517
1518    if (mgr->using_translate) {
1519       u_vbuf_translate_end(mgr);
1520    }
1521 }
1522
1523 void u_vbuf_save_vertex_elements(struct u_vbuf *mgr)
1524 {
1525    assert(!mgr->ve_saved);
1526    mgr->ve_saved = mgr->ve;
1527 }
1528
1529 void u_vbuf_restore_vertex_elements(struct u_vbuf *mgr)
1530 {
1531    if (mgr->ve != mgr->ve_saved) {
1532       struct pipe_context *pipe = mgr->pipe;
1533
1534       mgr->ve = mgr->ve_saved;
1535       pipe->bind_vertex_elements_state(pipe,
1536                                        mgr->ve ? mgr->ve->driver_cso : NULL);
1537    }
1538    mgr->ve_saved = NULL;
1539 }
1540
1541 void u_vbuf_save_vertex_buffer0(struct u_vbuf *mgr)
1542 {
1543    pipe_vertex_buffer_reference(&mgr->vertex_buffer0_saved,
1544                                 &mgr->vertex_buffer[0]);
1545 }
1546
1547 void u_vbuf_restore_vertex_buffer0(struct u_vbuf *mgr)
1548 {
1549    u_vbuf_set_vertex_buffers(mgr, 0, 1, &mgr->vertex_buffer0_saved);
1550    pipe_vertex_buffer_unreference(&mgr->vertex_buffer0_saved);
1551 }