src/gallium/auxiliary/util/u_vbuf.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2011 Marek Olšák <maraeo@gmail.com>
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * This module uploads user buffers and translates the vertex buffers which
  30  * contain incompatible vertices (i.e. not supported by the driver/hardware)
  31  * into compatible ones, based on the Gallium CAPs.
  32  *
  33  * It does not upload index buffers.
  34  *
  35  * The module heavily uses bitmasks to represent per-buffer and
  36  * per-vertex-element flags to avoid looping over the list of buffers just
  37  * to see if there's a non-zero stride, or user buffer, or unsupported format,
  38  * etc.
  39  *
  40  * There are 3 categories of vertex elements, which are processed separately:
  41  * - per-vertex attribs (stride != 0, instance_divisor == 0)
  42  * - instanced attribs (stride != 0, instance_divisor > 0)
  43  * - constant attribs (stride == 0)
  44  *
  45  * All needed uploads and translations are performed every draw command, but
  46  * only the subset of vertices needed for that draw command is uploaded or
  47  * translated. (the module never translates whole buffers)
  48  *
  49  *
  50  * The module consists of two main parts:
  51  *
  52  *
  53  * 1) Translate (u_vbuf_translate_begin/end)
  54  *
  55  * This is pretty much a vertex fetch fallback. It translates vertices from
  56  * one vertex buffer to another in an unused vertex buffer slot. It does
  57  * whatever is needed to make the vertices readable by the hardware (changes
  58  * vertex formats and aligns offsets and strides). The translate module is
  59  * used here.
  60  *
  61  * Each of the 3 categories is translated to a separate buffer.
  62  * Only the [min_index, max_index] range is translated. For instanced attribs,
  63  * the range is [start_instance, start_instance+instance_count]. For constant
  64  * attribs, the range is [0, 1].
  65  *
  66  *
  67  * 2) User buffer uploading (u_vbuf_upload_buffers)
  68  *
  69  * Only the [min_index, max_index] range is uploaded (just like Translate)
  70  * with a single memcpy.
  71  *
  72  * This method works best for non-indexed draw operations or indexed draw
  73  * operations where the [min_index, max_index] range is not being way bigger
  74  * than the vertex count.
  75  *
  76  * If the range is too big (e.g. one triangle with indices {0, 1, 10000}),
  77  * the per-vertex attribs are uploaded via the translate module, all packed
  78  * into one vertex buffer, and the indexed draw call is turned into
  79  * a non-indexed one in the process. This adds additional complexity
  80  * to the translate part, but it prevents bad apps from bringing your frame
  81  * rate down.
  82  *
  83  *
  84  * If there is nothing to do, it forwards every command to the driver.
  85  * The module also has its own CSO cache of vertex element states.
  86  */
  87
  88 #include "util/u_vbuf.h"
  89
  90 #include "util/u_dump.h"
  91 #include "util/format/u_format.h"
  92 #include "util/u_inlines.h"
  93 #include "util/u_memory.h"
  94 #include "util/u_screen.h"
  95 #include "util/u_upload_mgr.h"
  96 #include "translate/translate.h"
  97 #include "translate/translate_cache.h"
  98 #include "cso_cache/cso_cache.h"
  99 #include "cso_cache/cso_hash.h"
 100
 101 struct u_vbuf_elements {
 102    unsigned count;
 103    struct pipe_vertex_element ve[PIPE_MAX_ATTRIBS];
 104
 105    unsigned src_format_size[PIPE_MAX_ATTRIBS];
 106
 107    /* If (velem[i].src_format != native_format[i]), the vertex buffer
 108     * referenced by the vertex element cannot be used for rendering and
 109     * its vertex data must be translated to native_format[i]. */
 110    enum pipe_format native_format[PIPE_MAX_ATTRIBS];
 111    unsigned native_format_size[PIPE_MAX_ATTRIBS];
 112
 113    /* Which buffers are used by the vertex element state. */
 114    uint32_t used_vb_mask;
 115    /* This might mean two things:
 116     * - src_format != native_format, as discussed above.
 117     * - src_offset % 4 != 0 (if the caps don't allow such an offset). */
 118    uint32_t incompatible_elem_mask; /* each bit describes a corresp. attrib  */
 119    /* Which buffer has at least one vertex element referencing it
 120     * incompatible. */
 121    uint32_t incompatible_vb_mask_any;
 122    /* Which buffer has all vertex elements referencing it incompatible. */
 123    uint32_t incompatible_vb_mask_all;
 124    /* Which buffer has at least one vertex element referencing it
 125     * compatible. */
 126    uint32_t compatible_vb_mask_any;
 127    /* Which buffer has all vertex elements referencing it compatible. */
 128    uint32_t compatible_vb_mask_all;
 129
 130    /* Which buffer has at least one vertex element referencing it
 131     * non-instanced. */
 132    uint32_t noninstance_vb_mask_any;
 133
 134    void *driver_cso;
 135 };
 136
 137 enum {
 138    VB_VERTEX = 0,
 139    VB_INSTANCE = 1,
 140    VB_CONST = 2,
 141    VB_NUM = 3
 142 };
 143
 144 struct u_vbuf {
 145    struct u_vbuf_caps caps;
 146    bool has_signed_vb_offset;
 147
 148    struct pipe_context *pipe;
 149    struct translate_cache *translate_cache;
 150    struct cso_cache *cso_cache;
 151
 152    /* This is what was set in set_vertex_buffers.
 153     * May contain user buffers. */
 154    struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
 155    uint32_t enabled_vb_mask;
 156
 157    /* Saved vertex buffer. */
 158    struct pipe_vertex_buffer vertex_buffer0_saved;
 159
 160    /* Vertex buffers for the driver.
 161     * There are usually no user buffers. */
 162    struct pipe_vertex_buffer real_vertex_buffer[PIPE_MAX_ATTRIBS];
 163    uint32_t dirty_real_vb_mask; /* which buffers are dirty since the last
 164                                    call of set_vertex_buffers */
 165
 166    /* Vertex elements. */
 167    struct u_vbuf_elements *ve, *ve_saved;
 168
 169    /* Vertex elements used for the translate fallback. */
 170    struct cso_velems_state fallback_velems;
 171    /* If non-NULL, this is a vertex element state used for the translate
 172     * fallback and therefore used for rendering too. */
 173    boolean using_translate;
 174    /* The vertex buffer slot index where translated vertices have been
 175     * stored in. */
 176    unsigned fallback_vbs[VB_NUM];
 177    unsigned fallback_vbs_mask;
 178
 179    /* Which buffer is a user buffer. */
 180    uint32_t user_vb_mask; /* each bit describes a corresp. buffer */
 181    /* Which buffer is incompatible (unaligned). */
 182    uint32_t incompatible_vb_mask; /* each bit describes a corresp. buffer */
 183    /* Which buffer has a non-zero stride. */
 184    uint32_t nonzero_stride_vb_mask; /* each bit describes a corresp. buffer */
 185    /* Which buffers are allowed (supported by hardware). */
 186    uint32_t allowed_vb_mask;
 187 };
 188
 189 static void *
 190 u_vbuf_create_vertex_elements(struct u_vbuf *mgr, unsigned count,
 191                               const struct pipe_vertex_element *attribs);
 192 static void u_vbuf_delete_vertex_elements(struct u_vbuf *mgr, void *cso);
 193
 194 static const struct {
 195    enum pipe_format from, to;
 196 } vbuf_format_fallbacks[] = {
 197    { PIPE_FORMAT_R32_FIXED,            PIPE_FORMAT_R32_FLOAT },
 198    { PIPE_FORMAT_R32G32_FIXED,         PIPE_FORMAT_R32G32_FLOAT },
 199    { PIPE_FORMAT_R32G32B32_FIXED,      PIPE_FORMAT_R32G32B32_FLOAT },
 200    { PIPE_FORMAT_R32G32B32A32_FIXED,   PIPE_FORMAT_R32G32B32A32_FLOAT },
 201    { PIPE_FORMAT_R16_FLOAT,            PIPE_FORMAT_R32_FLOAT },
 202    { PIPE_FORMAT_R16G16_FLOAT,         PIPE_FORMAT_R32G32_FLOAT },
 203    { PIPE_FORMAT_R16G16B16_FLOAT,      PIPE_FORMAT_R32G32B32_FLOAT },
 204    { PIPE_FORMAT_R16G16B16A16_FLOAT,   PIPE_FORMAT_R32G32B32A32_FLOAT },
 205    { PIPE_FORMAT_R64_FLOAT,            PIPE_FORMAT_R32_FLOAT },
 206    { PIPE_FORMAT_R64G64_FLOAT,         PIPE_FORMAT_R32G32_FLOAT },
 207    { PIPE_FORMAT_R64G64B64_FLOAT,      PIPE_FORMAT_R32G32B32_FLOAT },
 208    { PIPE_FORMAT_R64G64B64A64_FLOAT,   PIPE_FORMAT_R32G32B32A32_FLOAT },
 209    { PIPE_FORMAT_R32_UNORM,            PIPE_FORMAT_R32_FLOAT },
 210    { PIPE_FORMAT_R32G32_UNORM,         PIPE_FORMAT_R32G32_FLOAT },
 211    { PIPE_FORMAT_R32G32B32_UNORM,      PIPE_FORMAT_R32G32B32_FLOAT },
 212    { PIPE_FORMAT_R32G32B32A32_UNORM,   PIPE_FORMAT_R32G32B32A32_FLOAT },
 213    { PIPE_FORMAT_R32_SNORM,            PIPE_FORMAT_R32_FLOAT },
 214    { PIPE_FORMAT_R32G32_SNORM,         PIPE_FORMAT_R32G32_FLOAT },
 215    { PIPE_FORMAT_R32G32B32_SNORM,      PIPE_FORMAT_R32G32B32_FLOAT },
 216    { PIPE_FORMAT_R32G32B32A32_SNORM,   PIPE_FORMAT_R32G32B32A32_FLOAT },
 217    { PIPE_FORMAT_R32_USCALED,          PIPE_FORMAT_R32_FLOAT },
 218    { PIPE_FORMAT_R32G32_USCALED,       PIPE_FORMAT_R32G32_FLOAT },
 219    { PIPE_FORMAT_R32G32B32_USCALED,    PIPE_FORMAT_R32G32B32_FLOAT },
 220    { PIPE_FORMAT_R32G32B32A32_USCALED, PIPE_FORMAT_R32G32B32A32_FLOAT },
 221    { PIPE_FORMAT_R32_SSCALED,          PIPE_FORMAT_R32_FLOAT },
 222    { PIPE_FORMAT_R32G32_SSCALED,       PIPE_FORMAT_R32G32_FLOAT },
 223    { PIPE_FORMAT_R32G32B32_SSCALED,    PIPE_FORMAT_R32G32B32_FLOAT },
 224    { PIPE_FORMAT_R32G32B32A32_SSCALED, PIPE_FORMAT_R32G32B32A32_FLOAT },
 225    { PIPE_FORMAT_R16_UNORM,            PIPE_FORMAT_R32_FLOAT },
 226    { PIPE_FORMAT_R16G16_UNORM,         PIPE_FORMAT_R32G32_FLOAT },
 227    { PIPE_FORMAT_R16G16B16_UNORM,      PIPE_FORMAT_R32G32B32_FLOAT },
 228    { PIPE_FORMAT_R16G16B16A16_UNORM,   PIPE_FORMAT_R32G32B32A32_FLOAT },
 229    { PIPE_FORMAT_R16_SNORM,            PIPE_FORMAT_R32_FLOAT },
 230    { PIPE_FORMAT_R16G16_SNORM,         PIPE_FORMAT_R32G32_FLOAT },
 231    { PIPE_FORMAT_R16G16B16_SNORM,      PIPE_FORMAT_R32G32B32_FLOAT },
 232    { PIPE_FORMAT_R16G16B16A16_SNORM,   PIPE_FORMAT_R32G32B32A32_FLOAT },
 233    { PIPE_FORMAT_R16_USCALED,          PIPE_FORMAT_R32_FLOAT },
 234    { PIPE_FORMAT_R16G16_USCALED,       PIPE_FORMAT_R32G32_FLOAT },
 235    { PIPE_FORMAT_R16G16B16_USCALED,    PIPE_FORMAT_R32G32B32_FLOAT },
 236    { PIPE_FORMAT_R16G16B16A16_USCALED, PIPE_FORMAT_R32G32B32A32_FLOAT },
 237    { PIPE_FORMAT_R16_SSCALED,          PIPE_FORMAT_R32_FLOAT },
 238    { PIPE_FORMAT_R16G16_SSCALED,       PIPE_FORMAT_R32G32_FLOAT },
 239    { PIPE_FORMAT_R16G16B16_SSCALED,    PIPE_FORMAT_R32G32B32_FLOAT },
 240    { PIPE_FORMAT_R16G16B16A16_SSCALED, PIPE_FORMAT_R32G32B32A32_FLOAT },
 241    { PIPE_FORMAT_R8_UNORM,             PIPE_FORMAT_R32_FLOAT },
 242    { PIPE_FORMAT_R8G8_UNORM,           PIPE_FORMAT_R32G32_FLOAT },
 243    { PIPE_FORMAT_R8G8B8_UNORM,         PIPE_FORMAT_R32G32B32_FLOAT },
 244    { PIPE_FORMAT_R8G8B8A8_UNORM,       PIPE_FORMAT_R32G32B32A32_FLOAT },
 245    { PIPE_FORMAT_R8_SNORM,             PIPE_FORMAT_R32_FLOAT },
 246    { PIPE_FORMAT_R8G8_SNORM,           PIPE_FORMAT_R32G32_FLOAT },
 247    { PIPE_FORMAT_R8G8B8_SNORM,         PIPE_FORMAT_R32G32B32_FLOAT },
 248    { PIPE_FORMAT_R8G8B8A8_SNORM,       PIPE_FORMAT_R32G32B32A32_FLOAT },
 249    { PIPE_FORMAT_R8_USCALED,           PIPE_FORMAT_R32_FLOAT },
 250    { PIPE_FORMAT_R8G8_USCALED,         PIPE_FORMAT_R32G32_FLOAT },
 251    { PIPE_FORMAT_R8G8B8_USCALED,       PIPE_FORMAT_R32G32B32_FLOAT },
 252    { PIPE_FORMAT_R8G8B8A8_USCALED,     PIPE_FORMAT_R32G32B32A32_FLOAT },
 253    { PIPE_FORMAT_R8_SSCALED,           PIPE_FORMAT_R32_FLOAT },
 254    { PIPE_FORMAT_R8G8_SSCALED,         PIPE_FORMAT_R32G32_FLOAT },
 255    { PIPE_FORMAT_R8G8B8_SSCALED,       PIPE_FORMAT_R32G32B32_FLOAT },
 256    { PIPE_FORMAT_R8G8B8A8_SSCALED,     PIPE_FORMAT_R32G32B32A32_FLOAT },
 257 };
 258
 259 void u_vbuf_get_caps(struct pipe_screen *screen, struct u_vbuf_caps *caps,
 260                      bool needs64b)
 261 {
 262    unsigned i;
 263
 264    memset(caps, 0, sizeof(*caps));
 265
 266    /* I'd rather have a bitfield of which formats are supported and a static
 267     * table of the translations indexed by format, but since we don't have C99
 268     * we can't easily make a sparsely-populated table indexed by format.  So,
 269     * we construct the sparse table here.
 270     */
 271    for (i = 0; i < PIPE_FORMAT_COUNT; i++)
 272       caps->format_translation[i] = i;
 273
 274    for (i = 0; i < ARRAY_SIZE(vbuf_format_fallbacks); i++) {
 275       enum pipe_format format = vbuf_format_fallbacks[i].from;
 276       unsigned comp_bits = util_format_get_component_bits(format, 0, 0);
 277
 278       if ((comp_bits > 32) && !needs64b)
 279          continue;
 280
 281       if (!screen->is_format_supported(screen, format, PIPE_BUFFER, 0, 0,
 282                                        PIPE_BIND_VERTEX_BUFFER)) {
 283          caps->format_translation[format] = vbuf_format_fallbacks[i].to;
 284          caps->fallback_always = true;
 285       }
 286    }
 287
 288    caps->buffer_offset_unaligned =
 289       !screen->get_param(screen,
 290                          PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY);
 291    caps->buffer_stride_unaligned =
 292      !screen->get_param(screen,
 293                         PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY);
 294    caps->velem_src_offset_unaligned =
 295       !screen->get_param(screen,
 296                          PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY);
 297    caps->user_vertex_buffers =
 298       screen->get_param(screen, PIPE_CAP_USER_VERTEX_BUFFERS);
 299    caps->max_vertex_buffers =
 300       screen->get_param(screen, PIPE_CAP_MAX_VERTEX_BUFFERS);
 301
 302    /* OpenGL 2.0 requires a minimum of 16 vertex buffers */
 303    if (caps->max_vertex_buffers < 16)
 304       caps->fallback_always = true;
 305
 306    if (!caps->buffer_offset_unaligned ||
 307        !caps->buffer_stride_unaligned ||
 308        !caps->velem_src_offset_unaligned)
 309       caps->fallback_always = true;
 310
 311    if (!caps->fallback_always && !caps->user_vertex_buffers)
 312       caps->fallback_only_for_user_vbuffers = true;
 313 }
 314
 315 struct u_vbuf *
 316 u_vbuf_create(struct pipe_context *pipe, struct u_vbuf_caps *caps)
 317 {
 318    struct u_vbuf *mgr = CALLOC_STRUCT(u_vbuf);
 319
 320    mgr->caps = *caps;
 321    mgr->pipe = pipe;
 322    mgr->cso_cache = cso_cache_create();
 323    mgr->translate_cache = translate_cache_create();
 324    memset(mgr->fallback_vbs, ~0, sizeof(mgr->fallback_vbs));
 325    mgr->allowed_vb_mask = u_bit_consecutive(0, mgr->caps.max_vertex_buffers);
 326
 327    mgr->has_signed_vb_offset =
 328       pipe->screen->get_param(pipe->screen,
 329                               PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET);
 330
 331    return mgr;
 332 }
 333
 334 /* u_vbuf uses its own caching for vertex elements, because it needs to keep
 335  * its own preprocessed state per vertex element CSO. */
 336 static struct u_vbuf_elements *
 337 u_vbuf_set_vertex_elements_internal(struct u_vbuf *mgr,
 338                                     const struct cso_velems_state *velems)
 339 {
 340    struct pipe_context *pipe = mgr->pipe;
 341    unsigned key_size, hash_key;
 342    struct cso_hash_iter iter;
 343    struct u_vbuf_elements *ve;
 344
 345    /* need to include the count into the stored state data too. */
 346    key_size = sizeof(struct pipe_vertex_element) * velems->count +
 347               sizeof(unsigned);
 348    hash_key = cso_construct_key((void*)velems, key_size);
 349    iter = cso_find_state_template(mgr->cso_cache, hash_key, CSO_VELEMENTS,
 350                                   (void*)velems, key_size);
 351
 352    if (cso_hash_iter_is_null(iter)) {
 353       struct cso_velements *cso = MALLOC_STRUCT(cso_velements);
 354       memcpy(&cso->state, velems, key_size);
 355       cso->data = u_vbuf_create_vertex_elements(mgr, velems->count,
 356                                                 velems->velems);
 357       cso->delete_state = (cso_state_callback)u_vbuf_delete_vertex_elements;
 358       cso->context = (void*)mgr;
 359
 360       iter = cso_insert_state(mgr->cso_cache, hash_key, CSO_VELEMENTS, cso);
 361       ve = cso->data;
 362    } else {
 363       ve = ((struct cso_velements *)cso_hash_iter_data(iter))->data;
 364    }
 365
 366    assert(ve);
 367
 368    if (ve != mgr->ve)
 369       pipe->bind_vertex_elements_state(pipe, ve->driver_cso);
 370
 371    return ve;
 372 }
 373
 374 void u_vbuf_set_vertex_elements(struct u_vbuf *mgr,
 375                                 const struct cso_velems_state *velems)
 376 {
 377    mgr->ve = u_vbuf_set_vertex_elements_internal(mgr, velems);
 378 }
 379
 380 void u_vbuf_unset_vertex_elements(struct u_vbuf *mgr)
 381 {
 382    mgr->ve = NULL;
 383 }
 384
 385 void u_vbuf_destroy(struct u_vbuf *mgr)
 386 {
 387    struct pipe_screen *screen = mgr->pipe->screen;
 388    unsigned i;
 389    const unsigned num_vb = screen->get_shader_param(screen, PIPE_SHADER_VERTEX,
 390                                                     PIPE_SHADER_CAP_MAX_INPUTS);
 391
 392    mgr->pipe->set_vertex_buffers(mgr->pipe, 0, num_vb, NULL);
 393
 394    for (i = 0; i < PIPE_MAX_ATTRIBS; i++)
 395       pipe_vertex_buffer_unreference(&mgr->vertex_buffer[i]);
 396    for (i = 0; i < PIPE_MAX_ATTRIBS; i++)
 397       pipe_vertex_buffer_unreference(&mgr->real_vertex_buffer[i]);
 398
 399    pipe_vertex_buffer_unreference(&mgr->vertex_buffer0_saved);
 400
 401    translate_cache_destroy(mgr->translate_cache);
 402    cso_cache_delete(mgr->cso_cache);
 403    FREE(mgr);
 404 }
 405
 406 static enum pipe_error
 407 u_vbuf_translate_buffers(struct u_vbuf *mgr, struct translate_key *key,
 408                          const struct pipe_draw_info *info,
 409                          unsigned vb_mask, unsigned out_vb,
 410                          int start_vertex, unsigned num_vertices,
 411                          int min_index, boolean unroll_indices)
 412 {
 413    struct translate *tr;
 414    struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS] = {0};
 415    struct pipe_resource *out_buffer = NULL;
 416    uint8_t *out_map;
 417    unsigned out_offset, mask;
 418
 419    /* Get a translate object. */
 420    tr = translate_cache_find(mgr->translate_cache, key);
 421
 422    /* Map buffers we want to translate. */
 423    mask = vb_mask;
 424    while (mask) {
 425       struct pipe_vertex_buffer *vb;
 426       unsigned offset;
 427       uint8_t *map;
 428       unsigned i = u_bit_scan(&mask);
 429
 430       vb = &mgr->vertex_buffer[i];
 431       offset = vb->buffer_offset + vb->stride * start_vertex;
 432
 433       if (vb->is_user_buffer) {
 434          map = (uint8_t*)vb->buffer.user + offset;
 435       } else {
 436          unsigned size = vb->stride ? num_vertices * vb->stride
 437                                     : sizeof(double)*4;
 438
 439          if (!vb->buffer.resource)
 440             continue;
 441
 442          if (offset + size > vb->buffer.resource->width0) {
 443             /* Don't try to map past end of buffer.  This often happens when
 444              * we're translating an attribute that's at offset > 0 from the
 445              * start of the vertex.  If we'd subtract attrib's offset from
 446              * the size, this probably wouldn't happen.
 447              */
 448             size = vb->buffer.resource->width0 - offset;
 449
 450             /* Also adjust num_vertices.  A common user error is to call
 451              * glDrawRangeElements() with incorrect 'end' argument.  The 'end
 452              * value should be the max index value, but people often
 453              * accidentally add one to this value.  This adjustment avoids
 454              * crashing (by reading past the end of a hardware buffer mapping)
 455              * when people do that.
 456              */
 457             num_vertices = (size + vb->stride - 1) / vb->stride;
 458          }
 459
 460          map = pipe_buffer_map_range(mgr->pipe, vb->buffer.resource, offset, size,
 461                                      PIPE_TRANSFER_READ, &vb_transfer[i]);
 462       }
 463
 464       /* Subtract min_index so that indexing with the index buffer works. */
 465       if (unroll_indices) {
 466          map -= (ptrdiff_t)vb->stride * min_index;
 467       }
 468
 469       tr->set_buffer(tr, i, map, vb->stride, info->max_index);
 470    }
 471
 472    /* Translate. */
 473    if (unroll_indices) {
 474       struct pipe_transfer *transfer = NULL;
 475       const unsigned offset = info->start * info->index_size;
 476       uint8_t *map;
 477
 478       /* Create and map the output buffer. */
 479       u_upload_alloc(mgr->pipe->stream_uploader, 0,
 480                      key->output_stride * info->count, 4,
 481                      &out_offset, &out_buffer,
 482                      (void**)&out_map);
 483       if (!out_buffer)
 484          return PIPE_ERROR_OUT_OF_MEMORY;
 485
 486       if (info->has_user_indices) {
 487          map = (uint8_t*)info->index.user + offset;
 488       } else {
 489          map = pipe_buffer_map_range(mgr->pipe, info->index.resource, offset,
 490                                      info->count * info->index_size,
 491                                      PIPE_TRANSFER_READ, &transfer);
 492       }
 493
 494       switch (info->index_size) {
 495       case 4:
 496          tr->run_elts(tr, (unsigned*)map, info->count, 0, 0, out_map);
 497          break;
 498       case 2:
 499          tr->run_elts16(tr, (uint16_t*)map, info->count, 0, 0, out_map);
 500          break;
 501       case 1:
 502          tr->run_elts8(tr, map, info->count, 0, 0, out_map);
 503          break;
 504       }
 505
 506       if (transfer) {
 507          pipe_buffer_unmap(mgr->pipe, transfer);
 508       }
 509    } else {
 510       /* Create and map the output buffer. */
 511       u_upload_alloc(mgr->pipe->stream_uploader,
 512                      mgr->has_signed_vb_offset ?
 513                         0 : key->output_stride * start_vertex,
 514                      key->output_stride * num_vertices, 4,
 515                      &out_offset, &out_buffer,
 516                      (void**)&out_map);
 517       if (!out_buffer)
 518          return PIPE_ERROR_OUT_OF_MEMORY;
 519
 520       out_offset -= key->output_stride * start_vertex;
 521
 522       tr->run(tr, 0, num_vertices, 0, 0, out_map);
 523    }
 524
 525    /* Unmap all buffers. */
 526    mask = vb_mask;
 527    while (mask) {
 528       unsigned i = u_bit_scan(&mask);
 529
 530       if (vb_transfer[i]) {
 531          pipe_buffer_unmap(mgr->pipe, vb_transfer[i]);
 532       }
 533    }
 534
 535    /* Setup the new vertex buffer. */
 536    mgr->real_vertex_buffer[out_vb].buffer_offset = out_offset;
 537    mgr->real_vertex_buffer[out_vb].stride = key->output_stride;
 538
 539    /* Move the buffer reference. */
 540    pipe_vertex_buffer_unreference(&mgr->real_vertex_buffer[out_vb]);
 541    mgr->real_vertex_buffer[out_vb].buffer.resource = out_buffer;
 542    mgr->real_vertex_buffer[out_vb].is_user_buffer = false;
 543
 544    return PIPE_OK;
 545 }
 546
 547 static boolean
 548 u_vbuf_translate_find_free_vb_slots(struct u_vbuf *mgr,
 549                                     unsigned mask[VB_NUM])
 550 {
 551    unsigned type;
 552    unsigned fallback_vbs[VB_NUM];
 553    /* Set the bit for each buffer which is incompatible, or isn't set. */
 554    uint32_t unused_vb_mask =
 555       mgr->ve->incompatible_vb_mask_all | mgr->incompatible_vb_mask |
 556       ~mgr->enabled_vb_mask;
 557    uint32_t unused_vb_mask_orig;
 558    boolean insufficient_buffers = false;
 559
 560    /* No vertex buffers available at all */
 561    if (!unused_vb_mask)
 562       return FALSE;
 563
 564    memset(fallback_vbs, ~0, sizeof(fallback_vbs));
 565    mgr->fallback_vbs_mask = 0;
 566
 567    /* Find free slots for each type if needed. */
 568    unused_vb_mask_orig = unused_vb_mask;
 569    for (type = 0; type < VB_NUM; type++) {
 570       if (mask[type]) {
 571          uint32_t index;
 572
 573          if (!unused_vb_mask) {
 574             insufficient_buffers = true;
 575             break;
 576          }
 577
 578          index = ffs(unused_vb_mask) - 1;
 579          fallback_vbs[type] = index;
 580          mgr->fallback_vbs_mask |= 1 << index;
 581          unused_vb_mask &= ~(1 << index);
 582          /*printf("found slot=%i for type=%i\n", index, type);*/
 583       }
 584    }
 585
 586    if (insufficient_buffers) {
 587       /* not enough vbs for all types supported by the hardware, they will have to share one
 588        * buffer */
 589       uint32_t index = ffs(unused_vb_mask_orig) - 1;
 590       /* When sharing one vertex buffer use per-vertex frequency for everything. */
 591       fallback_vbs[VB_VERTEX] = index;
 592       mgr->fallback_vbs_mask = 1 << index;
 593       mask[VB_VERTEX] = mask[VB_VERTEX] | mask[VB_CONST] | mask[VB_INSTANCE];
 594       mask[VB_CONST] = 0;
 595       mask[VB_INSTANCE] = 0;
 596    }
 597
 598    for (type = 0; type < VB_NUM; type++) {
 599       if (mask[type]) {
 600          mgr->dirty_real_vb_mask |= 1 << fallback_vbs[type];
 601       }
 602    }
 603
 604    memcpy(mgr->fallback_vbs, fallback_vbs, sizeof(fallback_vbs));
 605    return TRUE;
 606 }
 607
 608 static boolean
 609 u_vbuf_translate_begin(struct u_vbuf *mgr,
 610                        const struct pipe_draw_info *info,
 611                        int start_vertex, unsigned num_vertices,
 612                        int min_index, boolean unroll_indices)
 613 {
 614    unsigned mask[VB_NUM] = {0};
 615    struct translate_key key[VB_NUM];
 616    unsigned elem_index[VB_NUM][PIPE_MAX_ATTRIBS]; /* ... into key.elements */
 617    unsigned i, type;
 618    const unsigned incompatible_vb_mask = mgr->incompatible_vb_mask &
 619                                          mgr->ve->used_vb_mask;
 620
 621    const int start[VB_NUM] = {
 622       start_vertex,           /* VERTEX */
 623       info->start_instance,   /* INSTANCE */
 624       0                       /* CONST */
 625    };
 626
 627    const unsigned num[VB_NUM] = {
 628       num_vertices,           /* VERTEX */
 629       info->instance_count,   /* INSTANCE */
 630       1                       /* CONST */
 631    };
 632
 633    memset(key, 0, sizeof(key));
 634    memset(elem_index, ~0, sizeof(elem_index));
 635
 636    /* See if there are vertex attribs of each type to translate and
 637     * which ones. */
 638    for (i = 0; i < mgr->ve->count; i++) {
 639       unsigned vb_index = mgr->ve->ve[i].vertex_buffer_index;
 640
 641       if (!mgr->vertex_buffer[vb_index].stride) {
 642          if (!(mgr->ve->incompatible_elem_mask & (1 << i)) &&
 643              !(incompatible_vb_mask & (1 << vb_index))) {
 644             continue;
 645          }
 646          mask[VB_CONST] |= 1 << vb_index;
 647       } else if (mgr->ve->ve[i].instance_divisor) {
 648          if (!(mgr->ve->incompatible_elem_mask & (1 << i)) &&
 649              !(incompatible_vb_mask & (1 << vb_index))) {
 650             continue;
 651          }
 652          mask[VB_INSTANCE] |= 1 << vb_index;
 653       } else {
 654          if (!unroll_indices &&
 655              !(mgr->ve->incompatible_elem_mask & (1 << i)) &&
 656              !(incompatible_vb_mask & (1 << vb_index))) {
 657             continue;
 658          }
 659          mask[VB_VERTEX] |= 1 << vb_index;
 660       }
 661    }
 662
 663    assert(mask[VB_VERTEX] || mask[VB_INSTANCE] || mask[VB_CONST]);
 664
 665    /* Find free vertex buffer slots. */
 666    if (!u_vbuf_translate_find_free_vb_slots(mgr, mask)) {
 667       return FALSE;
 668    }
 669
 670    /* Initialize the translate keys. */
 671    for (i = 0; i < mgr->ve->count; i++) {
 672       struct translate_key *k;
 673       struct translate_element *te;
 674       enum pipe_format output_format = mgr->ve->native_format[i];
 675       unsigned bit, vb_index = mgr->ve->ve[i].vertex_buffer_index;
 676       bit = 1 << vb_index;
 677
 678       if (!(mgr->ve->incompatible_elem_mask & (1 << i)) &&
 679           !(incompatible_vb_mask & (1 << vb_index)) &&
 680           (!unroll_indices || !(mask[VB_VERTEX] & bit))) {
 681          continue;
 682       }
 683
 684       /* Set type to what we will translate.
 685        * Whether vertex, instance, or constant attribs. */
 686       for (type = 0; type < VB_NUM; type++) {
 687          if (mask[type] & bit) {
 688             break;
 689          }
 690       }
 691       assert(type < VB_NUM);
 692       if (mgr->ve->ve[i].src_format != output_format)
 693          assert(translate_is_output_format_supported(output_format));
 694       /*printf("velem=%i type=%i\n", i, type);*/
 695
 696       /* Add the vertex element. */
 697       k = &key[type];
 698       elem_index[type][i] = k->nr_elements;
 699
 700       te = &k->element[k->nr_elements];
 701       te->type = TRANSLATE_ELEMENT_NORMAL;
 702       te->instance_divisor = 0;
 703       te->input_buffer = vb_index;
 704       te->input_format = mgr->ve->ve[i].src_format;
 705       te->input_offset = mgr->ve->ve[i].src_offset;
 706       te->output_format = output_format;
 707       te->output_offset = k->output_stride;
 708
 709       k->output_stride += mgr->ve->native_format_size[i];
 710       k->nr_elements++;
 711    }
 712
 713    /* Translate buffers. */
 714    for (type = 0; type < VB_NUM; type++) {
 715       if (key[type].nr_elements) {
 716          enum pipe_error err;
 717          err = u_vbuf_translate_buffers(mgr, &key[type], info, mask[type],
 718                                         mgr->fallback_vbs[type],
 719                                         start[type], num[type], min_index,
 720                                         unroll_indices && type == VB_VERTEX);
 721          if (err != PIPE_OK)
 722             return FALSE;
 723
 724          /* Fixup the stride for constant attribs. */
 725          if (type == VB_CONST) {
 726             mgr->real_vertex_buffer[mgr->fallback_vbs[VB_CONST]].stride = 0;
 727          }
 728       }
 729    }
 730
 731    /* Setup new vertex elements. */
 732    for (i = 0; i < mgr->ve->count; i++) {
 733       for (type = 0; type < VB_NUM; type++) {
 734          if (elem_index[type][i] < key[type].nr_elements) {
 735             struct translate_element *te = &key[type].element[elem_index[type][i]];
 736             mgr->fallback_velems.velems[i].instance_divisor = mgr->ve->ve[i].instance_divisor;
 737             mgr->fallback_velems.velems[i].src_format = te->output_format;
 738             mgr->fallback_velems.velems[i].src_offset = te->output_offset;
 739             mgr->fallback_velems.velems[i].vertex_buffer_index = mgr->fallback_vbs[type];
 740
 741             /* elem_index[type][i] can only be set for one type. */
 742             assert(type > VB_INSTANCE || elem_index[type+1][i] == ~0u);
 743             assert(type > VB_VERTEX   || elem_index[type+2][i] == ~0u);
 744             break;
 745          }
 746       }
 747       /* No translating, just copy the original vertex element over. */
 748       if (type == VB_NUM) {
 749          memcpy(&mgr->fallback_velems.velems[i], &mgr->ve->ve[i],
 750                 sizeof(struct pipe_vertex_element));
 751       }
 752    }
 753
 754    mgr->fallback_velems.count = mgr->ve->count;
 755
 756    u_vbuf_set_vertex_elements_internal(mgr, &mgr->fallback_velems);
 757    mgr->using_translate = TRUE;
 758    return TRUE;
 759 }
 760
 761 static void u_vbuf_translate_end(struct u_vbuf *mgr)
 762 {
 763    unsigned i;
 764
 765    /* Restore vertex elements. */
 766    mgr->pipe->bind_vertex_elements_state(mgr->pipe, mgr->ve->driver_cso);
 767    mgr->using_translate = FALSE;
 768
 769    /* Unreference the now-unused VBOs. */
 770    for (i = 0; i < VB_NUM; i++) {
 771       unsigned vb = mgr->fallback_vbs[i];
 772       if (vb != ~0u) {
 773          pipe_resource_reference(&mgr->real_vertex_buffer[vb].buffer.resource, NULL);
 774          mgr->fallback_vbs[i] = ~0;
 775       }
 776    }
 777    /* This will cause the buffer to be unbound in the driver later. */
 778    mgr->dirty_real_vb_mask |= mgr->fallback_vbs_mask;
 779    mgr->fallback_vbs_mask = 0;
 780 }
 781
 782 static void *
 783 u_vbuf_create_vertex_elements(struct u_vbuf *mgr, unsigned count,
 784                               const struct pipe_vertex_element *attribs)
 785 {
 786    struct pipe_context *pipe = mgr->pipe;
 787    unsigned i;
 788    struct pipe_vertex_element driver_attribs[PIPE_MAX_ATTRIBS];
 789    struct u_vbuf_elements *ve = CALLOC_STRUCT(u_vbuf_elements);
 790    uint32_t used_buffers = 0;
 791
 792    ve->count = count;
 793
 794    memcpy(ve->ve, attribs, sizeof(struct pipe_vertex_element) * count);
 795    memcpy(driver_attribs, attribs, sizeof(struct pipe_vertex_element) * count);
 796
 797    /* Set the best native format in case the original format is not
 798     * supported. */
 799    for (i = 0; i < count; i++) {
 800       enum pipe_format format = ve->ve[i].src_format;
 801
 802       ve->src_format_size[i] = util_format_get_blocksize(format);
 803
 804       used_buffers |= 1 << ve->ve[i].vertex_buffer_index;
 805
 806       if (!ve->ve[i].instance_divisor) {
 807          ve->noninstance_vb_mask_any |= 1 << ve->ve[i].vertex_buffer_index;
 808       }
 809
 810       format = mgr->caps.format_translation[format];
 811
 812       driver_attribs[i].src_format = format;
 813       ve->native_format[i] = format;
 814       ve->native_format_size[i] =
 815             util_format_get_blocksize(ve->native_format[i]);
 816
 817       if (ve->ve[i].src_format != format ||
 818           (!mgr->caps.velem_src_offset_unaligned &&
 819            ve->ve[i].src_offset % 4 != 0)) {
 820          ve->incompatible_elem_mask |= 1 << i;
 821          ve->incompatible_vb_mask_any |= 1 << ve->ve[i].vertex_buffer_index;
 822       } else {
 823          ve->compatible_vb_mask_any |= 1 << ve->ve[i].vertex_buffer_index;
 824       }
 825    }
 826
 827    if (used_buffers & ~mgr->allowed_vb_mask) {
 828       /* More vertex buffers are used than the hardware supports.  In
 829        * principle, we only need to make sure that less vertex buffers are
 830        * used, and mark some of the latter vertex buffers as incompatible.
 831        * For now, mark all vertex buffers as incompatible.
 832        */
 833       ve->incompatible_vb_mask_any = used_buffers;
 834       ve->compatible_vb_mask_any = 0;
 835       ve->incompatible_elem_mask = u_bit_consecutive(0, count);
 836    }
 837
 838    ve->used_vb_mask = used_buffers;
 839    ve->compatible_vb_mask_all = ~ve->incompatible_vb_mask_any & used_buffers;
 840    ve->incompatible_vb_mask_all = ~ve->compatible_vb_mask_any & used_buffers;
 841
 842    /* Align the formats and offsets to the size of DWORD if needed. */
 843    if (!mgr->caps.velem_src_offset_unaligned) {
 844       for (i = 0; i < count; i++) {
 845          ve->native_format_size[i] = align(ve->native_format_size[i], 4);
 846          driver_attribs[i].src_offset = align(ve->ve[i].src_offset, 4);
 847       }
 848    }
 849
 850    /* Only create driver CSO if no incompatible elements */
 851    if (!ve->incompatible_elem_mask) {
 852       ve->driver_cso =
 853          pipe->create_vertex_elements_state(pipe, count, driver_attribs);
 854    }
 855
 856    return ve;
 857 }
 858
 859 static void u_vbuf_delete_vertex_elements(struct u_vbuf *mgr, void *cso)
 860 {
 861    struct pipe_context *pipe = mgr->pipe;
 862    struct u_vbuf_elements *ve = cso;
 863
 864    if (ve->driver_cso)
 865       pipe->delete_vertex_elements_state(pipe, ve->driver_cso);
 866    FREE(ve);
 867 }
 868
 869 void u_vbuf_set_vertex_buffers(struct u_vbuf *mgr,
 870                                unsigned start_slot, unsigned count,
 871                                const struct pipe_vertex_buffer *bufs)
 872 {
 873    unsigned i;
 874    /* which buffers are enabled */
 875    uint32_t enabled_vb_mask = 0;
 876    /* which buffers are in user memory */
 877    uint32_t user_vb_mask = 0;
 878    /* which buffers are incompatible with the driver */
 879    uint32_t incompatible_vb_mask = 0;
 880    /* which buffers have a non-zero stride */
 881    uint32_t nonzero_stride_vb_mask = 0;
 882    const uint32_t mask = ~(((1ull << count) - 1) << start_slot);
 883
 884    /* Zero out the bits we are going to rewrite completely. */
 885    mgr->user_vb_mask &= mask;
 886    mgr->incompatible_vb_mask &= mask;
 887    mgr->nonzero_stride_vb_mask &= mask;
 888    mgr->enabled_vb_mask &= mask;
 889
 890    if (!bufs) {
 891       struct pipe_context *pipe = mgr->pipe;
 892       /* Unbind. */
 893       mgr->dirty_real_vb_mask &= mask;
 894
 895       for (i = 0; i < count; i++) {
 896          unsigned dst_index = start_slot + i;
 897
 898          pipe_vertex_buffer_unreference(&mgr->vertex_buffer[dst_index]);
 899          pipe_vertex_buffer_unreference(&mgr->real_vertex_buffer[dst_index]);
 900       }
 901
 902       pipe->set_vertex_buffers(pipe, start_slot, count, NULL);
 903       return;
 904    }
 905
 906    for (i = 0; i < count; i++) {
 907       unsigned dst_index = start_slot + i;
 908       const struct pipe_vertex_buffer *vb = &bufs[i];
 909       struct pipe_vertex_buffer *orig_vb = &mgr->vertex_buffer[dst_index];
 910       struct pipe_vertex_buffer *real_vb = &mgr->real_vertex_buffer[dst_index];
 911
 912       if (!vb->buffer.resource) {
 913          pipe_vertex_buffer_unreference(orig_vb);
 914          pipe_vertex_buffer_unreference(real_vb);
 915          continue;
 916       }
 917
 918       pipe_vertex_buffer_reference(orig_vb, vb);
 919
 920       if (vb->stride) {
 921          nonzero_stride_vb_mask |= 1 << dst_index;
 922       }
 923       enabled_vb_mask |= 1 << dst_index;
 924
 925       if ((!mgr->caps.buffer_offset_unaligned && vb->buffer_offset % 4 != 0) ||
 926           (!mgr->caps.buffer_stride_unaligned && vb->stride % 4 != 0)) {
 927          incompatible_vb_mask |= 1 << dst_index;
 928          real_vb->buffer_offset = vb->buffer_offset;
 929          real_vb->stride = vb->stride;
 930          pipe_vertex_buffer_unreference(real_vb);
 931          real_vb->is_user_buffer = false;
 932          continue;
 933       }
 934
 935       if (!mgr->caps.user_vertex_buffers && vb->is_user_buffer) {
 936          user_vb_mask |= 1 << dst_index;
 937          real_vb->buffer_offset = vb->buffer_offset;
 938          real_vb->stride = vb->stride;
 939          pipe_vertex_buffer_unreference(real_vb);
 940          real_vb->is_user_buffer = false;
 941          continue;
 942       }
 943
 944       pipe_vertex_buffer_reference(real_vb, vb);
 945    }
 946
 947    mgr->user_vb_mask |= user_vb_mask;
 948    mgr->incompatible_vb_mask |= incompatible_vb_mask;
 949    mgr->nonzero_stride_vb_mask |= nonzero_stride_vb_mask;
 950    mgr->enabled_vb_mask |= enabled_vb_mask;
 951
 952    /* All changed buffers are marked as dirty, even the NULL ones,
 953     * which will cause the NULL buffers to be unbound in the driver later. */
 954    mgr->dirty_real_vb_mask |= ~mask;
 955 }
 956
 957 static enum pipe_error
 958 u_vbuf_upload_buffers(struct u_vbuf *mgr,
 959                       int start_vertex, unsigned num_vertices,
 960                       int start_instance, unsigned num_instances)
 961 {
 962    unsigned i;
 963    unsigned nr_velems = mgr->ve->count;
 964    const struct pipe_vertex_element *velems =
 965          mgr->using_translate ? mgr->fallback_velems.velems : mgr->ve->ve;
 966    unsigned start_offset[PIPE_MAX_ATTRIBS];
 967    unsigned end_offset[PIPE_MAX_ATTRIBS];
 968    uint32_t buffer_mask = 0;
 969
 970    /* Determine how much data needs to be uploaded. */
 971    for (i = 0; i < nr_velems; i++) {
 972       const struct pipe_vertex_element *velem = &velems[i];
 973       unsigned index = velem->vertex_buffer_index;
 974       struct pipe_vertex_buffer *vb = &mgr->vertex_buffer[index];
 975       unsigned instance_div, first, size, index_bit;
 976
 977       /* Skip the buffers generated by translate. */
 978       if ((1 << index) & mgr->fallback_vbs_mask) {
 979          continue;
 980       }
 981
 982       if (!vb->is_user_buffer) {
 983          continue;
 984       }
 985
 986       instance_div = velem->instance_divisor;
 987       first = vb->buffer_offset + velem->src_offset;
 988
 989       if (!vb->stride) {
 990          /* Constant attrib. */
 991          size = mgr->ve->src_format_size[i];
 992       } else if (instance_div) {
 993          /* Per-instance attrib. */
 994
 995          /* Figure out how many instances we'll render given instance_div.  We
 996           * can't use the typical div_round_up() pattern because the CTS uses
 997           * instance_div = ~0 for a test, which overflows div_round_up()'s
 998           * addition.
 999           */
1000          unsigned count = num_instances / instance_div;
1001          if (count * instance_div != num_instances)
1002             count++;
1003
1004          first += vb->stride * start_instance;
1005          size = vb->stride * (count - 1) + mgr->ve->src_format_size[i];
1006       } else {
1007          /* Per-vertex attrib. */
1008          first += vb->stride * start_vertex;
1009          size = vb->stride * (num_vertices - 1) + mgr->ve->src_format_size[i];
1010       }
1011
1012       index_bit = 1 << index;
1013
1014       /* Update offsets. */
1015       if (!(buffer_mask & index_bit)) {
1016          start_offset[index] = first;
1017          end_offset[index] = first + size;
1018       } else {
1019          if (first < start_offset[index])
1020             start_offset[index] = first;
1021          if (first + size > end_offset[index])
1022             end_offset[index] = first + size;
1023       }
1024
1025       buffer_mask |= index_bit;
1026    }
1027
1028    /* Upload buffers. */
1029    while (buffer_mask) {
1030       unsigned start, end;
1031       struct pipe_vertex_buffer *real_vb;
1032       const uint8_t *ptr;
1033
1034       i = u_bit_scan(&buffer_mask);
1035
1036       start = start_offset[i];
1037       end = end_offset[i];
1038       assert(start < end);
1039
1040       real_vb = &mgr->real_vertex_buffer[i];
1041       ptr = mgr->vertex_buffer[i].buffer.user;
1042
1043       u_upload_data(mgr->pipe->stream_uploader,
1044                     mgr->has_signed_vb_offset ? 0 : start,
1045                     end - start, 4,
1046                     ptr + start, &real_vb->buffer_offset, &real_vb->buffer.resource);
1047       if (!real_vb->buffer.resource)
1048          return PIPE_ERROR_OUT_OF_MEMORY;
1049
1050       real_vb->buffer_offset -= start;
1051    }
1052
1053    return PIPE_OK;
1054 }
1055
1056 static boolean u_vbuf_need_minmax_index(const struct u_vbuf *mgr)
1057 {
1058    /* See if there are any per-vertex attribs which will be uploaded or
1059     * translated. Use bitmasks to get the info instead of looping over vertex
1060     * elements. */
1061    return (mgr->ve->used_vb_mask &
1062            ((mgr->user_vb_mask |
1063              mgr->incompatible_vb_mask |
1064              mgr->ve->incompatible_vb_mask_any) &
1065             mgr->ve->noninstance_vb_mask_any &
1066             mgr->nonzero_stride_vb_mask)) != 0;
1067 }
1068
1069 static boolean u_vbuf_mapping_vertex_buffer_blocks(const struct u_vbuf *mgr)
1070 {
1071    /* Return true if there are hw buffers which don't need to be translated.
1072     *
1073     * We could query whether each buffer is busy, but that would
1074     * be way more costly than this. */
1075    return (mgr->ve->used_vb_mask &
1076            (~mgr->user_vb_mask &
1077             ~mgr->incompatible_vb_mask &
1078             mgr->ve->compatible_vb_mask_all &
1079             mgr->ve->noninstance_vb_mask_any &
1080             mgr->nonzero_stride_vb_mask)) != 0;
1081 }
1082
1083 static void
1084 u_vbuf_get_minmax_index_mapped(const struct pipe_draw_info *info,
1085                                const void *indices, unsigned *out_min_index,
1086                                unsigned *out_max_index)
1087 {
1088    if (!info->count) {
1089       *out_min_index = 0;
1090       *out_max_index = 0;
1091       return;
1092    }
1093
1094    switch (info->index_size) {
1095    case 4: {
1096       const unsigned *ui_indices = (const unsigned*)indices;
1097       unsigned max = 0;
1098       unsigned min = ~0u;
1099       if (info->primitive_restart) {
1100          for (unsigned i = 0; i < info->count; i++) {
1101             if (ui_indices[i] != info->restart_index) {
1102                if (ui_indices[i] > max) max = ui_indices[i];
1103                if (ui_indices[i] < min) min = ui_indices[i];
1104             }
1105          }
1106       }
1107       else {
1108          for (unsigned i = 0; i < info->count; i++) {
1109             if (ui_indices[i] > max) max = ui_indices[i];
1110             if (ui_indices[i] < min) min = ui_indices[i];
1111          }
1112       }
1113       *out_min_index = min;
1114       *out_max_index = max;
1115       break;
1116    }
1117    case 2: {
1118       const unsigned short *us_indices = (const unsigned short*)indices;
1119       unsigned short max = 0;
1120       unsigned short min = ~((unsigned short)0);
1121       if (info->primitive_restart) {
1122          for (unsigned i = 0; i < info->count; i++) {
1123             if (us_indices[i] != info->restart_index) {
1124                if (us_indices[i] > max) max = us_indices[i];
1125                if (us_indices[i] < min) min = us_indices[i];
1126             }
1127          }
1128       }
1129       else {
1130          for (unsigned i = 0; i < info->count; i++) {
1131             if (us_indices[i] > max) max = us_indices[i];
1132             if (us_indices[i] < min) min = us_indices[i];
1133          }
1134       }
1135       *out_min_index = min;
1136       *out_max_index = max;
1137       break;
1138    }
1139    case 1: {
1140       const unsigned char *ub_indices = (const unsigned char*)indices;
1141       unsigned char max = 0;
1142       unsigned char min = ~((unsigned char)0);
1143       if (info->primitive_restart) {
1144          for (unsigned i = 0; i < info->count; i++) {
1145             if (ub_indices[i] != info->restart_index) {
1146                if (ub_indices[i] > max) max = ub_indices[i];
1147                if (ub_indices[i] < min) min = ub_indices[i];
1148             }
1149          }
1150       }
1151       else {
1152          for (unsigned i = 0; i < info->count; i++) {
1153             if (ub_indices[i] > max) max = ub_indices[i];
1154             if (ub_indices[i] < min) min = ub_indices[i];
1155          }
1156       }
1157       *out_min_index = min;
1158       *out_max_index = max;
1159       break;
1160    }
1161    default:
1162       unreachable("bad index size");
1163    }
1164 }
1165
1166 void u_vbuf_get_minmax_index(struct pipe_context *pipe,
1167                              const struct pipe_draw_info *info,
1168                              unsigned *out_min_index, unsigned *out_max_index)
1169 {
1170    struct pipe_transfer *transfer = NULL;
1171    const void *indices;
1172
1173    if (info->has_user_indices) {
1174       indices = (uint8_t*)info->index.user +
1175                 info->start * info->index_size;
1176    } else {
1177       indices = pipe_buffer_map_range(pipe, info->index.resource,
1178                                       info->start * info->index_size,
1179                                       info->count * info->index_size,
1180                                       PIPE_TRANSFER_READ, &transfer);
1181    }
1182
1183    u_vbuf_get_minmax_index_mapped(info, indices, out_min_index, out_max_index);
1184
1185    if (transfer) {
1186       pipe_buffer_unmap(pipe, transfer);
1187    }
1188 }
1189
1190 static void u_vbuf_set_driver_vertex_buffers(struct u_vbuf *mgr)
1191 {
1192    struct pipe_context *pipe = mgr->pipe;
1193    unsigned start_slot, count;
1194
1195    start_slot = ffs(mgr->dirty_real_vb_mask) - 1;
1196    count = util_last_bit(mgr->dirty_real_vb_mask >> start_slot);
1197
1198    pipe->set_vertex_buffers(pipe, start_slot, count,
1199                             mgr->real_vertex_buffer + start_slot);
1200    mgr->dirty_real_vb_mask = 0;
1201 }
1202
1203 static void
1204 u_vbuf_split_indexed_multidraw(struct u_vbuf *mgr, struct pipe_draw_info *info,
1205                                unsigned *indirect_data, unsigned stride,
1206                                unsigned draw_count)
1207 {
1208    assert(info->index_size);
1209    info->indirect = NULL;
1210
1211    for (unsigned i = 0; i < draw_count; i++) {
1212       unsigned offset = i * stride / 4;
1213
1214       info->count = indirect_data[offset + 0];
1215       info->instance_count = indirect_data[offset + 1];
1216
1217       if (!info->count || !info->instance_count)
1218          continue;
1219
1220       info->start = indirect_data[offset + 2];
1221       info->index_bias = indirect_data[offset + 3];
1222       info->start_instance = indirect_data[offset + 4];
1223
1224       u_vbuf_draw_vbo(mgr, info);
1225    }
1226 }
1227
1228 void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info)
1229 {
1230    struct pipe_context *pipe = mgr->pipe;
1231    int start_vertex;
1232    unsigned min_index;
1233    unsigned num_vertices;
1234    boolean unroll_indices = FALSE;
1235    const uint32_t used_vb_mask = mgr->ve->used_vb_mask;
1236    uint32_t user_vb_mask = mgr->user_vb_mask & used_vb_mask;
1237    const uint32_t incompatible_vb_mask =
1238       mgr->incompatible_vb_mask & used_vb_mask;
1239    struct pipe_draw_info new_info;
1240
1241    /* Normal draw. No fallback and no user buffers. */
1242    if (!incompatible_vb_mask &&
1243        !mgr->ve->incompatible_elem_mask &&
1244        !user_vb_mask) {
1245
1246       /* Set vertex buffers if needed. */
1247       if (mgr->dirty_real_vb_mask & used_vb_mask) {
1248          u_vbuf_set_driver_vertex_buffers(mgr);
1249       }
1250
1251       pipe->draw_vbo(pipe, info);
1252       return;
1253    }
1254
1255    new_info = *info;
1256
1257    /* Handle indirect (multi)draws. */
1258    if (new_info.indirect) {
1259       const struct pipe_draw_indirect_info *indirect = new_info.indirect;
1260       unsigned draw_count = 0;
1261
1262       /* Get the number of draws. */
1263       if (indirect->indirect_draw_count) {
1264          pipe_buffer_read(pipe, indirect->indirect_draw_count,
1265                           indirect->indirect_draw_count_offset,
1266                           4, &draw_count);
1267       } else {
1268          draw_count = indirect->draw_count;
1269       }
1270
1271       if (!draw_count)
1272          return;
1273
1274       unsigned data_size = (draw_count - 1) * indirect->stride +
1275                            (new_info.index_size ? 20 : 16);
1276       unsigned *data = malloc(data_size);
1277       if (!data)
1278          return; /* report an error? */
1279
1280       /* Read the used buffer range only once, because the read can be
1281        * uncached.
1282        */
1283       pipe_buffer_read(pipe, indirect->buffer, indirect->offset, data_size,
1284                        data);
1285
1286       if (info->index_size) {
1287          /* Indexed multidraw. */
1288          unsigned index_bias0 = data[3];
1289          bool index_bias_same = true;
1290
1291          /* If we invoke the translate path, we have to split the multidraw. */
1292          if (incompatible_vb_mask ||
1293              mgr->ve->incompatible_elem_mask) {
1294             u_vbuf_split_indexed_multidraw(mgr, &new_info, data,
1295                                            indirect->stride, draw_count);
1296             free(data);
1297             return;
1298          }
1299
1300          /* See if index_bias is the same for all draws. */
1301          for (unsigned i = 1; i < draw_count; i++) {
1302             if (data[i * indirect->stride / 4 + 3] != index_bias0) {
1303                index_bias_same = false;
1304                break;
1305             }
1306          }
1307
1308          /* Split the multidraw if index_bias is different. */
1309          if (!index_bias_same) {
1310             u_vbuf_split_indexed_multidraw(mgr, &new_info, data,
1311                                            indirect->stride, draw_count);
1312             free(data);
1313             return;
1314          }
1315
1316          /* If we don't need to use the translate path and index_bias is
1317           * the same, we can process the multidraw with the time complexity
1318           * equal to 1 draw call (except for the index range computation).
1319           * We only need to compute the index range covering all draw calls
1320           * of the multidraw.
1321           *
1322           * The driver will not look at these values because indirect != NULL.
1323           * These values determine the user buffer bounds to upload.
1324           */
1325          new_info.index_bias = index_bias0;
1326          new_info.min_index = ~0u;
1327          new_info.max_index = 0;
1328          new_info.start_instance = ~0u;
1329          unsigned end_instance = 0;
1330
1331          struct pipe_transfer *transfer = NULL;
1332          const uint8_t *indices;
1333
1334          if (info->has_user_indices) {
1335             indices = (uint8_t*)info->index.user;
1336          } else {
1337             indices = (uint8_t*)pipe_buffer_map(pipe, info->index.resource,
1338                                                 PIPE_TRANSFER_READ, &transfer);
1339          }
1340
1341          for (unsigned i = 0; i < draw_count; i++) {
1342             unsigned offset = i * indirect->stride / 4;
1343             unsigned start = data[offset + 2];
1344             unsigned count = data[offset + 0];
1345             unsigned start_instance = data[offset + 4];
1346             unsigned instance_count = data[offset + 1];
1347
1348             if (!count || !instance_count)
1349                continue;
1350
1351             /* Update the ranges of instances. */
1352             new_info.start_instance = MIN2(new_info.start_instance,
1353                                            start_instance);
1354             end_instance = MAX2(end_instance, start_instance + instance_count);
1355
1356             /* Update the index range. */
1357             unsigned min, max;
1358             new_info.count = count; /* only used by get_minmax_index */
1359             u_vbuf_get_minmax_index_mapped(&new_info,
1360                                            indices +
1361                                            new_info.index_size * start,
1362                                            &min, &max);
1363
1364             new_info.min_index = MIN2(new_info.min_index, min);
1365             new_info.max_index = MAX2(new_info.max_index, max);
1366          }
1367          free(data);
1368
1369          if (transfer)
1370             pipe_buffer_unmap(pipe, transfer);
1371
1372          /* Set the final instance count. */
1373          new_info.instance_count = end_instance - new_info.start_instance;
1374
1375          if (new_info.start_instance == ~0u || !new_info.instance_count)
1376             return;
1377       } else {
1378          /* Non-indexed multidraw.
1379           *
1380           * Keep the draw call indirect and compute minimums & maximums,
1381           * which will determine the user buffer bounds to upload, but
1382           * the driver will not look at these values because indirect != NULL.
1383           *
1384           * This efficiently processes the multidraw with the time complexity
1385           * equal to 1 draw call.
1386           */
1387          new_info.start = ~0u;
1388          new_info.start_instance = ~0u;
1389          unsigned end_vertex = 0;
1390          unsigned end_instance = 0;
1391
1392          for (unsigned i = 0; i < draw_count; i++) {
1393             unsigned offset = i * indirect->stride / 4;
1394             unsigned start = data[offset + 2];
1395             unsigned count = data[offset + 0];
1396             unsigned start_instance = data[offset + 3];
1397             unsigned instance_count = data[offset + 1];
1398
1399             new_info.start = MIN2(new_info.start, start);
1400             new_info.start_instance = MIN2(new_info.start_instance,
1401                                            start_instance);
1402
1403             end_vertex = MAX2(end_vertex, start + count);
1404             end_instance = MAX2(end_instance, start_instance + instance_count);
1405          }
1406          free(data);
1407
1408          /* Set the final counts. */
1409          new_info.count = end_vertex - new_info.start;
1410          new_info.instance_count = end_instance - new_info.start_instance;
1411
1412          if (new_info.start == ~0u || !new_info.count || !new_info.instance_count)
1413             return;
1414       }
1415    }
1416
1417    if (new_info.index_size) {
1418       /* See if anything needs to be done for per-vertex attribs. */
1419       if (u_vbuf_need_minmax_index(mgr)) {
1420          unsigned max_index;
1421
1422          if (new_info.max_index != ~0u) {
1423             min_index = new_info.min_index;
1424             max_index = new_info.max_index;
1425          } else {
1426             u_vbuf_get_minmax_index(mgr->pipe, &new_info,
1427                                     &min_index, &max_index);
1428          }
1429
1430          assert(min_index <= max_index);
1431
1432          start_vertex = min_index + new_info.index_bias;
1433          num_vertices = max_index + 1 - min_index;
1434
1435          /* Primitive restart doesn't work when unrolling indices.
1436           * We would have to break this drawing operation into several ones. */
1437          /* Use some heuristic to see if unrolling indices improves
1438           * performance. */
1439          if (!info->indirect &&
1440              !new_info.primitive_restart &&
1441              util_is_vbo_upload_ratio_too_large(new_info.count, num_vertices) &&
1442              !u_vbuf_mapping_vertex_buffer_blocks(mgr)) {
1443             unroll_indices = TRUE;
1444             user_vb_mask &= ~(mgr->nonzero_stride_vb_mask &
1445                               mgr->ve->noninstance_vb_mask_any);
1446          }
1447       } else {
1448          /* Nothing to do for per-vertex attribs. */
1449          start_vertex = 0;
1450          num_vertices = 0;
1451          min_index = 0;
1452       }
1453    } else {
1454       start_vertex = new_info.start;
1455       num_vertices = new_info.count;
1456       min_index = 0;
1457    }
1458
1459    /* Translate vertices with non-native layouts or formats. */
1460    if (unroll_indices ||
1461        incompatible_vb_mask ||
1462        mgr->ve->incompatible_elem_mask) {
1463       if (!u_vbuf_translate_begin(mgr, &new_info, start_vertex, num_vertices,
1464                                   min_index, unroll_indices)) {
1465          debug_warn_once("u_vbuf_translate_begin() failed");
1466          return;
1467       }
1468
1469       if (unroll_indices) {
1470          new_info.index_size = 0;
1471          new_info.index_bias = 0;
1472          new_info.min_index = 0;
1473          new_info.max_index = new_info.count - 1;
1474          new_info.start = 0;
1475       }
1476
1477       user_vb_mask &= ~(incompatible_vb_mask |
1478                         mgr->ve->incompatible_vb_mask_all);
1479    }
1480
1481    /* Upload user buffers. */
1482    if (user_vb_mask) {
1483       if (u_vbuf_upload_buffers(mgr, start_vertex, num_vertices,
1484                                 new_info.start_instance,
1485                                 new_info.instance_count) != PIPE_OK) {
1486          debug_warn_once("u_vbuf_upload_buffers() failed");
1487          return;
1488       }
1489
1490       mgr->dirty_real_vb_mask |= user_vb_mask;
1491    }
1492
1493    /*
1494    if (unroll_indices) {
1495       printf("unrolling indices: start_vertex = %i, num_vertices = %i\n",
1496              start_vertex, num_vertices);
1497       util_dump_draw_info(stdout, info);
1498       printf("\n");
1499    }
1500
1501    unsigned i;
1502    for (i = 0; i < mgr->nr_vertex_buffers; i++) {
1503       printf("input %i: ", i);
1504       util_dump_vertex_buffer(stdout, mgr->vertex_buffer+i);
1505       printf("\n");
1506    }
1507    for (i = 0; i < mgr->nr_real_vertex_buffers; i++) {
1508       printf("real %i: ", i);
1509       util_dump_vertex_buffer(stdout, mgr->real_vertex_buffer+i);
1510       printf("\n");
1511    }
1512    */
1513
1514    u_upload_unmap(pipe->stream_uploader);
1515    u_vbuf_set_driver_vertex_buffers(mgr);
1516
1517    pipe->draw_vbo(pipe, &new_info);
1518
1519    if (mgr->using_translate) {
1520       u_vbuf_translate_end(mgr);
1521    }
1522 }
1523
1524 void u_vbuf_save_vertex_elements(struct u_vbuf *mgr)
1525 {
1526    assert(!mgr->ve_saved);
1527    mgr->ve_saved = mgr->ve;
1528 }
1529
1530 void u_vbuf_restore_vertex_elements(struct u_vbuf *mgr)
1531 {
1532    if (mgr->ve != mgr->ve_saved) {
1533       struct pipe_context *pipe = mgr->pipe;
1534
1535       mgr->ve = mgr->ve_saved;
1536       pipe->bind_vertex_elements_state(pipe,
1537                                        mgr->ve ? mgr->ve->driver_cso : NULL);
1538    }
1539    mgr->ve_saved = NULL;
1540 }
1541
1542 void u_vbuf_save_vertex_buffer0(struct u_vbuf *mgr)
1543 {
1544    pipe_vertex_buffer_reference(&mgr->vertex_buffer0_saved,
1545                                 &mgr->vertex_buffer[0]);
1546 }
1547
1548 void u_vbuf_restore_vertex_buffer0(struct u_vbuf *mgr)
1549 {
1550    u_vbuf_set_vertex_buffers(mgr, 0, 1, &mgr->vertex_buffer0_saved);
1551    pipe_vertex_buffer_unreference(&mgr->vertex_buffer0_saved);
1552 }