src/gallium/drivers/i965/brw_curbe.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32 #include "util/u_memory.h"
  33 #include "util/u_math.h"
  34
  35 #include "brw_batchbuffer.h"
  36 #include "brw_context.h"
  37 #include "brw_defines.h"
  38 #include "brw_state.h"
  39 #include "brw_util.h"
  40 #include "brw_debug.h"
  41 #include "brw_screen.h"
  42
  43
  44 /**
  45  * Partition the CURBE between the various users of constant values:
  46  * Note that vertex and fragment shaders can now fetch constants out
  47  * of constant buffers.  We no longer allocatea block of the GRF for
  48  * constants.  That greatly reduces the demand for space in the CURBE.
  49  * Some of the comments within are dated...
  50  */
  51 static int calculate_curbe_offsets( struct brw_context *brw )
  52 {
  53    /* CACHE_NEW_WM_PROG */
  54    const GLuint nr_fp_regs = (brw->wm.prog_data->nr_params + 15) / 16;
  55
  56    /* BRW_NEW_VERTEX_PROGRAM */
  57    const GLuint nr_vp_regs = (brw->vs.prog_data->nr_params + 15) / 16;
  58    GLuint nr_clip_regs = 0;
  59    GLuint total_regs;
  60
  61    /* PIPE_NEW_CLIP */
  62    if (brw->curr.ucp.nr) {
  63       GLuint nr_planes = 6 + brw->curr.ucp.nr;
  64       nr_clip_regs = (nr_planes * 4 + 15) / 16;
  65    }
  66
  67
  68    total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs;
  69
  70    /* When this is > 32, want to use a true constant buffer to hold
  71     * the extra constants.
  72     */
  73    assert(total_regs <= 32);
  74
  75    /* Lazy resize:
  76     */
  77    if (nr_fp_regs > brw->curbe.wm_size ||
  78        nr_vp_regs > brw->curbe.vs_size ||
  79        nr_clip_regs != brw->curbe.clip_size ||
  80        (total_regs < brw->curbe.total_size / 4 &&
  81         brw->curbe.total_size > 16)) {
  82
  83       GLuint reg = 0;
  84
  85       /* Calculate a new layout:
  86        */
  87       reg = 0;
  88       brw->curbe.wm_start = reg;
  89       brw->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs;
  90       brw->curbe.clip_start = reg;
  91       brw->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs;
  92       brw->curbe.vs_start = reg;
  93       brw->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs;
  94       brw->curbe.total_size = reg;
  95
  96       if (BRW_DEBUG & DEBUG_CURBE)
  97          debug_printf("curbe wm %d+%d clip %d+%d vs %d+%d\n",
  98                       brw->curbe.wm_start,
  99                       brw->curbe.wm_size,
 100                       brw->curbe.clip_start,
 101                       brw->curbe.clip_size,
 102                       brw->curbe.vs_start,
 103                       brw->curbe.vs_size );
 104
 105       brw->state.dirty.brw |= BRW_NEW_CURBE_OFFSETS;
 106    }
 107
 108    return 0;
 109 }
 110
 111
 112 const struct brw_tracked_state brw_curbe_offsets = {
 113    .dirty = {
 114       .mesa = PIPE_NEW_CLIP,
 115       .brw  = BRW_NEW_VERTEX_PROGRAM,
 116       .cache = CACHE_NEW_WM_PROG
 117    },
 118    .prepare = calculate_curbe_offsets
 119 };
 120
 121
 122
 123
 124 /* Define the number of curbes within CS's urb allocation.  Multiple
 125  * urb entries -> multiple curbes.  These will be used by
 126  * fixed-function hardware in a double-buffering scheme to avoid a
 127  * pipeline stall each time the contents of the curbe is changed.
 128  */
 129 int brw_upload_cs_urb_state(struct brw_context *brw)
 130 {
 131    struct brw_cs_urb_state cs_urb;
 132    memset(&cs_urb, 0, sizeof(cs_urb));
 133
 134    /* It appears that this is the state packet for the CS unit, ie. the
 135     * urb entries detailed here are housed in the CS range from the
 136     * URB_FENCE command.
 137     */
 138    cs_urb.header.opcode = CMD_CS_URB_STATE;
 139    cs_urb.header.length = sizeof(cs_urb)/4 - 2;
 140
 141    /* BRW_NEW_URB_FENCE */
 142    cs_urb.bits0.nr_urb_entries = brw->urb.nr_cs_entries;
 143    cs_urb.bits0.urb_entry_size = brw->urb.csize - 1;
 144
 145    assert(brw->urb.nr_cs_entries);
 146    BRW_CACHED_BATCH_STRUCT(brw, &cs_urb);
 147    return 0;
 148 }
 149
 150 static GLfloat fixed_plane[6][4] = {
 151    { 0,    0,   -1, 1 },
 152    { 0,    0,    1, 1 },
 153    { 0,   -1,    0, 1 },
 154    { 0,    1,    0, 1 },
 155    {-1,    0,    0, 1 },
 156    { 1,    0,    0, 1 }
 157 };
 158
 159 /* Upload a new set of constants.  Too much variability to go into the
 160  * cache mechanism, but maybe would benefit from a comparison against
 161  * the current uploaded set of constants.
 162  */
 163 static enum pipe_error prepare_curbe_buffer(struct brw_context *brw)
 164 {
 165    const GLuint sz = brw->curbe.total_size;
 166    const GLuint bufsz = sz * 16 * sizeof(GLfloat);
 167    enum pipe_error ret;
 168    GLfloat *buf;
 169    GLuint i;
 170
 171    if (sz == 0) {
 172       if (brw->curbe.last_buf) {
 173          free(brw->curbe.last_buf);
 174          brw->curbe.last_buf = NULL;
 175          brw->curbe.last_bufsz  = 0;
 176       }
 177       return 0;
 178    }
 179
 180    buf = (GLfloat *) CALLOC(bufsz, 1);
 181
 182    /* fragment shader constants */
 183    if (brw->curbe.wm_size) {
 184       GLuint offset = brw->curbe.wm_start * 16;
 185
 186       /* map fs constant buffer */
 187
 188       /* copy float constants */
 189       for (i = 0; i < brw->wm.prog_data->nr_params; i++)
 190          buf[offset + i] = *brw->wm.prog_data->param[i];
 191
 192       /* unmap fs constant buffer */
 193    }
 194
 195
 196    /* The clipplanes are actually delivered to both CLIP and VS units.
 197     * VS uses them to calculate the outcode bitmasks.
 198     */
 199    if (brw->curbe.clip_size) {
 200       GLuint offset = brw->curbe.clip_start * 16;
 201       GLuint j;
 202
 203       /* If any planes are going this way, send them all this way:
 204        */
 205       for (i = 0; i < 6; i++) {
 206          buf[offset + i * 4 + 0] = fixed_plane[i][0];
 207          buf[offset + i * 4 + 1] = fixed_plane[i][1];
 208          buf[offset + i * 4 + 2] = fixed_plane[i][2];
 209          buf[offset + i * 4 + 3] = fixed_plane[i][3];
 210       }
 211
 212       /* Clip planes:
 213        */
 214       assert(brw->curr.ucp.nr <= 6);
 215       for (j = 0; j < brw->curr.ucp.nr; j++) {
 216          buf[offset + i * 4 + 0] = brw->curr.ucp.ucp[j][0];
 217          buf[offset + i * 4 + 1] = brw->curr.ucp.ucp[j][1];
 218          buf[offset + i * 4 + 2] = brw->curr.ucp.ucp[j][2];
 219          buf[offset + i * 4 + 3] = brw->curr.ucp.ucp[j][3];
 220          i++;
 221       }
 222    }
 223
 224    /* vertex shader constants */
 225    if (brw->curbe.vs_size) {
 226       GLuint offset = brw->curbe.vs_start * 16;
 227       GLuint nr = brw->curr.vertex_shader->info.file_max[TGSI_FILE_CONSTANT];
 228       struct pipe_screen *screen = brw->base.screen;
 229
 230       const GLfloat *value = screen->buffer_map( screen,
 231                                                  brw->curr.vertex_constants,
 232                                                  PIPE_BUFFER_USAGE_CPU_READ);
 233
 234       /* XXX: what if user's constant buffer is too small?
 235        */
 236       memcpy(&buf[offset], value, nr * 4 * sizeof(float));
 237
 238       screen->buffer_unmap( screen, brw->curr.vertex_constants );
 239    }
 240
 241    if (BRW_DEBUG & DEBUG_CURBE) {
 242       for (i = 0; i < sz*16; i+=4)
 243          debug_printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4,
 244                       buf[i+0], buf[i+1], buf[i+2], buf[i+3]);
 245
 246       debug_printf("last_buf %p buf %p sz %d/%d cmp %d\n",
 247                    (void *)brw->curbe.last_buf, (void *)buf,
 248                    bufsz, brw->curbe.last_bufsz,
 249                    brw->curbe.last_buf ? memcmp(buf, brw->curbe.last_buf, bufsz) : -1);
 250    }
 251
 252    if (brw->curbe.curbe_bo != NULL &&
 253        brw->curbe.last_buf &&
 254        bufsz == brw->curbe.last_bufsz &&
 255        memcmp(buf, brw->curbe.last_buf, bufsz) == 0) {
 256       /* constants have not changed */
 257       FREE(buf);
 258    }
 259    else {
 260       /* constants have changed */
 261       if (brw->curbe.last_buf)
 262          FREE(brw->curbe.last_buf);
 263
 264       brw->curbe.last_buf = buf;
 265       brw->curbe.last_bufsz = bufsz;
 266
 267       if (brw->curbe.curbe_bo != NULL &&
 268           (brw->curbe.need_new_bo ||
 269            brw->curbe.curbe_next_offset + bufsz > brw->curbe.curbe_bo->size))
 270       {
 271          bo_reference(&brw->curbe.curbe_bo, NULL);
 272       }
 273
 274       if (brw->curbe.curbe_bo == NULL) {
 275          /* Allocate a single page for CURBE entries for this batchbuffer.
 276           * They're generally around 64b.
 277           */
 278          ret = brw->sws->bo_alloc(brw->sws,
 279                                   BRW_BUFFER_TYPE_CURBE,
 280                                   4096, 1 << 6,
 281                                   &brw->curbe.curbe_bo);
 282          if (ret)
 283             return ret;
 284
 285          brw->curbe.curbe_next_offset = 0;
 286       }
 287
 288       brw->curbe.curbe_offset = brw->curbe.curbe_next_offset;
 289       brw->curbe.curbe_next_offset += bufsz;
 290       brw->curbe.curbe_next_offset = align(brw->curbe.curbe_next_offset, 64);
 291
 292       /* Copy data to the buffer:
 293        */
 294       brw->sws->bo_subdata(brw->curbe.curbe_bo,
 295                            brw->curbe.curbe_offset,
 296                            BRW_DATA_OTHER,
 297                            bufsz,
 298                            buf);
 299    }
 300
 301    brw_add_validated_bo(brw, brw->curbe.curbe_bo);
 302
 303    /* Because this provokes an action (ie copy the constants into the
 304     * URB), it shouldn't be shortcircuited if identical to the
 305     * previous time - because eg. the urb destination may have
 306     * changed, or the urb contents different to last time.
 307     *
 308     * Note that the data referred to is actually copied internally,
 309     * not just used in place according to passed pointer.
 310     *
 311     * It appears that the CS unit takes care of using each available
 312     * URB entry (Const URB Entry == CURBE) in turn, and issuing
 313     * flushes as necessary when doublebuffering of CURBEs isn't
 314     * possible.
 315     */
 316
 317    return 0;
 318 }
 319
 320 static enum pipe_error emit_curbe_buffer(struct brw_context *brw)
 321 {
 322    GLuint sz = brw->curbe.total_size;
 323
 324    BEGIN_BATCH(2, IGNORE_CLIPRECTS);
 325    if (sz == 0) {
 326       OUT_BATCH((CMD_CONST_BUFFER << 16) | (2 - 2));
 327       OUT_BATCH(0);
 328    } else {
 329       OUT_BATCH((CMD_CONST_BUFFER << 16) | (1 << 8) | (2 - 2));
 330       OUT_RELOC(brw->curbe.curbe_bo,
 331                 BRW_USAGE_STATE,
 332                 (sz - 1) + brw->curbe.curbe_offset);
 333    }
 334    ADVANCE_BATCH();
 335    return 0;
 336 }
 337
 338 const struct brw_tracked_state brw_curbe_buffer = {
 339    .dirty = {
 340       .mesa = (PIPE_NEW_FRAGMENT_CONSTANTS |
 341                PIPE_NEW_VERTEX_CONSTANTS |
 342                PIPE_NEW_CLIP),
 343       .brw  = (BRW_NEW_FRAGMENT_PROGRAM |
 344                BRW_NEW_VERTEX_PROGRAM |
 345                BRW_NEW_URB_FENCE | /* Implicit - hardware requires this, not used above */
 346                BRW_NEW_PSP | /* Implicit - hardware requires this, not used above */
 347                BRW_NEW_CURBE_OFFSETS |
 348                BRW_NEW_BATCH),
 349       .cache = (CACHE_NEW_WM_PROG)
 350    },
 351    .prepare = prepare_curbe_buffer,
 352    .emit = emit_curbe_buffer,
 353 };
 354