src/mesa/drivers/dri/i965/brw_curbe.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keithw@vmware.com>
  30   */
  31
  32
  33
  34 #include "main/glheader.h"
  35 #include "main/context.h"
  36 #include "main/macros.h"
  37 #include "main/enums.h"
  38 #include "program/prog_parameter.h"
  39 #include "program/prog_print.h"
  40 #include "program/prog_statevars.h"
  41 #include "intel_batchbuffer.h"
  42 #include "brw_context.h"
  43 #include "brw_defines.h"
  44 #include "brw_state.h"
  45 #include "brw_util.h"
  46
  47
  48 /**
  49  * Partition the CURBE between the various users of constant values:
  50  * Note that vertex and fragment shaders can now fetch constants out
  51  * of constant buffers.  We no longer allocatea block of the GRF for
  52  * constants.  That greatly reduces the demand for space in the CURBE.
  53  * Some of the comments within are dated...
  54  */
  55 static void calculate_curbe_offsets( struct brw_context *brw )
  56 {
  57    struct gl_context *ctx = &brw->ctx;
  58    /* CACHE_NEW_WM_PROG */
  59    const GLuint nr_fp_regs = (brw->wm.prog_data->base.nr_params + 15) / 16;
  60
  61    /* BRW_NEW_VERTEX_PROGRAM */
  62    const GLuint nr_vp_regs = (brw->vs.prog_data->base.base.nr_params + 15) / 16;
  63    GLuint nr_clip_regs = 0;
  64    GLuint total_regs;
  65
  66    /* _NEW_TRANSFORM */
  67    if (ctx->Transform.ClipPlanesEnabled) {
  68       GLuint nr_planes = 6 + _mesa_bitcount_64(ctx->Transform.ClipPlanesEnabled);
  69       nr_clip_regs = (nr_planes * 4 + 15) / 16;
  70    }
  71
  72
  73    total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs;
  74
  75    /* This can happen - what to do?  Probably rather than falling
  76     * back, the best thing to do is emit programs which code the
  77     * constants as immediate values.  Could do this either as a static
  78     * cap on WM and VS, or adaptively.
  79     *
  80     * Unfortunately, this is currently dependent on the results of the
  81     * program generation process (in the case of wm), so this would
  82     * introduce the need to re-generate programs in the event of a
  83     * curbe allocation failure.
  84     */
  85    /* Max size is 32 - just large enough to
  86     * hold the 128 parameters allowed by
  87     * the fragment and vertex program
  88     * api's.  It's not clear what happens
  89     * when both VP and FP want to use 128
  90     * parameters, though.
  91     */
  92    assert(total_regs <= 32);
  93
  94    /* Lazy resize:
  95     */
  96    if (nr_fp_regs > brw->curbe.wm_size ||
  97        nr_vp_regs > brw->curbe.vs_size ||
  98        nr_clip_regs != brw->curbe.clip_size ||
  99        (total_regs < brw->curbe.total_size / 4 &&
 100         brw->curbe.total_size > 16)) {
 101
 102       GLuint reg = 0;
 103
 104       /* Calculate a new layout:
 105        */
 106       reg = 0;
 107       brw->curbe.wm_start = reg;
 108       brw->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs;
 109       brw->curbe.clip_start = reg;
 110       brw->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs;
 111       brw->curbe.vs_start = reg;
 112       brw->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs;
 113       brw->curbe.total_size = reg;
 114
 115       if (0)
 116          fprintf(stderr, "curbe wm %d+%d clip %d+%d vs %d+%d\n",
 117                  brw->curbe.wm_start,
 118                  brw->curbe.wm_size,
 119                  brw->curbe.clip_start,
 120                  brw->curbe.clip_size,
 121                  brw->curbe.vs_start,
 122                  brw->curbe.vs_size );
 123
 124       brw->state.dirty.brw |= BRW_NEW_CURBE_OFFSETS;
 125    }
 126 }
 127
 128
 129 const struct brw_tracked_state brw_curbe_offsets = {
 130    .dirty = {
 131       .mesa = _NEW_TRANSFORM,
 132       .brw  = BRW_NEW_VERTEX_PROGRAM | BRW_NEW_CONTEXT,
 133       .cache = CACHE_NEW_WM_PROG
 134    },
 135    .emit = calculate_curbe_offsets
 136 };
 137
 138
 139
 140
 141 /* Define the number of curbes within CS's urb allocation.  Multiple
 142  * urb entries -> multiple curbes.  These will be used by
 143  * fixed-function hardware in a double-buffering scheme to avoid a
 144  * pipeline stall each time the contents of the curbe is changed.
 145  */
 146 void brw_upload_cs_urb_state(struct brw_context *brw)
 147 {
 148    BEGIN_BATCH(2);
 149    /* It appears that this is the state packet for the CS unit, ie. the
 150     * urb entries detailed here are housed in the CS range from the
 151     * URB_FENCE command.
 152     */
 153    OUT_BATCH(CMD_CS_URB_STATE << 16 | (2-2));
 154
 155    /* BRW_NEW_URB_FENCE */
 156    if (brw->urb.csize == 0) {
 157       OUT_BATCH(0);
 158    } else {
 159       /* BRW_NEW_URB_FENCE */
 160       assert(brw->urb.nr_cs_entries);
 161       OUT_BATCH((brw->urb.csize - 1) << 4 | brw->urb.nr_cs_entries);
 162    }
 163    ADVANCE_BATCH();
 164 }
 165
 166 static GLfloat fixed_plane[6][4] = {
 167    { 0,    0,   -1, 1 },
 168    { 0,    0,    1, 1 },
 169    { 0,   -1,    0, 1 },
 170    { 0,    1,    0, 1 },
 171    {-1,    0,    0, 1 },
 172    { 1,    0,    0, 1 }
 173 };
 174
 175 /* Upload a new set of constants.  Too much variability to go into the
 176  * cache mechanism, but maybe would benefit from a comparison against
 177  * the current uploaded set of constants.
 178  */
 179 static void
 180 brw_upload_constant_buffer(struct brw_context *brw)
 181 {
 182    struct gl_context *ctx = &brw->ctx;
 183    const GLuint sz = brw->curbe.total_size;
 184    const GLuint bufsz = sz * 16 * sizeof(GLfloat);
 185    GLfloat *buf;
 186    GLuint i;
 187    gl_clip_plane *clip_planes;
 188
 189    if (sz == 0) {
 190       brw->curbe.last_bufsz  = 0;
 191       goto emit;
 192    }
 193
 194    buf = brw->curbe.next_buf;
 195
 196    /* fragment shader constants */
 197    if (brw->curbe.wm_size) {
 198       GLuint offset = brw->curbe.wm_start * 16;
 199
 200       /* copy float constants */
 201       for (i = 0; i < brw->wm.prog_data->base.nr_params; i++) {
 202          buf[offset + i] = *brw->wm.prog_data->base.param[i];
 203       }
 204    }
 205
 206    /* clipper constants */
 207    if (brw->curbe.clip_size) {
 208       GLuint offset = brw->curbe.clip_start * 16;
 209       GLuint j;
 210
 211       /* If any planes are going this way, send them all this way:
 212        */
 213       for (i = 0; i < 6; i++) {
 214          buf[offset + i * 4 + 0] = fixed_plane[i][0];
 215          buf[offset + i * 4 + 1] = fixed_plane[i][1];
 216          buf[offset + i * 4 + 2] = fixed_plane[i][2];
 217          buf[offset + i * 4 + 3] = fixed_plane[i][3];
 218       }
 219
 220       /* Clip planes: _NEW_TRANSFORM plus _NEW_PROJECTION to get to
 221        * clip-space:
 222        */
 223       clip_planes = brw_select_clip_planes(ctx);
 224       for (j = 0; j < MAX_CLIP_PLANES; j++) {
 225          if (ctx->Transform.ClipPlanesEnabled & (1<<j)) {
 226             buf[offset + i * 4 + 0] = clip_planes[j][0];
 227             buf[offset + i * 4 + 1] = clip_planes[j][1];
 228             buf[offset + i * 4 + 2] = clip_planes[j][2];
 229             buf[offset + i * 4 + 3] = clip_planes[j][3];
 230             i++;
 231          }
 232       }
 233    }
 234
 235    /* vertex shader constants */
 236    if (brw->curbe.vs_size) {
 237       GLuint offset = brw->curbe.vs_start * 16;
 238
 239       for (i = 0; i < brw->vs.prog_data->base.base.nr_params; i++) {
 240          buf[offset + i] = *brw->vs.prog_data->base.base.param[i];
 241       }
 242    }
 243
 244    if (0) {
 245       for (i = 0; i < sz*16; i+=4)
 246          fprintf(stderr, "curbe %d.%d: %f %f %f %f\n", i/8, i&4,
 247                  buf[i+0], buf[i+1], buf[i+2], buf[i+3]);
 248
 249       fprintf(stderr, "last_buf %p buf %p sz %d/%d cmp %d\n",
 250               brw->curbe.last_buf, buf,
 251               bufsz, brw->curbe.last_bufsz,
 252               brw->curbe.last_buf ? memcmp(buf, brw->curbe.last_buf, bufsz) : -1);
 253    }
 254
 255    if (brw->curbe.curbe_bo != NULL &&
 256        bufsz == brw->curbe.last_bufsz &&
 257        memcmp(buf, brw->curbe.last_buf, bufsz) == 0) {
 258       /* constants have not changed */
 259    } else {
 260       /* Update the record of what our last set of constants was.  We
 261        * don't just flip the pointers because we don't fill in the
 262        * data in the padding between the entries.
 263        */
 264       memcpy(brw->curbe.last_buf, buf, bufsz);
 265       brw->curbe.last_bufsz = bufsz;
 266
 267       if (brw->curbe.curbe_bo != NULL &&
 268           brw->curbe.curbe_next_offset + bufsz > brw->curbe.curbe_bo->size)
 269       {
 270          drm_intel_gem_bo_unmap_gtt(brw->curbe.curbe_bo);
 271          drm_intel_bo_unreference(brw->curbe.curbe_bo);
 272          brw->curbe.curbe_bo = NULL;
 273       }
 274
 275       if (brw->curbe.curbe_bo == NULL) {
 276          /* Allocate a single page for CURBE entries for this batchbuffer.
 277           * They're generally around 64b.
 278           */
 279          brw->curbe.curbe_bo = drm_intel_bo_alloc(brw->bufmgr, "CURBE",
 280                                                   4096, 1 << 6);
 281          brw->curbe.curbe_next_offset = 0;
 282          drm_intel_gem_bo_map_gtt(brw->curbe.curbe_bo);
 283          assert(bufsz < 4096);
 284       }
 285
 286       brw->curbe.curbe_offset = brw->curbe.curbe_next_offset;
 287       brw->curbe.curbe_next_offset += bufsz;
 288       brw->curbe.curbe_next_offset = ALIGN(brw->curbe.curbe_next_offset, 64);
 289
 290       /* Copy data to the buffer:
 291        */
 292       memcpy(brw->curbe.curbe_bo->virtual + brw->curbe.curbe_offset,
 293              buf,
 294              bufsz);
 295    }
 296
 297    /* Because this provokes an action (ie copy the constants into the
 298     * URB), it shouldn't be shortcircuited if identical to the
 299     * previous time - because eg. the urb destination may have
 300     * changed, or the urb contents different to last time.
 301     *
 302     * Note that the data referred to is actually copied internally,
 303     * not just used in place according to passed pointer.
 304     *
 305     * It appears that the CS unit takes care of using each available
 306     * URB entry (Const URB Entry == CURBE) in turn, and issuing
 307     * flushes as necessary when doublebuffering of CURBEs isn't
 308     * possible.
 309     */
 310
 311 emit:
 312    BEGIN_BATCH(2);
 313    if (brw->curbe.total_size == 0) {
 314       OUT_BATCH((CMD_CONST_BUFFER << 16) | (2 - 2));
 315       OUT_BATCH(0);
 316    } else {
 317       OUT_BATCH((CMD_CONST_BUFFER << 16) | (1 << 8) | (2 - 2));
 318       OUT_RELOC(brw->curbe.curbe_bo,
 319                 I915_GEM_DOMAIN_INSTRUCTION, 0,
 320                 (brw->curbe.total_size - 1) + brw->curbe.curbe_offset);
 321    }
 322    ADVANCE_BATCH();
 323 }
 324
 325 const struct brw_tracked_state brw_constant_buffer = {
 326    .dirty = {
 327       .mesa = _NEW_PROGRAM_CONSTANTS,
 328       .brw  = (BRW_NEW_FRAGMENT_PROGRAM |
 329                BRW_NEW_VERTEX_PROGRAM |
 330                BRW_NEW_URB_FENCE | /* Implicit - hardware requires this, not used above */
 331                BRW_NEW_PSP | /* Implicit - hardware requires this, not used above */
 332                BRW_NEW_CURBE_OFFSETS |
 333                BRW_NEW_BATCH),
 334       .cache = (CACHE_NEW_WM_PROG)
 335    },
 336    .emit = brw_upload_constant_buffer,
 337 };
 338