src/mesa/drivers/dri/i965/genX_state_upload.c

   1 /*
   2  * Copyright © 2017 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <assert.h>
  25
  26 #include "common/gen_device_info.h"
  27 #include "common/gen_sample_positions.h"
  28 #include "genxml/gen_macros.h"
  29
  30 #include "main/bufferobj.h"
  31 #include "main/context.h"
  32 #include "main/enums.h"
  33 #include "main/macros.h"
  34 #include "main/state.h"
  35
  36 #include "brw_context.h"
  37 #include "brw_draw.h"
  38 #include "brw_multisample_state.h"
  39 #include "brw_state.h"
  40 #include "brw_wm.h"
  41 #include "brw_util.h"
  42
  43 #include "intel_batchbuffer.h"
  44 #include "intel_buffer_objects.h"
  45 #include "intel_fbo.h"
  46
  47 #include "main/enums.h"
  48 #include "main/fbobject.h"
  49 #include "main/framebuffer.h"
  50 #include "main/glformats.h"
  51 #include "main/samplerobj.h"
  52 #include "main/shaderapi.h"
  53 #include "main/stencil.h"
  54 #include "main/transformfeedback.h"
  55 #include "main/varray.h"
  56 #include "main/viewport.h"
  57 #include "util/half_float.h"
  58
  59 UNUSED static void *
  60 emit_dwords(struct brw_context *brw, unsigned n)
  61 {
  62    intel_batchbuffer_begin(brw, n, RENDER_RING);
  63    uint32_t *map = brw->batch.map_next;
  64    brw->batch.map_next += n;
  65    intel_batchbuffer_advance(brw);
  66    return map;
  67 }
  68
  69 struct brw_address {
  70    struct brw_bo *bo;
  71    unsigned reloc_flags;
  72    uint32_t offset;
  73 };
  74
  75 #define __gen_address_type struct brw_address
  76 #define __gen_user_data struct brw_context
  77
  78 static uint64_t
  79 __gen_combine_address(struct brw_context *brw, void *location,
  80                       struct brw_address address, uint32_t delta)
  81 {
  82    struct intel_batchbuffer *batch = &brw->batch;
  83    uint32_t offset;
  84
  85    if (address.bo == NULL) {
  86       return address.offset + delta;
  87    } else {
  88       if (GEN_GEN < 6 && brw_ptr_in_state_buffer(batch, location)) {
  89          offset = (char *) location - (char *) brw->batch.state.map;
  90          return brw_state_reloc(batch, offset, address.bo,
  91                                 address.offset + delta,
  92                                 address.reloc_flags);
  93       }
  94
  95       assert(!brw_ptr_in_state_buffer(batch, location));
  96
  97       offset = (char *) location - (char *) brw->batch.batch.map;
  98       return brw_batch_reloc(batch, offset, address.bo,
  99                              address.offset + delta,
 100                              address.reloc_flags);
 101    }
 102 }
 103
 104 static struct brw_address
 105 rw_bo(struct brw_bo *bo, uint32_t offset)
 106 {
 107    return (struct brw_address) {
 108             .bo = bo,
 109             .offset = offset,
 110             .reloc_flags = RELOC_WRITE,
 111    };
 112 }
 113
 114 static struct brw_address
 115 ro_bo(struct brw_bo *bo, uint32_t offset)
 116 {
 117    return (struct brw_address) {
 118             .bo = bo,
 119             .offset = offset,
 120    };
 121 }
 122
 123 UNUSED static struct brw_address
 124 ggtt_bo(struct brw_bo *bo, uint32_t offset)
 125 {
 126    return (struct brw_address) {
 127             .bo = bo,
 128             .offset = offset,
 129             .reloc_flags = RELOC_WRITE | RELOC_NEEDS_GGTT,
 130    };
 131 }
 132
 133 #if GEN_GEN == 4
 134 static struct brw_address
 135 KSP(struct brw_context *brw, uint32_t offset)
 136 {
 137    return ro_bo(brw->cache.bo, offset);
 138 }
 139 #else
 140 static uint32_t
 141 KSP(struct brw_context *brw, uint32_t offset)
 142 {
 143    return offset;
 144 }
 145 #endif
 146
 147 #include "genxml/genX_pack.h"
 148
 149 #define _brw_cmd_length(cmd) cmd ## _length
 150 #define _brw_cmd_length_bias(cmd) cmd ## _length_bias
 151 #define _brw_cmd_header(cmd) cmd ## _header
 152 #define _brw_cmd_pack(cmd) cmd ## _pack
 153
 154 #define brw_batch_emit(brw, cmd, name)                  \
 155    for (struct cmd name = { _brw_cmd_header(cmd) },     \
 156         *_dst = emit_dwords(brw, _brw_cmd_length(cmd)); \
 157         __builtin_expect(_dst != NULL, 1);              \
 158         _brw_cmd_pack(cmd)(brw, (void *)_dst, &name),   \
 159         _dst = NULL)
 160
 161 #define brw_batch_emitn(brw, cmd, n, ...) ({           \
 162       uint32_t *_dw = emit_dwords(brw, n);             \
 163       struct cmd template = {                          \
 164          _brw_cmd_header(cmd),                         \
 165          .DWordLength = n - _brw_cmd_length_bias(cmd), \
 166          __VA_ARGS__                                   \
 167       };                                               \
 168       _brw_cmd_pack(cmd)(brw, _dw, &template);         \
 169       _dw + 1; /* Array starts at dw[1] */             \
 170    })
 171
 172 #define brw_state_emit(brw, cmd, align, offset, name)              \
 173    for (struct cmd name = {},                                      \
 174         *_dst = brw_state_batch(brw, _brw_cmd_length(cmd) * 4,     \
 175                                 align, offset);                    \
 176         __builtin_expect(_dst != NULL, 1);                         \
 177         _brw_cmd_pack(cmd)(brw, (void *)_dst, &name),              \
 178         _dst = NULL)
 179
 180 /**
 181  * Polygon stipple packet
 182  */
 183 static void
 184 genX(upload_polygon_stipple)(struct brw_context *brw)
 185 {
 186    struct gl_context *ctx = &brw->ctx;
 187
 188    /* _NEW_POLYGON */
 189    if (!ctx->Polygon.StippleFlag)
 190       return;
 191
 192    brw_batch_emit(brw, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
 193       /* Polygon stipple is provided in OpenGL order, i.e. bottom
 194        * row first.  If we're rendering to a window (i.e. the
 195        * default frame buffer object, 0), then we need to invert
 196        * it to match our pixel layout.  But if we're rendering
 197        * to a FBO (i.e. any named frame buffer object), we *don't*
 198        * need to invert - we already match the layout.
 199        */
 200       if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
 201          for (unsigned i = 0; i < 32; i++)
 202             poly.PatternRow[i] = ctx->PolygonStipple[31 - i]; /* invert */
 203       } else {
 204          for (unsigned i = 0; i < 32; i++)
 205             poly.PatternRow[i] = ctx->PolygonStipple[i];
 206       }
 207    }
 208 }
 209
 210 static const struct brw_tracked_state genX(polygon_stipple) = {
 211    .dirty = {
 212       .mesa = _NEW_POLYGON |
 213               _NEW_POLYGONSTIPPLE,
 214       .brw = BRW_NEW_CONTEXT,
 215    },
 216    .emit = genX(upload_polygon_stipple),
 217 };
 218
 219 /**
 220  * Polygon stipple offset packet
 221  */
 222 static void
 223 genX(upload_polygon_stipple_offset)(struct brw_context *brw)
 224 {
 225    struct gl_context *ctx = &brw->ctx;
 226
 227    /* _NEW_POLYGON */
 228    if (!ctx->Polygon.StippleFlag)
 229       return;
 230
 231    brw_batch_emit(brw, GENX(3DSTATE_POLY_STIPPLE_OFFSET), poly) {
 232       /* _NEW_BUFFERS
 233        *
 234        * If we're drawing to a system window we have to invert the Y axis
 235        * in order to match the OpenGL pixel coordinate system, and our
 236        * offset must be matched to the window position.  If we're drawing
 237        * to a user-created FBO then our native pixel coordinate system
 238        * works just fine, and there's no window system to worry about.
 239        */
 240       if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
 241          poly.PolygonStippleYOffset =
 242             (32 - (_mesa_geometric_height(ctx->DrawBuffer) & 31)) & 31;
 243       }
 244    }
 245 }
 246
 247 static const struct brw_tracked_state genX(polygon_stipple_offset) = {
 248    .dirty = {
 249       .mesa = _NEW_BUFFERS |
 250               _NEW_POLYGON,
 251       .brw = BRW_NEW_CONTEXT,
 252    },
 253    .emit = genX(upload_polygon_stipple_offset),
 254 };
 255
 256 /**
 257  * Line stipple packet
 258  */
 259 static void
 260 genX(upload_line_stipple)(struct brw_context *brw)
 261 {
 262    struct gl_context *ctx = &brw->ctx;
 263
 264    if (!ctx->Line.StippleFlag)
 265       return;
 266
 267    brw_batch_emit(brw, GENX(3DSTATE_LINE_STIPPLE), line) {
 268       line.LineStipplePattern = ctx->Line.StipplePattern;
 269
 270       line.LineStippleInverseRepeatCount = 1.0f / ctx->Line.StippleFactor;
 271       line.LineStippleRepeatCount = ctx->Line.StippleFactor;
 272    }
 273 }
 274
 275 static const struct brw_tracked_state genX(line_stipple) = {
 276    .dirty = {
 277       .mesa = _NEW_LINE,
 278       .brw = BRW_NEW_CONTEXT,
 279    },
 280    .emit = genX(upload_line_stipple),
 281 };
 282
 283 /* Constant single cliprect for framebuffer object or DRI2 drawing */
 284 static void
 285 genX(upload_drawing_rect)(struct brw_context *brw)
 286 {
 287    struct gl_context *ctx = &brw->ctx;
 288    const struct gl_framebuffer *fb = ctx->DrawBuffer;
 289    const unsigned int fb_width = _mesa_geometric_width(fb);
 290    const unsigned int fb_height = _mesa_geometric_height(fb);
 291
 292    brw_batch_emit(brw, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
 293       rect.ClippedDrawingRectangleXMax = fb_width - 1;
 294       rect.ClippedDrawingRectangleYMax = fb_height - 1;
 295    }
 296 }
 297
 298 static const struct brw_tracked_state genX(drawing_rect) = {
 299    .dirty = {
 300       .mesa = _NEW_BUFFERS,
 301       .brw = BRW_NEW_BLORP |
 302              BRW_NEW_CONTEXT,
 303    },
 304    .emit = genX(upload_drawing_rect),
 305 };
 306
 307 static uint32_t *
 308 genX(emit_vertex_buffer_state)(struct brw_context *brw,
 309                                uint32_t *dw,
 310                                unsigned buffer_nr,
 311                                struct brw_bo *bo,
 312                                unsigned start_offset,
 313                                unsigned end_offset,
 314                                unsigned stride,
 315                                unsigned step_rate)
 316 {
 317    struct GENX(VERTEX_BUFFER_STATE) buf_state = {
 318       .VertexBufferIndex = buffer_nr,
 319       .BufferPitch = stride,
 320       .BufferStartingAddress = ro_bo(bo, start_offset),
 321 #if GEN_GEN >= 8
 322       .BufferSize = end_offset - start_offset,
 323 #endif
 324
 325 #if GEN_GEN >= 7
 326       .AddressModifyEnable = true,
 327 #endif
 328
 329 #if GEN_GEN < 8
 330       .BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA,
 331       .InstanceDataStepRate = step_rate,
 332 #if GEN_GEN >= 5
 333       .EndAddress = ro_bo(bo, end_offset - 1),
 334 #endif
 335 #endif
 336
 337 #if GEN_GEN == 10
 338       .VertexBufferMOCS = CNL_MOCS_WB,
 339 #elif GEN_GEN == 9
 340       .VertexBufferMOCS = SKL_MOCS_WB,
 341 #elif GEN_GEN == 8
 342       .VertexBufferMOCS = BDW_MOCS_WB,
 343 #elif GEN_GEN == 7
 344       .VertexBufferMOCS = GEN7_MOCS_L3,
 345 #endif
 346    };
 347
 348    GENX(VERTEX_BUFFER_STATE_pack)(brw, dw, &buf_state);
 349    return dw + GENX(VERTEX_BUFFER_STATE_length);
 350 }
 351
 352 UNUSED static bool
 353 is_passthru_format(uint32_t format)
 354 {
 355    switch (format) {
 356    case ISL_FORMAT_R64_PASSTHRU:
 357    case ISL_FORMAT_R64G64_PASSTHRU:
 358    case ISL_FORMAT_R64G64B64_PASSTHRU:
 359    case ISL_FORMAT_R64G64B64A64_PASSTHRU:
 360       return true;
 361    default:
 362       return false;
 363    }
 364 }
 365
 366 UNUSED static int
 367 uploads_needed(uint32_t format)
 368 {
 369    if (!is_passthru_format(format))
 370       return 1;
 371
 372    switch (format) {
 373    case ISL_FORMAT_R64_PASSTHRU:
 374    case ISL_FORMAT_R64G64_PASSTHRU:
 375       return 1;
 376    case ISL_FORMAT_R64G64B64_PASSTHRU:
 377    case ISL_FORMAT_R64G64B64A64_PASSTHRU:
 378       return 2;
 379    default:
 380       unreachable("not reached");
 381    }
 382 }
 383
 384 /*
 385  * Returns the format that we are finally going to use when upload a vertex
 386  * element. It will only change if we are using *64*PASSTHRU formats, as for
 387  * gen < 8 they need to be splitted on two *32*FLOAT formats.
 388  *
 389  * @upload points in which upload we are. Valid values are [0,1]
 390  */
 391 static uint32_t
 392 downsize_format_if_needed(uint32_t format,
 393                           int upload)
 394 {
 395    assert(upload == 0 || upload == 1);
 396
 397    if (!is_passthru_format(format))
 398       return format;
 399
 400    switch (format) {
 401    case ISL_FORMAT_R64_PASSTHRU:
 402       return ISL_FORMAT_R32G32_FLOAT;
 403    case ISL_FORMAT_R64G64_PASSTHRU:
 404       return ISL_FORMAT_R32G32B32A32_FLOAT;
 405    case ISL_FORMAT_R64G64B64_PASSTHRU:
 406       return !upload ? ISL_FORMAT_R32G32B32A32_FLOAT
 407                      : ISL_FORMAT_R32G32_FLOAT;
 408    case ISL_FORMAT_R64G64B64A64_PASSTHRU:
 409       return ISL_FORMAT_R32G32B32A32_FLOAT;
 410    default:
 411       unreachable("not reached");
 412    }
 413 }
 414
 415 /*
 416  * Returns the number of componentes associated with a format that is used on
 417  * a 64 to 32 format split. See downsize_format()
 418  */
 419 static int
 420 upload_format_size(uint32_t upload_format)
 421 {
 422    switch (upload_format) {
 423    case ISL_FORMAT_R32G32_FLOAT:
 424       return 2;
 425    case ISL_FORMAT_R32G32B32A32_FLOAT:
 426       return 4;
 427    default:
 428       unreachable("not reached");
 429    }
 430 }
 431
 432 static void
 433 genX(emit_vertices)(struct brw_context *brw)
 434 {
 435    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 436    uint32_t *dw;
 437
 438    brw_prepare_vertices(brw);
 439    brw_prepare_shader_draw_parameters(brw);
 440
 441 #if GEN_GEN < 6
 442    brw_emit_query_begin(brw);
 443 #endif
 444
 445    const struct brw_vs_prog_data *vs_prog_data =
 446       brw_vs_prog_data(brw->vs.base.prog_data);
 447
 448 #if GEN_GEN >= 8
 449    struct gl_context *ctx = &brw->ctx;
 450    const bool uses_edge_flag = (ctx->Polygon.FrontMode != GL_FILL ||
 451                                 ctx->Polygon.BackMode != GL_FILL);
 452
 453    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) {
 454       unsigned vue = brw->vb.nr_enabled;
 455
 456       /* The element for the edge flags must always be last, so we have to
 457        * insert the SGVS before it in that case.
 458        */
 459       if (uses_edge_flag) {
 460          assert(vue > 0);
 461          vue--;
 462       }
 463
 464       WARN_ONCE(vue >= 33,
 465                 "Trying to insert VID/IID past 33rd vertex element, "
 466                 "need to reorder the vertex attrbutes.");
 467
 468       brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs) {
 469          if (vs_prog_data->uses_vertexid) {
 470             vfs.VertexIDEnable = true;
 471             vfs.VertexIDComponentNumber = 2;
 472             vfs.VertexIDElementOffset = vue;
 473          }
 474
 475          if (vs_prog_data->uses_instanceid) {
 476             vfs.InstanceIDEnable = true;
 477             vfs.InstanceIDComponentNumber = 3;
 478             vfs.InstanceIDElementOffset = vue;
 479          }
 480       }
 481
 482       brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
 483          vfi.InstancingEnable = true;
 484          vfi.VertexElementIndex = vue;
 485       }
 486    } else {
 487       brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs);
 488    }
 489
 490    /* Normally we don't need an element for the SGVS attribute because the
 491     * 3DSTATE_VF_SGVS instruction lets you store the generated attribute in an
 492     * element that is past the list in 3DSTATE_VERTEX_ELEMENTS. However if
 493     * we're using draw parameters then we need an element for the those
 494     * values.  Additionally if there is an edge flag element then the SGVS
 495     * can't be inserted past that so we need a dummy element to ensure that
 496     * the edge flag is the last one.
 497     */
 498    const bool needs_sgvs_element = (vs_prog_data->uses_basevertex ||
 499                                     vs_prog_data->uses_baseinstance ||
 500                                     ((vs_prog_data->uses_instanceid ||
 501                                       vs_prog_data->uses_vertexid)
 502                                      && uses_edge_flag));
 503 #else
 504    const bool needs_sgvs_element = (vs_prog_data->uses_basevertex ||
 505                                     vs_prog_data->uses_baseinstance ||
 506                                     vs_prog_data->uses_instanceid ||
 507                                     vs_prog_data->uses_vertexid);
 508 #endif
 509    unsigned nr_elements =
 510       brw->vb.nr_enabled + needs_sgvs_element + vs_prog_data->uses_drawid;
 511
 512 #if GEN_GEN < 8
 513    /* If any of the formats of vb.enabled needs more that one upload, we need
 514     * to add it to nr_elements
 515     */
 516    for (unsigned i = 0; i < brw->vb.nr_enabled; i++) {
 517       struct brw_vertex_element *input = brw->vb.enabled[i];
 518       uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
 519
 520       if (uploads_needed(format) > 1)
 521          nr_elements++;
 522    }
 523 #endif
 524
 525    /* If the VS doesn't read any inputs (calculating vertex position from
 526     * a state variable for some reason, for example), emit a single pad
 527     * VERTEX_ELEMENT struct and bail.
 528     *
 529     * The stale VB state stays in place, but they don't do anything unless
 530     * a VE loads from them.
 531     */
 532    if (nr_elements == 0) {
 533       dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS),
 534                            1 + GENX(VERTEX_ELEMENT_STATE_length));
 535       struct GENX(VERTEX_ELEMENT_STATE) elem = {
 536          .Valid = true,
 537          .SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32G32B32A32_FLOAT,
 538          .Component0Control = VFCOMP_STORE_0,
 539          .Component1Control = VFCOMP_STORE_0,
 540          .Component2Control = VFCOMP_STORE_0,
 541          .Component3Control = VFCOMP_STORE_1_FP,
 542       };
 543       GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem);
 544       return;
 545    }
 546
 547    /* Now emit 3DSTATE_VERTEX_BUFFERS and 3DSTATE_VERTEX_ELEMENTS packets. */
 548    const bool uses_draw_params =
 549       vs_prog_data->uses_basevertex ||
 550       vs_prog_data->uses_baseinstance;
 551    const unsigned nr_buffers = brw->vb.nr_buffers +
 552       uses_draw_params + vs_prog_data->uses_drawid;
 553
 554    if (nr_buffers) {
 555       assert(nr_buffers <= (GEN_GEN >= 6 ? 33 : 17));
 556
 557       dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_BUFFERS),
 558                            1 + GENX(VERTEX_BUFFER_STATE_length) * nr_buffers);
 559
 560       for (unsigned i = 0; i < brw->vb.nr_buffers; i++) {
 561          const struct brw_vertex_buffer *buffer = &brw->vb.buffers[i];
 562          /* Prior to Haswell and Bay Trail we have to use 4-component formats
 563           * to fake 3-component ones.  In particular, we do this for
 564           * half-float and 8 and 16-bit integer formats.  This means that the
 565           * vertex element may poke over the end of the buffer by 2 bytes.
 566           */
 567          const unsigned padding =
 568             (GEN_GEN <= 7 && !GEN_IS_HASWELL && !devinfo->is_baytrail) * 2;
 569          const unsigned end = buffer->offset + buffer->size + padding;
 570          dw = genX(emit_vertex_buffer_state)(brw, dw, i, buffer->bo,
 571                                              buffer->offset,
 572                                              end,
 573                                              buffer->stride,
 574                                              buffer->step_rate);
 575       }
 576
 577       if (uses_draw_params) {
 578          dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers,
 579                                              brw->draw.draw_params_bo,
 580                                              brw->draw.draw_params_offset,
 581                                              brw->draw.draw_params_bo->size,
 582                                              0 /* stride */,
 583                                              0 /* step rate */);
 584       }
 585
 586       if (vs_prog_data->uses_drawid) {
 587          dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers + 1,
 588                                              brw->draw.draw_id_bo,
 589                                              brw->draw.draw_id_offset,
 590                                              brw->draw.draw_id_bo->size,
 591                                              0 /* stride */,
 592                                              0 /* step rate */);
 593       }
 594    }
 595
 596    /* The hardware allows one more VERTEX_ELEMENTS than VERTEX_BUFFERS,
 597     * presumably for VertexID/InstanceID.
 598     */
 599 #if GEN_GEN >= 6
 600    assert(nr_elements <= 34);
 601    const struct brw_vertex_element *gen6_edgeflag_input = NULL;
 602 #else
 603    assert(nr_elements <= 18);
 604 #endif
 605
 606    dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS),
 607                         1 + GENX(VERTEX_ELEMENT_STATE_length) * nr_elements);
 608    unsigned i;
 609    for (i = 0; i < brw->vb.nr_enabled; i++) {
 610       const struct brw_vertex_element *input = brw->vb.enabled[i];
 611       uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
 612       uint32_t comp0 = VFCOMP_STORE_SRC;
 613       uint32_t comp1 = VFCOMP_STORE_SRC;
 614       uint32_t comp2 = VFCOMP_STORE_SRC;
 615       uint32_t comp3 = VFCOMP_STORE_SRC;
 616       const unsigned num_uploads = GEN_GEN < 8 ? uploads_needed(format) : 1;
 617
 618 #if GEN_GEN >= 8
 619       /* From the BDW PRM, Volume 2d, page 588 (VERTEX_ELEMENT_STATE):
 620        * "Any SourceElementFormat of *64*_PASSTHRU cannot be used with an
 621        * element which has edge flag enabled."
 622        */
 623       assert(!(is_passthru_format(format) && uses_edge_flag));
 624 #endif
 625
 626       /* The gen4 driver expects edgeflag to come in as a float, and passes
 627        * that float on to the tests in the clipper.  Mesa's current vertex
 628        * attribute value for EdgeFlag is stored as a float, which works out.
 629        * glEdgeFlagPointer, on the other hand, gives us an unnormalized
 630        * integer ubyte.  Just rewrite that to convert to a float.
 631        *
 632        * Gen6+ passes edgeflag as sideband along with the vertex, instead
 633        * of in the VUE.  We have to upload it sideband as the last vertex
 634        * element according to the B-Spec.
 635        */
 636 #if GEN_GEN >= 6
 637       if (input == &brw->vb.inputs[VERT_ATTRIB_EDGEFLAG]) {
 638          gen6_edgeflag_input = input;
 639          continue;
 640       }
 641 #endif
 642
 643       for (unsigned c = 0; c < num_uploads; c++) {
 644          const uint32_t upload_format = GEN_GEN >= 8 ? format :
 645             downsize_format_if_needed(format, c);
 646          /* If we need more that one upload, the offset stride would be 128
 647           * bits (16 bytes), as for previous uploads we are using the full
 648           * entry. */
 649          const unsigned offset = input->offset + c * 16;
 650
 651          const int size = (GEN_GEN < 8 && is_passthru_format(format)) ?
 652             upload_format_size(upload_format) : input->glarray->Size;
 653
 654          switch (size) {
 655             case 0: comp0 = VFCOMP_STORE_0;
 656             case 1: comp1 = VFCOMP_STORE_0;
 657             case 2: comp2 = VFCOMP_STORE_0;
 658             case 3:
 659                if (GEN_GEN >= 8 && input->glarray->Doubles) {
 660                   comp3 = VFCOMP_STORE_0;
 661                } else if (input->glarray->Integer) {
 662                   comp3 = VFCOMP_STORE_1_INT;
 663                } else {
 664                   comp3 = VFCOMP_STORE_1_FP;
 665                }
 666
 667                break;
 668          }
 669
 670 #if GEN_GEN >= 8
 671          /* From the BDW PRM, Volume 2d, page 586 (VERTEX_ELEMENT_STATE):
 672           *
 673           *     "When SourceElementFormat is set to one of the *64*_PASSTHRU
 674           *     formats, 64-bit components are stored in the URB without any
 675           *     conversion. In this case, vertex elements must be written as 128
 676           *     or 256 bits, with VFCOMP_STORE_0 being used to pad the output as
 677           *     required. E.g., if R64_PASSTHRU is used to copy a 64-bit Red
 678           *     component into the URB, Component 1 must be specified as
 679           *     VFCOMP_STORE_0 (with Components 2,3 set to VFCOMP_NOSTORE) in
 680           *     order to output a 128-bit vertex element, or Components 1-3 must
 681           *     be specified as VFCOMP_STORE_0 in order to output a 256-bit vertex
 682           *     element. Likewise, use of R64G64B64_PASSTHRU requires Component 3
 683           *     to be specified as VFCOMP_STORE_0 in order to output a 256-bit
 684           *     vertex element."
 685           */
 686          if (input->glarray->Doubles && !input->is_dual_slot) {
 687             /* Store vertex elements which correspond to double and dvec2 vertex
 688              * shader inputs as 128-bit vertex elements, instead of 256-bits.
 689              */
 690             comp2 = VFCOMP_NOSTORE;
 691             comp3 = VFCOMP_NOSTORE;
 692          }
 693 #endif
 694
 695          struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
 696             .VertexBufferIndex = input->buffer,
 697             .Valid = true,
 698             .SourceElementFormat = upload_format,
 699             .SourceElementOffset = offset,
 700             .Component0Control = comp0,
 701             .Component1Control = comp1,
 702             .Component2Control = comp2,
 703             .Component3Control = comp3,
 704 #if GEN_GEN < 5
 705             .DestinationElementOffset = i * 4,
 706 #endif
 707          };
 708
 709          GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
 710          dw += GENX(VERTEX_ELEMENT_STATE_length);
 711       }
 712    }
 713
 714    if (needs_sgvs_element) {
 715       struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
 716          .Valid = true,
 717          .Component0Control = VFCOMP_STORE_0,
 718          .Component1Control = VFCOMP_STORE_0,
 719          .Component2Control = VFCOMP_STORE_0,
 720          .Component3Control = VFCOMP_STORE_0,
 721 #if GEN_GEN < 5
 722          .DestinationElementOffset = i * 4,
 723 #endif
 724       };
 725
 726 #if GEN_GEN >= 8
 727       if (vs_prog_data->uses_basevertex ||
 728           vs_prog_data->uses_baseinstance) {
 729          elem_state.VertexBufferIndex = brw->vb.nr_buffers;
 730          elem_state.SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32G32_UINT;
 731          elem_state.Component0Control = VFCOMP_STORE_SRC;
 732          elem_state.Component1Control = VFCOMP_STORE_SRC;
 733       }
 734 #else
 735       elem_state.VertexBufferIndex = brw->vb.nr_buffers;
 736       elem_state.SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32G32_UINT;
 737       if (vs_prog_data->uses_basevertex)
 738          elem_state.Component0Control = VFCOMP_STORE_SRC;
 739
 740       if (vs_prog_data->uses_baseinstance)
 741          elem_state.Component1Control = VFCOMP_STORE_SRC;
 742
 743       if (vs_prog_data->uses_vertexid)
 744          elem_state.Component2Control = VFCOMP_STORE_VID;
 745
 746       if (vs_prog_data->uses_instanceid)
 747          elem_state.Component3Control = VFCOMP_STORE_IID;
 748 #endif
 749
 750       GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
 751       dw += GENX(VERTEX_ELEMENT_STATE_length);
 752    }
 753
 754    if (vs_prog_data->uses_drawid) {
 755       struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
 756          .Valid = true,
 757          .VertexBufferIndex = brw->vb.nr_buffers + 1,
 758          .SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32_UINT,
 759          .Component0Control = VFCOMP_STORE_SRC,
 760          .Component1Control = VFCOMP_STORE_0,
 761          .Component2Control = VFCOMP_STORE_0,
 762          .Component3Control = VFCOMP_STORE_0,
 763 #if GEN_GEN < 5
 764          .DestinationElementOffset = i * 4,
 765 #endif
 766       };
 767
 768       GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
 769       dw += GENX(VERTEX_ELEMENT_STATE_length);
 770    }
 771
 772 #if GEN_GEN >= 6
 773    if (gen6_edgeflag_input) {
 774       const uint32_t format =
 775          brw_get_vertex_surface_type(brw, gen6_edgeflag_input->glarray);
 776
 777       struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
 778          .Valid = true,
 779          .VertexBufferIndex = gen6_edgeflag_input->buffer,
 780          .EdgeFlagEnable = true,
 781          .SourceElementFormat = format,
 782          .SourceElementOffset = gen6_edgeflag_input->offset,
 783          .Component0Control = VFCOMP_STORE_SRC,
 784          .Component1Control = VFCOMP_STORE_0,
 785          .Component2Control = VFCOMP_STORE_0,
 786          .Component3Control = VFCOMP_STORE_0,
 787       };
 788
 789       GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
 790       dw += GENX(VERTEX_ELEMENT_STATE_length);
 791    }
 792 #endif
 793
 794 #if GEN_GEN >= 8
 795    for (unsigned i = 0, j = 0; i < brw->vb.nr_enabled; i++) {
 796       const struct brw_vertex_element *input = brw->vb.enabled[i];
 797       const struct brw_vertex_buffer *buffer = &brw->vb.buffers[input->buffer];
 798       unsigned element_index;
 799
 800       /* The edge flag element is reordered to be the last one in the code
 801        * above so we need to compensate for that in the element indices used
 802        * below.
 803        */
 804       if (input == gen6_edgeflag_input)
 805          element_index = nr_elements - 1;
 806       else
 807          element_index = j++;
 808
 809       brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
 810          vfi.VertexElementIndex = element_index;
 811          vfi.InstancingEnable = buffer->step_rate != 0;
 812          vfi.InstanceDataStepRate = buffer->step_rate;
 813       }
 814    }
 815
 816    if (vs_prog_data->uses_drawid) {
 817       const unsigned element = brw->vb.nr_enabled + needs_sgvs_element;
 818
 819       brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
 820          vfi.VertexElementIndex = element;
 821       }
 822    }
 823 #endif
 824 }
 825
 826 static const struct brw_tracked_state genX(vertices) = {
 827    .dirty = {
 828       .mesa = _NEW_POLYGON,
 829       .brw = BRW_NEW_BATCH |
 830              BRW_NEW_BLORP |
 831              BRW_NEW_VERTICES |
 832              BRW_NEW_VS_PROG_DATA,
 833    },
 834    .emit = genX(emit_vertices),
 835 };
 836
 837 static void
 838 genX(emit_index_buffer)(struct brw_context *brw)
 839 {
 840    const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
 841
 842    if (index_buffer == NULL)
 843       return;
 844
 845    brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) {
 846 #if GEN_GEN < 8 && !GEN_IS_HASWELL
 847       ib.CutIndexEnable = brw->prim_restart.enable_cut_index;
 848 #endif
 849       ib.IndexFormat = brw_get_index_type(index_buffer->index_size);
 850       ib.BufferStartingAddress = ro_bo(brw->ib.bo, 0);
 851 #if GEN_GEN >= 8
 852       ib.IndexBufferMOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
 853       ib.BufferSize = brw->ib.size;
 854 #else
 855       ib.BufferEndingAddress = ro_bo(brw->ib.bo, brw->ib.size - 1);
 856 #endif
 857    }
 858 }
 859
 860 static const struct brw_tracked_state genX(index_buffer) = {
 861    .dirty = {
 862       .mesa = 0,
 863       .brw = BRW_NEW_BATCH |
 864              BRW_NEW_BLORP |
 865              BRW_NEW_INDEX_BUFFER,
 866    },
 867    .emit = genX(emit_index_buffer),
 868 };
 869
 870 #if GEN_IS_HASWELL || GEN_GEN >= 8
 871 static void
 872 genX(upload_cut_index)(struct brw_context *brw)
 873 {
 874    const struct gl_context *ctx = &brw->ctx;
 875
 876    brw_batch_emit(brw, GENX(3DSTATE_VF), vf) {
 877       if (ctx->Array._PrimitiveRestart && brw->ib.ib) {
 878          vf.IndexedDrawCutIndexEnable = true;
 879          vf.CutIndex = _mesa_primitive_restart_index(ctx, brw->ib.index_size);
 880       }
 881    }
 882 }
 883
 884 const struct brw_tracked_state genX(cut_index) = {
 885    .dirty = {
 886       .mesa  = _NEW_TRANSFORM,
 887       .brw   = BRW_NEW_INDEX_BUFFER,
 888    },
 889    .emit = genX(upload_cut_index),
 890 };
 891 #endif
 892
 893 #if GEN_GEN >= 6
 894 /**
 895  * Determine the appropriate attribute override value to store into the
 896  * 3DSTATE_SF structure for a given fragment shader attribute.  The attribute
 897  * override value contains two pieces of information: the location of the
 898  * attribute in the VUE (relative to urb_entry_read_offset, see below), and a
 899  * flag indicating whether to "swizzle" the attribute based on the direction
 900  * the triangle is facing.
 901  *
 902  * If an attribute is "swizzled", then the given VUE location is used for
 903  * front-facing triangles, and the VUE location that immediately follows is
 904  * used for back-facing triangles.  We use this to implement the mapping from
 905  * gl_FrontColor/gl_BackColor to gl_Color.
 906  *
 907  * urb_entry_read_offset is the offset into the VUE at which the SF unit is
 908  * being instructed to begin reading attribute data.  It can be set to a
 909  * nonzero value to prevent the SF unit from wasting time reading elements of
 910  * the VUE that are not needed by the fragment shader.  It is measured in
 911  * 256-bit increments.
 912  */
 913 static void
 914 genX(get_attr_override)(struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr,
 915                         const struct brw_vue_map *vue_map,
 916                         int urb_entry_read_offset, int fs_attr,
 917                         bool two_side_color, uint32_t *max_source_attr)
 918 {
 919    /* Find the VUE slot for this attribute. */
 920    int slot = vue_map->varying_to_slot[fs_attr];
 921
 922    /* Viewport and Layer are stored in the VUE header.  We need to override
 923     * them to zero if earlier stages didn't write them, as GL requires that
 924     * they read back as zero when not explicitly set.
 925     */
 926    if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
 927       attr->ComponentOverrideX = true;
 928       attr->ComponentOverrideW = true;
 929       attr->ConstantSource = CONST_0000;
 930
 931       if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
 932          attr->ComponentOverrideY = true;
 933       if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
 934          attr->ComponentOverrideZ = true;
 935
 936       return;
 937    }
 938
 939    /* If there was only a back color written but not front, use back
 940     * as the color instead of undefined
 941     */
 942    if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
 943       slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
 944    if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
 945       slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
 946
 947    if (slot == -1) {
 948       /* This attribute does not exist in the VUE--that means that the vertex
 949        * shader did not write to it.  This means that either:
 950        *
 951        * (a) This attribute is a texture coordinate, and it is going to be
 952        * replaced with point coordinates (as a consequence of a call to
 953        * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
 954        * hardware will ignore whatever attribute override we supply.
 955        *
 956        * (b) This attribute is read by the fragment shader but not written by
 957        * the vertex shader, so its value is undefined.  Therefore the
 958        * attribute override we supply doesn't matter.
 959        *
 960        * (c) This attribute is gl_PrimitiveID, and it wasn't written by the
 961        * previous shader stage.
 962        *
 963        * Note that we don't have to worry about the cases where the attribute
 964        * is gl_PointCoord or is undergoing point sprite coordinate
 965        * replacement, because in those cases, this function isn't called.
 966        *
 967        * In case (c), we need to program the attribute overrides so that the
 968        * primitive ID will be stored in this slot.  In every other case, the
 969        * attribute override we supply doesn't matter.  So just go ahead and
 970        * program primitive ID in every case.
 971        */
 972       attr->ComponentOverrideW = true;
 973       attr->ComponentOverrideX = true;
 974       attr->ComponentOverrideY = true;
 975       attr->ComponentOverrideZ = true;
 976       attr->ConstantSource = PRIM_ID;
 977       return;
 978    }
 979
 980    /* Compute the location of the attribute relative to urb_entry_read_offset.
 981     * Each increment of urb_entry_read_offset represents a 256-bit value, so
 982     * it counts for two 128-bit VUE slots.
 983     */
 984    int source_attr = slot - 2 * urb_entry_read_offset;
 985    assert(source_attr >= 0 && source_attr < 32);
 986
 987    /* If we are doing two-sided color, and the VUE slot following this one
 988     * represents a back-facing color, then we need to instruct the SF unit to
 989     * do back-facing swizzling.
 990     */
 991    bool swizzling = two_side_color &&
 992       ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
 993         vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
 994        (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
 995         vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1));
 996
 997    /* Update max_source_attr.  If swizzling, the SF will read this slot + 1. */
 998    if (*max_source_attr < source_attr + swizzling)
 999       *max_source_attr = source_attr + swizzling;
1000
1001    attr->SourceAttribute = source_attr;
1002    if (swizzling)
1003       attr->SwizzleSelect = INPUTATTR_FACING;
1004 }
1005
1006
1007 static void
1008 genX(calculate_attr_overrides)(const struct brw_context *brw,
1009                                struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides,
1010                                uint32_t *point_sprite_enables,
1011                                uint32_t *urb_entry_read_length,
1012                                uint32_t *urb_entry_read_offset)
1013 {
1014    const struct gl_context *ctx = &brw->ctx;
1015
1016    /* _NEW_POINT */
1017    const struct gl_point_attrib *point = &ctx->Point;
1018
1019    /* BRW_NEW_FRAGMENT_PROGRAM */
1020    const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
1021
1022    /* BRW_NEW_FS_PROG_DATA */
1023    const struct brw_wm_prog_data *wm_prog_data =
1024       brw_wm_prog_data(brw->wm.base.prog_data);
1025    uint32_t max_source_attr = 0;
1026
1027    *point_sprite_enables = 0;
1028
1029    int first_slot =
1030       brw_compute_first_urb_slot_required(fp->info.inputs_read,
1031                                           &brw->vue_map_geom_out);
1032
1033    /* Each URB offset packs two varying slots */
1034    assert(first_slot % 2 == 0);
1035    *urb_entry_read_offset = first_slot / 2;
1036
1037    /* From the Ivybridge PRM, Vol 2 Part 1, 3DSTATE_SBE,
1038     * description of dw10 Point Sprite Texture Coordinate Enable:
1039     *
1040     * "This field must be programmed to zero when non-point primitives
1041     * are rendered."
1042     *
1043     * The SandyBridge PRM doesn't explicitly say that point sprite enables
1044     * must be programmed to zero when rendering non-point primitives, but
1045     * the IvyBridge PRM does, and if we don't, we get garbage.
1046     *
1047     * This is not required on Haswell, as the hardware ignores this state
1048     * when drawing non-points -- although we do still need to be careful to
1049     * correctly set the attr overrides.
1050     *
1051     * _NEW_POLYGON
1052     * BRW_NEW_PRIMITIVE | BRW_NEW_GS_PROG_DATA | BRW_NEW_TES_PROG_DATA
1053     */
1054    bool drawing_points = brw_is_drawing_points(brw);
1055
1056    for (int attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1057       int input_index = wm_prog_data->urb_setup[attr];
1058
1059       if (input_index < 0)
1060          continue;
1061
1062       /* _NEW_POINT */
1063       bool point_sprite = false;
1064       if (drawing_points) {
1065          if (point->PointSprite &&
1066              (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7) &&
1067              (point->CoordReplace & (1u << (attr - VARYING_SLOT_TEX0)))) {
1068             point_sprite = true;
1069          }
1070
1071          if (attr == VARYING_SLOT_PNTC)
1072             point_sprite = true;
1073
1074          if (point_sprite)
1075             *point_sprite_enables |= (1 << input_index);
1076       }
1077
1078       /* BRW_NEW_VUE_MAP_GEOM_OUT | _NEW_LIGHT | _NEW_PROGRAM */
1079       struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 };
1080
1081       if (!point_sprite) {
1082          genX(get_attr_override)(&attribute,
1083                                  &brw->vue_map_geom_out,
1084                                  *urb_entry_read_offset, attr,
1085                                  _mesa_vertex_program_two_side_enabled(ctx),
1086                                  &max_source_attr);
1087       }
1088
1089       /* The hardware can only do the overrides on 16 overrides at a
1090        * time, and the other up to 16 have to be lined up so that the
1091        * input index = the output index.  We'll need to do some
1092        * tweaking to make sure that's the case.
1093        */
1094       if (input_index < 16)
1095          attr_overrides[input_index] = attribute;
1096       else
1097          assert(attribute.SourceAttribute == input_index);
1098    }
1099
1100    /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
1101     * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
1102     *
1103     * "This field should be set to the minimum length required to read the
1104     *  maximum source attribute.  The maximum source attribute is indicated
1105     *  by the maximum value of the enabled Attribute # Source Attribute if
1106     *  Attribute Swizzle Enable is set, Number of Output Attributes-1 if
1107     *  enable is not set.
1108     *  read_length = ceiling((max_source_attr + 1) / 2)
1109     *
1110     *  [errata] Corruption/Hang possible if length programmed larger than
1111     *  recommended"
1112     *
1113     * Similar text exists for Ivy Bridge.
1114     */
1115    *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2);
1116 }
1117 #endif
1118
1119 /* ---------------------------------------------------------------------- */
1120
1121 #if GEN_GEN >= 8
1122 typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML;
1123 #elif GEN_GEN >= 6
1124 typedef struct GENX(DEPTH_STENCIL_STATE)      DEPTH_STENCIL_GENXML;
1125 #else
1126 typedef struct GENX(COLOR_CALC_STATE)         DEPTH_STENCIL_GENXML;
1127 #endif
1128
1129 static inline void
1130 set_depth_stencil_bits(struct brw_context *brw, DEPTH_STENCIL_GENXML *ds)
1131 {
1132    struct gl_context *ctx = &brw->ctx;
1133
1134    /* _NEW_BUFFERS */
1135    struct intel_renderbuffer *depth_irb =
1136       intel_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH);
1137
1138    /* _NEW_DEPTH */
1139    struct gl_depthbuffer_attrib *depth = &ctx->Depth;
1140
1141    /* _NEW_STENCIL */
1142    struct gl_stencil_attrib *stencil = &ctx->Stencil;
1143    const int b = stencil->_BackFace;
1144
1145    if (depth->Test && depth_irb) {
1146       ds->DepthTestEnable = true;
1147       ds->DepthBufferWriteEnable = brw_depth_writes_enabled(brw);
1148       ds->DepthTestFunction = intel_translate_compare_func(depth->Func);
1149    }
1150
1151    if (brw->stencil_enabled) {
1152       ds->StencilTestEnable = true;
1153       ds->StencilWriteMask = stencil->WriteMask[0] & 0xff;
1154       ds->StencilTestMask = stencil->ValueMask[0] & 0xff;
1155
1156       ds->StencilTestFunction =
1157          intel_translate_compare_func(stencil->Function[0]);
1158       ds->StencilFailOp =
1159          intel_translate_stencil_op(stencil->FailFunc[0]);
1160       ds->StencilPassDepthPassOp =
1161          intel_translate_stencil_op(stencil->ZPassFunc[0]);
1162       ds->StencilPassDepthFailOp =
1163          intel_translate_stencil_op(stencil->ZFailFunc[0]);
1164
1165       ds->StencilBufferWriteEnable = brw->stencil_write_enabled;
1166
1167       if (brw->stencil_two_sided) {
1168          ds->DoubleSidedStencilEnable = true;
1169          ds->BackfaceStencilWriteMask = stencil->WriteMask[b] & 0xff;
1170          ds->BackfaceStencilTestMask = stencil->ValueMask[b] & 0xff;
1171
1172          ds->BackfaceStencilTestFunction =
1173             intel_translate_compare_func(stencil->Function[b]);
1174          ds->BackfaceStencilFailOp =
1175             intel_translate_stencil_op(stencil->FailFunc[b]);
1176          ds->BackfaceStencilPassDepthPassOp =
1177             intel_translate_stencil_op(stencil->ZPassFunc[b]);
1178          ds->BackfaceStencilPassDepthFailOp =
1179             intel_translate_stencil_op(stencil->ZFailFunc[b]);
1180       }
1181
1182 #if GEN_GEN <= 5 || GEN_GEN >= 9
1183       ds->StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
1184       ds->BackfaceStencilReferenceValue = _mesa_get_stencil_ref(ctx, b);
1185 #endif
1186    }
1187 }
1188
1189 #if GEN_GEN >= 6
1190 static void
1191 genX(upload_depth_stencil_state)(struct brw_context *brw)
1192 {
1193 #if GEN_GEN >= 8
1194    brw_batch_emit(brw, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) {
1195       set_depth_stencil_bits(brw, &wmds);
1196    }
1197 #else
1198    uint32_t ds_offset;
1199    brw_state_emit(brw, GENX(DEPTH_STENCIL_STATE), 64, &ds_offset, ds) {
1200       set_depth_stencil_bits(brw, &ds);
1201    }
1202
1203    /* Now upload a pointer to the indirect state */
1204 #if GEN_GEN == 6
1205    brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
1206       ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
1207       ptr.DEPTH_STENCIL_STATEChange = true;
1208    }
1209 #else
1210    brw_batch_emit(brw, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
1211       ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
1212    }
1213 #endif
1214 #endif
1215 }
1216
1217 static const struct brw_tracked_state genX(depth_stencil_state) = {
1218    .dirty = {
1219       .mesa = _NEW_BUFFERS |
1220               _NEW_DEPTH |
1221               _NEW_STENCIL,
1222       .brw  = BRW_NEW_BLORP |
1223               (GEN_GEN >= 8 ? BRW_NEW_CONTEXT
1224                             : BRW_NEW_BATCH |
1225                               BRW_NEW_STATE_BASE_ADDRESS),
1226    },
1227    .emit = genX(upload_depth_stencil_state),
1228 };
1229 #endif
1230
1231 /* ---------------------------------------------------------------------- */
1232
1233 #if GEN_GEN <= 5
1234
1235 static void
1236 genX(upload_clip_state)(struct brw_context *brw)
1237 {
1238    struct gl_context *ctx = &brw->ctx;
1239
1240    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
1241    brw_state_emit(brw, GENX(CLIP_STATE), 32, &brw->clip.state_offset, clip) {
1242       clip.KernelStartPointer = KSP(brw, brw->clip.prog_offset);
1243       clip.GRFRegisterCount =
1244          DIV_ROUND_UP(brw->clip.prog_data->total_grf, 16) - 1;
1245       clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
1246       clip.SingleProgramFlow = true;
1247       clip.VertexURBEntryReadLength = brw->clip.prog_data->urb_read_length;
1248       clip.ConstantURBEntryReadLength = brw->clip.prog_data->curb_read_length;
1249
1250       /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
1251       clip.ConstantURBEntryReadOffset = brw->curbe.clip_start * 2;
1252       clip.DispatchGRFStartRegisterForURBData = 1;
1253       clip.VertexURBEntryReadOffset = 0;
1254
1255       /* BRW_NEW_URB_FENCE */
1256       clip.NumberofURBEntries = brw->urb.nr_clip_entries;
1257       clip.URBEntryAllocationSize = brw->urb.vsize - 1;
1258
1259       if (brw->urb.nr_clip_entries >= 10) {
1260          /* Half of the URB entries go to each thread, and it has to be an
1261           * even number.
1262           */
1263          assert(brw->urb.nr_clip_entries % 2 == 0);
1264
1265          /* Although up to 16 concurrent Clip threads are allowed on Ironlake,
1266           * only 2 threads can output VUEs at a time.
1267           */
1268          clip.MaximumNumberofThreads = (GEN_GEN == 5 ? 16 : 2) - 1;
1269       } else {
1270          assert(brw->urb.nr_clip_entries >= 5);
1271          clip.MaximumNumberofThreads = 1 - 1;
1272       }
1273
1274       clip.VertexPositionSpace = VPOS_NDCSPACE;
1275       clip.UserClipFlagsMustClipEnable = true;
1276       clip.GuardbandClipTestEnable = true;
1277
1278       clip.ClipperViewportStatePointer =
1279          ro_bo(brw->batch.state.bo, brw->clip.vp_offset);
1280
1281       clip.ScreenSpaceViewportXMin = -1;
1282       clip.ScreenSpaceViewportXMax = 1;
1283       clip.ScreenSpaceViewportYMin = -1;
1284       clip.ScreenSpaceViewportYMax = 1;
1285
1286       clip.ViewportXYClipTestEnable = true;
1287       clip.ViewportZClipTestEnable = !ctx->Transform.DepthClamp;
1288
1289       /* _NEW_TRANSFORM */
1290       if (GEN_GEN == 5 || GEN_IS_G4X) {
1291          clip.UserClipDistanceClipTestEnableBitmask =
1292             ctx->Transform.ClipPlanesEnabled;
1293       } else {
1294          /* Up to 6 actual clip flags, plus the 7th for the negative RHW
1295           * workaround.
1296           */
1297          clip.UserClipDistanceClipTestEnableBitmask =
1298             (ctx->Transform.ClipPlanesEnabled & 0x3f) | 0x40;
1299       }
1300
1301       if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE)
1302          clip.APIMode = APIMODE_D3D;
1303       else
1304          clip.APIMode = APIMODE_OGL;
1305
1306       clip.GuardbandClipTestEnable = true;
1307
1308       clip.ClipMode = brw->clip.prog_data->clip_mode;
1309
1310 #if GEN_IS_G4X
1311       clip.NegativeWClipTestEnable = true;
1312 #endif
1313    }
1314 }
1315
1316 const struct brw_tracked_state genX(clip_state) = {
1317    .dirty = {
1318       .mesa  = _NEW_TRANSFORM |
1319                _NEW_VIEWPORT,
1320       .brw   = BRW_NEW_BATCH |
1321                BRW_NEW_BLORP |
1322                BRW_NEW_CLIP_PROG_DATA |
1323                BRW_NEW_PUSH_CONSTANT_ALLOCATION |
1324                BRW_NEW_PROGRAM_CACHE |
1325                BRW_NEW_URB_FENCE,
1326    },
1327    .emit = genX(upload_clip_state),
1328 };
1329
1330 #else
1331
1332 static void
1333 genX(upload_clip_state)(struct brw_context *brw)
1334 {
1335    struct gl_context *ctx = &brw->ctx;
1336
1337    /* _NEW_BUFFERS */
1338    struct gl_framebuffer *fb = ctx->DrawBuffer;
1339
1340    /* BRW_NEW_FS_PROG_DATA */
1341    struct brw_wm_prog_data *wm_prog_data =
1342       brw_wm_prog_data(brw->wm.base.prog_data);
1343
1344    brw_batch_emit(brw, GENX(3DSTATE_CLIP), clip) {
1345       clip.StatisticsEnable = !brw->meta_in_progress;
1346
1347       if (wm_prog_data->barycentric_interp_modes &
1348           BRW_BARYCENTRIC_NONPERSPECTIVE_BITS)
1349          clip.NonPerspectiveBarycentricEnable = true;
1350
1351 #if GEN_GEN >= 7
1352       clip.EarlyCullEnable = true;
1353 #endif
1354
1355 #if GEN_GEN == 7
1356       clip.FrontWinding = brw->polygon_front_bit == _mesa_is_user_fbo(fb);
1357
1358       if (ctx->Polygon.CullFlag) {
1359          switch (ctx->Polygon.CullFaceMode) {
1360          case GL_FRONT:
1361             clip.CullMode = CULLMODE_FRONT;
1362             break;
1363          case GL_BACK:
1364             clip.CullMode = CULLMODE_BACK;
1365             break;
1366          case GL_FRONT_AND_BACK:
1367             clip.CullMode = CULLMODE_BOTH;
1368             break;
1369          default:
1370             unreachable("Should not get here: invalid CullFlag");
1371          }
1372       } else {
1373          clip.CullMode = CULLMODE_NONE;
1374       }
1375 #endif
1376
1377 #if GEN_GEN < 8
1378       clip.UserClipDistanceCullTestEnableBitmask =
1379          brw_vue_prog_data(brw->vs.base.prog_data)->cull_distance_mask;
1380
1381       clip.ViewportZClipTestEnable = !ctx->Transform.DepthClamp;
1382 #endif
1383
1384       /* _NEW_LIGHT */
1385       if (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION) {
1386          clip.TriangleStripListProvokingVertexSelect = 0;
1387          clip.TriangleFanProvokingVertexSelect = 1;
1388          clip.LineStripListProvokingVertexSelect = 0;
1389       } else {
1390          clip.TriangleStripListProvokingVertexSelect = 2;
1391          clip.TriangleFanProvokingVertexSelect = 2;
1392          clip.LineStripListProvokingVertexSelect = 1;
1393       }
1394
1395       /* _NEW_TRANSFORM */
1396       clip.UserClipDistanceClipTestEnableBitmask =
1397          ctx->Transform.ClipPlanesEnabled;
1398
1399 #if GEN_GEN >= 8
1400       clip.ForceUserClipDistanceClipTestEnableBitmask = true;
1401 #endif
1402
1403       if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE)
1404          clip.APIMode = APIMODE_D3D;
1405       else
1406          clip.APIMode = APIMODE_OGL;
1407
1408       clip.GuardbandClipTestEnable = true;
1409
1410       /* BRW_NEW_VIEWPORT_COUNT */
1411       const unsigned viewport_count = brw->clip.viewport_count;
1412
1413       if (ctx->RasterDiscard) {
1414          clip.ClipMode = CLIPMODE_REJECT_ALL;
1415 #if GEN_GEN == 6
1416          perf_debug("Rasterizer discard is currently implemented via the "
1417                     "clipper; having the GS not write primitives would "
1418                     "likely be faster.\n");
1419 #endif
1420       } else {
1421          clip.ClipMode = CLIPMODE_NORMAL;
1422       }
1423
1424       clip.ClipEnable = true;
1425
1426       /* _NEW_POLYGON,
1427        * BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_TES_PROG_DATA | BRW_NEW_PRIMITIVE
1428        */
1429       if (!brw_is_drawing_points(brw) && !brw_is_drawing_lines(brw))
1430          clip.ViewportXYClipTestEnable = true;
1431
1432       clip.MinimumPointWidth = 0.125;
1433       clip.MaximumPointWidth = 255.875;
1434       clip.MaximumVPIndex = viewport_count - 1;
1435       if (_mesa_geometric_layers(fb) == 0)
1436          clip.ForceZeroRTAIndexEnable = true;
1437    }
1438 }
1439
1440 static const struct brw_tracked_state genX(clip_state) = {
1441    .dirty = {
1442       .mesa  = _NEW_BUFFERS |
1443                _NEW_LIGHT |
1444                _NEW_POLYGON |
1445                _NEW_TRANSFORM,
1446       .brw   = BRW_NEW_BLORP |
1447                BRW_NEW_CONTEXT |
1448                BRW_NEW_FS_PROG_DATA |
1449                BRW_NEW_GS_PROG_DATA |
1450                BRW_NEW_VS_PROG_DATA |
1451                BRW_NEW_META_IN_PROGRESS |
1452                BRW_NEW_PRIMITIVE |
1453                BRW_NEW_RASTERIZER_DISCARD |
1454                BRW_NEW_TES_PROG_DATA |
1455                BRW_NEW_VIEWPORT_COUNT,
1456    },
1457    .emit = genX(upload_clip_state),
1458 };
1459 #endif
1460
1461 /* ---------------------------------------------------------------------- */
1462
1463 static void
1464 genX(upload_sf)(struct brw_context *brw)
1465 {
1466    struct gl_context *ctx = &brw->ctx;
1467    float point_size;
1468
1469 #if GEN_GEN <= 7
1470    /* _NEW_BUFFERS */
1471    bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
1472    UNUSED const bool multisampled_fbo =
1473       _mesa_geometric_samples(ctx->DrawBuffer) > 1;
1474 #endif
1475
1476 #if GEN_GEN < 6
1477    const struct brw_sf_prog_data *sf_prog_data = brw->sf.prog_data;
1478
1479    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
1480
1481    brw_state_emit(brw, GENX(SF_STATE), 64, &brw->sf.state_offset, sf) {
1482       sf.KernelStartPointer = KSP(brw, brw->sf.prog_offset);
1483       sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
1484       sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
1485       sf.DispatchGRFStartRegisterForURBData = 3;
1486       sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET;
1487       sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length;
1488       sf.NumberofURBEntries = brw->urb.nr_sf_entries;
1489       sf.URBEntryAllocationSize = brw->urb.sfsize - 1;
1490
1491       /* STATE_PREFETCH command description describes this state as being
1492        * something loaded through the GPE (L2 ISC), so it's INSTRUCTION
1493        * domain.
1494        */
1495       sf.SetupViewportStateOffset =
1496          ro_bo(brw->batch.state.bo, brw->sf.vp_offset);
1497
1498       sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
1499
1500       /* sf.ConstantURBEntryReadLength = stage_prog_data->curb_read_length; */
1501       /* sf.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2; */
1502
1503       sf.MaximumNumberofThreads =
1504          MIN2(GEN_GEN == 5 ? 48 : 24, brw->urb.nr_sf_entries) - 1;
1505
1506       sf.SpritePointEnable = ctx->Point.PointSprite;
1507
1508       sf.DestinationOriginHorizontalBias = 0.5;
1509       sf.DestinationOriginVerticalBias = 0.5;
1510 #else
1511    brw_batch_emit(brw, GENX(3DSTATE_SF), sf) {
1512       sf.StatisticsEnable = true;
1513 #endif
1514       sf.ViewportTransformEnable = true;
1515
1516 #if GEN_GEN == 7
1517       /* _NEW_BUFFERS */
1518       sf.DepthBufferSurfaceFormat = brw_depthbuffer_format(brw);
1519 #endif
1520
1521 #if GEN_GEN <= 7
1522       /* _NEW_POLYGON */
1523       sf.FrontWinding = brw->polygon_front_bit == render_to_fbo;
1524 #if GEN_GEN >= 6
1525       sf.GlobalDepthOffsetEnableSolid = ctx->Polygon.OffsetFill;
1526       sf.GlobalDepthOffsetEnableWireframe = ctx->Polygon.OffsetLine;
1527       sf.GlobalDepthOffsetEnablePoint = ctx->Polygon.OffsetPoint;
1528
1529       switch (ctx->Polygon.FrontMode) {
1530          case GL_FILL:
1531             sf.FrontFaceFillMode = FILL_MODE_SOLID;
1532             break;
1533          case GL_LINE:
1534             sf.FrontFaceFillMode = FILL_MODE_WIREFRAME;
1535             break;
1536          case GL_POINT:
1537             sf.FrontFaceFillMode = FILL_MODE_POINT;
1538             break;
1539          default:
1540             unreachable("not reached");
1541       }
1542
1543       switch (ctx->Polygon.BackMode) {
1544          case GL_FILL:
1545             sf.BackFaceFillMode = FILL_MODE_SOLID;
1546             break;
1547          case GL_LINE:
1548             sf.BackFaceFillMode = FILL_MODE_WIREFRAME;
1549             break;
1550          case GL_POINT:
1551             sf.BackFaceFillMode = FILL_MODE_POINT;
1552             break;
1553          default:
1554             unreachable("not reached");
1555       }
1556
1557       if (multisampled_fbo && ctx->Multisample.Enabled)
1558          sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
1559
1560       sf.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
1561       sf.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
1562       sf.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
1563 #endif
1564
1565       sf.ScissorRectangleEnable = true;
1566
1567       if (ctx->Polygon.CullFlag) {
1568          switch (ctx->Polygon.CullFaceMode) {
1569             case GL_FRONT:
1570                sf.CullMode = CULLMODE_FRONT;
1571                break;
1572             case GL_BACK:
1573                sf.CullMode = CULLMODE_BACK;
1574                break;
1575             case GL_FRONT_AND_BACK:
1576                sf.CullMode = CULLMODE_BOTH;
1577                break;
1578             default:
1579                unreachable("not reached");
1580          }
1581       } else {
1582          sf.CullMode = CULLMODE_NONE;
1583       }
1584
1585 #if GEN_IS_HASWELL
1586       sf.LineStippleEnable = ctx->Line.StippleFlag;
1587 #endif
1588
1589 #endif
1590
1591       /* _NEW_LINE */
1592 #if GEN_GEN == 8
1593       const struct gen_device_info *devinfo = &brw->screen->devinfo;
1594
1595       if (devinfo->is_cherryview)
1596          sf.CHVLineWidth = brw_get_line_width(brw);
1597       else
1598          sf.LineWidth = brw_get_line_width(brw);
1599 #else
1600       sf.LineWidth = brw_get_line_width(brw);
1601 #endif
1602
1603       if (ctx->Line.SmoothFlag) {
1604          sf.LineEndCapAntialiasingRegionWidth = _10pixels;
1605 #if GEN_GEN <= 7
1606          sf.AntiAliasingEnable = true;
1607 #endif
1608       }
1609
1610       /* _NEW_POINT - Clamp to ARB_point_parameters user limits */
1611       point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
1612       /* Clamp to the hardware limits */
1613       sf.PointWidth = CLAMP(point_size, 0.125f, 255.875f);
1614
1615       /* _NEW_PROGRAM | _NEW_POINT, BRW_NEW_VUE_MAP_GEOM_OUT */
1616       if (use_state_point_size(brw))
1617          sf.PointWidthSource = State;
1618
1619 #if GEN_GEN >= 8
1620       /* _NEW_POINT | _NEW_MULTISAMPLE */
1621       if ((ctx->Point.SmoothFlag || _mesa_is_multisample_enabled(ctx)) &&
1622           !ctx->Point.PointSprite)
1623          sf.SmoothPointEnable = true;
1624 #endif
1625
1626 #if GEN_GEN == 10
1627       /* _NEW_BUFFERS
1628        * Smooth Point Enable bit MUST not be set when NUM_MULTISAMPLES > 1.
1629        */
1630       const bool multisampled_fbo =
1631          _mesa_geometric_samples(ctx->DrawBuffer) > 1;
1632       if (multisampled_fbo)
1633          sf.SmoothPointEnable = false;
1634 #endif
1635
1636 #if GEN_IS_G4X || GEN_GEN >= 5
1637       sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
1638 #endif
1639
1640       /* _NEW_LIGHT */
1641       if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) {
1642          sf.TriangleStripListProvokingVertexSelect = 2;
1643          sf.TriangleFanProvokingVertexSelect = 2;
1644          sf.LineStripListProvokingVertexSelect = 1;
1645       } else {
1646          sf.TriangleFanProvokingVertexSelect = 1;
1647       }
1648
1649 #if GEN_GEN == 6
1650       /* BRW_NEW_FS_PROG_DATA */
1651       const struct brw_wm_prog_data *wm_prog_data =
1652          brw_wm_prog_data(brw->wm.base.prog_data);
1653
1654       sf.AttributeSwizzleEnable = true;
1655       sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
1656
1657       /*
1658        * Window coordinates in an FBO are inverted, which means point
1659        * sprite origin must be inverted, too.
1660        */
1661       if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo) {
1662          sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
1663       } else {
1664          sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
1665       }
1666
1667       /* BRW_NEW_VUE_MAP_GEOM_OUT | BRW_NEW_FRAGMENT_PROGRAM |
1668        * _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM | BRW_NEW_FS_PROG_DATA
1669        */
1670       uint32_t urb_entry_read_length;
1671       uint32_t urb_entry_read_offset;
1672       uint32_t point_sprite_enables;
1673       genX(calculate_attr_overrides)(brw, sf.Attribute, &point_sprite_enables,
1674                                      &urb_entry_read_length,
1675                                      &urb_entry_read_offset);
1676       sf.VertexURBEntryReadLength = urb_entry_read_length;
1677       sf.VertexURBEntryReadOffset = urb_entry_read_offset;
1678       sf.PointSpriteTextureCoordinateEnable = point_sprite_enables;
1679       sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
1680 #endif
1681    }
1682 }
1683
1684 static const struct brw_tracked_state genX(sf_state) = {
1685    .dirty = {
1686       .mesa  = _NEW_LIGHT |
1687                _NEW_LINE |
1688                _NEW_POINT |
1689                _NEW_PROGRAM |
1690                (GEN_GEN >= 6 ? _NEW_MULTISAMPLE : 0) |
1691                (GEN_GEN <= 7 ? _NEW_BUFFERS | _NEW_POLYGON : 0) |
1692                (GEN_GEN == 10 ? _NEW_BUFFERS : 0),
1693       .brw   = BRW_NEW_BLORP |
1694                BRW_NEW_VUE_MAP_GEOM_OUT |
1695                (GEN_GEN <= 5 ? BRW_NEW_BATCH |
1696                                BRW_NEW_PROGRAM_CACHE |
1697                                BRW_NEW_SF_PROG_DATA |
1698                                BRW_NEW_SF_VP |
1699                                BRW_NEW_URB_FENCE
1700                              : 0) |
1701                (GEN_GEN >= 6 ? BRW_NEW_CONTEXT : 0) |
1702                (GEN_GEN >= 6 && GEN_GEN <= 7 ?
1703                                BRW_NEW_GS_PROG_DATA |
1704                                BRW_NEW_PRIMITIVE |
1705                                BRW_NEW_TES_PROG_DATA
1706                              : 0) |
1707                (GEN_GEN == 6 ? BRW_NEW_FS_PROG_DATA |
1708                                BRW_NEW_FRAGMENT_PROGRAM
1709                              : 0),
1710    },
1711    .emit = genX(upload_sf),
1712 };
1713
1714 /* ---------------------------------------------------------------------- */
1715
1716 static bool
1717 brw_color_buffer_write_enabled(struct brw_context *brw)
1718 {
1719    struct gl_context *ctx = &brw->ctx;
1720    /* BRW_NEW_FRAGMENT_PROGRAM */
1721    const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
1722    unsigned i;
1723
1724    /* _NEW_BUFFERS */
1725    for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
1726       struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
1727       uint64_t outputs_written = fp->info.outputs_written;
1728
1729       /* _NEW_COLOR */
1730       if (rb && (outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR) ||
1731                  outputs_written & BITFIELD64_BIT(FRAG_RESULT_DATA0 + i)) &&
1732           (ctx->Color.ColorMask[i][0] ||
1733            ctx->Color.ColorMask[i][1] ||
1734            ctx->Color.ColorMask[i][2] ||
1735            ctx->Color.ColorMask[i][3])) {
1736          return true;
1737       }
1738    }
1739
1740    return false;
1741 }
1742
1743 static void
1744 genX(upload_wm)(struct brw_context *brw)
1745 {
1746    struct gl_context *ctx = &brw->ctx;
1747
1748    /* BRW_NEW_FS_PROG_DATA */
1749    const struct brw_wm_prog_data *wm_prog_data =
1750       brw_wm_prog_data(brw->wm.base.prog_data);
1751
1752    UNUSED bool writes_depth =
1753       wm_prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF;
1754    UNUSED struct brw_stage_state *stage_state = &brw->wm.base;
1755    UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
1756
1757 #if GEN_GEN == 6
1758    /* We can't fold this into gen6_upload_wm_push_constants(), because
1759     * according to the SNB PRM, vol 2 part 1 section 7.2.2
1760     * (3DSTATE_CONSTANT_PS [DevSNB]):
1761     *
1762     *     "[DevSNB]: This packet must be followed by WM_STATE."
1763     */
1764    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_PS), wmcp) {
1765       if (wm_prog_data->base.nr_params != 0) {
1766          wmcp.Buffer0Valid = true;
1767          /* Pointer to the WM constant buffer.  Covered by the set of
1768           * state flags from gen6_upload_wm_push_constants.
1769           */
1770          wmcp.PointertoPSConstantBuffer0 = stage_state->push_const_offset;
1771          wmcp.PSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
1772       }
1773    }
1774 #endif
1775
1776 #if GEN_GEN >= 6
1777    brw_batch_emit(brw, GENX(3DSTATE_WM), wm) {
1778       wm.LineAntialiasingRegionWidth = _10pixels;
1779       wm.LineEndCapAntialiasingRegionWidth = _05pixels;
1780
1781       wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
1782       wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
1783 #else
1784    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
1785    brw_state_emit(brw, GENX(WM_STATE), 64, &stage_state->state_offset, wm) {
1786       if (wm_prog_data->dispatch_8 && wm_prog_data->dispatch_16) {
1787          /* These two fields should be the same pre-gen6, which is why we
1788           * only have one hardware field to program for both dispatch
1789           * widths.
1790           */
1791          assert(wm_prog_data->base.dispatch_grf_start_reg ==
1792                 wm_prog_data->dispatch_grf_start_reg_2);
1793       }
1794
1795       if (wm_prog_data->dispatch_8 || wm_prog_data->dispatch_16)
1796          wm.GRFRegisterCount0 = wm_prog_data->reg_blocks_0;
1797
1798       if (stage_state->sampler_count)
1799          wm.SamplerStatePointer =
1800             ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
1801 #if GEN_GEN == 5
1802       if (wm_prog_data->prog_offset_2)
1803          wm.GRFRegisterCount2 = wm_prog_data->reg_blocks_2;
1804 #endif
1805
1806       wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
1807       wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
1808       /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
1809       wm.ConstantURBEntryReadOffset = brw->curbe.wm_start * 2;
1810       wm.EarlyDepthTestEnable = true;
1811       wm.LineAntialiasingRegionWidth = _05pixels;
1812       wm.LineEndCapAntialiasingRegionWidth = _10pixels;
1813
1814       /* _NEW_POLYGON */
1815       if (ctx->Polygon.OffsetFill) {
1816          wm.GlobalDepthOffsetEnable = true;
1817          /* Something weird going on with legacy_global_depth_bias,
1818           * offset_constant, scaling and MRD.  This value passes glean
1819           * but gives some odd results elsewere (eg. the
1820           * quad-offset-units test).
1821           */
1822          wm.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
1823
1824          /* This is the only value that passes glean:
1825          */
1826          wm.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
1827       }
1828
1829       wm.DepthCoefficientURBReadOffset = 1;
1830 #endif
1831
1832       /* BRW_NEW_STATS_WM */
1833       wm.StatisticsEnable = GEN_GEN >= 6 || brw->stats_wm;
1834
1835 #if GEN_GEN < 7
1836       if (wm_prog_data->base.use_alt_mode)
1837          wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
1838
1839       wm.SamplerCount = GEN_GEN == 5 ?
1840          0 : DIV_ROUND_UP(stage_state->sampler_count, 4);
1841
1842       wm.BindingTableEntryCount =
1843          wm_prog_data->base.binding_table.size_bytes / 4;
1844       wm.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
1845       wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
1846       wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
1847       wm.DispatchGRFStartRegisterForConstantSetupData0 =
1848          wm_prog_data->base.dispatch_grf_start_reg;
1849       if (GEN_GEN == 6 ||
1850           wm_prog_data->dispatch_8 || wm_prog_data->dispatch_16) {
1851          wm.KernelStartPointer0 = KSP(brw, stage_state->prog_offset);
1852       }
1853
1854 #if GEN_GEN >= 5
1855       if (GEN_GEN == 6 || wm_prog_data->prog_offset_2) {
1856          wm.KernelStartPointer2 =
1857             KSP(brw, stage_state->prog_offset + wm_prog_data->prog_offset_2);
1858       }
1859 #endif
1860
1861 #if GEN_GEN == 6
1862       wm.DualSourceBlendEnable =
1863          wm_prog_data->dual_src_blend && (ctx->Color.BlendEnabled & 1) &&
1864          ctx->Color.Blend[0]._UsesDualSrc;
1865       wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
1866       wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
1867
1868       /* From the SNB PRM, volume 2 part 1, page 281:
1869        * "If the PS kernel does not need the Position XY Offsets
1870        * to compute a Position XY value, then this field should be
1871        * programmed to POSOFFSET_NONE."
1872        *
1873        * "SW Recommendation: If the PS kernel needs the Position Offsets
1874        * to compute a Position XY value, this field should match Position
1875        * ZW Interpolation Mode to ensure a consistent position.xyzw
1876        * computation."
1877        * We only require XY sample offsets. So, this recommendation doesn't
1878        * look useful at the moment. We might need this in future.
1879        */
1880       if (wm_prog_data->uses_pos_offset)
1881          wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
1882       else
1883          wm.PositionXYOffsetSelect = POSOFFSET_NONE;
1884
1885       wm.DispatchGRFStartRegisterForConstantSetupData2 =
1886          wm_prog_data->dispatch_grf_start_reg_2;
1887 #endif
1888
1889       if (wm_prog_data->base.total_scratch) {
1890          wm.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);
1891          wm.PerThreadScratchSpace =
1892             ffs(stage_state->per_thread_scratch) - 11;
1893       }
1894
1895       wm.PixelShaderComputedDepth = writes_depth;
1896 #endif
1897
1898       /* _NEW_LINE */
1899       wm.LineStippleEnable = ctx->Line.StippleFlag;
1900
1901       /* _NEW_POLYGON */
1902       wm.PolygonStippleEnable = ctx->Polygon.StippleFlag;
1903
1904 #if GEN_GEN < 8
1905
1906 #if GEN_GEN >= 6
1907       wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
1908
1909       /* _NEW_BUFFERS */
1910       const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
1911
1912       if (multisampled_fbo) {
1913          /* _NEW_MULTISAMPLE */
1914          if (ctx->Multisample.Enabled)
1915             wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
1916          else
1917             wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
1918
1919          if (wm_prog_data->persample_dispatch)
1920             wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
1921          else
1922             wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
1923       } else {
1924          wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
1925          wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
1926       }
1927 #endif
1928       wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
1929       if (wm_prog_data->uses_kill ||
1930           _mesa_is_alpha_test_enabled(ctx) ||
1931           _mesa_is_alpha_to_coverage_enabled(ctx) ||
1932           (GEN_GEN >= 6 && wm_prog_data->uses_omask)) {
1933          wm.PixelShaderKillsPixel = true;
1934       }
1935
1936       /* _NEW_BUFFERS | _NEW_COLOR */
1937       if (brw_color_buffer_write_enabled(brw) || writes_depth ||
1938           wm.PixelShaderKillsPixel ||
1939           (GEN_GEN >= 6 && wm_prog_data->has_side_effects)) {
1940          wm.ThreadDispatchEnable = true;
1941       }
1942
1943 #if GEN_GEN >= 7
1944       wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
1945       wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
1946 #endif
1947
1948       /* The "UAV access enable" bits are unnecessary on HSW because they only
1949        * seem to have an effect on the HW-assisted coherency mechanism which we
1950        * don't need, and the rasterization-related UAV_ONLY flag and the
1951        * DISPATCH_ENABLE bit can be set independently from it.
1952        * C.f. gen8_upload_ps_extra().
1953        *
1954        * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS |
1955        * _NEW_COLOR
1956        */
1957 #if GEN_IS_HASWELL
1958       if (!(brw_color_buffer_write_enabled(brw) || writes_depth) &&
1959           wm_prog_data->has_side_effects)
1960          wm.PSUAVonly = ON;
1961 #endif
1962 #endif
1963
1964 #if GEN_GEN >= 7
1965       /* BRW_NEW_FS_PROG_DATA */
1966       if (wm_prog_data->early_fragment_tests)
1967          wm.EarlyDepthStencilControl = EDSC_PREPS;
1968       else if (wm_prog_data->has_side_effects)
1969          wm.EarlyDepthStencilControl = EDSC_PSEXEC;
1970 #endif
1971    }
1972
1973 #if GEN_GEN <= 5
1974    if (brw->wm.offset_clamp != ctx->Polygon.OffsetClamp) {
1975       brw_batch_emit(brw, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) {
1976          clamp.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
1977       }
1978
1979       brw->wm.offset_clamp = ctx->Polygon.OffsetClamp;
1980    }
1981 #endif
1982 }
1983
1984 static const struct brw_tracked_state genX(wm_state) = {
1985    .dirty = {
1986       .mesa  = _NEW_LINE |
1987                _NEW_POLYGON |
1988                (GEN_GEN < 8 ? _NEW_BUFFERS |
1989                               _NEW_COLOR :
1990                               0) |
1991                (GEN_GEN == 6 ? _NEW_PROGRAM_CONSTANTS : 0) |
1992                (GEN_GEN < 6 ? _NEW_POLYGONSTIPPLE : 0) |
1993                (GEN_GEN < 8 && GEN_GEN >= 6 ? _NEW_MULTISAMPLE : 0),
1994       .brw   = BRW_NEW_BLORP |
1995                BRW_NEW_FS_PROG_DATA |
1996                (GEN_GEN < 6 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
1997                               BRW_NEW_FRAGMENT_PROGRAM |
1998                               BRW_NEW_PROGRAM_CACHE |
1999                               BRW_NEW_SAMPLER_STATE_TABLE |
2000                               BRW_NEW_STATS_WM
2001                             : 0) |
2002                (GEN_GEN < 7 ? BRW_NEW_BATCH : BRW_NEW_CONTEXT),
2003    },
2004    .emit = genX(upload_wm),
2005 };
2006
2007 /* ---------------------------------------------------------------------- */
2008
2009 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \
2010    pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset);           \
2011    pkt.SamplerCount       =                                               \
2012       DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);          \
2013    pkt.BindingTableEntryCount =                                           \
2014       stage_prog_data->binding_table.size_bytes / 4;                      \
2015    pkt.FloatingPointMode  = stage_prog_data->use_alt_mode;                \
2016                                                                           \
2017    if (stage_prog_data->total_scratch) {                                  \
2018       pkt.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);    \
2019       pkt.PerThreadScratchSpace =                                         \
2020          ffs(stage_state->per_thread_scratch) - 11;                       \
2021    }                                                                      \
2022                                                                           \
2023    pkt.DispatchGRFStartRegisterForURBData =                               \
2024       stage_prog_data->dispatch_grf_start_reg;                            \
2025    pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length;       \
2026    pkt.prefix##URBEntryReadOffset = 0;                                    \
2027                                                                           \
2028    pkt.StatisticsEnable = true;                                           \
2029    pkt.Enable           = true;
2030
2031 static void
2032 genX(upload_vs_state)(struct brw_context *brw)
2033 {
2034    UNUSED struct gl_context *ctx = &brw->ctx;
2035    const struct gen_device_info *devinfo = &brw->screen->devinfo;
2036    struct brw_stage_state *stage_state = &brw->vs.base;
2037
2038    /* BRW_NEW_VS_PROG_DATA */
2039    const struct brw_vue_prog_data *vue_prog_data =
2040       brw_vue_prog_data(brw->vs.base.prog_data);
2041    const struct brw_stage_prog_data *stage_prog_data = &vue_prog_data->base;
2042
2043    assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ||
2044           vue_prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT);
2045
2046 #if GEN_GEN == 6
2047    /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
2048     * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
2049     *
2050     *   [DevSNB] A pipeline flush must be programmed prior to a 3DSTATE_VS
2051     *   command that causes the VS Function Enable to toggle. Pipeline
2052     *   flush can be executed by sending a PIPE_CONTROL command with CS
2053     *   stall bit set and a post sync operation.
2054     *
2055     * We've already done such a flush at the start of state upload, so we
2056     * don't need to do another one here.
2057     */
2058    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), cvs) {
2059       if (stage_state->push_const_size != 0) {
2060          cvs.Buffer0Valid = true;
2061          cvs.PointertoVSConstantBuffer0 = stage_state->push_const_offset;
2062          cvs.VSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
2063       }
2064    }
2065 #endif
2066
2067    if (GEN_GEN == 7 && devinfo->is_ivybridge)
2068       gen7_emit_vs_workaround_flush(brw);
2069
2070 #if GEN_GEN >= 6
2071    brw_batch_emit(brw, GENX(3DSTATE_VS), vs) {
2072 #else
2073    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
2074    brw_state_emit(brw, GENX(VS_STATE), 32, &stage_state->state_offset, vs) {
2075 #endif
2076       INIT_THREAD_DISPATCH_FIELDS(vs, Vertex);
2077
2078       vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
2079
2080 #if GEN_GEN < 6
2081       vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1;
2082       vs.ConstantURBEntryReadLength = stage_prog_data->curb_read_length;
2083       vs.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2;
2084
2085       vs.NumberofURBEntries = brw->urb.nr_vs_entries >> (GEN_GEN == 5 ? 2 : 0);
2086       vs.URBEntryAllocationSize = brw->urb.vsize - 1;
2087
2088       vs.MaximumNumberofThreads =
2089          CLAMP(brw->urb.nr_vs_entries / 2, 1, devinfo->max_vs_threads) - 1;
2090
2091       vs.StatisticsEnable = false;
2092       vs.SamplerStatePointer =
2093          ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
2094 #endif
2095
2096 #if GEN_GEN == 5
2097       /* Force single program flow on Ironlake.  We cannot reliably get
2098        * all applications working without it.  See:
2099        * https://bugs.freedesktop.org/show_bug.cgi?id=29172
2100        *
2101        * The most notable and reliably failing application is the Humus
2102        * demo "CelShading"
2103        */
2104       vs.SingleProgramFlow = true;
2105       vs.SamplerCount = 0; /* hardware requirement */
2106 #endif
2107
2108 #if GEN_GEN >= 8
2109       vs.SIMD8DispatchEnable =
2110          vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
2111
2112       vs.UserClipDistanceCullTestEnableBitmask =
2113          vue_prog_data->cull_distance_mask;
2114 #endif
2115    }
2116
2117 #if GEN_GEN == 6
2118    /* Based on my reading of the simulator, the VS constants don't get
2119     * pulled into the VS FF unit until an appropriate pipeline flush
2120     * happens, and instead the 3DSTATE_CONSTANT_VS packet just adds
2121     * references to them into a little FIFO.  The flushes are common,
2122     * but don't reliably happen between this and a 3DPRIMITIVE, causing
2123     * the primitive to use the wrong constants.  Then the FIFO
2124     * containing the constant setup gets added to again on the next
2125     * constants change, and eventually when a flush does happen the
2126     * unit is overwhelmed by constant changes and dies.
2127     *
2128     * To avoid this, send a PIPE_CONTROL down the line that will
2129     * update the unit immediately loading the constants.  The flush
2130     * type bits here were those set by the STATE_BASE_ADDRESS whose
2131     * move in a82a43e8d99e1715dd11c9c091b5ab734079b6a6 triggered the
2132     * bug reports that led to this workaround, and may be more than
2133     * what is strictly required to avoid the issue.
2134     */
2135    brw_emit_pipe_control_flush(brw,
2136                                PIPE_CONTROL_DEPTH_STALL |
2137                                PIPE_CONTROL_INSTRUCTION_INVALIDATE |
2138                                PIPE_CONTROL_STATE_CACHE_INVALIDATE);
2139 #endif
2140 }
2141
2142 static const struct brw_tracked_state genX(vs_state) = {
2143    .dirty = {
2144       .mesa  = (GEN_GEN == 6 ? (_NEW_PROGRAM_CONSTANTS | _NEW_TRANSFORM) : 0),
2145       .brw   = BRW_NEW_BATCH |
2146                BRW_NEW_BLORP |
2147                BRW_NEW_CONTEXT |
2148                BRW_NEW_VS_PROG_DATA |
2149                (GEN_GEN == 6 ? BRW_NEW_VERTEX_PROGRAM : 0) |
2150                (GEN_GEN <= 5 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
2151                                BRW_NEW_PROGRAM_CACHE |
2152                                BRW_NEW_SAMPLER_STATE_TABLE |
2153                                BRW_NEW_URB_FENCE
2154                              : 0),
2155    },
2156    .emit = genX(upload_vs_state),
2157 };
2158
2159 /* ---------------------------------------------------------------------- */
2160
2161 static void
2162 genX(upload_cc_viewport)(struct brw_context *brw)
2163 {
2164    struct gl_context *ctx = &brw->ctx;
2165
2166    /* BRW_NEW_VIEWPORT_COUNT */
2167    const unsigned viewport_count = brw->clip.viewport_count;
2168
2169    struct GENX(CC_VIEWPORT) ccv;
2170    uint32_t cc_vp_offset;
2171    uint32_t *cc_map =
2172       brw_state_batch(brw, 4 * GENX(CC_VIEWPORT_length) * viewport_count,
2173                       32, &cc_vp_offset);
2174
2175    for (unsigned i = 0; i < viewport_count; i++) {
2176       /* _NEW_VIEWPORT | _NEW_TRANSFORM */
2177       const struct gl_viewport_attrib *vp = &ctx->ViewportArray[i];
2178       if (ctx->Transform.DepthClamp) {
2179          ccv.MinimumDepth = MIN2(vp->Near, vp->Far);
2180          ccv.MaximumDepth = MAX2(vp->Near, vp->Far);
2181       } else {
2182          ccv.MinimumDepth = 0.0;
2183          ccv.MaximumDepth = 1.0;
2184       }
2185       GENX(CC_VIEWPORT_pack)(NULL, cc_map, &ccv);
2186       cc_map += GENX(CC_VIEWPORT_length);
2187    }
2188
2189 #if GEN_GEN >= 7
2190    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
2191       ptr.CCViewportPointer = cc_vp_offset;
2192    }
2193 #elif GEN_GEN == 6
2194    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
2195       vp.CCViewportStateChange = 1;
2196       vp.PointertoCC_VIEWPORT = cc_vp_offset;
2197    }
2198 #else
2199    brw->cc.vp_offset = cc_vp_offset;
2200    ctx->NewDriverState |= BRW_NEW_CC_VP;
2201 #endif
2202 }
2203
2204 const struct brw_tracked_state genX(cc_vp) = {
2205    .dirty = {
2206       .mesa = _NEW_TRANSFORM |
2207               _NEW_VIEWPORT,
2208       .brw = BRW_NEW_BATCH |
2209              BRW_NEW_BLORP |
2210              BRW_NEW_VIEWPORT_COUNT,
2211    },
2212    .emit = genX(upload_cc_viewport)
2213 };
2214
2215 /* ---------------------------------------------------------------------- */
2216
2217 static void
2218 set_scissor_bits(const struct gl_context *ctx, int i,
2219                  bool render_to_fbo, unsigned fb_width, unsigned fb_height,
2220                  struct GENX(SCISSOR_RECT) *sc)
2221 {
2222    int bbox[4];
2223
2224    bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
2225    bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width);
2226    bbox[2] = MAX2(ctx->ViewportArray[i].Y, 0);
2227    bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height);
2228    _mesa_intersect_scissor_bounding_box(ctx, i, bbox);
2229
2230    if (bbox[0] == bbox[1] || bbox[2] == bbox[3]) {
2231       /* If the scissor was out of bounds and got clamped to 0 width/height
2232        * at the bounds, the subtraction of 1 from maximums could produce a
2233        * negative number and thus not clip anything.  Instead, just provide
2234        * a min > max scissor inside the bounds, which produces the expected
2235        * no rendering.
2236        */
2237       sc->ScissorRectangleXMin = 1;
2238       sc->ScissorRectangleXMax = 0;
2239       sc->ScissorRectangleYMin = 1;
2240       sc->ScissorRectangleYMax = 0;
2241    } else if (render_to_fbo) {
2242       /* texmemory: Y=0=bottom */
2243       sc->ScissorRectangleXMin = bbox[0];
2244       sc->ScissorRectangleXMax = bbox[1] - 1;
2245       sc->ScissorRectangleYMin = bbox[2];
2246       sc->ScissorRectangleYMax = bbox[3] - 1;
2247    } else {
2248       /* memory: Y=0=top */
2249       sc->ScissorRectangleXMin = bbox[0];
2250       sc->ScissorRectangleXMax = bbox[1] - 1;
2251       sc->ScissorRectangleYMin = fb_height - bbox[3];
2252       sc->ScissorRectangleYMax = fb_height - bbox[2] - 1;
2253    }
2254 }
2255
2256 #if GEN_GEN >= 6
2257 static void
2258 genX(upload_scissor_state)(struct brw_context *brw)
2259 {
2260    struct gl_context *ctx = &brw->ctx;
2261    const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2262    struct GENX(SCISSOR_RECT) scissor;
2263    uint32_t scissor_state_offset;
2264    const unsigned int fb_width = _mesa_geometric_width(ctx->DrawBuffer);
2265    const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer);
2266    uint32_t *scissor_map;
2267
2268    /* BRW_NEW_VIEWPORT_COUNT */
2269    const unsigned viewport_count = brw->clip.viewport_count;
2270
2271    scissor_map = brw_state_batch(
2272       brw, GENX(SCISSOR_RECT_length) * sizeof(uint32_t) * viewport_count,
2273       32, &scissor_state_offset);
2274
2275    /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */
2276
2277    /* The scissor only needs to handle the intersection of drawable and
2278     * scissor rect.  Clipping to the boundaries of static shared buffers
2279     * for front/back/depth is covered by looping over cliprects in brw_draw.c.
2280     *
2281     * Note that the hardware's coordinates are inclusive, while Mesa's min is
2282     * inclusive but max is exclusive.
2283     */
2284    for (unsigned i = 0; i < viewport_count; i++) {
2285       set_scissor_bits(ctx, i, render_to_fbo, fb_width, fb_height, &scissor);
2286       GENX(SCISSOR_RECT_pack)(
2287          NULL, scissor_map + i * GENX(SCISSOR_RECT_length), &scissor);
2288    }
2289
2290    brw_batch_emit(brw, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
2291       ptr.ScissorRectPointer = scissor_state_offset;
2292    }
2293 }
2294
2295 static const struct brw_tracked_state genX(scissor_state) = {
2296    .dirty = {
2297       .mesa = _NEW_BUFFERS |
2298               _NEW_SCISSOR |
2299               _NEW_VIEWPORT,
2300       .brw = BRW_NEW_BATCH |
2301              BRW_NEW_BLORP |
2302              BRW_NEW_VIEWPORT_COUNT,
2303    },
2304    .emit = genX(upload_scissor_state),
2305 };
2306 #endif
2307
2308 /* ---------------------------------------------------------------------- */
2309
2310 static void
2311 brw_calculate_guardband_size(uint32_t fb_width, uint32_t fb_height,
2312                              float m00, float m11, float m30, float m31,
2313                              float *xmin, float *xmax,
2314                              float *ymin, float *ymax)
2315 {
2316    /* According to the "Vertex X,Y Clamping and Quantization" section of the
2317     * Strips and Fans documentation:
2318     *
2319     * "The vertex X and Y screen-space coordinates are also /clamped/ to the
2320     *  fixed-point "guardband" range supported by the rasterization hardware"
2321     *
2322     * and
2323     *
2324     * "In almost all circumstances, if an object’s vertices are actually
2325     *  modified by this clamping (i.e., had X or Y coordinates outside of
2326     *  the guardband extent the rendered object will not match the intended
2327     *  result.  Therefore software should take steps to ensure that this does
2328     *  not happen - e.g., by clipping objects such that they do not exceed
2329     *  these limits after the Drawing Rectangle is applied."
2330     *
2331     * I believe the fundamental restriction is that the rasterizer (in
2332     * the SF/WM stages) have a limit on the number of pixels that can be
2333     * rasterized.  We need to ensure any coordinates beyond the rasterizer
2334     * limit are handled by the clipper.  So effectively that limit becomes
2335     * the clipper's guardband size.
2336     *
2337     * It goes on to say:
2338     *
2339     * "In addition, in order to be correctly rendered, objects must have a
2340     *  screenspace bounding box not exceeding 8K in the X or Y direction.
2341     *  This additional restriction must also be comprehended by software,
2342     *  i.e., enforced by use of clipping."
2343     *
2344     * This makes no sense.  Gen7+ hardware supports 16K render targets,
2345     * and you definitely need to be able to draw polygons that fill the
2346     * surface.  Our assumption is that the rasterizer was limited to 8K
2347     * on Sandybridge, which only supports 8K surfaces, and it was actually
2348     * increased to 16K on Ivybridge and later.
2349     *
2350     * So, limit the guardband to 16K on Gen7+ and 8K on Sandybridge.
2351     */
2352    const float gb_size = GEN_GEN >= 7 ? 16384.0f : 8192.0f;
2353
2354    if (m00 != 0 && m11 != 0) {
2355       /* First, we compute the screen-space render area */
2356       const float ss_ra_xmin = MIN3(        0, m30 + m00, m30 - m00);
2357       const float ss_ra_xmax = MAX3( fb_width, m30 + m00, m30 - m00);
2358       const float ss_ra_ymin = MIN3(        0, m31 + m11, m31 - m11);
2359       const float ss_ra_ymax = MAX3(fb_height, m31 + m11, m31 - m11);
2360
2361       /* We want the guardband to be centered on that */
2362       const float ss_gb_xmin = (ss_ra_xmin + ss_ra_xmax) / 2 - gb_size;
2363       const float ss_gb_xmax = (ss_ra_xmin + ss_ra_xmax) / 2 + gb_size;
2364       const float ss_gb_ymin = (ss_ra_ymin + ss_ra_ymax) / 2 - gb_size;
2365       const float ss_gb_ymax = (ss_ra_ymin + ss_ra_ymax) / 2 + gb_size;
2366
2367       /* Now we need it in native device coordinates */
2368       const float ndc_gb_xmin = (ss_gb_xmin - m30) / m00;
2369       const float ndc_gb_xmax = (ss_gb_xmax - m30) / m00;
2370       const float ndc_gb_ymin = (ss_gb_ymin - m31) / m11;
2371       const float ndc_gb_ymax = (ss_gb_ymax - m31) / m11;
2372
2373       /* Thanks to Y-flipping and ORIGIN_UPPER_LEFT, the Y coordinates may be
2374        * flipped upside-down.  X should be fine though.
2375        */
2376       assert(ndc_gb_xmin <= ndc_gb_xmax);
2377       *xmin = ndc_gb_xmin;
2378       *xmax = ndc_gb_xmax;
2379       *ymin = MIN2(ndc_gb_ymin, ndc_gb_ymax);
2380       *ymax = MAX2(ndc_gb_ymin, ndc_gb_ymax);
2381    } else {
2382       /* The viewport scales to 0, so nothing will be rendered. */
2383       *xmin = 0.0f;
2384       *xmax = 0.0f;
2385       *ymin = 0.0f;
2386       *ymax = 0.0f;
2387    }
2388 }
2389
2390 static void
2391 genX(upload_sf_clip_viewport)(struct brw_context *brw)
2392 {
2393    struct gl_context *ctx = &brw->ctx;
2394    float y_scale, y_bias;
2395
2396    /* BRW_NEW_VIEWPORT_COUNT */
2397    const unsigned viewport_count = brw->clip.viewport_count;
2398
2399    /* _NEW_BUFFERS */
2400    const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2401    const uint32_t fb_width = (float)_mesa_geometric_width(ctx->DrawBuffer);
2402    const uint32_t fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer);
2403
2404 #if GEN_GEN >= 7
2405 #define clv sfv
2406    struct GENX(SF_CLIP_VIEWPORT) sfv;
2407    uint32_t sf_clip_vp_offset;
2408    uint32_t *sf_clip_map =
2409       brw_state_batch(brw, GENX(SF_CLIP_VIEWPORT_length) * 4 * viewport_count,
2410                       64, &sf_clip_vp_offset);
2411 #else
2412    struct GENX(SF_VIEWPORT) sfv;
2413    struct GENX(CLIP_VIEWPORT) clv;
2414    uint32_t sf_vp_offset, clip_vp_offset;
2415    uint32_t *sf_map =
2416       brw_state_batch(brw, GENX(SF_VIEWPORT_length) * 4 * viewport_count,
2417                       32, &sf_vp_offset);
2418    uint32_t *clip_map =
2419       brw_state_batch(brw, GENX(CLIP_VIEWPORT_length) * 4 * viewport_count,
2420                       32, &clip_vp_offset);
2421 #endif
2422
2423    /* _NEW_BUFFERS */
2424    if (render_to_fbo) {
2425       y_scale = 1.0;
2426       y_bias = 0;
2427    } else {
2428       y_scale = -1.0;
2429       y_bias = (float)fb_height;
2430    }
2431
2432    for (unsigned i = 0; i < brw->clip.viewport_count; i++) {
2433       /* _NEW_VIEWPORT: Guardband Clipping */
2434       float scale[3], translate[3], gb_xmin, gb_xmax, gb_ymin, gb_ymax;
2435       _mesa_get_viewport_xform(ctx, i, scale, translate);
2436
2437       sfv.ViewportMatrixElementm00 = scale[0];
2438       sfv.ViewportMatrixElementm11 = scale[1] * y_scale,
2439       sfv.ViewportMatrixElementm22 = scale[2],
2440       sfv.ViewportMatrixElementm30 = translate[0],
2441       sfv.ViewportMatrixElementm31 = translate[1] * y_scale + y_bias,
2442       sfv.ViewportMatrixElementm32 = translate[2],
2443       brw_calculate_guardband_size(fb_width, fb_height,
2444                                    sfv.ViewportMatrixElementm00,
2445                                    sfv.ViewportMatrixElementm11,
2446                                    sfv.ViewportMatrixElementm30,
2447                                    sfv.ViewportMatrixElementm31,
2448                                    &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
2449
2450
2451       clv.XMinClipGuardband = gb_xmin;
2452       clv.XMaxClipGuardband = gb_xmax;
2453       clv.YMinClipGuardband = gb_ymin;
2454       clv.YMaxClipGuardband = gb_ymax;
2455
2456 #if GEN_GEN < 6
2457       set_scissor_bits(ctx, i, render_to_fbo, fb_width, fb_height,
2458                        &sfv.ScissorRectangle);
2459 #elif GEN_GEN >= 8
2460       /* _NEW_VIEWPORT | _NEW_BUFFERS: Screen Space Viewport
2461        * The hardware will take the intersection of the drawing rectangle,
2462        * scissor rectangle, and the viewport extents. We don't need to be
2463        * smart, and can therefore just program the viewport extents.
2464        */
2465       const float viewport_Xmax =
2466          ctx->ViewportArray[i].X + ctx->ViewportArray[i].Width;
2467       const float viewport_Ymax =
2468          ctx->ViewportArray[i].Y + ctx->ViewportArray[i].Height;
2469
2470       if (render_to_fbo) {
2471          sfv.XMinViewPort = ctx->ViewportArray[i].X;
2472          sfv.XMaxViewPort = viewport_Xmax - 1;
2473          sfv.YMinViewPort = ctx->ViewportArray[i].Y;
2474          sfv.YMaxViewPort = viewport_Ymax - 1;
2475       } else {
2476          sfv.XMinViewPort = ctx->ViewportArray[i].X;
2477          sfv.XMaxViewPort = viewport_Xmax - 1;
2478          sfv.YMinViewPort = fb_height - viewport_Ymax;
2479          sfv.YMaxViewPort = fb_height - ctx->ViewportArray[i].Y - 1;
2480       }
2481 #endif
2482
2483 #if GEN_GEN >= 7
2484       GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_map, &sfv);
2485       sf_clip_map += GENX(SF_CLIP_VIEWPORT_length);
2486 #else
2487       GENX(SF_VIEWPORT_pack)(NULL, sf_map, &sfv);
2488       GENX(CLIP_VIEWPORT_pack)(NULL, clip_map, &clv);
2489       sf_map += GENX(SF_VIEWPORT_length);
2490       clip_map += GENX(CLIP_VIEWPORT_length);
2491 #endif
2492    }
2493
2494 #if GEN_GEN >= 7
2495    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
2496       ptr.SFClipViewportPointer = sf_clip_vp_offset;
2497    }
2498 #elif GEN_GEN == 6
2499    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
2500       vp.SFViewportStateChange = 1;
2501       vp.CLIPViewportStateChange = 1;
2502       vp.PointertoCLIP_VIEWPORT = clip_vp_offset;
2503       vp.PointertoSF_VIEWPORT = sf_vp_offset;
2504    }
2505 #else
2506    brw->sf.vp_offset = sf_vp_offset;
2507    brw->clip.vp_offset = clip_vp_offset;
2508    brw->ctx.NewDriverState |= BRW_NEW_SF_VP | BRW_NEW_CLIP_VP;
2509 #endif
2510 }
2511
2512 static const struct brw_tracked_state genX(sf_clip_viewport) = {
2513    .dirty = {
2514       .mesa = _NEW_BUFFERS |
2515               _NEW_VIEWPORT |
2516               (GEN_GEN <= 5 ? _NEW_SCISSOR : 0),
2517       .brw = BRW_NEW_BATCH |
2518              BRW_NEW_BLORP |
2519              BRW_NEW_VIEWPORT_COUNT,
2520    },
2521    .emit = genX(upload_sf_clip_viewport),
2522 };
2523
2524 /* ---------------------------------------------------------------------- */
2525
2526 static void
2527 genX(upload_gs_state)(struct brw_context *brw)
2528 {
2529    UNUSED struct gl_context *ctx = &brw->ctx;
2530    UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
2531    const struct brw_stage_state *stage_state = &brw->gs.base;
2532    const struct gl_program *gs_prog = brw->programs[MESA_SHADER_GEOMETRY];
2533    /* BRW_NEW_GEOMETRY_PROGRAM */
2534    bool active = GEN_GEN >= 6 && gs_prog;
2535
2536    /* BRW_NEW_GS_PROG_DATA */
2537    struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
2538    UNUSED const struct brw_vue_prog_data *vue_prog_data =
2539       brw_vue_prog_data(stage_prog_data);
2540 #if GEN_GEN >= 7
2541    const struct brw_gs_prog_data *gs_prog_data =
2542       brw_gs_prog_data(stage_prog_data);
2543 #endif
2544
2545 #if GEN_GEN == 6
2546    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_GS), cgs) {
2547       if (active && stage_state->push_const_size != 0) {
2548          cgs.Buffer0Valid = true;
2549          cgs.PointertoGSConstantBuffer0 = stage_state->push_const_offset;
2550          cgs.GSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
2551       }
2552    }
2553 #endif
2554
2555 #if GEN_GEN == 7 && !GEN_IS_HASWELL
2556    /**
2557     * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
2558     * Geometry > Geometry Shader > State:
2559     *
2560     *     "Note: Because of corruption in IVB:GT2, software needs to flush the
2561     *     whole fixed function pipeline when the GS enable changes value in
2562     *     the 3DSTATE_GS."
2563     *
2564     * The hardware architects have clarified that in this context "flush the
2565     * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
2566     * Stall" bit set.
2567     */
2568    if (devinfo->gt == 2 && brw->gs.enabled != active)
2569       gen7_emit_cs_stall_flush(brw);
2570 #endif
2571
2572 #if GEN_GEN >= 6
2573    brw_batch_emit(brw, GENX(3DSTATE_GS), gs) {
2574 #else
2575    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
2576    brw_state_emit(brw, GENX(GS_STATE), 32, &brw->ff_gs.state_offset, gs) {
2577 #endif
2578
2579 #if GEN_GEN >= 6
2580       if (active) {
2581          INIT_THREAD_DISPATCH_FIELDS(gs, Vertex);
2582
2583 #if GEN_GEN >= 7
2584          gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
2585          gs.OutputTopology = gs_prog_data->output_topology;
2586          gs.ControlDataHeaderSize =
2587             gs_prog_data->control_data_header_size_hwords;
2588
2589          gs.InstanceControl = gs_prog_data->invocations - 1;
2590          gs.DispatchMode = vue_prog_data->dispatch_mode;
2591
2592          gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
2593
2594          gs.ControlDataFormat = gs_prog_data->control_data_format;
2595 #endif
2596
2597          /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between
2598           * Ivy Bridge and Haswell.
2599           *
2600           * On Ivy Bridge, setting this bit causes the vertices of a triangle
2601           * strip to be delivered to the geometry shader in an order that does
2602           * not strictly follow the OpenGL spec, but preserves triangle
2603           * orientation.  For example, if the vertices are (1, 2, 3, 4, 5), then
2604           * the geometry shader sees triangles:
2605           *
2606           * (1, 2, 3), (2, 4, 3), (3, 4, 5)
2607           *
2608           * (Clearing the bit is even worse, because it fails to preserve
2609           * orientation).
2610           *
2611           * Triangle strips with adjacency always ordered in a way that preserves
2612           * triangle orientation but does not strictly follow the OpenGL spec,
2613           * regardless of the setting of this bit.
2614           *
2615           * On Haswell, both triangle strips and triangle strips with adjacency
2616           * are always ordered in a way that preserves triangle orientation.
2617           * Setting this bit causes the ordering to strictly follow the OpenGL
2618           * spec.
2619           *
2620           * So in either case we want to set the bit.  Unfortunately on Ivy
2621           * Bridge this will get the order close to correct but not perfect.
2622           */
2623          gs.ReorderMode = TRAILING;
2624          gs.MaximumNumberofThreads =
2625             GEN_GEN == 8 ? (devinfo->max_gs_threads / 2 - 1)
2626                          : (devinfo->max_gs_threads - 1);
2627
2628 #if GEN_GEN < 7
2629          gs.SOStatisticsEnable = true;
2630          if (gs_prog->info.has_transform_feedback_varyings)
2631             gs.SVBIPayloadEnable = true;
2632
2633          /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
2634           * was previously done for gen6.
2635           *
2636           * TODO: test with both disabled to see if the HW is behaving
2637           * as expected, like in gen7.
2638           */
2639          gs.SingleProgramFlow = true;
2640          gs.VectorMaskEnable = true;
2641 #endif
2642
2643 #if GEN_GEN >= 8
2644          gs.ExpectedVertexCount = gs_prog_data->vertices_in;
2645
2646          if (gs_prog_data->static_vertex_count != -1) {
2647             gs.StaticOutput = true;
2648             gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;
2649          }
2650          gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;
2651
2652          gs.UserClipDistanceCullTestEnableBitmask =
2653             vue_prog_data->cull_distance_mask;
2654
2655          const int urb_entry_write_offset = 1;
2656          const uint32_t urb_entry_output_length =
2657             DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -
2658             urb_entry_write_offset;
2659
2660          gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
2661          gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
2662 #endif
2663       }
2664 #endif
2665
2666 #if GEN_GEN <= 6
2667       if (!active && brw->ff_gs.prog_active) {
2668          /* In gen6, transform feedback for the VS stage is done with an
2669           * ad-hoc GS program. This function provides the needed 3DSTATE_GS
2670           * for this.
2671           */
2672          gs.KernelStartPointer = KSP(brw, brw->ff_gs.prog_offset);
2673          gs.SingleProgramFlow = true;
2674          gs.DispatchGRFStartRegisterForURBData = GEN_GEN == 6 ? 2 : 1;
2675          gs.VertexURBEntryReadLength = brw->ff_gs.prog_data->urb_read_length;
2676
2677 #if GEN_GEN <= 5
2678          gs.GRFRegisterCount =
2679             DIV_ROUND_UP(brw->ff_gs.prog_data->total_grf, 16) - 1;
2680          /* BRW_NEW_URB_FENCE */
2681          gs.NumberofURBEntries = brw->urb.nr_gs_entries;
2682          gs.URBEntryAllocationSize = brw->urb.vsize - 1;
2683          gs.MaximumNumberofThreads = brw->urb.nr_gs_entries >= 8 ? 1 : 0;
2684          gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
2685 #else
2686          gs.Enable = true;
2687          gs.VectorMaskEnable = true;
2688          gs.SVBIPayloadEnable = true;
2689          gs.SVBIPostIncrementEnable = true;
2690          gs.SVBIPostIncrementValue =
2691             brw->ff_gs.prog_data->svbi_postincrement_value;
2692          gs.SOStatisticsEnable = true;
2693          gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
2694 #endif
2695       }
2696 #endif
2697       if (!active && !brw->ff_gs.prog_active) {
2698 #if GEN_GEN < 8
2699          gs.DispatchGRFStartRegisterForURBData = 1;
2700 #if GEN_GEN >= 7
2701          gs.IncludeVertexHandles = true;
2702 #endif
2703 #endif
2704       }
2705
2706 #if GEN_GEN >= 6
2707       gs.StatisticsEnable = true;
2708 #endif
2709 #if GEN_GEN == 5 || GEN_GEN == 6
2710       gs.RenderingEnabled = true;
2711 #endif
2712 #if GEN_GEN <= 5
2713       gs.MaximumVPIndex = brw->clip.viewport_count - 1;
2714 #endif
2715    }
2716
2717 #if GEN_GEN == 6
2718    brw->gs.enabled = active;
2719 #endif
2720 }
2721
2722 static const struct brw_tracked_state genX(gs_state) = {
2723    .dirty = {
2724       .mesa  = (GEN_GEN == 6 ? _NEW_PROGRAM_CONSTANTS : 0),
2725       .brw   = BRW_NEW_BATCH |
2726                BRW_NEW_BLORP |
2727                (GEN_GEN <= 5 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
2728                                BRW_NEW_PROGRAM_CACHE |
2729                                BRW_NEW_URB_FENCE |
2730                                BRW_NEW_VIEWPORT_COUNT
2731                              : 0) |
2732                (GEN_GEN >= 6 ? BRW_NEW_CONTEXT |
2733                                BRW_NEW_GEOMETRY_PROGRAM |
2734                                BRW_NEW_GS_PROG_DATA
2735                              : 0) |
2736                (GEN_GEN < 7 ? BRW_NEW_FF_GS_PROG_DATA : 0),
2737    },
2738    .emit = genX(upload_gs_state),
2739 };
2740
2741 /* ---------------------------------------------------------------------- */
2742
2743 UNUSED static GLenum
2744 fix_dual_blend_alpha_to_one(GLenum function)
2745 {
2746    switch (function) {
2747    case GL_SRC1_ALPHA:
2748       return GL_ONE;
2749
2750    case GL_ONE_MINUS_SRC1_ALPHA:
2751       return GL_ZERO;
2752    }
2753
2754    return function;
2755 }
2756
2757 #define blend_factor(x) brw_translate_blend_factor(x)
2758 #define blend_eqn(x) brw_translate_blend_equation(x)
2759
2760 /**
2761  * Modify blend function to force destination alpha to 1.0
2762  *
2763  * If \c function specifies a blend function that uses destination alpha,
2764  * replace it with a function that hard-wires destination alpha to 1.0.  This
2765  * is used when rendering to xRGB targets.
2766  */
2767 static GLenum
2768 brw_fix_xRGB_alpha(GLenum function)
2769 {
2770    switch (function) {
2771    case GL_DST_ALPHA:
2772       return GL_ONE;
2773
2774    case GL_ONE_MINUS_DST_ALPHA:
2775    case GL_SRC_ALPHA_SATURATE:
2776       return GL_ZERO;
2777    }
2778
2779    return function;
2780 }
2781
2782 #if GEN_GEN >= 6
2783 typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML;
2784 #else
2785 typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML;
2786 #endif
2787
2788 UNUSED static bool
2789 set_blend_entry_bits(struct brw_context *brw, BLEND_ENTRY_GENXML *entry, int i,
2790                      bool alpha_to_one)
2791 {
2792    struct gl_context *ctx = &brw->ctx;
2793
2794    /* _NEW_BUFFERS */
2795    const struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
2796
2797    bool independent_alpha_blend = false;
2798
2799    /* Used for implementing the following bit of GL_EXT_texture_integer:
2800     * "Per-fragment operations that require floating-point color
2801     *  components, including multisample alpha operations, alpha test,
2802     *  blending, and dithering, have no effect when the corresponding
2803     *  colors are written to an integer color buffer."
2804     */
2805    const bool integer = ctx->DrawBuffer->_IntegerBuffers & (0x1 << i);
2806
2807    const unsigned blend_enabled = GEN_GEN >= 6 ?
2808       ctx->Color.BlendEnabled & (1 << i) : ctx->Color.BlendEnabled;
2809
2810    /* _NEW_COLOR */
2811    if (ctx->Color.ColorLogicOpEnabled) {
2812       GLenum rb_type = rb ? _mesa_get_format_datatype(rb->Format)
2813          : GL_UNSIGNED_NORMALIZED;
2814       WARN_ONCE(ctx->Color.LogicOp != GL_COPY &&
2815                 rb_type != GL_UNSIGNED_NORMALIZED &&
2816                 rb_type != GL_FLOAT, "Ignoring %s logic op on %s "
2817                 "renderbuffer\n",
2818                 _mesa_enum_to_string(ctx->Color.LogicOp),
2819                 _mesa_enum_to_string(rb_type));
2820       if (GEN_GEN >= 8 || rb_type == GL_UNSIGNED_NORMALIZED) {
2821          entry->LogicOpEnable = true;
2822          entry->LogicOpFunction =
2823             intel_translate_logic_op(ctx->Color.LogicOp);
2824       }
2825    } else if (blend_enabled && !ctx->Color._AdvancedBlendMode
2826               && (GEN_GEN <= 5 || !integer)) {
2827       GLenum eqRGB = ctx->Color.Blend[i].EquationRGB;
2828       GLenum eqA = ctx->Color.Blend[i].EquationA;
2829       GLenum srcRGB = ctx->Color.Blend[i].SrcRGB;
2830       GLenum dstRGB = ctx->Color.Blend[i].DstRGB;
2831       GLenum srcA = ctx->Color.Blend[i].SrcA;
2832       GLenum dstA = ctx->Color.Blend[i].DstA;
2833
2834       if (eqRGB == GL_MIN || eqRGB == GL_MAX)
2835          srcRGB = dstRGB = GL_ONE;
2836
2837       if (eqA == GL_MIN || eqA == GL_MAX)
2838          srcA = dstA = GL_ONE;
2839
2840       /* Due to hardware limitations, the destination may have information
2841        * in an alpha channel even when the format specifies no alpha
2842        * channel. In order to avoid getting any incorrect blending due to
2843        * that alpha channel, coerce the blend factors to values that will
2844        * not read the alpha channel, but will instead use the correct
2845        * implicit value for alpha.
2846        */
2847       if (rb && !_mesa_base_format_has_channel(rb->_BaseFormat,
2848                                                GL_TEXTURE_ALPHA_TYPE)) {
2849          srcRGB = brw_fix_xRGB_alpha(srcRGB);
2850          srcA = brw_fix_xRGB_alpha(srcA);
2851          dstRGB = brw_fix_xRGB_alpha(dstRGB);
2852          dstA = brw_fix_xRGB_alpha(dstA);
2853       }
2854
2855       /* From the BLEND_STATE docs, DWord 0, Bit 29 (AlphaToOne Enable):
2856        * "If Dual Source Blending is enabled, this bit must be disabled."
2857        *
2858        * We override SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO,
2859        * and leave it enabled anyway.
2860        */
2861       if (GEN_GEN >= 6 && ctx->Color.Blend[i]._UsesDualSrc && alpha_to_one) {
2862          srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
2863          srcA = fix_dual_blend_alpha_to_one(srcA);
2864          dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
2865          dstA = fix_dual_blend_alpha_to_one(dstA);
2866       }
2867
2868       entry->ColorBufferBlendEnable = true;
2869       entry->DestinationBlendFactor = blend_factor(dstRGB);
2870       entry->SourceBlendFactor = blend_factor(srcRGB);
2871       entry->DestinationAlphaBlendFactor = blend_factor(dstA);
2872       entry->SourceAlphaBlendFactor = blend_factor(srcA);
2873       entry->ColorBlendFunction = blend_eqn(eqRGB);
2874       entry->AlphaBlendFunction = blend_eqn(eqA);
2875
2876       if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB)
2877          independent_alpha_blend = true;
2878    }
2879
2880    return independent_alpha_blend;
2881 }
2882
2883 #if GEN_GEN >= 6
2884 static void
2885 genX(upload_blend_state)(struct brw_context *brw)
2886 {
2887    struct gl_context *ctx = &brw->ctx;
2888    int size;
2889
2890    /* We need at least one BLEND_STATE written, because we might do
2891     * thread dispatch even if _NumColorDrawBuffers is 0 (for example
2892     * for computed depth or alpha test), which will do an FB write
2893     * with render target 0, which will reference BLEND_STATE[0] for
2894     * alpha test enable.
2895     */
2896    int nr_draw_buffers = ctx->DrawBuffer->_NumColorDrawBuffers;
2897    if (nr_draw_buffers == 0 && ctx->Color.AlphaEnabled)
2898       nr_draw_buffers = 1;
2899
2900    size = GENX(BLEND_STATE_ENTRY_length) * 4 * nr_draw_buffers;
2901 #if GEN_GEN >= 8
2902    size += GENX(BLEND_STATE_length) * 4;
2903 #endif
2904
2905    uint32_t *blend_map;
2906    blend_map = brw_state_batch(brw, size, 64, &brw->cc.blend_state_offset);
2907
2908 #if GEN_GEN >= 8
2909    struct GENX(BLEND_STATE) blend = { 0 };
2910    {
2911 #else
2912    for (int i = 0; i < nr_draw_buffers; i++) {
2913       struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
2914 #define blend entry
2915 #endif
2916       /* OpenGL specification 3.3 (page 196), section 4.1.3 says:
2917        * "If drawbuffer zero is not NONE and the buffer it references has an
2918        * integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
2919        * operations are skipped."
2920        */
2921       if (!(ctx->DrawBuffer->_IntegerBuffers & 0x1)) {
2922          /* _NEW_MULTISAMPLE */
2923          if (_mesa_is_multisample_enabled(ctx)) {
2924             if (ctx->Multisample.SampleAlphaToCoverage) {
2925                blend.AlphaToCoverageEnable = true;
2926                blend.AlphaToCoverageDitherEnable = GEN_GEN >= 7;
2927             }
2928             if (ctx->Multisample.SampleAlphaToOne)
2929                blend.AlphaToOneEnable = true;
2930          }
2931
2932          /* _NEW_COLOR */
2933          if (ctx->Color.AlphaEnabled) {
2934             blend.AlphaTestEnable = true;
2935             blend.AlphaTestFunction =
2936                intel_translate_compare_func(ctx->Color.AlphaFunc);
2937          }
2938
2939          if (ctx->Color.DitherFlag) {
2940             blend.ColorDitherEnable = true;
2941          }
2942       }
2943
2944 #if GEN_GEN >= 8
2945       for (int i = 0; i < nr_draw_buffers; i++) {
2946          struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
2947 #else
2948       {
2949 #endif
2950          blend.IndependentAlphaBlendEnable =
2951             set_blend_entry_bits(brw, &entry, i, blend.AlphaToOneEnable) ||
2952             blend.IndependentAlphaBlendEnable;
2953
2954          /* See section 8.1.6 "Pre-Blend Color Clamping" of the
2955           * SandyBridge PRM Volume 2 Part 1 for HW requirements.
2956           *
2957           * We do our ARB_color_buffer_float CLAMP_FRAGMENT_COLOR
2958           * clamping in the fragment shader.  For its clamping of
2959           * blending, the spec says:
2960           *
2961           *     "RESOLVED: For fixed-point color buffers, the inputs and
2962           *      the result of the blending equation are clamped.  For
2963           *      floating-point color buffers, no clamping occurs."
2964           *
2965           * So, generally, we want clamping to the render target's range.
2966           * And, good news, the hardware tables for both pre- and
2967           * post-blend color clamping are either ignored, or any are
2968           * allowed, or clamping is required but RT range clamping is a
2969           * valid option.
2970           */
2971          entry.PreBlendColorClampEnable = true;
2972          entry.PostBlendColorClampEnable = true;
2973          entry.ColorClampRange = COLORCLAMP_RTFORMAT;
2974
2975          entry.WriteDisableRed   = !ctx->Color.ColorMask[i][0];
2976          entry.WriteDisableGreen = !ctx->Color.ColorMask[i][1];
2977          entry.WriteDisableBlue  = !ctx->Color.ColorMask[i][2];
2978          entry.WriteDisableAlpha = !ctx->Color.ColorMask[i][3];
2979
2980 #if GEN_GEN >= 8
2981          GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry);
2982 #else
2983          GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[i * 2], &entry);
2984 #endif
2985       }
2986    }
2987
2988 #if GEN_GEN >= 8
2989    GENX(BLEND_STATE_pack)(NULL, blend_map, &blend);
2990 #endif
2991
2992 #if GEN_GEN < 7
2993    brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
2994       ptr.PointertoBLEND_STATE = brw->cc.blend_state_offset;
2995       ptr.BLEND_STATEChange = true;
2996    }
2997 #else
2998    brw_batch_emit(brw, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
2999       ptr.BlendStatePointer = brw->cc.blend_state_offset;
3000 #if GEN_GEN >= 8
3001       ptr.BlendStatePointerValid = true;
3002 #endif
3003    }
3004 #endif
3005 }
3006
3007 static const struct brw_tracked_state genX(blend_state) = {
3008    .dirty = {
3009       .mesa = _NEW_BUFFERS |
3010               _NEW_COLOR |
3011               _NEW_MULTISAMPLE,
3012       .brw = BRW_NEW_BATCH |
3013              BRW_NEW_BLORP |
3014              BRW_NEW_STATE_BASE_ADDRESS,
3015    },
3016    .emit = genX(upload_blend_state),
3017 };
3018 #endif
3019
3020 /* ---------------------------------------------------------------------- */
3021
3022 #if GEN_GEN >= 7
3023 UNUSED static const uint32_t push_constant_opcodes[] = {
3024    [MESA_SHADER_VERTEX]                      = 21,
3025    [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
3026    [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
3027    [MESA_SHADER_GEOMETRY]                    = 22,
3028    [MESA_SHADER_FRAGMENT]                    = 23,
3029    [MESA_SHADER_COMPUTE]                     = 0,
3030 };
3031
3032 static void
3033 genX(upload_push_constant_packets)(struct brw_context *brw)
3034 {
3035    const struct gen_device_info *devinfo = &brw->screen->devinfo;
3036    struct gl_context *ctx = &brw->ctx;
3037
3038    UNUSED uint32_t mocs = GEN_GEN < 8 ? GEN7_MOCS_L3 : 0;
3039
3040    struct brw_stage_state *stage_states[] = {
3041       &brw->vs.base,
3042       &brw->tcs.base,
3043       &brw->tes.base,
3044       &brw->gs.base,
3045       &brw->wm.base,
3046    };
3047
3048    if (GEN_GEN == 7 && !GEN_IS_HASWELL && !devinfo->is_baytrail &&
3049        stage_states[MESA_SHADER_VERTEX]->push_constants_dirty)
3050       gen7_emit_vs_workaround_flush(brw);
3051
3052    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
3053       struct brw_stage_state *stage_state = stage_states[stage];
3054       UNUSED struct gl_program *prog = ctx->_Shader->CurrentProgram[stage];
3055
3056       if (!stage_state->push_constants_dirty)
3057          continue;
3058
3059       brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), pkt) {
3060          pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
3061          if (stage_state->prog_data) {
3062 #if GEN_GEN >= 8 || GEN_IS_HASWELL
3063             /* The Skylake PRM contains the following restriction:
3064              *
3065              *    "The driver must ensure The following case does not occur
3066              *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
3067              *     buffer 3 read length equal to zero committed followed by a
3068              *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
3069              *     zero committed."
3070              *
3071              * To avoid this, we program the buffers in the highest slots.
3072              * This way, slot 0 is only used if slot 3 is also used.
3073              */
3074             int n = 3;
3075
3076             for (int i = 3; i >= 0; i--) {
3077                const struct brw_ubo_range *range =
3078                   &stage_state->prog_data->ubo_ranges[i];
3079
3080                if (range->length == 0)
3081                   continue;
3082
3083                const struct gl_uniform_block *block =
3084                   prog->sh.UniformBlocks[range->block];
3085                const struct gl_buffer_binding *binding =
3086                   &ctx->UniformBufferBindings[block->Binding];
3087
3088                if (binding->BufferObject == ctx->Shared->NullBufferObj) {
3089                   static unsigned msg_id = 0;
3090                   _mesa_gl_debug(ctx, &msg_id, MESA_DEBUG_SOURCE_API,
3091                                  MESA_DEBUG_TYPE_UNDEFINED,
3092                                  MESA_DEBUG_SEVERITY_HIGH,
3093                                  "UBO %d unbound, %s shader uniform data "
3094                                  "will be undefined.",
3095                                  range->block,
3096                                  _mesa_shader_stage_to_string(stage));
3097                   continue;
3098                }
3099
3100                assert(binding->Offset % 32 == 0);
3101
3102                struct brw_bo *bo = intel_bufferobj_buffer(brw,
3103                   intel_buffer_object(binding->BufferObject),
3104                   binding->Offset, range->length * 32, false);
3105
3106                pkt.ConstantBody.ReadLength[n] = range->length;
3107                pkt.ConstantBody.Buffer[n] =
3108                   ro_bo(bo, range->start * 32 + binding->Offset);
3109                n--;
3110             }
3111
3112             if (stage_state->push_const_size > 0) {
3113                assert(n >= 0);
3114                pkt.ConstantBody.ReadLength[n] = stage_state->push_const_size;
3115                pkt.ConstantBody.Buffer[n] =
3116                   ro_bo(stage_state->push_const_bo,
3117                         stage_state->push_const_offset);
3118             }
3119 #else
3120             pkt.ConstantBody.ReadLength[0] = stage_state->push_const_size;
3121             pkt.ConstantBody.Buffer[0].offset =
3122                stage_state->push_const_offset | mocs;
3123 #endif
3124          }
3125       }
3126
3127       stage_state->push_constants_dirty = false;
3128       brw->ctx.NewDriverState |= GEN_GEN >= 9 ? BRW_NEW_SURFACES : 0;
3129    }
3130 }
3131
3132 const struct brw_tracked_state genX(push_constant_packets) = {
3133    .dirty = {
3134       .mesa  = 0,
3135       .brw   = BRW_NEW_DRAW_CALL,
3136    },
3137    .emit = genX(upload_push_constant_packets),
3138 };
3139 #endif
3140
3141 #if GEN_GEN >= 6
3142 static void
3143 genX(upload_vs_push_constants)(struct brw_context *brw)
3144 {
3145    struct brw_stage_state *stage_state = &brw->vs.base;
3146
3147    /* BRW_NEW_VERTEX_PROGRAM */
3148    const struct gl_program *vp = brw->programs[MESA_SHADER_VERTEX];
3149    /* BRW_NEW_VS_PROG_DATA */
3150    const struct brw_stage_prog_data *prog_data = brw->vs.base.prog_data;
3151
3152    gen6_upload_push_constants(brw, vp, prog_data, stage_state);
3153 }
3154
3155 static const struct brw_tracked_state genX(vs_push_constants) = {
3156    .dirty = {
3157       .mesa  = _NEW_PROGRAM_CONSTANTS |
3158                _NEW_TRANSFORM,
3159       .brw   = BRW_NEW_BATCH |
3160                BRW_NEW_BLORP |
3161                BRW_NEW_VERTEX_PROGRAM |
3162                BRW_NEW_VS_PROG_DATA,
3163    },
3164    .emit = genX(upload_vs_push_constants),
3165 };
3166
3167 static void
3168 genX(upload_gs_push_constants)(struct brw_context *brw)
3169 {
3170    struct brw_stage_state *stage_state = &brw->gs.base;
3171
3172    /* BRW_NEW_GEOMETRY_PROGRAM */
3173    const struct gl_program *gp = brw->programs[MESA_SHADER_GEOMETRY];
3174
3175    /* BRW_NEW_GS_PROG_DATA */
3176    struct brw_stage_prog_data *prog_data = brw->gs.base.prog_data;
3177
3178    gen6_upload_push_constants(brw, gp, prog_data, stage_state);
3179 }
3180
3181 static const struct brw_tracked_state genX(gs_push_constants) = {
3182    .dirty = {
3183       .mesa  = _NEW_PROGRAM_CONSTANTS |
3184                _NEW_TRANSFORM,
3185       .brw   = BRW_NEW_BATCH |
3186                BRW_NEW_BLORP |
3187                BRW_NEW_GEOMETRY_PROGRAM |
3188                BRW_NEW_GS_PROG_DATA,
3189    },
3190    .emit = genX(upload_gs_push_constants),
3191 };
3192
3193 static void
3194 genX(upload_wm_push_constants)(struct brw_context *brw)
3195 {
3196    struct brw_stage_state *stage_state = &brw->wm.base;
3197    /* BRW_NEW_FRAGMENT_PROGRAM */
3198    const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
3199    /* BRW_NEW_FS_PROG_DATA */
3200    const struct brw_stage_prog_data *prog_data = brw->wm.base.prog_data;
3201
3202    gen6_upload_push_constants(brw, fp, prog_data, stage_state);
3203 }
3204
3205 static const struct brw_tracked_state genX(wm_push_constants) = {
3206    .dirty = {
3207       .mesa  = _NEW_PROGRAM_CONSTANTS,
3208       .brw   = BRW_NEW_BATCH |
3209                BRW_NEW_BLORP |
3210                BRW_NEW_FRAGMENT_PROGRAM |
3211                BRW_NEW_FS_PROG_DATA,
3212    },
3213    .emit = genX(upload_wm_push_constants),
3214 };
3215 #endif
3216
3217 /* ---------------------------------------------------------------------- */
3218
3219 #if GEN_GEN >= 6
3220 static unsigned
3221 genX(determine_sample_mask)(struct brw_context *brw)
3222 {
3223    struct gl_context *ctx = &brw->ctx;
3224    float coverage = 1.0f;
3225    float coverage_invert = false;
3226    unsigned sample_mask = ~0u;
3227
3228    /* BRW_NEW_NUM_SAMPLES */
3229    unsigned num_samples = brw->num_samples;
3230
3231    if (_mesa_is_multisample_enabled(ctx)) {
3232       if (ctx->Multisample.SampleCoverage) {
3233          coverage = ctx->Multisample.SampleCoverageValue;
3234          coverage_invert = ctx->Multisample.SampleCoverageInvert;
3235       }
3236       if (ctx->Multisample.SampleMask) {
3237          sample_mask = ctx->Multisample.SampleMaskValue;
3238       }
3239    }
3240
3241    if (num_samples > 1) {
3242       int coverage_int = (int) (num_samples * coverage + 0.5f);
3243       uint32_t coverage_bits = (1 << coverage_int) - 1;
3244       if (coverage_invert)
3245          coverage_bits ^= (1 << num_samples) - 1;
3246       return coverage_bits & sample_mask;
3247    } else {
3248       return 1;
3249    }
3250 }
3251
3252 static void
3253 genX(emit_3dstate_multisample2)(struct brw_context *brw,
3254                                 unsigned num_samples)
3255 {
3256    unsigned log2_samples = ffs(num_samples) - 1;
3257
3258    brw_batch_emit(brw, GENX(3DSTATE_MULTISAMPLE), multi) {
3259       multi.PixelLocation = CENTER;
3260       multi.NumberofMultisamples = log2_samples;
3261 #if GEN_GEN == 6
3262       GEN_SAMPLE_POS_4X(multi.Sample);
3263 #elif GEN_GEN == 7
3264       switch (num_samples) {
3265       case 1:
3266          GEN_SAMPLE_POS_1X(multi.Sample);
3267          break;
3268       case 2:
3269          GEN_SAMPLE_POS_2X(multi.Sample);
3270          break;
3271       case 4:
3272          GEN_SAMPLE_POS_4X(multi.Sample);
3273          break;
3274       case 8:
3275          GEN_SAMPLE_POS_8X(multi.Sample);
3276          break;
3277       default:
3278          break;
3279       }
3280 #endif
3281    }
3282 }
3283
3284 static void
3285 genX(upload_multisample_state)(struct brw_context *brw)
3286 {
3287    assert(brw->num_samples > 0 && brw->num_samples <= 16);
3288
3289    genX(emit_3dstate_multisample2)(brw, brw->num_samples);
3290
3291    brw_batch_emit(brw, GENX(3DSTATE_SAMPLE_MASK), sm) {
3292       sm.SampleMask = genX(determine_sample_mask)(brw);
3293    }
3294 }
3295
3296 static const struct brw_tracked_state genX(multisample_state) = {
3297    .dirty = {
3298       .mesa = _NEW_MULTISAMPLE |
3299               (GEN_GEN == 10 ? _NEW_BUFFERS : 0),
3300       .brw = BRW_NEW_BLORP |
3301              BRW_NEW_CONTEXT |
3302              BRW_NEW_NUM_SAMPLES,
3303    },
3304    .emit = genX(upload_multisample_state)
3305 };
3306 #endif
3307
3308 /* ---------------------------------------------------------------------- */
3309
3310 static void
3311 genX(upload_color_calc_state)(struct brw_context *brw)
3312 {
3313    struct gl_context *ctx = &brw->ctx;
3314
3315    brw_state_emit(brw, GENX(COLOR_CALC_STATE), 64, &brw->cc.state_offset, cc) {
3316 #if GEN_GEN <= 5
3317       cc.IndependentAlphaBlendEnable =
3318          set_blend_entry_bits(brw, &cc, 0, false);
3319       set_depth_stencil_bits(brw, &cc);
3320
3321       if (ctx->Color.AlphaEnabled &&
3322           ctx->DrawBuffer->_NumColorDrawBuffers <= 1) {
3323          cc.AlphaTestEnable = true;
3324          cc.AlphaTestFunction =
3325             intel_translate_compare_func(ctx->Color.AlphaFunc);
3326       }
3327
3328       cc.ColorDitherEnable = ctx->Color.DitherFlag;
3329
3330       cc.StatisticsEnable = brw->stats_wm;
3331
3332       cc.CCViewportStatePointer =
3333          ro_bo(brw->batch.state.bo, brw->cc.vp_offset);
3334 #else
3335       /* _NEW_COLOR */
3336       cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
3337       cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
3338       cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
3339       cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
3340
3341 #if GEN_GEN < 9
3342       /* _NEW_STENCIL */
3343       cc.StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
3344       cc.BackfaceStencilReferenceValue =
3345          _mesa_get_stencil_ref(ctx, ctx->Stencil._BackFace);
3346 #endif
3347
3348 #endif
3349
3350       /* _NEW_COLOR */
3351       UNCLAMPED_FLOAT_TO_UBYTE(cc.AlphaReferenceValueAsUNORM8,
3352                                ctx->Color.AlphaRef);
3353    }
3354
3355 #if GEN_GEN >= 6
3356    brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
3357       ptr.ColorCalcStatePointer = brw->cc.state_offset;
3358 #if GEN_GEN != 7
3359       ptr.ColorCalcStatePointerValid = true;
3360 #endif
3361    }
3362 #else
3363    brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
3364 #endif
3365 }
3366
3367 static const struct brw_tracked_state genX(color_calc_state) = {
3368    .dirty = {
3369       .mesa = _NEW_COLOR |
3370               _NEW_STENCIL |
3371               (GEN_GEN <= 5 ? _NEW_BUFFERS |
3372                               _NEW_DEPTH
3373                             : 0),
3374       .brw = BRW_NEW_BATCH |
3375              BRW_NEW_BLORP |
3376              (GEN_GEN <= 5 ? BRW_NEW_CC_VP |
3377                              BRW_NEW_STATS_WM
3378                            : BRW_NEW_CC_STATE |
3379                              BRW_NEW_STATE_BASE_ADDRESS),
3380    },
3381    .emit = genX(upload_color_calc_state),
3382 };
3383
3384
3385 /* ---------------------------------------------------------------------- */
3386
3387 #if GEN_GEN >= 7
3388 static void
3389 genX(upload_sbe)(struct brw_context *brw)
3390 {
3391    struct gl_context *ctx = &brw->ctx;
3392    /* BRW_NEW_FRAGMENT_PROGRAM */
3393    UNUSED const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
3394    /* BRW_NEW_FS_PROG_DATA */
3395    const struct brw_wm_prog_data *wm_prog_data =
3396       brw_wm_prog_data(brw->wm.base.prog_data);
3397 #if GEN_GEN >= 8
3398    struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = { { 0 } };
3399 #else
3400 #define attr_overrides sbe.Attribute
3401 #endif
3402    uint32_t urb_entry_read_length;
3403    uint32_t urb_entry_read_offset;
3404    uint32_t point_sprite_enables;
3405
3406    brw_batch_emit(brw, GENX(3DSTATE_SBE), sbe) {
3407       sbe.AttributeSwizzleEnable = true;
3408       sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
3409
3410       /* _NEW_BUFFERS */
3411       bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3412
3413       /* _NEW_POINT
3414        *
3415        * Window coordinates in an FBO are inverted, which means point
3416        * sprite origin must be inverted.
3417        */
3418       if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo)
3419          sbe.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
3420       else
3421          sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
3422
3423       /* _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM,
3424        * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM |
3425        * BRW_NEW_GS_PROG_DATA | BRW_NEW_PRIMITIVE | BRW_NEW_TES_PROG_DATA |
3426        * BRW_NEW_VUE_MAP_GEOM_OUT
3427        */
3428       genX(calculate_attr_overrides)(brw,
3429                                      attr_overrides,
3430                                      &point_sprite_enables,
3431                                      &urb_entry_read_length,
3432                                      &urb_entry_read_offset);
3433
3434       /* Typically, the URB entry read length and offset should be programmed
3435        * in 3DSTATE_VS and 3DSTATE_GS; SBE inherits it from the last active
3436        * stage which produces geometry.  However, we don't know the proper
3437        * value until we call calculate_attr_overrides().
3438        *
3439        * To fit with our existing code, we override the inherited values and
3440        * specify it here directly, as we did on previous generations.
3441        */
3442       sbe.VertexURBEntryReadLength = urb_entry_read_length;
3443       sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
3444       sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables;
3445       sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
3446
3447 #if GEN_GEN >= 8
3448       sbe.ForceVertexURBEntryReadLength = true;
3449       sbe.ForceVertexURBEntryReadOffset = true;
3450 #endif
3451
3452 #if GEN_GEN >= 9
3453       /* prepare the active component dwords */
3454       const int num_inputs = urb_entry_read_length * 2;
3455       for (int input_index = 0; input_index < num_inputs; input_index++) {
3456          sbe.AttributeActiveComponentFormat[input_index] = ACTIVE_COMPONENT_XYZW;
3457       }
3458 #endif
3459    }
3460
3461 #if GEN_GEN >= 8
3462    brw_batch_emit(brw, GENX(3DSTATE_SBE_SWIZ), sbes) {
3463       for (int i = 0; i < 16; i++)
3464          sbes.Attribute[i] = attr_overrides[i];
3465    }
3466 #endif
3467
3468 #undef attr_overrides
3469 }
3470
3471 static const struct brw_tracked_state genX(sbe_state) = {
3472    .dirty = {
3473       .mesa  = _NEW_BUFFERS |
3474                _NEW_LIGHT |
3475                _NEW_POINT |
3476                _NEW_POLYGON |
3477                _NEW_PROGRAM,
3478       .brw   = BRW_NEW_BLORP |
3479                BRW_NEW_CONTEXT |
3480                BRW_NEW_FRAGMENT_PROGRAM |
3481                BRW_NEW_FS_PROG_DATA |
3482                BRW_NEW_GS_PROG_DATA |
3483                BRW_NEW_TES_PROG_DATA |
3484                BRW_NEW_VUE_MAP_GEOM_OUT |
3485                (GEN_GEN == 7 ? BRW_NEW_PRIMITIVE
3486                              : 0),
3487    },
3488    .emit = genX(upload_sbe),
3489 };
3490 #endif
3491
3492 /* ---------------------------------------------------------------------- */
3493
3494 #if GEN_GEN >= 7
3495 /**
3496  * Outputs the 3DSTATE_SO_DECL_LIST command.
3497  *
3498  * The data output is a series of 64-bit entries containing a SO_DECL per
3499  * stream.  We only have one stream of rendering coming out of the GS unit, so
3500  * we only emit stream 0 (low 16 bits) SO_DECLs.
3501  */
3502 static void
3503 genX(upload_3dstate_so_decl_list)(struct brw_context *brw,
3504                                   const struct brw_vue_map *vue_map)
3505 {
3506    struct gl_context *ctx = &brw->ctx;
3507    /* BRW_NEW_TRANSFORM_FEEDBACK */
3508    struct gl_transform_feedback_object *xfb_obj =
3509       ctx->TransformFeedback.CurrentObject;
3510    const struct gl_transform_feedback_info *linked_xfb_info =
3511       xfb_obj->program->sh.LinkedTransformFeedback;
3512    struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128];
3513    int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3514    int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3515    int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3516    int max_decls = 0;
3517    STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);
3518
3519    memset(so_decl, 0, sizeof(so_decl));
3520
3521    /* Construct the list of SO_DECLs to be emitted.  The formatting of the
3522     * command feels strange -- each dword pair contains a SO_DECL per stream.
3523     */
3524    for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) {
3525       const struct gl_transform_feedback_output *output =
3526          &linked_xfb_info->Outputs[i];
3527       const int buffer = output->OutputBuffer;
3528       const int varying = output->OutputRegister;
3529       const unsigned stream_id = output->StreamId;
3530       assert(stream_id < MAX_VERTEX_STREAMS);
3531
3532       buffer_mask[stream_id] |= 1 << buffer;
3533
3534       assert(vue_map->varying_to_slot[varying] >= 0);
3535
3536       /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
3537        * array.  Instead, it simply increments DstOffset for the following
3538        * input by the number of components that should be skipped.
3539        *
3540        * Our hardware is unusual in that it requires us to program SO_DECLs
3541        * for fake "hole" components, rather than simply taking the offset
3542        * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
3543        * program as many size = 4 holes as we can, then a final hole to
3544        * accommodate the final 1, 2, or 3 remaining.
3545        */
3546       int skip_components = output->DstOffset - next_offset[buffer];
3547
3548       while (skip_components > 0) {
3549          so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
3550             .HoleFlag = 1,
3551             .OutputBufferSlot = output->OutputBuffer,
3552             .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
3553          };
3554          skip_components -= 4;
3555       }
3556
3557       next_offset[buffer] = output->DstOffset + output->NumComponents;
3558
3559       so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
3560          .OutputBufferSlot = output->OutputBuffer,
3561          .RegisterIndex = vue_map->varying_to_slot[varying],
3562          .ComponentMask =
3563             ((1 << output->NumComponents) - 1) << output->ComponentOffset,
3564       };
3565
3566       if (decls[stream_id] > max_decls)
3567          max_decls = decls[stream_id];
3568    }
3569
3570    uint32_t *dw;
3571    dw = brw_batch_emitn(brw, GENX(3DSTATE_SO_DECL_LIST), 3 + 2 * max_decls,
3572                         .StreamtoBufferSelects0 = buffer_mask[0],
3573                         .StreamtoBufferSelects1 = buffer_mask[1],
3574                         .StreamtoBufferSelects2 = buffer_mask[2],
3575                         .StreamtoBufferSelects3 = buffer_mask[3],
3576                         .NumEntries0 = decls[0],
3577                         .NumEntries1 = decls[1],
3578                         .NumEntries2 = decls[2],
3579                         .NumEntries3 = decls[3]);
3580
3581    for (int i = 0; i < max_decls; i++) {
3582       GENX(SO_DECL_ENTRY_pack)(
3583          brw, dw + 2 + i * 2,
3584          &(struct GENX(SO_DECL_ENTRY)) {
3585             .Stream0Decl = so_decl[0][i],
3586             .Stream1Decl = so_decl[1][i],
3587             .Stream2Decl = so_decl[2][i],
3588             .Stream3Decl = so_decl[3][i],
3589          });
3590    }
3591 }
3592
3593 static void
3594 genX(upload_3dstate_so_buffers)(struct brw_context *brw)
3595 {
3596    struct gl_context *ctx = &brw->ctx;
3597    /* BRW_NEW_TRANSFORM_FEEDBACK */
3598    struct gl_transform_feedback_object *xfb_obj =
3599       ctx->TransformFeedback.CurrentObject;
3600 #if GEN_GEN < 8
3601    const struct gl_transform_feedback_info *linked_xfb_info =
3602       xfb_obj->program->sh.LinkedTransformFeedback;
3603 #else
3604    struct brw_transform_feedback_object *brw_obj =
3605       (struct brw_transform_feedback_object *) xfb_obj;
3606    uint32_t mocs_wb = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
3607 #endif
3608
3609    /* Set up the up to 4 output buffers.  These are the ranges defined in the
3610     * gl_transform_feedback_object.
3611     */
3612    for (int i = 0; i < 4; i++) {
3613       struct intel_buffer_object *bufferobj =
3614          intel_buffer_object(xfb_obj->Buffers[i]);
3615
3616       if (!bufferobj) {
3617          brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
3618             sob.SOBufferIndex = i;
3619          }
3620          continue;
3621       }
3622
3623       uint32_t start = xfb_obj->Offset[i];
3624       assert(start % 4 == 0);
3625       uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
3626       struct brw_bo *bo =
3627          intel_bufferobj_buffer(brw, bufferobj, start, end - start, true);
3628       assert(end <= bo->size);
3629
3630       brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
3631          sob.SOBufferIndex = i;
3632
3633          sob.SurfaceBaseAddress = rw_bo(bo, start);
3634 #if GEN_GEN < 8
3635          sob.SurfacePitch = linked_xfb_info->Buffers[i].Stride * 4;
3636          sob.SurfaceEndAddress = rw_bo(bo, end);
3637 #else
3638          sob.SOBufferEnable = true;
3639          sob.StreamOffsetWriteEnable = true;
3640          sob.StreamOutputBufferOffsetAddressEnable = true;
3641          sob.SOBufferMOCS = mocs_wb;
3642
3643          sob.SurfaceSize = MAX2(xfb_obj->Size[i] / 4, 1) - 1;
3644          sob.StreamOutputBufferOffsetAddress =
3645             rw_bo(brw_obj->offset_bo, i * sizeof(uint32_t));
3646
3647          if (brw_obj->zero_offsets) {
3648             /* Zero out the offset and write that to offset_bo */
3649             sob.StreamOffset = 0;
3650          } else {
3651             /* Use offset_bo as the "Stream Offset." */
3652             sob.StreamOffset = 0xFFFFFFFF;
3653          }
3654 #endif
3655       }
3656    }
3657
3658 #if GEN_GEN >= 8
3659    brw_obj->zero_offsets = false;
3660 #endif
3661 }
3662
3663 static bool
3664 query_active(struct gl_query_object *q)
3665 {
3666    return q && q->Active;
3667 }
3668
3669 static void
3670 genX(upload_3dstate_streamout)(struct brw_context *brw, bool active,
3671                                const struct brw_vue_map *vue_map)
3672 {
3673    struct gl_context *ctx = &brw->ctx;
3674    /* BRW_NEW_TRANSFORM_FEEDBACK */
3675    struct gl_transform_feedback_object *xfb_obj =
3676       ctx->TransformFeedback.CurrentObject;
3677
3678    brw_batch_emit(brw, GENX(3DSTATE_STREAMOUT), sos) {
3679       if (active) {
3680          int urb_entry_read_offset = 0;
3681          int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
3682             urb_entry_read_offset;
3683
3684          sos.SOFunctionEnable = true;
3685          sos.SOStatisticsEnable = true;
3686
3687          /* BRW_NEW_RASTERIZER_DISCARD */
3688          if (ctx->RasterDiscard) {
3689             if (!query_active(ctx->Query.PrimitivesGenerated[0])) {
3690                sos.RenderingDisable = true;
3691             } else {
3692                perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED "
3693                           "query active relies on the clipper.\n");
3694             }
3695          }
3696
3697          /* _NEW_LIGHT */
3698          if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION)
3699             sos.ReorderMode = TRAILING;
3700
3701 #if GEN_GEN < 8
3702          sos.SOBufferEnable0 = xfb_obj->Buffers[0] != NULL;
3703          sos.SOBufferEnable1 = xfb_obj->Buffers[1] != NULL;
3704          sos.SOBufferEnable2 = xfb_obj->Buffers[2] != NULL;
3705          sos.SOBufferEnable3 = xfb_obj->Buffers[3] != NULL;
3706 #else
3707          const struct gl_transform_feedback_info *linked_xfb_info =
3708             xfb_obj->program->sh.LinkedTransformFeedback;
3709          /* Set buffer pitches; 0 means unbound. */
3710          if (xfb_obj->Buffers[0])
3711             sos.Buffer0SurfacePitch = linked_xfb_info->Buffers[0].Stride * 4;
3712          if (xfb_obj->Buffers[1])
3713             sos.Buffer1SurfacePitch = linked_xfb_info->Buffers[1].Stride * 4;
3714          if (xfb_obj->Buffers[2])
3715             sos.Buffer2SurfacePitch = linked_xfb_info->Buffers[2].Stride * 4;
3716          if (xfb_obj->Buffers[3])
3717             sos.Buffer3SurfacePitch = linked_xfb_info->Buffers[3].Stride * 4;
3718 #endif
3719
3720          /* We always read the whole vertex.  This could be reduced at some
3721           * point by reading less and offsetting the register index in the
3722           * SO_DECLs.
3723           */
3724          sos.Stream0VertexReadOffset = urb_entry_read_offset;
3725          sos.Stream0VertexReadLength = urb_entry_read_length - 1;
3726          sos.Stream1VertexReadOffset = urb_entry_read_offset;
3727          sos.Stream1VertexReadLength = urb_entry_read_length - 1;
3728          sos.Stream2VertexReadOffset = urb_entry_read_offset;
3729          sos.Stream2VertexReadLength = urb_entry_read_length - 1;
3730          sos.Stream3VertexReadOffset = urb_entry_read_offset;
3731          sos.Stream3VertexReadLength = urb_entry_read_length - 1;
3732       }
3733    }
3734 }
3735
3736 static void
3737 genX(upload_sol)(struct brw_context *brw)
3738 {
3739    struct gl_context *ctx = &brw->ctx;
3740    /* BRW_NEW_TRANSFORM_FEEDBACK */
3741    bool active = _mesa_is_xfb_active_and_unpaused(ctx);
3742
3743    if (active) {
3744       genX(upload_3dstate_so_buffers)(brw);
3745
3746       /* BRW_NEW_VUE_MAP_GEOM_OUT */
3747       genX(upload_3dstate_so_decl_list)(brw, &brw->vue_map_geom_out);
3748    }
3749
3750    /* Finally, set up the SOL stage.  This command must always follow updates to
3751     * the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or
3752     * MMIO register updates (current performed by the kernel at each batch
3753     * emit).
3754     */
3755    genX(upload_3dstate_streamout)(brw, active, &brw->vue_map_geom_out);
3756 }
3757
3758 static const struct brw_tracked_state genX(sol_state) = {
3759    .dirty = {
3760       .mesa  = _NEW_LIGHT,
3761       .brw   = BRW_NEW_BATCH |
3762                BRW_NEW_BLORP |
3763                BRW_NEW_RASTERIZER_DISCARD |
3764                BRW_NEW_VUE_MAP_GEOM_OUT |
3765                BRW_NEW_TRANSFORM_FEEDBACK,
3766    },
3767    .emit = genX(upload_sol),
3768 };
3769 #endif
3770
3771 /* ---------------------------------------------------------------------- */
3772
3773 #if GEN_GEN >= 7
3774 static void
3775 genX(upload_ps)(struct brw_context *brw)
3776 {
3777    UNUSED const struct gl_context *ctx = &brw->ctx;
3778    UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
3779
3780    /* BRW_NEW_FS_PROG_DATA */
3781    const struct brw_wm_prog_data *prog_data =
3782       brw_wm_prog_data(brw->wm.base.prog_data);
3783    const struct brw_stage_state *stage_state = &brw->wm.base;
3784
3785 #if GEN_GEN < 8
3786 #endif
3787
3788    brw_batch_emit(brw, GENX(3DSTATE_PS), ps) {
3789       /* Initialize the execution mask with VMask.  Otherwise, derivatives are
3790        * incorrect for subspans where some of the pixels are unlit.  We believe
3791        * the bit just didn't take effect in previous generations.
3792        */
3793       ps.VectorMaskEnable = GEN_GEN >= 8;
3794
3795       ps.SamplerCount =
3796          DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);
3797
3798       /* BRW_NEW_FS_PROG_DATA */
3799       ps.BindingTableEntryCount = prog_data->base.binding_table.size_bytes / 4;
3800
3801       if (prog_data->base.use_alt_mode)
3802          ps.FloatingPointMode = Alternate;
3803
3804       /* Haswell requires the sample mask to be set in this packet as well as
3805        * in 3DSTATE_SAMPLE_MASK; the values should match.
3806        */
3807
3808       /* _NEW_BUFFERS, _NEW_MULTISAMPLE */
3809 #if GEN_IS_HASWELL
3810       ps.SampleMask = genX(determine_sample_mask(brw));
3811 #endif
3812
3813       /* 3DSTATE_PS expects the number of threads per PSD, which is always 64;
3814        * it implicitly scales for different GT levels (which have some # of
3815        * PSDs).
3816        *
3817        * In Gen8 the format is U8-2 whereas in Gen9 it is U8-1.
3818        */
3819 #if GEN_GEN >= 9
3820       ps.MaximumNumberofThreadsPerPSD = 64 - 1;
3821 #elif GEN_GEN >= 8
3822       ps.MaximumNumberofThreadsPerPSD = 64 - 2;
3823 #else
3824       ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
3825 #endif
3826
3827       if (prog_data->base.nr_params > 0 ||
3828           prog_data->base.ubo_ranges[0].length > 0)
3829          ps.PushConstantEnable = true;
3830
3831 #if GEN_GEN < 8
3832       /* From the IVB PRM, volume 2 part 1, page 287:
3833        * "This bit is inserted in the PS payload header and made available to
3834        * the DataPort (either via the message header or via header bypass) to
3835        * indicate that oMask data (one or two phases) is included in Render
3836        * Target Write messages. If present, the oMask data is used to mask off
3837        * samples."
3838        */
3839       ps.oMaskPresenttoRenderTarget = prog_data->uses_omask;
3840
3841       /* The hardware wedges if you have this bit set but don't turn on any
3842        * dual source blend factors.
3843        *
3844        * BRW_NEW_FS_PROG_DATA | _NEW_COLOR
3845        */
3846       ps.DualSourceBlendEnable = prog_data->dual_src_blend &&
3847                                  (ctx->Color.BlendEnabled & 1) &&
3848                                  ctx->Color.Blend[0]._UsesDualSrc;
3849
3850       /* BRW_NEW_FS_PROG_DATA */
3851       ps.AttributeEnable = (prog_data->num_varying_inputs != 0);
3852 #endif
3853
3854       /* From the documentation for this packet:
3855        * "If the PS kernel does not need the Position XY Offsets to
3856        *  compute a Position Value, then this field should be programmed
3857        *  to POSOFFSET_NONE."
3858        *
3859        * "SW Recommendation: If the PS kernel needs the Position Offsets
3860        *  to compute a Position XY value, this field should match Position
3861        *  ZW Interpolation Mode to ensure a consistent position.xyzw
3862        *  computation."
3863        *
3864        * We only require XY sample offsets. So, this recommendation doesn't
3865        * look useful at the moment. We might need this in future.
3866        */
3867       if (prog_data->uses_pos_offset)
3868          ps.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
3869       else
3870          ps.PositionXYOffsetSelect = POSOFFSET_NONE;
3871
3872       ps._8PixelDispatchEnable = prog_data->dispatch_8;
3873       ps._16PixelDispatchEnable = prog_data->dispatch_16;
3874       ps.DispatchGRFStartRegisterForConstantSetupData0 =
3875          prog_data->base.dispatch_grf_start_reg;
3876       ps.DispatchGRFStartRegisterForConstantSetupData2 =
3877          prog_data->dispatch_grf_start_reg_2;
3878
3879       ps.KernelStartPointer0 = stage_state->prog_offset;
3880       ps.KernelStartPointer2 = stage_state->prog_offset +
3881          prog_data->prog_offset_2;
3882
3883       if (prog_data->base.total_scratch) {
3884          ps.ScratchSpaceBasePointer =
3885             rw_bo(stage_state->scratch_bo,
3886                   ffs(stage_state->per_thread_scratch) - 11);
3887       }
3888    }
3889 }
3890
3891 static const struct brw_tracked_state genX(ps_state) = {
3892    .dirty = {
3893       .mesa  = _NEW_MULTISAMPLE |
3894                (GEN_GEN < 8 ? _NEW_BUFFERS |
3895                               _NEW_COLOR
3896                             : 0),
3897       .brw   = BRW_NEW_BATCH |
3898                BRW_NEW_BLORP |
3899                BRW_NEW_FS_PROG_DATA,
3900    },
3901    .emit = genX(upload_ps),
3902 };
3903 #endif
3904
3905 /* ---------------------------------------------------------------------- */
3906
3907 #if GEN_GEN >= 7
3908 static void
3909 genX(upload_hs_state)(struct brw_context *brw)
3910 {
3911    const struct gen_device_info *devinfo = &brw->screen->devinfo;
3912    struct brw_stage_state *stage_state = &brw->tcs.base;
3913    struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
3914    const struct brw_vue_prog_data *vue_prog_data =
3915       brw_vue_prog_data(stage_prog_data);
3916
3917    /* BRW_NEW_TES_PROG_DATA */
3918    struct brw_tcs_prog_data *tcs_prog_data =
3919       brw_tcs_prog_data(stage_prog_data);
3920
3921    if (!tcs_prog_data) {
3922       brw_batch_emit(brw, GENX(3DSTATE_HS), hs);
3923    } else {
3924       brw_batch_emit(brw, GENX(3DSTATE_HS), hs) {
3925          INIT_THREAD_DISPATCH_FIELDS(hs, Vertex);
3926
3927          hs.InstanceCount = tcs_prog_data->instances - 1;
3928          hs.IncludeVertexHandles = true;
3929
3930          hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
3931       }
3932    }
3933 }
3934
3935 static const struct brw_tracked_state genX(hs_state) = {
3936    .dirty = {
3937       .mesa  = 0,
3938       .brw   = BRW_NEW_BATCH |
3939                BRW_NEW_BLORP |
3940                BRW_NEW_TCS_PROG_DATA |
3941                BRW_NEW_TESS_PROGRAMS,
3942    },
3943    .emit = genX(upload_hs_state),
3944 };
3945
3946 static void
3947 genX(upload_ds_state)(struct brw_context *brw)
3948 {
3949    const struct gen_device_info *devinfo = &brw->screen->devinfo;
3950    const struct brw_stage_state *stage_state = &brw->tes.base;
3951    struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
3952
3953    /* BRW_NEW_TES_PROG_DATA */
3954    const struct brw_tes_prog_data *tes_prog_data =
3955       brw_tes_prog_data(stage_prog_data);
3956    const struct brw_vue_prog_data *vue_prog_data =
3957       brw_vue_prog_data(stage_prog_data);
3958
3959    if (!tes_prog_data) {
3960       brw_batch_emit(brw, GENX(3DSTATE_DS), ds);
3961    } else {
3962       brw_batch_emit(brw, GENX(3DSTATE_DS), ds) {
3963          INIT_THREAD_DISPATCH_FIELDS(ds, Patch);
3964
3965         ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
3966         ds.ComputeWCoordinateEnable =
3967            tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
3968
3969 #if GEN_GEN >= 8
3970         if (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8)
3971            ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
3972         ds.UserClipDistanceCullTestEnableBitmask =
3973             vue_prog_data->cull_distance_mask;
3974 #endif
3975       }
3976    }
3977 }
3978
3979 static const struct brw_tracked_state genX(ds_state) = {
3980    .dirty = {
3981       .mesa  = 0,
3982       .brw   = BRW_NEW_BATCH |
3983                BRW_NEW_BLORP |
3984                BRW_NEW_TESS_PROGRAMS |
3985                BRW_NEW_TES_PROG_DATA,
3986    },
3987    .emit = genX(upload_ds_state),
3988 };
3989
3990 /* ---------------------------------------------------------------------- */
3991
3992 static void
3993 upload_te_state(struct brw_context *brw)
3994 {
3995    /* BRW_NEW_TESS_PROGRAMS */
3996    bool active = brw->programs[MESA_SHADER_TESS_EVAL];
3997
3998    /* BRW_NEW_TES_PROG_DATA */
3999    const struct brw_tes_prog_data *tes_prog_data =
4000       brw_tes_prog_data(brw->tes.base.prog_data);
4001
4002    if (active) {
4003       brw_batch_emit(brw, GENX(3DSTATE_TE), te) {
4004          te.Partitioning = tes_prog_data->partitioning;
4005          te.OutputTopology = tes_prog_data->output_topology;
4006          te.TEDomain = tes_prog_data->domain;
4007          te.TEEnable = true;
4008          te.MaximumTessellationFactorOdd = 63.0;
4009          te.MaximumTessellationFactorNotOdd = 64.0;
4010       }
4011    } else {
4012       brw_batch_emit(brw, GENX(3DSTATE_TE), te);
4013    }
4014 }
4015
4016 static const struct brw_tracked_state genX(te_state) = {
4017    .dirty = {
4018       .mesa  = 0,
4019       .brw   = BRW_NEW_BLORP |
4020                BRW_NEW_CONTEXT |
4021                BRW_NEW_TES_PROG_DATA |
4022                BRW_NEW_TESS_PROGRAMS,
4023    },
4024    .emit = upload_te_state,
4025 };
4026
4027 /* ---------------------------------------------------------------------- */
4028
4029 static void
4030 genX(upload_tes_push_constants)(struct brw_context *brw)
4031 {
4032    struct brw_stage_state *stage_state = &brw->tes.base;
4033    /* BRW_NEW_TESS_PROGRAMS */
4034    const struct gl_program *tep = brw->programs[MESA_SHADER_TESS_EVAL];
4035
4036    /* BRW_NEW_TES_PROG_DATA */
4037    const struct brw_stage_prog_data *prog_data = brw->tes.base.prog_data;
4038    gen6_upload_push_constants(brw, tep, prog_data, stage_state);
4039 }
4040
4041 static const struct brw_tracked_state genX(tes_push_constants) = {
4042    .dirty = {
4043       .mesa  = _NEW_PROGRAM_CONSTANTS,
4044       .brw   = BRW_NEW_BATCH |
4045                BRW_NEW_BLORP |
4046                BRW_NEW_TESS_PROGRAMS |
4047                BRW_NEW_TES_PROG_DATA,
4048    },
4049    .emit = genX(upload_tes_push_constants),
4050 };
4051
4052 static void
4053 genX(upload_tcs_push_constants)(struct brw_context *brw)
4054 {
4055    struct brw_stage_state *stage_state = &brw->tcs.base;
4056    /* BRW_NEW_TESS_PROGRAMS */
4057    const struct gl_program *tcp = brw->programs[MESA_SHADER_TESS_CTRL];
4058
4059    /* BRW_NEW_TCS_PROG_DATA */
4060    const struct brw_stage_prog_data *prog_data = brw->tcs.base.prog_data;
4061
4062    gen6_upload_push_constants(brw, tcp, prog_data, stage_state);
4063 }
4064
4065 static const struct brw_tracked_state genX(tcs_push_constants) = {
4066    .dirty = {
4067       .mesa  = _NEW_PROGRAM_CONSTANTS,
4068       .brw   = BRW_NEW_BATCH |
4069                BRW_NEW_BLORP |
4070                BRW_NEW_DEFAULT_TESS_LEVELS |
4071                BRW_NEW_TESS_PROGRAMS |
4072                BRW_NEW_TCS_PROG_DATA,
4073    },
4074    .emit = genX(upload_tcs_push_constants),
4075 };
4076
4077 #endif
4078
4079 /* ---------------------------------------------------------------------- */
4080
4081 #if GEN_GEN >= 7
4082 static void
4083 genX(upload_cs_push_constants)(struct brw_context *brw)
4084 {
4085    struct brw_stage_state *stage_state = &brw->cs.base;
4086
4087    /* BRW_NEW_COMPUTE_PROGRAM */
4088    const struct gl_program *cp = brw->programs[MESA_SHADER_COMPUTE];
4089
4090    if (cp) {
4091       /* BRW_NEW_CS_PROG_DATA */
4092       struct brw_cs_prog_data *cs_prog_data =
4093          brw_cs_prog_data(brw->cs.base.prog_data);
4094
4095       _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE);
4096       brw_upload_cs_push_constants(brw, cp, cs_prog_data, stage_state);
4097    }
4098 }
4099
4100 const struct brw_tracked_state genX(cs_push_constants) = {
4101    .dirty = {
4102       .mesa = _NEW_PROGRAM_CONSTANTS,
4103       .brw = BRW_NEW_BATCH |
4104              BRW_NEW_BLORP |
4105              BRW_NEW_COMPUTE_PROGRAM |
4106              BRW_NEW_CS_PROG_DATA,
4107    },
4108    .emit = genX(upload_cs_push_constants),
4109 };
4110
4111 /**
4112  * Creates a new CS constant buffer reflecting the current CS program's
4113  * constants, if needed by the CS program.
4114  */
4115 static void
4116 genX(upload_cs_pull_constants)(struct brw_context *brw)
4117 {
4118    struct brw_stage_state *stage_state = &brw->cs.base;
4119
4120    /* BRW_NEW_COMPUTE_PROGRAM */
4121    struct brw_program *cp =
4122       (struct brw_program *) brw->programs[MESA_SHADER_COMPUTE];
4123
4124    /* BRW_NEW_CS_PROG_DATA */
4125    const struct brw_stage_prog_data *prog_data = brw->cs.base.prog_data;
4126
4127    _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE);
4128    /* _NEW_PROGRAM_CONSTANTS */
4129    brw_upload_pull_constants(brw, BRW_NEW_SURFACES, &cp->program,
4130                              stage_state, prog_data);
4131 }
4132
4133 const struct brw_tracked_state genX(cs_pull_constants) = {
4134    .dirty = {
4135       .mesa = _NEW_PROGRAM_CONSTANTS,
4136       .brw = BRW_NEW_BATCH |
4137              BRW_NEW_BLORP |
4138              BRW_NEW_COMPUTE_PROGRAM |
4139              BRW_NEW_CS_PROG_DATA,
4140    },
4141    .emit = genX(upload_cs_pull_constants),
4142 };
4143
4144 static void
4145 genX(upload_cs_state)(struct brw_context *brw)
4146 {
4147    if (!brw->cs.base.prog_data)
4148       return;
4149
4150    uint32_t offset;
4151    uint32_t *desc = (uint32_t*) brw_state_batch(
4152       brw, GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t), 64,
4153       &offset);
4154
4155    struct brw_stage_state *stage_state = &brw->cs.base;
4156    struct brw_stage_prog_data *prog_data = stage_state->prog_data;
4157    struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
4158    const struct gen_device_info *devinfo = &brw->screen->devinfo;
4159
4160    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
4161       brw_emit_buffer_surface_state(
4162          brw, &stage_state->surf_offset[
4163                  prog_data->binding_table.shader_time_start],
4164          brw->shader_time.bo, 0, ISL_FORMAT_RAW,
4165          brw->shader_time.bo->size, 1,
4166          RELOC_WRITE);
4167    }
4168
4169    uint32_t *bind = brw_state_batch(brw, prog_data->binding_table.size_bytes,
4170                                     32, &stage_state->bind_bo_offset);
4171
4172    brw_batch_emit(brw, GENX(MEDIA_VFE_STATE), vfe) {
4173       if (prog_data->total_scratch) {
4174          uint32_t bo_offset;
4175
4176          if (GEN_GEN >= 8) {
4177             /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
4178              * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
4179              */
4180             bo_offset = ffs(stage_state->per_thread_scratch) - 11;
4181          } else if (GEN_IS_HASWELL) {
4182             /* Haswell's Per Thread Scratch Space is in the range [0, 10]
4183              * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
4184              */
4185             bo_offset = ffs(stage_state->per_thread_scratch) - 12;
4186          } else {
4187             /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
4188              * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
4189              */
4190             bo_offset = stage_state->per_thread_scratch / 1024 - 1;
4191          }
4192          vfe.ScratchSpaceBasePointer =
4193             rw_bo(stage_state->scratch_bo, bo_offset);
4194       }
4195
4196       const uint32_t subslices = MAX2(brw->screen->subslice_total, 1);
4197       vfe.MaximumNumberofThreads = devinfo->max_cs_threads * subslices - 1;
4198       vfe.NumberofURBEntries = GEN_GEN >= 8 ? 2 : 0;
4199       vfe.ResetGatewayTimer =
4200          Resettingrelativetimerandlatchingtheglobaltimestamp;
4201 #if GEN_GEN < 9
4202       vfe.BypassGatewayControl = BypassingOpenGatewayCloseGatewayprotocol;
4203 #endif
4204 #if GEN_GEN == 7
4205       vfe.GPGPUMode = 1;
4206 #endif
4207
4208       /* We are uploading duplicated copies of push constant uniforms for each
4209        * thread. Although the local id data needs to vary per thread, it won't
4210        * change for other uniform data. Unfortunately this duplication is
4211        * required for gen7. As of Haswell, this duplication can be avoided,
4212        * but this older mechanism with duplicated data continues to work.
4213        *
4214        * FINISHME: As of Haswell, we could make use of the
4215        * INTERFACE_DESCRIPTOR_DATA "Cross-Thread Constant Data Read Length"
4216        * field to only store one copy of uniform data.
4217        *
4218        * FINISHME: Broadwell adds a new alternative "Indirect Payload Storage"
4219        * which is described in the GPGPU_WALKER command and in the Broadwell
4220        * PRM Volume 7: 3D Media GPGPU, under Media GPGPU Pipeline => Mode of
4221        * Operations => GPGPU Mode => Indirect Payload Storage.
4222        *
4223        * Note: The constant data is built in brw_upload_cs_push_constants
4224        * below.
4225        */
4226       vfe.URBEntryAllocationSize = GEN_GEN >= 8 ? 2 : 0;
4227
4228       const uint32_t vfe_curbe_allocation =
4229          ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads +
4230                cs_prog_data->push.cross_thread.regs, 2);
4231       vfe.CURBEAllocationSize = vfe_curbe_allocation;
4232    }
4233
4234    if (cs_prog_data->push.total.size > 0) {
4235       brw_batch_emit(brw, GENX(MEDIA_CURBE_LOAD), curbe) {
4236          curbe.CURBETotalDataLength =
4237             ALIGN(cs_prog_data->push.total.size, 64);
4238          curbe.CURBEDataStartAddress = stage_state->push_const_offset;
4239       }
4240    }
4241
4242    /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */
4243    memcpy(bind, stage_state->surf_offset,
4244           prog_data->binding_table.size_bytes);
4245    const struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {
4246       .KernelStartPointer = brw->cs.base.prog_offset,
4247       .SamplerStatePointer = stage_state->sampler_offset,
4248       .SamplerCount = DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4),
4249       .BindingTablePointer = stage_state->bind_bo_offset,
4250       .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
4251       .NumberofThreadsinGPGPUThreadGroup = cs_prog_data->threads,
4252       .SharedLocalMemorySize = encode_slm_size(GEN_GEN,
4253                                                prog_data->total_shared),
4254       .BarrierEnable = cs_prog_data->uses_barrier,
4255 #if GEN_GEN >= 8 || GEN_IS_HASWELL
4256       .CrossThreadConstantDataReadLength =
4257          cs_prog_data->push.cross_thread.regs,
4258 #endif
4259    };
4260
4261    GENX(INTERFACE_DESCRIPTOR_DATA_pack)(brw, desc, &idd);
4262
4263    brw_batch_emit(brw, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
4264       load.InterfaceDescriptorTotalLength =
4265          GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
4266       load.InterfaceDescriptorDataStartAddress = offset;
4267    }
4268 }
4269
4270 static const struct brw_tracked_state genX(cs_state) = {
4271    .dirty = {
4272       .mesa = _NEW_PROGRAM_CONSTANTS,
4273       .brw = BRW_NEW_BATCH |
4274              BRW_NEW_BLORP |
4275              BRW_NEW_CS_PROG_DATA |
4276              BRW_NEW_SAMPLER_STATE_TABLE |
4277              BRW_NEW_SURFACES,
4278    },
4279    .emit = genX(upload_cs_state)
4280 };
4281
4282 #endif
4283
4284 /* ---------------------------------------------------------------------- */
4285
4286 #if GEN_GEN >= 8
4287 static void
4288 genX(upload_raster)(struct brw_context *brw)
4289 {
4290    const struct gl_context *ctx = &brw->ctx;
4291
4292    /* _NEW_BUFFERS */
4293    const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
4294
4295    /* _NEW_POLYGON */
4296    const struct gl_polygon_attrib *polygon = &ctx->Polygon;
4297
4298    /* _NEW_POINT */
4299    const struct gl_point_attrib *point = &ctx->Point;
4300
4301    brw_batch_emit(brw, GENX(3DSTATE_RASTER), raster) {
4302       if (brw->polygon_front_bit == render_to_fbo)
4303          raster.FrontWinding = CounterClockwise;
4304
4305       if (polygon->CullFlag) {
4306          switch (polygon->CullFaceMode) {
4307          case GL_FRONT:
4308             raster.CullMode = CULLMODE_FRONT;
4309             break;
4310          case GL_BACK:
4311             raster.CullMode = CULLMODE_BACK;
4312             break;
4313          case GL_FRONT_AND_BACK:
4314             raster.CullMode = CULLMODE_BOTH;
4315             break;
4316          default:
4317             unreachable("not reached");
4318          }
4319       } else {
4320          raster.CullMode = CULLMODE_NONE;
4321       }
4322
4323       raster.SmoothPointEnable = point->SmoothFlag;
4324
4325       raster.DXMultisampleRasterizationEnable =
4326          _mesa_is_multisample_enabled(ctx);
4327
4328       raster.GlobalDepthOffsetEnableSolid = polygon->OffsetFill;
4329       raster.GlobalDepthOffsetEnableWireframe = polygon->OffsetLine;
4330       raster.GlobalDepthOffsetEnablePoint = polygon->OffsetPoint;
4331
4332       switch (polygon->FrontMode) {
4333       case GL_FILL:
4334          raster.FrontFaceFillMode = FILL_MODE_SOLID;
4335          break;
4336       case GL_LINE:
4337          raster.FrontFaceFillMode = FILL_MODE_WIREFRAME;
4338          break;
4339       case GL_POINT:
4340          raster.FrontFaceFillMode = FILL_MODE_POINT;
4341          break;
4342       default:
4343          unreachable("not reached");
4344       }
4345
4346       switch (polygon->BackMode) {
4347       case GL_FILL:
4348          raster.BackFaceFillMode = FILL_MODE_SOLID;
4349          break;
4350       case GL_LINE:
4351          raster.BackFaceFillMode = FILL_MODE_WIREFRAME;
4352          break;
4353       case GL_POINT:
4354          raster.BackFaceFillMode = FILL_MODE_POINT;
4355          break;
4356       default:
4357          unreachable("not reached");
4358       }
4359
4360       /* _NEW_LINE */
4361       raster.AntialiasingEnable = ctx->Line.SmoothFlag;
4362
4363 #if GEN_GEN == 10
4364       /* _NEW_BUFFERS
4365        * Antialiasing Enable bit MUST not be set when NUM_MULTISAMPLES > 1.
4366        */
4367       const bool multisampled_fbo =
4368          _mesa_geometric_samples(ctx->DrawBuffer) > 1;
4369       if (multisampled_fbo)
4370          raster.AntialiasingEnable = false;
4371 #endif
4372
4373       /* _NEW_SCISSOR */
4374       raster.ScissorRectangleEnable = ctx->Scissor.EnableFlags;
4375
4376       /* _NEW_TRANSFORM */
4377       if (!ctx->Transform.DepthClamp) {
4378 #if GEN_GEN >= 9
4379          raster.ViewportZFarClipTestEnable = true;
4380          raster.ViewportZNearClipTestEnable = true;
4381 #else
4382          raster.ViewportZClipTestEnable = true;
4383 #endif
4384       }
4385
4386       /* BRW_NEW_CONSERVATIVE_RASTERIZATION */
4387 #if GEN_GEN >= 9
4388       raster.ConservativeRasterizationEnable =
4389          ctx->IntelConservativeRasterization;
4390 #endif
4391
4392       raster.GlobalDepthOffsetClamp = polygon->OffsetClamp;
4393       raster.GlobalDepthOffsetScale = polygon->OffsetFactor;
4394
4395       raster.GlobalDepthOffsetConstant = polygon->OffsetUnits * 2;
4396    }
4397 }
4398
4399 static const struct brw_tracked_state genX(raster_state) = {
4400    .dirty = {
4401       .mesa  = _NEW_BUFFERS |
4402                _NEW_LINE |
4403                _NEW_MULTISAMPLE |
4404                _NEW_POINT |
4405                _NEW_POLYGON |
4406                _NEW_SCISSOR |
4407                _NEW_TRANSFORM,
4408       .brw   = BRW_NEW_BLORP |
4409                BRW_NEW_CONTEXT |
4410                BRW_NEW_CONSERVATIVE_RASTERIZATION,
4411    },
4412    .emit = genX(upload_raster),
4413 };
4414 #endif
4415
4416 /* ---------------------------------------------------------------------- */
4417
4418 #if GEN_GEN >= 8
4419 static void
4420 genX(upload_ps_extra)(struct brw_context *brw)
4421 {
4422    UNUSED struct gl_context *ctx = &brw->ctx;
4423
4424    const struct brw_wm_prog_data *prog_data =
4425       brw_wm_prog_data(brw->wm.base.prog_data);
4426
4427    brw_batch_emit(brw, GENX(3DSTATE_PS_EXTRA), psx) {
4428       psx.PixelShaderValid = true;
4429       psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
4430       psx.PixelShaderKillsPixel = prog_data->uses_kill;
4431       psx.AttributeEnable = prog_data->num_varying_inputs != 0;
4432       psx.PixelShaderUsesSourceDepth = prog_data->uses_src_depth;
4433       psx.PixelShaderUsesSourceW = prog_data->uses_src_w;
4434       psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
4435
4436       /* _NEW_MULTISAMPLE | BRW_NEW_CONSERVATIVE_RASTERIZATION */
4437       if (prog_data->uses_sample_mask) {
4438 #if GEN_GEN >= 9
4439          if (prog_data->post_depth_coverage)
4440             psx.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
4441          else if (prog_data->inner_coverage && ctx->IntelConservativeRasterization)
4442             psx.InputCoverageMaskState = ICMS_INNER_CONSERVATIVE;
4443          else
4444             psx.InputCoverageMaskState = ICMS_NORMAL;
4445 #else
4446          psx.PixelShaderUsesInputCoverageMask = true;
4447 #endif
4448       }
4449
4450       psx.oMaskPresenttoRenderTarget = prog_data->uses_omask;
4451 #if GEN_GEN >= 9
4452       psx.PixelShaderPullsBary = prog_data->pulls_bary;
4453       psx.PixelShaderComputesStencil = prog_data->computed_stencil;
4454 #endif
4455
4456       /* The stricter cross-primitive coherency guarantees that the hardware
4457        * gives us with the "Accesses UAV" bit set for at least one shader stage
4458        * and the "UAV coherency required" bit set on the 3DPRIMITIVE command
4459        * are redundant within the current image, atomic counter and SSBO GL
4460        * APIs, which all have very loose ordering and coherency requirements
4461        * and generally rely on the application to insert explicit barriers when
4462        * a shader invocation is expected to see the memory writes performed by
4463        * the invocations of some previous primitive.  Regardless of the value
4464        * of "UAV coherency required", the "Accesses UAV" bits will implicitly
4465        * cause an in most cases useless DC flush when the lowermost stage with
4466        * the bit set finishes execution.
4467        *
4468        * It would be nice to disable it, but in some cases we can't because on
4469        * Gen8+ it also has an influence on rasterization via the PS UAV-only
4470        * signal (which could be set independently from the coherency mechanism
4471        * in the 3DSTATE_WM command on Gen7), and because in some cases it will
4472        * determine whether the hardware skips execution of the fragment shader
4473        * or not via the ThreadDispatchEnable signal.  However if we know that
4474        * GEN8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
4475        * GEN8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
4476        * difference so we may just disable it here.
4477        *
4478        * Gen8 hardware tries to compute ThreadDispatchEnable for us but doesn't
4479        * take into account KillPixels when no depth or stencil writes are
4480        * enabled.  In order for occlusion queries to work correctly with no
4481        * attachments, we need to force-enable here.
4482        *
4483        * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS |
4484        * _NEW_COLOR
4485        */
4486       if ((prog_data->has_side_effects || prog_data->uses_kill) &&
4487           !brw_color_buffer_write_enabled(brw))
4488          psx.PixelShaderHasUAV = true;
4489    }
4490 }
4491
4492 const struct brw_tracked_state genX(ps_extra) = {
4493    .dirty = {
4494       .mesa  = _NEW_BUFFERS | _NEW_COLOR,
4495       .brw   = BRW_NEW_BLORP |
4496                BRW_NEW_CONTEXT |
4497                BRW_NEW_FRAGMENT_PROGRAM |
4498                BRW_NEW_FS_PROG_DATA |
4499                BRW_NEW_CONSERVATIVE_RASTERIZATION,
4500    },
4501    .emit = genX(upload_ps_extra),
4502 };
4503 #endif
4504
4505 /* ---------------------------------------------------------------------- */
4506
4507 #if GEN_GEN >= 8
4508 static void
4509 genX(upload_ps_blend)(struct brw_context *brw)
4510 {
4511    struct gl_context *ctx = &brw->ctx;
4512
4513    /* _NEW_BUFFERS */
4514    struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[0];
4515    const bool buffer0_is_integer = ctx->DrawBuffer->_IntegerBuffers & 0x1;
4516
4517    /* _NEW_COLOR */
4518    struct gl_colorbuffer_attrib *color = &ctx->Color;
4519
4520    brw_batch_emit(brw, GENX(3DSTATE_PS_BLEND), pb) {
4521       /* BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR */
4522       pb.HasWriteableRT = brw_color_buffer_write_enabled(brw);
4523
4524       bool alpha_to_one = false;
4525
4526       if (!buffer0_is_integer) {
4527          /* _NEW_MULTISAMPLE */
4528
4529          if (_mesa_is_multisample_enabled(ctx)) {
4530             pb.AlphaToCoverageEnable = ctx->Multisample.SampleAlphaToCoverage;
4531             alpha_to_one = ctx->Multisample.SampleAlphaToOne;
4532          }
4533
4534          pb.AlphaTestEnable = color->AlphaEnabled;
4535       }
4536
4537       /* Used for implementing the following bit of GL_EXT_texture_integer:
4538        * "Per-fragment operations that require floating-point color
4539        *  components, including multisample alpha operations, alpha test,
4540        *  blending, and dithering, have no effect when the corresponding
4541        *  colors are written to an integer color buffer."
4542        *
4543        * The OpenGL specification 3.3 (page 196), section 4.1.3 says:
4544        * "If drawbuffer zero is not NONE and the buffer it references has an
4545        *  integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
4546        *  operations are skipped."
4547        */
4548       if (rb && !buffer0_is_integer && (color->BlendEnabled & 1)) {
4549          GLenum eqRGB = color->Blend[0].EquationRGB;
4550          GLenum eqA = color->Blend[0].EquationA;
4551          GLenum srcRGB = color->Blend[0].SrcRGB;
4552          GLenum dstRGB = color->Blend[0].DstRGB;
4553          GLenum srcA = color->Blend[0].SrcA;
4554          GLenum dstA = color->Blend[0].DstA;
4555
4556          if (eqRGB == GL_MIN || eqRGB == GL_MAX)
4557             srcRGB = dstRGB = GL_ONE;
4558
4559          if (eqA == GL_MIN || eqA == GL_MAX)
4560             srcA = dstA = GL_ONE;
4561
4562          /* Due to hardware limitations, the destination may have information
4563           * in an alpha channel even when the format specifies no alpha
4564           * channel. In order to avoid getting any incorrect blending due to
4565           * that alpha channel, coerce the blend factors to values that will
4566           * not read the alpha channel, but will instead use the correct
4567           * implicit value for alpha.
4568           */
4569          if (!_mesa_base_format_has_channel(rb->_BaseFormat,
4570                                             GL_TEXTURE_ALPHA_TYPE)) {
4571             srcRGB = brw_fix_xRGB_alpha(srcRGB);
4572             srcA = brw_fix_xRGB_alpha(srcA);
4573             dstRGB = brw_fix_xRGB_alpha(dstRGB);
4574             dstA = brw_fix_xRGB_alpha(dstA);
4575          }
4576
4577          /* Alpha to One doesn't work with Dual Color Blending.  Override
4578           * SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO.
4579           */
4580          if (alpha_to_one && color->Blend[0]._UsesDualSrc) {
4581             srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
4582             srcA = fix_dual_blend_alpha_to_one(srcA);
4583             dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
4584             dstA = fix_dual_blend_alpha_to_one(dstA);
4585          }
4586
4587          pb.ColorBufferBlendEnable = true;
4588          pb.SourceAlphaBlendFactor = brw_translate_blend_factor(srcA);
4589          pb.DestinationAlphaBlendFactor = brw_translate_blend_factor(dstA);
4590          pb.SourceBlendFactor = brw_translate_blend_factor(srcRGB);
4591          pb.DestinationBlendFactor = brw_translate_blend_factor(dstRGB);
4592
4593          pb.IndependentAlphaBlendEnable =
4594             srcA != srcRGB || dstA != dstRGB || eqA != eqRGB;
4595       }
4596    }
4597 }
4598
4599 static const struct brw_tracked_state genX(ps_blend) = {
4600    .dirty = {
4601       .mesa = _NEW_BUFFERS |
4602               _NEW_COLOR |
4603               _NEW_MULTISAMPLE,
4604       .brw = BRW_NEW_BLORP |
4605              BRW_NEW_CONTEXT |
4606              BRW_NEW_FRAGMENT_PROGRAM,
4607    },
4608    .emit = genX(upload_ps_blend)
4609 };
4610 #endif
4611
4612 /* ---------------------------------------------------------------------- */
4613
4614 #if GEN_GEN >= 8
4615 static void
4616 genX(emit_vf_topology)(struct brw_context *brw)
4617 {
4618    brw_batch_emit(brw, GENX(3DSTATE_VF_TOPOLOGY), vftopo) {
4619       vftopo.PrimitiveTopologyType = brw->primitive;
4620    }
4621 }
4622
4623 static const struct brw_tracked_state genX(vf_topology) = {
4624    .dirty = {
4625       .mesa = 0,
4626       .brw = BRW_NEW_BLORP |
4627              BRW_NEW_PRIMITIVE,
4628    },
4629    .emit = genX(emit_vf_topology),
4630 };
4631 #endif
4632
4633 /* ---------------------------------------------------------------------- */
4634
4635 #if GEN_GEN >= 7
4636 static void
4637 genX(emit_mi_report_perf_count)(struct brw_context *brw,
4638                                 struct brw_bo *bo,
4639                                 uint32_t offset_in_bytes,
4640                                 uint32_t report_id)
4641 {
4642    brw_batch_emit(brw, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
4643       mi_rpc.MemoryAddress = ggtt_bo(bo, offset_in_bytes);
4644       mi_rpc.ReportID = report_id;
4645    }
4646 }
4647 #endif
4648
4649 /* ---------------------------------------------------------------------- */
4650
4651 /**
4652  * Emit a 3DSTATE_SAMPLER_STATE_POINTERS_{VS,HS,GS,DS,PS} packet.
4653  */
4654 static void
4655 genX(emit_sampler_state_pointers_xs)(struct brw_context *brw,
4656                                      struct brw_stage_state *stage_state)
4657 {
4658 #if GEN_GEN >= 7
4659    static const uint16_t packet_headers[] = {
4660       [MESA_SHADER_VERTEX] = 43,
4661       [MESA_SHADER_TESS_CTRL] = 44,
4662       [MESA_SHADER_TESS_EVAL] = 45,
4663       [MESA_SHADER_GEOMETRY] = 46,
4664       [MESA_SHADER_FRAGMENT] = 47,
4665    };
4666
4667    /* Ivybridge requires a workaround flush before VS packets. */
4668    if (GEN_GEN == 7 && !GEN_IS_HASWELL &&
4669        stage_state->stage == MESA_SHADER_VERTEX) {
4670       gen7_emit_vs_workaround_flush(brw);
4671    }
4672
4673    brw_batch_emit(brw, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
4674       ptr._3DCommandSubOpcode = packet_headers[stage_state->stage];
4675       ptr.PointertoVSSamplerState = stage_state->sampler_offset;
4676    }
4677 #endif
4678 }
4679
4680 UNUSED static bool
4681 has_component(mesa_format format, int i)
4682 {
4683    if (_mesa_is_format_color_format(format))
4684       return _mesa_format_has_color_component(format, i);
4685
4686    /* depth and stencil have only one component */
4687    return i == 0;
4688 }
4689
4690 /**
4691  * Upload SAMPLER_BORDER_COLOR_STATE.
4692  */
4693 static void
4694 genX(upload_default_color)(struct brw_context *brw,
4695                            const struct gl_sampler_object *sampler,
4696                            mesa_format format, GLenum base_format,
4697                            bool is_integer_format, bool is_stencil_sampling,
4698                            uint32_t *sdc_offset)
4699 {
4700    union gl_color_union color;
4701
4702    switch (base_format) {
4703    case GL_DEPTH_COMPONENT:
4704       /* GL specs that border color for depth textures is taken from the
4705        * R channel, while the hardware uses A.  Spam R into all the
4706        * channels for safety.
4707        */
4708       color.ui[0] = sampler->BorderColor.ui[0];
4709       color.ui[1] = sampler->BorderColor.ui[0];
4710       color.ui[2] = sampler->BorderColor.ui[0];
4711       color.ui[3] = sampler->BorderColor.ui[0];
4712       break;
4713    case GL_ALPHA:
4714       color.ui[0] = 0u;
4715       color.ui[1] = 0u;
4716       color.ui[2] = 0u;
4717       color.ui[3] = sampler->BorderColor.ui[3];
4718       break;
4719    case GL_INTENSITY:
4720       color.ui[0] = sampler->BorderColor.ui[0];
4721       color.ui[1] = sampler->BorderColor.ui[0];
4722       color.ui[2] = sampler->BorderColor.ui[0];
4723       color.ui[3] = sampler->BorderColor.ui[0];
4724       break;
4725    case GL_LUMINANCE:
4726       color.ui[0] = sampler->BorderColor.ui[0];
4727       color.ui[1] = sampler->BorderColor.ui[0];
4728       color.ui[2] = sampler->BorderColor.ui[0];
4729       color.ui[3] = float_as_int(1.0);
4730       break;
4731    case GL_LUMINANCE_ALPHA:
4732       color.ui[0] = sampler->BorderColor.ui[0];
4733       color.ui[1] = sampler->BorderColor.ui[0];
4734       color.ui[2] = sampler->BorderColor.ui[0];
4735       color.ui[3] = sampler->BorderColor.ui[3];
4736       break;
4737    default:
4738       color.ui[0] = sampler->BorderColor.ui[0];
4739       color.ui[1] = sampler->BorderColor.ui[1];
4740       color.ui[2] = sampler->BorderColor.ui[2];
4741       color.ui[3] = sampler->BorderColor.ui[3];
4742       break;
4743    }
4744
4745    /* In some cases we use an RGBA surface format for GL RGB textures,
4746     * where we've initialized the A channel to 1.0.  We also have to set
4747     * the border color alpha to 1.0 in that case.
4748     */
4749    if (base_format == GL_RGB)
4750       color.ui[3] = float_as_int(1.0);
4751
4752    int alignment = 32;
4753    if (GEN_GEN >= 8) {
4754       alignment = 64;
4755    } else if (GEN_IS_HASWELL && (is_integer_format || is_stencil_sampling)) {
4756       alignment = 512;
4757    }
4758
4759    uint32_t *sdc = brw_state_batch(
4760       brw, GENX(SAMPLER_BORDER_COLOR_STATE_length) * sizeof(uint32_t),
4761       alignment, sdc_offset);
4762
4763    struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 };
4764
4765 #define ASSIGN(dst, src) \
4766    do {                  \
4767       dst = src;         \
4768    } while (0)
4769
4770 #define ASSIGNu16(dst, src) \
4771    do {                     \
4772       dst = (uint16_t)src;  \
4773    } while (0)
4774
4775 #define ASSIGNu8(dst, src) \
4776    do {                    \
4777       dst = (uint8_t)src;  \
4778    } while (0)
4779
4780 #define BORDER_COLOR_ATTR(macro, _color_type, src)              \
4781    macro(state.BorderColor ## _color_type ## Red, src[0]);   \
4782    macro(state.BorderColor ## _color_type ## Green, src[1]);   \
4783    macro(state.BorderColor ## _color_type ## Blue, src[2]);   \
4784    macro(state.BorderColor ## _color_type ## Alpha, src[3]);
4785
4786 #if GEN_GEN >= 8
4787    /* On Broadwell, the border color is represented as four 32-bit floats,
4788     * integers, or unsigned values, interpreted according to the surface
4789     * format.  This matches the sampler->BorderColor union exactly; just
4790     * memcpy the values.
4791     */
4792    BORDER_COLOR_ATTR(ASSIGN, 32bit, color.ui);
4793 #elif GEN_IS_HASWELL
4794    if (is_integer_format || is_stencil_sampling) {
4795       bool stencil = format == MESA_FORMAT_S_UINT8 || is_stencil_sampling;
4796       const int bits_per_channel =
4797          _mesa_get_format_bits(format, stencil ? GL_STENCIL_BITS : GL_RED_BITS);
4798
4799       /* From the Haswell PRM, "Command Reference: Structures", Page 36:
4800        * "If any color channel is missing from the surface format,
4801        *  corresponding border color should be programmed as zero and if
4802        *  alpha channel is missing, corresponding Alpha border color should
4803        *  be programmed as 1."
4804        */
4805       unsigned c[4] = { 0, 0, 0, 1 };
4806       for (int i = 0; i < 4; i++) {
4807          if (has_component(format, i))
4808             c[i] = color.ui[i];
4809       }
4810
4811       switch (bits_per_channel) {
4812       case 8:
4813          /* Copy RGBA in order. */
4814          BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c);
4815          break;
4816       case 10:
4817          /* R10G10B10A2_UINT is treated like a 16-bit format. */
4818       case 16:
4819          BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c);
4820          break;
4821       case 32:
4822          if (base_format == GL_RG) {
4823             /* Careful inspection of the tables reveals that for RG32 formats,
4824              * the green channel needs to go where blue normally belongs.
4825              */
4826             state.BorderColor32bitRed = c[0];
4827             state.BorderColor32bitBlue = c[1];
4828             state.BorderColor32bitAlpha = 1;
4829          } else {
4830             /* Copy RGBA in order. */
4831             BORDER_COLOR_ATTR(ASSIGN, 32bit, c);
4832          }
4833          break;
4834       default:
4835          assert(!"Invalid number of bits per channel in integer format.");
4836          break;
4837       }
4838    } else {
4839       BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
4840    }
4841 #elif GEN_GEN == 5 || GEN_GEN == 6
4842    BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color.f);
4843    BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color.f);
4844    BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color.f);
4845
4846 #define MESA_FLOAT_TO_HALF(dst, src) \
4847    dst = _mesa_float_to_half(src);
4848
4849    BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color.f);
4850
4851 #undef MESA_FLOAT_TO_HALF
4852
4853    state.BorderColorSnorm8Red   = state.BorderColorSnorm16Red >> 8;
4854    state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8;
4855    state.BorderColorSnorm8Blue  = state.BorderColorSnorm16Blue >> 8;
4856    state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8;
4857
4858    BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
4859 #elif GEN_GEN == 4
4860    BORDER_COLOR_ATTR(ASSIGN, , color.f);
4861 #else
4862    BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
4863 #endif
4864
4865 #undef ASSIGN
4866 #undef BORDER_COLOR_ATTR
4867
4868    GENX(SAMPLER_BORDER_COLOR_STATE_pack)(brw, sdc, &state);
4869 }
4870
4871 static uint32_t
4872 translate_wrap_mode(struct brw_context *brw, GLenum wrap, bool using_nearest)
4873 {
4874    switch (wrap) {
4875    case GL_REPEAT:
4876       return TCM_WRAP;
4877    case GL_CLAMP:
4878 #if GEN_GEN >= 8
4879       /* GL_CLAMP is the weird mode where coordinates are clamped to
4880        * [0.0, 1.0], so linear filtering of coordinates outside of
4881        * [0.0, 1.0] give you half edge texel value and half border
4882        * color.
4883        *
4884        * Gen8+ supports this natively.
4885        */
4886       return TCM_HALF_BORDER;
4887 #else
4888       /* On Gen4-7.5, we clamp the coordinates in the fragment shader
4889        * and set clamp_border here, which gets the result desired.
4890        * We just use clamp(_to_edge) for nearest, because for nearest
4891        * clamping to 1.0 gives border color instead of the desired
4892        * edge texels.
4893        */
4894       if (using_nearest)
4895          return TCM_CLAMP;
4896       else
4897          return TCM_CLAMP_BORDER;
4898 #endif
4899    case GL_CLAMP_TO_EDGE:
4900       return TCM_CLAMP;
4901    case GL_CLAMP_TO_BORDER:
4902       return TCM_CLAMP_BORDER;
4903    case GL_MIRRORED_REPEAT:
4904       return TCM_MIRROR;
4905    case GL_MIRROR_CLAMP_TO_EDGE:
4906       return TCM_MIRROR_ONCE;
4907    default:
4908       return TCM_WRAP;
4909    }
4910 }
4911
4912 /**
4913  * Return true if the given wrap mode requires the border color to exist.
4914  */
4915 static bool
4916 wrap_mode_needs_border_color(unsigned wrap_mode)
4917 {
4918 #if GEN_GEN >= 8
4919    return wrap_mode == TCM_CLAMP_BORDER ||
4920           wrap_mode == TCM_HALF_BORDER;
4921 #else
4922    return wrap_mode == TCM_CLAMP_BORDER;
4923 #endif
4924 }
4925
4926 /**
4927  * Sets the sampler state for a single unit based off of the sampler key
4928  * entry.
4929  */
4930 static void
4931 genX(update_sampler_state)(struct brw_context *brw,
4932                            GLenum target, bool tex_cube_map_seamless,
4933                            GLfloat tex_unit_lod_bias,
4934                            mesa_format format, GLenum base_format,
4935                            const struct gl_texture_object *texObj,
4936                            const struct gl_sampler_object *sampler,
4937                            uint32_t *sampler_state,
4938                            uint32_t batch_offset_for_sampler_state)
4939 {
4940    struct GENX(SAMPLER_STATE) samp_st = { 0 };
4941
4942    /* Select min and mip filters. */
4943    switch (sampler->MinFilter) {
4944    case GL_NEAREST:
4945       samp_st.MinModeFilter = MAPFILTER_NEAREST;
4946       samp_st.MipModeFilter = MIPFILTER_NONE;
4947       break;
4948    case GL_LINEAR:
4949       samp_st.MinModeFilter = MAPFILTER_LINEAR;
4950       samp_st.MipModeFilter = MIPFILTER_NONE;
4951       break;
4952    case GL_NEAREST_MIPMAP_NEAREST:
4953       samp_st.MinModeFilter = MAPFILTER_NEAREST;
4954       samp_st.MipModeFilter = MIPFILTER_NEAREST;
4955       break;
4956    case GL_LINEAR_MIPMAP_NEAREST:
4957       samp_st.MinModeFilter = MAPFILTER_LINEAR;
4958       samp_st.MipModeFilter = MIPFILTER_NEAREST;
4959       break;
4960    case GL_NEAREST_MIPMAP_LINEAR:
4961       samp_st.MinModeFilter = MAPFILTER_NEAREST;
4962       samp_st.MipModeFilter = MIPFILTER_LINEAR;
4963       break;
4964    case GL_LINEAR_MIPMAP_LINEAR:
4965       samp_st.MinModeFilter = MAPFILTER_LINEAR;
4966       samp_st.MipModeFilter = MIPFILTER_LINEAR;
4967       break;
4968    default:
4969       unreachable("not reached");
4970    }
4971
4972    /* Select mag filter. */
4973    samp_st.MagModeFilter = sampler->MagFilter == GL_LINEAR ?
4974       MAPFILTER_LINEAR : MAPFILTER_NEAREST;
4975
4976    /* Enable anisotropic filtering if desired. */
4977    samp_st.MaximumAnisotropy = RATIO21;
4978
4979    if (sampler->MaxAnisotropy > 1.0f) {
4980       if (samp_st.MinModeFilter == MAPFILTER_LINEAR)
4981          samp_st.MinModeFilter = MAPFILTER_ANISOTROPIC;
4982       if (samp_st.MagModeFilter == MAPFILTER_LINEAR)
4983          samp_st.MagModeFilter = MAPFILTER_ANISOTROPIC;
4984
4985       if (sampler->MaxAnisotropy > 2.0f) {
4986          samp_st.MaximumAnisotropy =
4987             MIN2((sampler->MaxAnisotropy - 2) / 2, RATIO161);
4988       }
4989    }
4990
4991    /* Set address rounding bits if not using nearest filtering. */
4992    if (samp_st.MinModeFilter != MAPFILTER_NEAREST) {
4993       samp_st.UAddressMinFilterRoundingEnable = true;
4994       samp_st.VAddressMinFilterRoundingEnable = true;
4995       samp_st.RAddressMinFilterRoundingEnable = true;
4996    }
4997
4998    if (samp_st.MagModeFilter != MAPFILTER_NEAREST) {
4999       samp_st.UAddressMagFilterRoundingEnable = true;
5000       samp_st.VAddressMagFilterRoundingEnable = true;
5001       samp_st.RAddressMagFilterRoundingEnable = true;
5002    }
5003
5004    bool either_nearest =
5005       sampler->MinFilter == GL_NEAREST || sampler->MagFilter == GL_NEAREST;
5006    unsigned wrap_s = translate_wrap_mode(brw, sampler->WrapS, either_nearest);
5007    unsigned wrap_t = translate_wrap_mode(brw, sampler->WrapT, either_nearest);
5008    unsigned wrap_r = translate_wrap_mode(brw, sampler->WrapR, either_nearest);
5009
5010    if (target == GL_TEXTURE_CUBE_MAP ||
5011        target == GL_TEXTURE_CUBE_MAP_ARRAY) {
5012       /* Cube maps must use the same wrap mode for all three coordinate
5013        * dimensions.  Prior to Haswell, only CUBE and CLAMP are valid.
5014        *
5015        * Ivybridge and Baytrail seem to have problems with CUBE mode and
5016        * integer formats.  Fall back to CLAMP for now.
5017        */
5018       if ((tex_cube_map_seamless || sampler->CubeMapSeamless) &&
5019           !(GEN_GEN == 7 && !GEN_IS_HASWELL && texObj->_IsIntegerFormat)) {
5020          wrap_s = TCM_CUBE;
5021          wrap_t = TCM_CUBE;
5022          wrap_r = TCM_CUBE;
5023       } else {
5024          wrap_s = TCM_CLAMP;
5025          wrap_t = TCM_CLAMP;
5026          wrap_r = TCM_CLAMP;
5027       }
5028    } else if (target == GL_TEXTURE_1D) {
5029       /* There's a bug in 1D texture sampling - it actually pays
5030        * attention to the wrap_t value, though it should not.
5031        * Override the wrap_t value here to GL_REPEAT to keep
5032        * any nonexistent border pixels from floating in.
5033        */
5034       wrap_t = TCM_WRAP;
5035    }
5036
5037    samp_st.TCXAddressControlMode = wrap_s;
5038    samp_st.TCYAddressControlMode = wrap_t;
5039    samp_st.TCZAddressControlMode = wrap_r;
5040
5041    samp_st.ShadowFunction =
5042       sampler->CompareMode == GL_COMPARE_R_TO_TEXTURE_ARB ?
5043       intel_translate_shadow_compare_func(sampler->CompareFunc) : 0;
5044
5045 #if GEN_GEN >= 7
5046    /* Set shadow function. */
5047    samp_st.AnisotropicAlgorithm =
5048       samp_st.MinModeFilter == MAPFILTER_ANISOTROPIC ?
5049       EWAApproximation : LEGACY;
5050 #endif
5051
5052 #if GEN_GEN >= 6
5053    samp_st.NonnormalizedCoordinateEnable = target == GL_TEXTURE_RECTANGLE;
5054 #endif
5055
5056    const float hw_max_lod = GEN_GEN >= 7 ? 14 : 13;
5057    samp_st.MinLOD = CLAMP(sampler->MinLod, 0, hw_max_lod);
5058    samp_st.MaxLOD = CLAMP(sampler->MaxLod, 0, hw_max_lod);
5059    samp_st.TextureLODBias =
5060       CLAMP(tex_unit_lod_bias + sampler->LodBias, -16, 15);
5061
5062 #if GEN_GEN == 6
5063    samp_st.BaseMipLevel =
5064       CLAMP(texObj->MinLevel + texObj->BaseLevel, 0, hw_max_lod);
5065    samp_st.MinandMagStateNotEqual =
5066       samp_st.MinModeFilter != samp_st.MagModeFilter;
5067 #endif
5068
5069    /* Upload the border color if necessary.  If not, just point it at
5070     * offset 0 (the start of the batch) - the color should be ignored,
5071     * but that address won't fault in case something reads it anyway.
5072     */
5073    uint32_t border_color_offset = 0;
5074    if (wrap_mode_needs_border_color(wrap_s) ||
5075        wrap_mode_needs_border_color(wrap_t) ||
5076        wrap_mode_needs_border_color(wrap_r)) {
5077       genX(upload_default_color)(brw, sampler, format, base_format,
5078                                  texObj->_IsIntegerFormat,
5079                                  texObj->StencilSampling,
5080                                  &border_color_offset);
5081    }
5082 #if GEN_GEN < 6
5083       samp_st.BorderColorPointer =
5084          ro_bo(brw->batch.state.bo, border_color_offset);
5085 #else
5086       samp_st.BorderColorPointer = border_color_offset;
5087 #endif
5088
5089 #if GEN_GEN >= 8
5090    samp_st.LODPreClampMode = CLAMP_MODE_OGL;
5091 #else
5092    samp_st.LODPreClampEnable = true;
5093 #endif
5094
5095    GENX(SAMPLER_STATE_pack)(brw, sampler_state, &samp_st);
5096 }
5097
5098 static void
5099 update_sampler_state(struct brw_context *brw,
5100                      int unit,
5101                      uint32_t *sampler_state,
5102                      uint32_t batch_offset_for_sampler_state)
5103 {
5104    struct gl_context *ctx = &brw->ctx;
5105    const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
5106    const struct gl_texture_object *texObj = texUnit->_Current;
5107    const struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, unit);
5108
5109    /* These don't use samplers at all. */
5110    if (texObj->Target == GL_TEXTURE_BUFFER)
5111       return;
5112
5113    struct gl_texture_image *firstImage = texObj->Image[0][texObj->BaseLevel];
5114    genX(update_sampler_state)(brw, texObj->Target,
5115                               ctx->Texture.CubeMapSeamless,
5116                               texUnit->LodBias,
5117                               firstImage->TexFormat, firstImage->_BaseFormat,
5118                               texObj, sampler,
5119                               sampler_state, batch_offset_for_sampler_state);
5120 }
5121
5122 static void
5123 genX(upload_sampler_state_table)(struct brw_context *brw,
5124                                  struct gl_program *prog,
5125                                  struct brw_stage_state *stage_state)
5126 {
5127    struct gl_context *ctx = &brw->ctx;
5128    uint32_t sampler_count = stage_state->sampler_count;
5129
5130    GLbitfield SamplersUsed = prog->SamplersUsed;
5131
5132    if (sampler_count == 0)
5133       return;
5134
5135    /* SAMPLER_STATE is 4 DWords on all platforms. */
5136    const int dwords = GENX(SAMPLER_STATE_length);
5137    const int size_in_bytes = dwords * sizeof(uint32_t);
5138
5139    uint32_t *sampler_state = brw_state_batch(brw,
5140                                              sampler_count * size_in_bytes,
5141                                              32, &stage_state->sampler_offset);
5142    /* memset(sampler_state, 0, sampler_count * size_in_bytes); */
5143
5144    uint32_t batch_offset_for_sampler_state = stage_state->sampler_offset;
5145
5146    for (unsigned s = 0; s < sampler_count; s++) {
5147       if (SamplersUsed & (1 << s)) {
5148          const unsigned unit = prog->SamplerUnits[s];
5149          if (ctx->Texture.Unit[unit]._Current) {
5150             update_sampler_state(brw, unit, sampler_state,
5151                                  batch_offset_for_sampler_state);
5152          }
5153       }
5154
5155       sampler_state += dwords;
5156       batch_offset_for_sampler_state += size_in_bytes;
5157    }
5158
5159    if (GEN_GEN >= 7 && stage_state->stage != MESA_SHADER_COMPUTE) {
5160       /* Emit a 3DSTATE_SAMPLER_STATE_POINTERS_XS packet. */
5161       genX(emit_sampler_state_pointers_xs)(brw, stage_state);
5162    } else {
5163       /* Flag that the sampler state table pointer has changed; later atoms
5164        * will handle it.
5165        */
5166       brw->ctx.NewDriverState |= BRW_NEW_SAMPLER_STATE_TABLE;
5167    }
5168 }
5169
5170 static void
5171 genX(upload_fs_samplers)(struct brw_context *brw)
5172 {
5173    /* BRW_NEW_FRAGMENT_PROGRAM */
5174    struct gl_program *fs = brw->programs[MESA_SHADER_FRAGMENT];
5175    genX(upload_sampler_state_table)(brw, fs, &brw->wm.base);
5176 }
5177
5178 static const struct brw_tracked_state genX(fs_samplers) = {
5179    .dirty = {
5180       .mesa = _NEW_TEXTURE,
5181       .brw = BRW_NEW_BATCH |
5182              BRW_NEW_BLORP |
5183              BRW_NEW_FRAGMENT_PROGRAM,
5184    },
5185    .emit = genX(upload_fs_samplers),
5186 };
5187
5188 static void
5189 genX(upload_vs_samplers)(struct brw_context *brw)
5190 {
5191    /* BRW_NEW_VERTEX_PROGRAM */
5192    struct gl_program *vs = brw->programs[MESA_SHADER_VERTEX];
5193    genX(upload_sampler_state_table)(brw, vs, &brw->vs.base);
5194 }
5195
5196 static const struct brw_tracked_state genX(vs_samplers) = {
5197    .dirty = {
5198       .mesa = _NEW_TEXTURE,
5199       .brw = BRW_NEW_BATCH |
5200              BRW_NEW_BLORP |
5201              BRW_NEW_VERTEX_PROGRAM,
5202    },
5203    .emit = genX(upload_vs_samplers),
5204 };
5205
5206 #if GEN_GEN >= 6
5207 static void
5208 genX(upload_gs_samplers)(struct brw_context *brw)
5209 {
5210    /* BRW_NEW_GEOMETRY_PROGRAM */
5211    struct gl_program *gs = brw->programs[MESA_SHADER_GEOMETRY];
5212    if (!gs)
5213       return;
5214
5215    genX(upload_sampler_state_table)(brw, gs, &brw->gs.base);
5216 }
5217
5218
5219 static const struct brw_tracked_state genX(gs_samplers) = {
5220    .dirty = {
5221       .mesa = _NEW_TEXTURE,
5222       .brw = BRW_NEW_BATCH |
5223              BRW_NEW_BLORP |
5224              BRW_NEW_GEOMETRY_PROGRAM,
5225    },
5226    .emit = genX(upload_gs_samplers),
5227 };
5228 #endif
5229
5230 #if GEN_GEN >= 7
5231 static void
5232 genX(upload_tcs_samplers)(struct brw_context *brw)
5233 {
5234    /* BRW_NEW_TESS_PROGRAMS */
5235    struct gl_program *tcs = brw->programs[MESA_SHADER_TESS_CTRL];
5236    if (!tcs)
5237       return;
5238
5239    genX(upload_sampler_state_table)(brw, tcs, &brw->tcs.base);
5240 }
5241
5242 static const struct brw_tracked_state genX(tcs_samplers) = {
5243    .dirty = {
5244       .mesa = _NEW_TEXTURE,
5245       .brw = BRW_NEW_BATCH |
5246              BRW_NEW_BLORP |
5247              BRW_NEW_TESS_PROGRAMS,
5248    },
5249    .emit = genX(upload_tcs_samplers),
5250 };
5251 #endif
5252
5253 #if GEN_GEN >= 7
5254 static void
5255 genX(upload_tes_samplers)(struct brw_context *brw)
5256 {
5257    /* BRW_NEW_TESS_PROGRAMS */
5258    struct gl_program *tes = brw->programs[MESA_SHADER_TESS_EVAL];
5259    if (!tes)
5260       return;
5261
5262    genX(upload_sampler_state_table)(brw, tes, &brw->tes.base);
5263 }
5264
5265 static const struct brw_tracked_state genX(tes_samplers) = {
5266    .dirty = {
5267       .mesa = _NEW_TEXTURE,
5268       .brw = BRW_NEW_BATCH |
5269              BRW_NEW_BLORP |
5270              BRW_NEW_TESS_PROGRAMS,
5271    },
5272    .emit = genX(upload_tes_samplers),
5273 };
5274 #endif
5275
5276 #if GEN_GEN >= 7
5277 static void
5278 genX(upload_cs_samplers)(struct brw_context *brw)
5279 {
5280    /* BRW_NEW_COMPUTE_PROGRAM */
5281    struct gl_program *cs = brw->programs[MESA_SHADER_COMPUTE];
5282    if (!cs)
5283       return;
5284
5285    genX(upload_sampler_state_table)(brw, cs, &brw->cs.base);
5286 }
5287
5288 const struct brw_tracked_state genX(cs_samplers) = {
5289    .dirty = {
5290       .mesa = _NEW_TEXTURE,
5291       .brw = BRW_NEW_BATCH |
5292              BRW_NEW_BLORP |
5293              BRW_NEW_COMPUTE_PROGRAM,
5294    },
5295    .emit = genX(upload_cs_samplers),
5296 };
5297 #endif
5298
5299 /* ---------------------------------------------------------------------- */
5300
5301 #if GEN_GEN <= 5
5302
5303 static void genX(upload_blend_constant_color)(struct brw_context *brw)
5304 {
5305    struct gl_context *ctx = &brw->ctx;
5306
5307    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) {
5308       blend_cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
5309       blend_cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
5310       blend_cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
5311       blend_cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
5312    }
5313 }
5314
5315 static const struct brw_tracked_state genX(blend_constant_color) = {
5316    .dirty = {
5317       .mesa = _NEW_COLOR,
5318       .brw = BRW_NEW_CONTEXT |
5319              BRW_NEW_BLORP,
5320    },
5321    .emit = genX(upload_blend_constant_color)
5322 };
5323 #endif
5324
5325 /* ---------------------------------------------------------------------- */
5326
5327 void
5328 genX(init_atoms)(struct brw_context *brw)
5329 {
5330 #if GEN_GEN < 6
5331    static const struct brw_tracked_state *render_atoms[] =
5332    {
5333       /* Once all the programs are done, we know how large urb entry
5334        * sizes need to be and can decide if we need to change the urb
5335        * layout.
5336        */
5337       &brw_curbe_offsets,
5338       &brw_recalculate_urb_fence,
5339
5340       &genX(cc_vp),
5341       &genX(color_calc_state),
5342
5343       /* Surface state setup.  Must come before the VS/WM unit.  The binding
5344        * table upload must be last.
5345        */
5346       &brw_vs_pull_constants,
5347       &brw_wm_pull_constants,
5348       &brw_renderbuffer_surfaces,
5349       &brw_renderbuffer_read_surfaces,
5350       &brw_texture_surfaces,
5351       &brw_vs_binding_table,
5352       &brw_wm_binding_table,
5353
5354       &genX(fs_samplers),
5355       &genX(vs_samplers),
5356
5357       /* These set up state for brw_psp_urb_cbs */
5358       &genX(wm_state),
5359       &genX(sf_clip_viewport),
5360       &genX(sf_state),
5361       &genX(vs_state), /* always required, enabled or not */
5362       &genX(clip_state),
5363       &genX(gs_state),
5364
5365       /* Command packets:
5366        */
5367       &brw_binding_table_pointers,
5368       &genX(blend_constant_color),
5369
5370       &brw_depthbuffer,
5371
5372       &genX(polygon_stipple),
5373       &genX(polygon_stipple_offset),
5374
5375       &genX(line_stipple),
5376
5377       &brw_psp_urb_cbs,
5378
5379       &genX(drawing_rect),
5380       &brw_indices, /* must come before brw_vertices */
5381       &genX(index_buffer),
5382       &genX(vertices),
5383
5384       &brw_constant_buffer
5385    };
5386 #elif GEN_GEN == 6
5387    static const struct brw_tracked_state *render_atoms[] =
5388    {
5389       &genX(sf_clip_viewport),
5390
5391       /* Command packets: */
5392
5393       &genX(cc_vp),
5394
5395       &gen6_urb,
5396       &genX(blend_state),               /* must do before cc unit */
5397       &genX(color_calc_state),  /* must do before cc unit */
5398       &genX(depth_stencil_state),       /* must do before cc unit */
5399
5400       &genX(vs_push_constants), /* Before vs_state */
5401       &genX(gs_push_constants), /* Before gs_state */
5402       &genX(wm_push_constants), /* Before wm_state */
5403
5404       /* Surface state setup.  Must come before the VS/WM unit.  The binding
5405        * table upload must be last.
5406        */
5407       &brw_vs_pull_constants,
5408       &brw_vs_ubo_surfaces,
5409       &brw_gs_pull_constants,
5410       &brw_gs_ubo_surfaces,
5411       &brw_wm_pull_constants,
5412       &brw_wm_ubo_surfaces,
5413       &gen6_renderbuffer_surfaces,
5414       &brw_renderbuffer_read_surfaces,
5415       &brw_texture_surfaces,
5416       &gen6_sol_surface,
5417       &brw_vs_binding_table,
5418       &gen6_gs_binding_table,
5419       &brw_wm_binding_table,
5420
5421       &genX(fs_samplers),
5422       &genX(vs_samplers),
5423       &genX(gs_samplers),
5424       &gen6_sampler_state,
5425       &genX(multisample_state),
5426
5427       &genX(vs_state),
5428       &genX(gs_state),
5429       &genX(clip_state),
5430       &genX(sf_state),
5431       &genX(wm_state),
5432
5433       &genX(scissor_state),
5434
5435       &gen6_binding_table_pointers,
5436
5437       &brw_depthbuffer,
5438
5439       &genX(polygon_stipple),
5440       &genX(polygon_stipple_offset),
5441
5442       &genX(line_stipple),
5443
5444       &genX(drawing_rect),
5445
5446       &brw_indices, /* must come before brw_vertices */
5447       &genX(index_buffer),
5448       &genX(vertices),
5449    };
5450 #elif GEN_GEN == 7
5451    static const struct brw_tracked_state *render_atoms[] =
5452    {
5453       /* Command packets: */
5454
5455       &genX(cc_vp),
5456       &genX(sf_clip_viewport),
5457
5458       &gen7_l3_state,
5459       &gen7_push_constant_space,
5460       &gen7_urb,
5461       &genX(blend_state),               /* must do before cc unit */
5462       &genX(color_calc_state),  /* must do before cc unit */
5463       &genX(depth_stencil_state),       /* must do before cc unit */
5464
5465       &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
5466       &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
5467       &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
5468       &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
5469       &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
5470
5471       &genX(vs_push_constants), /* Before vs_state */
5472       &genX(tcs_push_constants),
5473       &genX(tes_push_constants),
5474       &genX(gs_push_constants), /* Before gs_state */
5475       &genX(wm_push_constants), /* Before wm_surfaces and constant_buffer */
5476
5477       /* Surface state setup.  Must come before the VS/WM unit.  The binding
5478        * table upload must be last.
5479        */
5480       &brw_vs_pull_constants,
5481       &brw_vs_ubo_surfaces,
5482       &brw_tcs_pull_constants,
5483       &brw_tcs_ubo_surfaces,
5484       &brw_tes_pull_constants,
5485       &brw_tes_ubo_surfaces,
5486       &brw_gs_pull_constants,
5487       &brw_gs_ubo_surfaces,
5488       &brw_wm_pull_constants,
5489       &brw_wm_ubo_surfaces,
5490       &gen6_renderbuffer_surfaces,
5491       &brw_renderbuffer_read_surfaces,
5492       &brw_texture_surfaces,
5493
5494       &genX(push_constant_packets),
5495
5496       &brw_vs_binding_table,
5497       &brw_tcs_binding_table,
5498       &brw_tes_binding_table,
5499       &brw_gs_binding_table,
5500       &brw_wm_binding_table,
5501
5502       &genX(fs_samplers),
5503       &genX(vs_samplers),
5504       &genX(tcs_samplers),
5505       &genX(tes_samplers),
5506       &genX(gs_samplers),
5507       &genX(multisample_state),
5508
5509       &genX(vs_state),
5510       &genX(hs_state),
5511       &genX(te_state),
5512       &genX(ds_state),
5513       &genX(gs_state),
5514       &genX(sol_state),
5515       &genX(clip_state),
5516       &genX(sbe_state),
5517       &genX(sf_state),
5518       &genX(wm_state),
5519       &genX(ps_state),
5520
5521       &genX(scissor_state),
5522
5523       &gen7_depthbuffer,
5524
5525       &genX(polygon_stipple),
5526       &genX(polygon_stipple_offset),
5527
5528       &genX(line_stipple),
5529
5530       &genX(drawing_rect),
5531
5532       &brw_indices, /* must come before brw_vertices */
5533       &genX(index_buffer),
5534       &genX(vertices),
5535
5536 #if GEN_IS_HASWELL
5537       &genX(cut_index),
5538 #endif
5539    };
5540 #elif GEN_GEN >= 8
5541    static const struct brw_tracked_state *render_atoms[] =
5542    {
5543       &genX(cc_vp),
5544       &genX(sf_clip_viewport),
5545
5546       &gen7_l3_state,
5547       &gen7_push_constant_space,
5548       &gen7_urb,
5549       &genX(blend_state),
5550       &genX(color_calc_state),
5551
5552       &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
5553       &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
5554       &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
5555       &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
5556       &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
5557
5558       &genX(vs_push_constants), /* Before vs_state */
5559       &genX(tcs_push_constants),
5560       &genX(tes_push_constants),
5561       &genX(gs_push_constants), /* Before gs_state */
5562       &genX(wm_push_constants), /* Before wm_surfaces and constant_buffer */
5563
5564       /* Surface state setup.  Must come before the VS/WM unit.  The binding
5565        * table upload must be last.
5566        */
5567       &brw_vs_pull_constants,
5568       &brw_vs_ubo_surfaces,
5569       &brw_tcs_pull_constants,
5570       &brw_tcs_ubo_surfaces,
5571       &brw_tes_pull_constants,
5572       &brw_tes_ubo_surfaces,
5573       &brw_gs_pull_constants,
5574       &brw_gs_ubo_surfaces,
5575       &brw_wm_pull_constants,
5576       &brw_wm_ubo_surfaces,
5577       &gen6_renderbuffer_surfaces,
5578       &brw_renderbuffer_read_surfaces,
5579       &brw_texture_surfaces,
5580
5581       &genX(push_constant_packets),
5582
5583       &brw_vs_binding_table,
5584       &brw_tcs_binding_table,
5585       &brw_tes_binding_table,
5586       &brw_gs_binding_table,
5587       &brw_wm_binding_table,
5588
5589       &genX(fs_samplers),
5590       &genX(vs_samplers),
5591       &genX(tcs_samplers),
5592       &genX(tes_samplers),
5593       &genX(gs_samplers),
5594       &genX(multisample_state),
5595
5596       &genX(vs_state),
5597       &genX(hs_state),
5598       &genX(te_state),
5599       &genX(ds_state),
5600       &genX(gs_state),
5601       &genX(sol_state),
5602       &genX(clip_state),
5603       &genX(raster_state),
5604       &genX(sbe_state),
5605       &genX(sf_state),
5606       &genX(ps_blend),
5607       &genX(ps_extra),
5608       &genX(ps_state),
5609       &genX(depth_stencil_state),
5610       &genX(wm_state),
5611
5612       &genX(scissor_state),
5613
5614       &gen7_depthbuffer,
5615
5616       &genX(polygon_stipple),
5617       &genX(polygon_stipple_offset),
5618
5619       &genX(line_stipple),
5620
5621       &genX(drawing_rect),
5622
5623       &genX(vf_topology),
5624
5625       &brw_indices,
5626       &genX(index_buffer),
5627       &genX(vertices),
5628
5629       &genX(cut_index),
5630       &gen8_pma_fix,
5631    };
5632 #endif
5633
5634    STATIC_ASSERT(ARRAY_SIZE(render_atoms) <= ARRAY_SIZE(brw->render_atoms));
5635    brw_copy_pipeline_atoms(brw, BRW_RENDER_PIPELINE,
5636                            render_atoms, ARRAY_SIZE(render_atoms));
5637
5638 #if GEN_GEN >= 7
5639    static const struct brw_tracked_state *compute_atoms[] =
5640    {
5641       &gen7_l3_state,
5642       &brw_cs_image_surfaces,
5643       &genX(cs_push_constants),
5644       &genX(cs_pull_constants),
5645       &brw_cs_ubo_surfaces,
5646       &brw_cs_texture_surfaces,
5647       &brw_cs_work_groups_surface,
5648       &genX(cs_samplers),
5649       &genX(cs_state),
5650    };
5651
5652    STATIC_ASSERT(ARRAY_SIZE(compute_atoms) <= ARRAY_SIZE(brw->compute_atoms));
5653    brw_copy_pipeline_atoms(brw, BRW_COMPUTE_PIPELINE,
5654                            compute_atoms, ARRAY_SIZE(compute_atoms));
5655
5656    brw->vtbl.emit_mi_report_perf_count = genX(emit_mi_report_perf_count);
5657 #endif
5658 }