src/mesa/drivers/dri/i965/genX_state_upload.c

   1 /*
   2  * Copyright © 2017 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <assert.h>
  25
  26 #include "dev/gen_device_info.h"
  27 #include "common/gen_sample_positions.h"
  28 #include "genxml/gen_macros.h"
  29
  30 #include "main/bufferobj.h"
  31 #include "main/context.h"
  32 #include "main/enums.h"
  33 #include "main/macros.h"
  34 #include "main/state.h"
  35
  36 #include "brw_context.h"
  37 #include "brw_draw.h"
  38 #include "brw_multisample_state.h"
  39 #include "brw_state.h"
  40 #include "brw_wm.h"
  41 #include "brw_util.h"
  42
  43 #include "intel_batchbuffer.h"
  44 #include "intel_buffer_objects.h"
  45 #include "intel_fbo.h"
  46
  47 #include "main/enums.h"
  48 #include "main/fbobject.h"
  49 #include "main/framebuffer.h"
  50 #include "main/glformats.h"
  51 #include "main/samplerobj.h"
  52 #include "main/shaderapi.h"
  53 #include "main/stencil.h"
  54 #include "main/transformfeedback.h"
  55 #include "main/varray.h"
  56 #include "main/viewport.h"
  57 #include "util/half_float.h"
  58
  59 UNUSED static void *
  60 emit_dwords(struct brw_context *brw, unsigned n)
  61 {
  62    intel_batchbuffer_begin(brw, n);
  63    uint32_t *map = brw->batch.map_next;
  64    brw->batch.map_next += n;
  65    intel_batchbuffer_advance(brw);
  66    return map;
  67 }
  68
  69 struct brw_address {
  70    struct brw_bo *bo;
  71    unsigned reloc_flags;
  72    uint32_t offset;
  73 };
  74
  75 #define __gen_address_type struct brw_address
  76 #define __gen_user_data struct brw_context
  77
  78 static uint64_t
  79 __gen_combine_address(struct brw_context *brw, void *location,
  80                       struct brw_address address, uint32_t delta)
  81 {
  82    struct intel_batchbuffer *batch = &brw->batch;
  83    uint32_t offset;
  84
  85    if (address.bo == NULL) {
  86       return address.offset + delta;
  87    } else {
  88       if (GEN_GEN < 6 && brw_ptr_in_state_buffer(batch, location)) {
  89          offset = (char *) location - (char *) brw->batch.state.map;
  90          return brw_state_reloc(batch, offset, address.bo,
  91                                 address.offset + delta,
  92                                 address.reloc_flags);
  93       }
  94
  95       assert(!brw_ptr_in_state_buffer(batch, location));
  96
  97       offset = (char *) location - (char *) brw->batch.batch.map;
  98       return brw_batch_reloc(batch, offset, address.bo,
  99                              address.offset + delta,
 100                              address.reloc_flags);
 101    }
 102 }
 103
 104 UNUSED static struct brw_address
 105 rw_bo(struct brw_bo *bo, uint32_t offset)
 106 {
 107    return (struct brw_address) {
 108             .bo = bo,
 109             .offset = offset,
 110             .reloc_flags = RELOC_WRITE,
 111    };
 112 }
 113
 114 static struct brw_address
 115 ro_bo(struct brw_bo *bo, uint32_t offset)
 116 {
 117    return (struct brw_address) {
 118             .bo = bo,
 119             .offset = offset,
 120    };
 121 }
 122
 123 static struct brw_address
 124 rw_32_bo(struct brw_bo *bo, uint32_t offset)
 125 {
 126    return (struct brw_address) {
 127             .bo = bo,
 128             .offset = offset,
 129             .reloc_flags = RELOC_WRITE | RELOC_32BIT,
 130    };
 131 }
 132
 133 static struct brw_address
 134 ro_32_bo(struct brw_bo *bo, uint32_t offset)
 135 {
 136    return (struct brw_address) {
 137             .bo = bo,
 138             .offset = offset,
 139             .reloc_flags = RELOC_32BIT,
 140    };
 141 }
 142
 143 UNUSED static struct brw_address
 144 ggtt_bo(struct brw_bo *bo, uint32_t offset)
 145 {
 146    return (struct brw_address) {
 147             .bo = bo,
 148             .offset = offset,
 149             .reloc_flags = RELOC_WRITE | RELOC_NEEDS_GGTT,
 150    };
 151 }
 152
 153 #if GEN_GEN == 4
 154 static struct brw_address
 155 KSP(struct brw_context *brw, uint32_t offset)
 156 {
 157    return ro_bo(brw->cache.bo, offset);
 158 }
 159 #else
 160 static uint32_t
 161 KSP(UNUSED struct brw_context *brw, uint32_t offset)
 162 {
 163    return offset;
 164 }
 165 #endif
 166
 167 #include "genxml/genX_pack.h"
 168
 169 #define _brw_cmd_length(cmd) cmd ## _length
 170 #define _brw_cmd_length_bias(cmd) cmd ## _length_bias
 171 #define _brw_cmd_header(cmd) cmd ## _header
 172 #define _brw_cmd_pack(cmd) cmd ## _pack
 173
 174 #define brw_batch_emit(brw, cmd, name)                  \
 175    for (struct cmd name = { _brw_cmd_header(cmd) },     \
 176         *_dst = emit_dwords(brw, _brw_cmd_length(cmd)); \
 177         __builtin_expect(_dst != NULL, 1);              \
 178         _brw_cmd_pack(cmd)(brw, (void *)_dst, &name),   \
 179         _dst = NULL)
 180
 181 #define brw_batch_emitn(brw, cmd, n, ...) ({           \
 182       uint32_t *_dw = emit_dwords(brw, n);             \
 183       struct cmd template = {                          \
 184          _brw_cmd_header(cmd),                         \
 185          .DWordLength = n - _brw_cmd_length_bias(cmd), \
 186          __VA_ARGS__                                   \
 187       };                                               \
 188       _brw_cmd_pack(cmd)(brw, _dw, &template);         \
 189       _dw + 1; /* Array starts at dw[1] */             \
 190    })
 191
 192 #define brw_state_emit(brw, cmd, align, offset, name)              \
 193    for (struct cmd name = {},                                      \
 194         *_dst = brw_state_batch(brw, _brw_cmd_length(cmd) * 4,     \
 195                                 align, offset);                    \
 196         __builtin_expect(_dst != NULL, 1);                         \
 197         _brw_cmd_pack(cmd)(brw, (void *)_dst, &name),              \
 198         _dst = NULL)
 199
 200 #if GEN_GEN >= 7
 201 MAYBE_UNUSED static void
 202 emit_lrm(struct brw_context *brw, uint32_t reg, struct brw_address addr)
 203 {
 204    brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_MEM), lrm) {
 205       lrm.RegisterAddress  = reg;
 206       lrm.MemoryAddress    = addr;
 207    }
 208 }
 209 #endif
 210
 211 MAYBE_UNUSED static void
 212 emit_lri(struct brw_context *brw, uint32_t reg, uint32_t imm)
 213 {
 214    brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_IMM), lri) {
 215       lri.RegisterOffset   = reg;
 216       lri.DataDWord        = imm;
 217    }
 218 }
 219
 220 #if GEN_IS_HASWELL || GEN_GEN >= 8
 221 MAYBE_UNUSED static void
 222 emit_lrr(struct brw_context *brw, uint32_t dst, uint32_t src)
 223 {
 224    brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_REG), lrr) {
 225       lrr.SourceRegisterAddress        = src;
 226       lrr.DestinationRegisterAddress   = dst;
 227    }
 228 }
 229 #endif
 230
 231 /**
 232  * Polygon stipple packet
 233  */
 234 static void
 235 genX(upload_polygon_stipple)(struct brw_context *brw)
 236 {
 237    struct gl_context *ctx = &brw->ctx;
 238
 239    /* _NEW_POLYGON */
 240    if (!ctx->Polygon.StippleFlag)
 241       return;
 242
 243    brw_batch_emit(brw, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
 244       /* Polygon stipple is provided in OpenGL order, i.e. bottom
 245        * row first.  If we're rendering to a window (i.e. the
 246        * default frame buffer object, 0), then we need to invert
 247        * it to match our pixel layout.  But if we're rendering
 248        * to a FBO (i.e. any named frame buffer object), we *don't*
 249        * need to invert - we already match the layout.
 250        */
 251       if (ctx->DrawBuffer->FlipY) {
 252          for (unsigned i = 0; i < 32; i++)
 253             poly.PatternRow[i] = ctx->PolygonStipple[31 - i]; /* invert */
 254       } else {
 255          for (unsigned i = 0; i < 32; i++)
 256             poly.PatternRow[i] = ctx->PolygonStipple[i];
 257       }
 258    }
 259 }
 260
 261 static const struct brw_tracked_state genX(polygon_stipple) = {
 262    .dirty = {
 263       .mesa = _NEW_POLYGON |
 264               _NEW_POLYGONSTIPPLE,
 265       .brw = BRW_NEW_CONTEXT,
 266    },
 267    .emit = genX(upload_polygon_stipple),
 268 };
 269
 270 /**
 271  * Polygon stipple offset packet
 272  */
 273 static void
 274 genX(upload_polygon_stipple_offset)(struct brw_context *brw)
 275 {
 276    struct gl_context *ctx = &brw->ctx;
 277
 278    /* _NEW_POLYGON */
 279    if (!ctx->Polygon.StippleFlag)
 280       return;
 281
 282    brw_batch_emit(brw, GENX(3DSTATE_POLY_STIPPLE_OFFSET), poly) {
 283       /* _NEW_BUFFERS
 284        *
 285        * If we're drawing to a system window we have to invert the Y axis
 286        * in order to match the OpenGL pixel coordinate system, and our
 287        * offset must be matched to the window position.  If we're drawing
 288        * to a user-created FBO then our native pixel coordinate system
 289        * works just fine, and there's no window system to worry about.
 290        */
 291       if (ctx->DrawBuffer->FlipY) {
 292          poly.PolygonStippleYOffset =
 293             (32 - (_mesa_geometric_height(ctx->DrawBuffer) & 31)) & 31;
 294       }
 295    }
 296 }
 297
 298 static const struct brw_tracked_state genX(polygon_stipple_offset) = {
 299    .dirty = {
 300       .mesa = _NEW_BUFFERS |
 301               _NEW_POLYGON,
 302       .brw = BRW_NEW_CONTEXT,
 303    },
 304    .emit = genX(upload_polygon_stipple_offset),
 305 };
 306
 307 /**
 308  * Line stipple packet
 309  */
 310 static void
 311 genX(upload_line_stipple)(struct brw_context *brw)
 312 {
 313    struct gl_context *ctx = &brw->ctx;
 314
 315    if (!ctx->Line.StippleFlag)
 316       return;
 317
 318    brw_batch_emit(brw, GENX(3DSTATE_LINE_STIPPLE), line) {
 319       line.LineStipplePattern = ctx->Line.StipplePattern;
 320
 321       line.LineStippleInverseRepeatCount = 1.0f / ctx->Line.StippleFactor;
 322       line.LineStippleRepeatCount = ctx->Line.StippleFactor;
 323    }
 324 }
 325
 326 static const struct brw_tracked_state genX(line_stipple) = {
 327    .dirty = {
 328       .mesa = _NEW_LINE,
 329       .brw = BRW_NEW_CONTEXT,
 330    },
 331    .emit = genX(upload_line_stipple),
 332 };
 333
 334 /* Constant single cliprect for framebuffer object or DRI2 drawing */
 335 static void
 336 genX(upload_drawing_rect)(struct brw_context *brw)
 337 {
 338    struct gl_context *ctx = &brw->ctx;
 339    const struct gl_framebuffer *fb = ctx->DrawBuffer;
 340    const unsigned int fb_width = _mesa_geometric_width(fb);
 341    const unsigned int fb_height = _mesa_geometric_height(fb);
 342
 343    brw_batch_emit(brw, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
 344       rect.ClippedDrawingRectangleXMax = fb_width - 1;
 345       rect.ClippedDrawingRectangleYMax = fb_height - 1;
 346    }
 347 }
 348
 349 static const struct brw_tracked_state genX(drawing_rect) = {
 350    .dirty = {
 351       .mesa = _NEW_BUFFERS,
 352       .brw = BRW_NEW_BLORP |
 353              BRW_NEW_CONTEXT,
 354    },
 355    .emit = genX(upload_drawing_rect),
 356 };
 357
 358 static uint32_t *
 359 genX(emit_vertex_buffer_state)(struct brw_context *brw,
 360                                uint32_t *dw,
 361                                unsigned buffer_nr,
 362                                struct brw_bo *bo,
 363                                unsigned start_offset,
 364                                MAYBE_UNUSED unsigned end_offset,
 365                                unsigned stride,
 366                                MAYBE_UNUSED unsigned step_rate)
 367 {
 368    struct GENX(VERTEX_BUFFER_STATE) buf_state = {
 369       .VertexBufferIndex = buffer_nr,
 370       .BufferPitch = stride,
 371
 372       /* The VF cache designers apparently cut corners, and made the cache
 373        * only consider the bottom 32 bits of memory addresses.  If you happen
 374        * to have two vertex buffers which get placed exactly 4 GiB apart and
 375        * use them in back-to-back draw calls, you can get collisions.  To work
 376        * around this problem, we restrict vertex buffers to the low 32 bits of
 377        * the address space.
 378        */
 379       .BufferStartingAddress = ro_32_bo(bo, start_offset),
 380 #if GEN_GEN >= 8
 381       .BufferSize = end_offset - start_offset,
 382 #endif
 383
 384 #if GEN_GEN >= 7
 385       .AddressModifyEnable = true,
 386 #endif
 387
 388 #if GEN_GEN < 8
 389       .BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA,
 390       .InstanceDataStepRate = step_rate,
 391 #if GEN_GEN >= 5
 392       .EndAddress = ro_bo(bo, end_offset - 1),
 393 #endif
 394 #endif
 395
 396 #if GEN_GEN == 11
 397       .VertexBufferMOCS = ICL_MOCS_WB,
 398 #elif GEN_GEN == 10
 399       .VertexBufferMOCS = CNL_MOCS_WB,
 400 #elif GEN_GEN == 9
 401       .VertexBufferMOCS = SKL_MOCS_WB,
 402 #elif GEN_GEN == 8
 403       .VertexBufferMOCS = BDW_MOCS_WB,
 404 #elif GEN_GEN == 7
 405       .VertexBufferMOCS = GEN7_MOCS_L3,
 406 #endif
 407    };
 408
 409    GENX(VERTEX_BUFFER_STATE_pack)(brw, dw, &buf_state);
 410    return dw + GENX(VERTEX_BUFFER_STATE_length);
 411 }
 412
 413 UNUSED static bool
 414 is_passthru_format(uint32_t format)
 415 {
 416    switch (format) {
 417    case ISL_FORMAT_R64_PASSTHRU:
 418    case ISL_FORMAT_R64G64_PASSTHRU:
 419    case ISL_FORMAT_R64G64B64_PASSTHRU:
 420    case ISL_FORMAT_R64G64B64A64_PASSTHRU:
 421       return true;
 422    default:
 423       return false;
 424    }
 425 }
 426
 427 UNUSED static int
 428 uploads_needed(uint32_t format,
 429                bool is_dual_slot)
 430 {
 431    if (!is_passthru_format(format))
 432       return 1;
 433
 434    if (is_dual_slot)
 435       return 2;
 436
 437    switch (format) {
 438    case ISL_FORMAT_R64_PASSTHRU:
 439    case ISL_FORMAT_R64G64_PASSTHRU:
 440       return 1;
 441    case ISL_FORMAT_R64G64B64_PASSTHRU:
 442    case ISL_FORMAT_R64G64B64A64_PASSTHRU:
 443       return 2;
 444    default:
 445       unreachable("not reached");
 446    }
 447 }
 448
 449 /*
 450  * Returns the format that we are finally going to use when upload a vertex
 451  * element. It will only change if we are using *64*PASSTHRU formats, as for
 452  * gen < 8 they need to be splitted on two *32*FLOAT formats.
 453  *
 454  * @upload points in which upload we are. Valid values are [0,1]
 455  */
 456 static uint32_t
 457 downsize_format_if_needed(uint32_t format,
 458                           int upload)
 459 {
 460    assert(upload == 0 || upload == 1);
 461
 462    if (!is_passthru_format(format))
 463       return format;
 464
 465    /* ISL_FORMAT_R64_PASSTHRU and ISL_FORMAT_R64G64_PASSTHRU with an upload ==
 466     * 1 means that we have been forced to do 2 uploads for a size <= 2. This
 467     * happens with gen < 8 and dvec3 or dvec4 vertex shader input
 468     * variables. In those cases, we return ISL_FORMAT_R32_FLOAT as a way of
 469     * flagging that we want to fill with zeroes this second forced upload.
 470     */
 471    switch (format) {
 472    case ISL_FORMAT_R64_PASSTHRU:
 473       return upload == 0 ? ISL_FORMAT_R32G32_FLOAT
 474                          : ISL_FORMAT_R32_FLOAT;
 475    case ISL_FORMAT_R64G64_PASSTHRU:
 476       return upload == 0 ? ISL_FORMAT_R32G32B32A32_FLOAT
 477                          : ISL_FORMAT_R32_FLOAT;
 478    case ISL_FORMAT_R64G64B64_PASSTHRU:
 479       return upload == 0 ? ISL_FORMAT_R32G32B32A32_FLOAT
 480                          : ISL_FORMAT_R32G32_FLOAT;
 481    case ISL_FORMAT_R64G64B64A64_PASSTHRU:
 482       return ISL_FORMAT_R32G32B32A32_FLOAT;
 483    default:
 484       unreachable("not reached");
 485    }
 486 }
 487
 488 /*
 489  * Returns the number of componentes associated with a format that is used on
 490  * a 64 to 32 format split. See downsize_format()
 491  */
 492 static int
 493 upload_format_size(uint32_t upload_format)
 494 {
 495    switch (upload_format) {
 496    case ISL_FORMAT_R32_FLOAT:
 497
 498       /* downsized_format has returned this one in order to flag that we are
 499        * performing a second upload which we want to have filled with
 500        * zeroes. This happens with gen < 8, a size <= 2, and dvec3 or dvec4
 501        * vertex shader input variables.
 502        */
 503
 504       return 0;
 505    case ISL_FORMAT_R32G32_FLOAT:
 506       return 2;
 507    case ISL_FORMAT_R32G32B32A32_FLOAT:
 508       return 4;
 509    default:
 510       unreachable("not reached");
 511    }
 512 }
 513
 514 static UNUSED uint16_t
 515 pinned_bo_high_bits(struct brw_bo *bo)
 516 {
 517    return (bo->kflags & EXEC_OBJECT_PINNED) ? bo->gtt_offset >> 32ull : 0;
 518 }
 519
 520 /* The VF cache designers apparently cut corners, and made the cache key's
 521  * <VertexBufferIndex, Memory Address> tuple only consider the bottom 32 bits
 522  * of the address.  If you happen to have two vertex buffers which get placed
 523  * exactly 4 GiB apart and use them in back-to-back draw calls, you can get
 524  * collisions.  (These collisions can happen within a single batch.)
 525  *
 526  * In the soft-pin world, we'd like to assign addresses up front, and never
 527  * move buffers.  So, we need to do a VF cache invalidate if the buffer for
 528  * a particular VB slot has different [48:32] address bits than the last one.
 529  *
 530  * In the relocation world, we have no idea what the addresses will be, so
 531  * we can't apply this workaround.  Instead, we tell the kernel to move it
 532  * to the low 4GB regardless.
 533  */
 534 static void
 535 vf_invalidate_for_vb_48bit_transitions(struct brw_context *brw)
 536 {
 537 #if GEN_GEN >= 8
 538    bool need_invalidate = false;
 539    unsigned i;
 540
 541    for (i = 0; i < brw->vb.nr_buffers; i++) {
 542       uint16_t high_bits = pinned_bo_high_bits(brw->vb.buffers[i].bo);
 543
 544       if (high_bits != brw->vb.last_bo_high_bits[i]) {
 545          need_invalidate = true;
 546          brw->vb.last_bo_high_bits[i] = high_bits;
 547       }
 548    }
 549
 550    /* Don't bother with draw parameter buffers - those are generated by
 551     * the driver so we can select a consistent memory zone.
 552     */
 553
 554    if (need_invalidate) {
 555       brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE);
 556    }
 557 #endif
 558 }
 559
 560 static void
 561 vf_invalidate_for_ib_48bit_transition(struct brw_context *brw)
 562 {
 563 #if GEN_GEN >= 8
 564    uint16_t high_bits = pinned_bo_high_bits(brw->ib.bo);
 565
 566    if (high_bits != brw->ib.last_bo_high_bits) {
 567       brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE);
 568       brw->ib.last_bo_high_bits = high_bits;
 569    }
 570 #endif
 571 }
 572
 573 static void
 574 genX(emit_vertices)(struct brw_context *brw)
 575 {
 576    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 577    uint32_t *dw;
 578
 579    brw_prepare_vertices(brw);
 580    brw_prepare_shader_draw_parameters(brw);
 581
 582 #if GEN_GEN < 6
 583    brw_emit_query_begin(brw);
 584 #endif
 585
 586    const struct brw_vs_prog_data *vs_prog_data =
 587       brw_vs_prog_data(brw->vs.base.prog_data);
 588
 589 #if GEN_GEN >= 8
 590    struct gl_context *ctx = &brw->ctx;
 591    const bool uses_edge_flag = (ctx->Polygon.FrontMode != GL_FILL ||
 592                                 ctx->Polygon.BackMode != GL_FILL);
 593
 594    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) {
 595       unsigned vue = brw->vb.nr_enabled;
 596
 597       /* The element for the edge flags must always be last, so we have to
 598        * insert the SGVS before it in that case.
 599        */
 600       if (uses_edge_flag) {
 601          assert(vue > 0);
 602          vue--;
 603       }
 604
 605       WARN_ONCE(vue >= 33,
 606                 "Trying to insert VID/IID past 33rd vertex element, "
 607                 "need to reorder the vertex attrbutes.");
 608
 609       brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs) {
 610          if (vs_prog_data->uses_vertexid) {
 611             vfs.VertexIDEnable = true;
 612             vfs.VertexIDComponentNumber = 2;
 613             vfs.VertexIDElementOffset = vue;
 614          }
 615
 616          if (vs_prog_data->uses_instanceid) {
 617             vfs.InstanceIDEnable = true;
 618             vfs.InstanceIDComponentNumber = 3;
 619             vfs.InstanceIDElementOffset = vue;
 620          }
 621       }
 622
 623       brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
 624          vfi.InstancingEnable = true;
 625          vfi.VertexElementIndex = vue;
 626       }
 627    } else {
 628       brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs);
 629    }
 630 #endif
 631
 632    const bool uses_draw_params =
 633       vs_prog_data->uses_firstvertex ||
 634       vs_prog_data->uses_baseinstance;
 635
 636    const bool uses_derived_draw_params =
 637       vs_prog_data->uses_drawid ||
 638       vs_prog_data->uses_is_indexed_draw;
 639
 640    const bool needs_sgvs_element = (uses_draw_params ||
 641                                     vs_prog_data->uses_instanceid ||
 642                                     vs_prog_data->uses_vertexid);
 643
 644    unsigned nr_elements =
 645       brw->vb.nr_enabled + needs_sgvs_element + uses_derived_draw_params;
 646
 647 #if GEN_GEN < 8
 648    /* If any of the formats of vb.enabled needs more that one upload, we need
 649     * to add it to nr_elements
 650     */
 651    for (unsigned i = 0; i < brw->vb.nr_enabled; i++) {
 652       struct brw_vertex_element *input = brw->vb.enabled[i];
 653       const struct gl_array_attributes *glattrib = input->glattrib;
 654       uint32_t format = brw_get_vertex_surface_type(brw, &glattrib->Format);
 655
 656       if (uploads_needed(format, input->is_dual_slot) > 1)
 657          nr_elements++;
 658    }
 659 #endif
 660
 661    /* If the VS doesn't read any inputs (calculating vertex position from
 662     * a state variable for some reason, for example), emit a single pad
 663     * VERTEX_ELEMENT struct and bail.
 664     *
 665     * The stale VB state stays in place, but they don't do anything unless
 666     * a VE loads from them.
 667     */
 668    if (nr_elements == 0) {
 669       dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS),
 670                            1 + GENX(VERTEX_ELEMENT_STATE_length));
 671       struct GENX(VERTEX_ELEMENT_STATE) elem = {
 672          .Valid = true,
 673          .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
 674          .Component0Control = VFCOMP_STORE_0,
 675          .Component1Control = VFCOMP_STORE_0,
 676          .Component2Control = VFCOMP_STORE_0,
 677          .Component3Control = VFCOMP_STORE_1_FP,
 678       };
 679       GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem);
 680       return;
 681    }
 682
 683    /* Now emit 3DSTATE_VERTEX_BUFFERS and 3DSTATE_VERTEX_ELEMENTS packets. */
 684    const unsigned nr_buffers = brw->vb.nr_buffers +
 685       uses_draw_params + uses_derived_draw_params;
 686
 687    vf_invalidate_for_vb_48bit_transitions(brw);
 688
 689    if (nr_buffers) {
 690       assert(nr_buffers <= (GEN_GEN >= 6 ? 33 : 17));
 691
 692       dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_BUFFERS),
 693                            1 + GENX(VERTEX_BUFFER_STATE_length) * nr_buffers);
 694
 695       for (unsigned i = 0; i < brw->vb.nr_buffers; i++) {
 696          const struct brw_vertex_buffer *buffer = &brw->vb.buffers[i];
 697          /* Prior to Haswell and Bay Trail we have to use 4-component formats
 698           * to fake 3-component ones.  In particular, we do this for
 699           * half-float and 8 and 16-bit integer formats.  This means that the
 700           * vertex element may poke over the end of the buffer by 2 bytes.
 701           */
 702          const unsigned padding =
 703             (GEN_GEN <= 7 && !GEN_IS_HASWELL && !devinfo->is_baytrail) * 2;
 704          const unsigned end = buffer->offset + buffer->size + padding;
 705          dw = genX(emit_vertex_buffer_state)(brw, dw, i, buffer->bo,
 706                                              buffer->offset,
 707                                              end,
 708                                              buffer->stride,
 709                                              buffer->step_rate);
 710       }
 711
 712       if (uses_draw_params) {
 713          dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers,
 714                                              brw->draw.draw_params_bo,
 715                                              brw->draw.draw_params_offset,
 716                                              brw->draw.draw_params_bo->size,
 717                                              0 /* stride */,
 718                                              0 /* step rate */);
 719       }
 720
 721       if (uses_derived_draw_params) {
 722          dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers + 1,
 723                                              brw->draw.derived_draw_params_bo,
 724                                              brw->draw.derived_draw_params_offset,
 725                                              brw->draw.derived_draw_params_bo->size,
 726                                              0 /* stride */,
 727                                              0 /* step rate */);
 728       }
 729    }
 730
 731    /* The hardware allows one more VERTEX_ELEMENTS than VERTEX_BUFFERS,
 732     * presumably for VertexID/InstanceID.
 733     */
 734 #if GEN_GEN >= 6
 735    assert(nr_elements <= 34);
 736    const struct brw_vertex_element *gen6_edgeflag_input = NULL;
 737 #else
 738    assert(nr_elements <= 18);
 739 #endif
 740
 741    dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS),
 742                         1 + GENX(VERTEX_ELEMENT_STATE_length) * nr_elements);
 743    unsigned i;
 744    for (i = 0; i < brw->vb.nr_enabled; i++) {
 745       const struct brw_vertex_element *input = brw->vb.enabled[i];
 746       const struct gl_array_attributes *glattrib = input->glattrib;
 747       uint32_t format = brw_get_vertex_surface_type(brw, &glattrib->Format);
 748       uint32_t comp0 = VFCOMP_STORE_SRC;
 749       uint32_t comp1 = VFCOMP_STORE_SRC;
 750       uint32_t comp2 = VFCOMP_STORE_SRC;
 751       uint32_t comp3 = VFCOMP_STORE_SRC;
 752       const unsigned num_uploads = GEN_GEN < 8 ?
 753          uploads_needed(format, input->is_dual_slot) : 1;
 754
 755 #if GEN_GEN >= 8
 756       /* From the BDW PRM, Volume 2d, page 588 (VERTEX_ELEMENT_STATE):
 757        * "Any SourceElementFormat of *64*_PASSTHRU cannot be used with an
 758        * element which has edge flag enabled."
 759        */
 760       assert(!(is_passthru_format(format) && uses_edge_flag));
 761 #endif
 762
 763       /* The gen4 driver expects edgeflag to come in as a float, and passes
 764        * that float on to the tests in the clipper.  Mesa's current vertex
 765        * attribute value for EdgeFlag is stored as a float, which works out.
 766        * glEdgeFlagPointer, on the other hand, gives us an unnormalized
 767        * integer ubyte.  Just rewrite that to convert to a float.
 768        *
 769        * Gen6+ passes edgeflag as sideband along with the vertex, instead
 770        * of in the VUE.  We have to upload it sideband as the last vertex
 771        * element according to the B-Spec.
 772        */
 773 #if GEN_GEN >= 6
 774       if (input == &brw->vb.inputs[VERT_ATTRIB_EDGEFLAG]) {
 775          gen6_edgeflag_input = input;
 776          continue;
 777       }
 778 #endif
 779
 780       for (unsigned c = 0; c < num_uploads; c++) {
 781          const uint32_t upload_format = GEN_GEN >= 8 ? format :
 782             downsize_format_if_needed(format, c);
 783          /* If we need more that one upload, the offset stride would be 128
 784           * bits (16 bytes), as for previous uploads we are using the full
 785           * entry. */
 786          const unsigned offset = input->offset + c * 16;
 787
 788          const struct gl_array_attributes *glattrib = input->glattrib;
 789          const int size = (GEN_GEN < 8 && is_passthru_format(format)) ?
 790             upload_format_size(upload_format) : glattrib->Format.Size;
 791
 792          switch (size) {
 793             case 0: comp0 = VFCOMP_STORE_0;
 794             case 1: comp1 = VFCOMP_STORE_0;
 795             case 2: comp2 = VFCOMP_STORE_0;
 796             case 3:
 797                if (GEN_GEN >= 8 && glattrib->Format.Doubles) {
 798                   comp3 = VFCOMP_STORE_0;
 799                } else if (glattrib->Format.Integer) {
 800                   comp3 = VFCOMP_STORE_1_INT;
 801                } else {
 802                   comp3 = VFCOMP_STORE_1_FP;
 803                }
 804
 805                break;
 806          }
 807
 808 #if GEN_GEN >= 8
 809          /* From the BDW PRM, Volume 2d, page 586 (VERTEX_ELEMENT_STATE):
 810           *
 811           *     "When SourceElementFormat is set to one of the *64*_PASSTHRU
 812           *     formats, 64-bit components are stored in the URB without any
 813           *     conversion. In this case, vertex elements must be written as 128
 814           *     or 256 bits, with VFCOMP_STORE_0 being used to pad the output as
 815           *     required. E.g., if R64_PASSTHRU is used to copy a 64-bit Red
 816           *     component into the URB, Component 1 must be specified as
 817           *     VFCOMP_STORE_0 (with Components 2,3 set to VFCOMP_NOSTORE) in
 818           *     order to output a 128-bit vertex element, or Components 1-3 must
 819           *     be specified as VFCOMP_STORE_0 in order to output a 256-bit vertex
 820           *     element. Likewise, use of R64G64B64_PASSTHRU requires Component 3
 821           *     to be specified as VFCOMP_STORE_0 in order to output a 256-bit
 822           *     vertex element."
 823           */
 824          if (glattrib->Format.Doubles && !input->is_dual_slot) {
 825             /* Store vertex elements which correspond to double and dvec2 vertex
 826              * shader inputs as 128-bit vertex elements, instead of 256-bits.
 827              */
 828             comp2 = VFCOMP_NOSTORE;
 829             comp3 = VFCOMP_NOSTORE;
 830          }
 831 #endif
 832
 833          struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
 834             .VertexBufferIndex = input->buffer,
 835             .Valid = true,
 836             .SourceElementFormat = upload_format,
 837             .SourceElementOffset = offset,
 838             .Component0Control = comp0,
 839             .Component1Control = comp1,
 840             .Component2Control = comp2,
 841             .Component3Control = comp3,
 842 #if GEN_GEN < 5
 843             .DestinationElementOffset = i * 4,
 844 #endif
 845          };
 846
 847          GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
 848          dw += GENX(VERTEX_ELEMENT_STATE_length);
 849       }
 850    }
 851
 852    if (needs_sgvs_element) {
 853       struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
 854          .Valid = true,
 855          .Component0Control = VFCOMP_STORE_0,
 856          .Component1Control = VFCOMP_STORE_0,
 857          .Component2Control = VFCOMP_STORE_0,
 858          .Component3Control = VFCOMP_STORE_0,
 859 #if GEN_GEN < 5
 860          .DestinationElementOffset = i * 4,
 861 #endif
 862       };
 863
 864 #if GEN_GEN >= 8
 865       if (uses_draw_params) {
 866          elem_state.VertexBufferIndex = brw->vb.nr_buffers;
 867          elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
 868          elem_state.Component0Control = VFCOMP_STORE_SRC;
 869          elem_state.Component1Control = VFCOMP_STORE_SRC;
 870       }
 871 #else
 872       elem_state.VertexBufferIndex = brw->vb.nr_buffers;
 873       elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
 874       if (uses_draw_params) {
 875          elem_state.Component0Control = VFCOMP_STORE_SRC;
 876          elem_state.Component1Control = VFCOMP_STORE_SRC;
 877       }
 878
 879       if (vs_prog_data->uses_vertexid)
 880          elem_state.Component2Control = VFCOMP_STORE_VID;
 881
 882       if (vs_prog_data->uses_instanceid)
 883          elem_state.Component3Control = VFCOMP_STORE_IID;
 884 #endif
 885
 886       GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
 887       dw += GENX(VERTEX_ELEMENT_STATE_length);
 888    }
 889
 890    if (uses_derived_draw_params) {
 891       struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
 892          .Valid = true,
 893          .VertexBufferIndex = brw->vb.nr_buffers + 1,
 894          .SourceElementFormat = ISL_FORMAT_R32G32_UINT,
 895          .Component0Control = VFCOMP_STORE_SRC,
 896          .Component1Control = VFCOMP_STORE_SRC,
 897          .Component2Control = VFCOMP_STORE_0,
 898          .Component3Control = VFCOMP_STORE_0,
 899 #if GEN_GEN < 5
 900          .DestinationElementOffset = i * 4,
 901 #endif
 902       };
 903
 904       GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
 905       dw += GENX(VERTEX_ELEMENT_STATE_length);
 906    }
 907
 908 #if GEN_GEN >= 6
 909    if (gen6_edgeflag_input) {
 910       const struct gl_array_attributes *glattrib = gen6_edgeflag_input->glattrib;
 911       const uint32_t format = brw_get_vertex_surface_type(brw, &glattrib->Format);
 912
 913       struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
 914          .Valid = true,
 915          .VertexBufferIndex = gen6_edgeflag_input->buffer,
 916          .EdgeFlagEnable = true,
 917          .SourceElementFormat = format,
 918          .SourceElementOffset = gen6_edgeflag_input->offset,
 919          .Component0Control = VFCOMP_STORE_SRC,
 920          .Component1Control = VFCOMP_STORE_0,
 921          .Component2Control = VFCOMP_STORE_0,
 922          .Component3Control = VFCOMP_STORE_0,
 923       };
 924
 925       GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
 926       dw += GENX(VERTEX_ELEMENT_STATE_length);
 927    }
 928 #endif
 929
 930 #if GEN_GEN >= 8
 931    for (unsigned i = 0, j = 0; i < brw->vb.nr_enabled; i++) {
 932       const struct brw_vertex_element *input = brw->vb.enabled[i];
 933       const struct brw_vertex_buffer *buffer = &brw->vb.buffers[input->buffer];
 934       unsigned element_index;
 935
 936       /* The edge flag element is reordered to be the last one in the code
 937        * above so we need to compensate for that in the element indices used
 938        * below.
 939        */
 940       if (input == gen6_edgeflag_input)
 941          element_index = nr_elements - 1;
 942       else
 943          element_index = j++;
 944
 945       brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
 946          vfi.VertexElementIndex = element_index;
 947          vfi.InstancingEnable = buffer->step_rate != 0;
 948          vfi.InstanceDataStepRate = buffer->step_rate;
 949       }
 950    }
 951
 952    if (vs_prog_data->uses_drawid) {
 953       const unsigned element = brw->vb.nr_enabled + needs_sgvs_element;
 954
 955       brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
 956          vfi.VertexElementIndex = element;
 957       }
 958    }
 959 #endif
 960 }
 961
 962 static const struct brw_tracked_state genX(vertices) = {
 963    .dirty = {
 964       .mesa = _NEW_POLYGON,
 965       .brw = BRW_NEW_BATCH |
 966              BRW_NEW_BLORP |
 967              BRW_NEW_VERTEX_PROGRAM |
 968              BRW_NEW_VERTICES |
 969              BRW_NEW_VS_PROG_DATA,
 970    },
 971    .emit = genX(emit_vertices),
 972 };
 973
 974 static void
 975 genX(emit_index_buffer)(struct brw_context *brw)
 976 {
 977    const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
 978
 979    if (index_buffer == NULL)
 980       return;
 981
 982    vf_invalidate_for_ib_48bit_transition(brw);
 983
 984    brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) {
 985 #if GEN_GEN < 8 && !GEN_IS_HASWELL
 986       ib.CutIndexEnable = brw->prim_restart.enable_cut_index;
 987 #endif
 988       ib.IndexFormat = brw_get_index_type(index_buffer->index_size);
 989
 990       /* The VF cache designers apparently cut corners, and made the cache
 991        * only consider the bottom 32 bits of memory addresses.  If you happen
 992        * to have two index buffers which get placed exactly 4 GiB apart and
 993        * use them in back-to-back draw calls, you can get collisions.  To work
 994        * around this problem, we restrict index buffers to the low 32 bits of
 995        * the address space.
 996        */
 997       ib.BufferStartingAddress = ro_32_bo(brw->ib.bo, 0);
 998 #if GEN_GEN >= 8
 999       ib.IndexBufferMOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
1000       ib.BufferSize = brw->ib.size;
1001 #else
1002       ib.BufferEndingAddress = ro_bo(brw->ib.bo, brw->ib.size - 1);
1003 #endif
1004    }
1005 }
1006
1007 static const struct brw_tracked_state genX(index_buffer) = {
1008    .dirty = {
1009       .mesa = 0,
1010       .brw = BRW_NEW_BATCH |
1011              BRW_NEW_BLORP |
1012              BRW_NEW_INDEX_BUFFER,
1013    },
1014    .emit = genX(emit_index_buffer),
1015 };
1016
1017 #if GEN_IS_HASWELL || GEN_GEN >= 8
1018 static void
1019 genX(upload_cut_index)(struct brw_context *brw)
1020 {
1021    const struct gl_context *ctx = &brw->ctx;
1022
1023    brw_batch_emit(brw, GENX(3DSTATE_VF), vf) {
1024       if (ctx->Array._PrimitiveRestart && brw->ib.ib) {
1025          vf.IndexedDrawCutIndexEnable = true;
1026          vf.CutIndex = _mesa_primitive_restart_index(ctx, brw->ib.index_size);
1027       }
1028    }
1029 }
1030
1031 const struct brw_tracked_state genX(cut_index) = {
1032    .dirty = {
1033       .mesa  = _NEW_TRANSFORM,
1034       .brw   = BRW_NEW_INDEX_BUFFER,
1035    },
1036    .emit = genX(upload_cut_index),
1037 };
1038 #endif
1039
1040 #if GEN_GEN >= 6
1041 /**
1042  * Determine the appropriate attribute override value to store into the
1043  * 3DSTATE_SF structure for a given fragment shader attribute.  The attribute
1044  * override value contains two pieces of information: the location of the
1045  * attribute in the VUE (relative to urb_entry_read_offset, see below), and a
1046  * flag indicating whether to "swizzle" the attribute based on the direction
1047  * the triangle is facing.
1048  *
1049  * If an attribute is "swizzled", then the given VUE location is used for
1050  * front-facing triangles, and the VUE location that immediately follows is
1051  * used for back-facing triangles.  We use this to implement the mapping from
1052  * gl_FrontColor/gl_BackColor to gl_Color.
1053  *
1054  * urb_entry_read_offset is the offset into the VUE at which the SF unit is
1055  * being instructed to begin reading attribute data.  It can be set to a
1056  * nonzero value to prevent the SF unit from wasting time reading elements of
1057  * the VUE that are not needed by the fragment shader.  It is measured in
1058  * 256-bit increments.
1059  */
1060 static void
1061 genX(get_attr_override)(struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr,
1062                         const struct brw_vue_map *vue_map,
1063                         int urb_entry_read_offset, int fs_attr,
1064                         bool two_side_color, uint32_t *max_source_attr)
1065 {
1066    /* Find the VUE slot for this attribute. */
1067    int slot = vue_map->varying_to_slot[fs_attr];
1068
1069    /* Viewport and Layer are stored in the VUE header.  We need to override
1070     * them to zero if earlier stages didn't write them, as GL requires that
1071     * they read back as zero when not explicitly set.
1072     */
1073    if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
1074       attr->ComponentOverrideX = true;
1075       attr->ComponentOverrideW = true;
1076       attr->ConstantSource = CONST_0000;
1077
1078       if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
1079          attr->ComponentOverrideY = true;
1080       if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
1081          attr->ComponentOverrideZ = true;
1082
1083       return;
1084    }
1085
1086    /* If there was only a back color written but not front, use back
1087     * as the color instead of undefined
1088     */
1089    if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
1090       slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
1091    if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
1092       slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
1093
1094    if (slot == -1) {
1095       /* This attribute does not exist in the VUE--that means that the vertex
1096        * shader did not write to it.  This means that either:
1097        *
1098        * (a) This attribute is a texture coordinate, and it is going to be
1099        * replaced with point coordinates (as a consequence of a call to
1100        * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
1101        * hardware will ignore whatever attribute override we supply.
1102        *
1103        * (b) This attribute is read by the fragment shader but not written by
1104        * the vertex shader, so its value is undefined.  Therefore the
1105        * attribute override we supply doesn't matter.
1106        *
1107        * (c) This attribute is gl_PrimitiveID, and it wasn't written by the
1108        * previous shader stage.
1109        *
1110        * Note that we don't have to worry about the cases where the attribute
1111        * is gl_PointCoord or is undergoing point sprite coordinate
1112        * replacement, because in those cases, this function isn't called.
1113        *
1114        * In case (c), we need to program the attribute overrides so that the
1115        * primitive ID will be stored in this slot.  In every other case, the
1116        * attribute override we supply doesn't matter.  So just go ahead and
1117        * program primitive ID in every case.
1118        */
1119       attr->ComponentOverrideW = true;
1120       attr->ComponentOverrideX = true;
1121       attr->ComponentOverrideY = true;
1122       attr->ComponentOverrideZ = true;
1123       attr->ConstantSource = PRIM_ID;
1124       return;
1125    }
1126
1127    /* Compute the location of the attribute relative to urb_entry_read_offset.
1128     * Each increment of urb_entry_read_offset represents a 256-bit value, so
1129     * it counts for two 128-bit VUE slots.
1130     */
1131    int source_attr = slot - 2 * urb_entry_read_offset;
1132    assert(source_attr >= 0 && source_attr < 32);
1133
1134    /* If we are doing two-sided color, and the VUE slot following this one
1135     * represents a back-facing color, then we need to instruct the SF unit to
1136     * do back-facing swizzling.
1137     */
1138    bool swizzling = two_side_color &&
1139       ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
1140         vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
1141        (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
1142         vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1));
1143
1144    /* Update max_source_attr.  If swizzling, the SF will read this slot + 1. */
1145    if (*max_source_attr < source_attr + swizzling)
1146       *max_source_attr = source_attr + swizzling;
1147
1148    attr->SourceAttribute = source_attr;
1149    if (swizzling)
1150       attr->SwizzleSelect = INPUTATTR_FACING;
1151 }
1152
1153
1154 static void
1155 genX(calculate_attr_overrides)(const struct brw_context *brw,
1156                                struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides,
1157                                uint32_t *point_sprite_enables,
1158                                uint32_t *urb_entry_read_length,
1159                                uint32_t *urb_entry_read_offset)
1160 {
1161    const struct gl_context *ctx = &brw->ctx;
1162
1163    /* _NEW_POINT */
1164    const struct gl_point_attrib *point = &ctx->Point;
1165
1166    /* BRW_NEW_FRAGMENT_PROGRAM */
1167    const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
1168
1169    /* BRW_NEW_FS_PROG_DATA */
1170    const struct brw_wm_prog_data *wm_prog_data =
1171       brw_wm_prog_data(brw->wm.base.prog_data);
1172    uint32_t max_source_attr = 0;
1173
1174    *point_sprite_enables = 0;
1175
1176    int first_slot =
1177       brw_compute_first_urb_slot_required(fp->info.inputs_read,
1178                                           &brw->vue_map_geom_out);
1179
1180    /* Each URB offset packs two varying slots */
1181    assert(first_slot % 2 == 0);
1182    *urb_entry_read_offset = first_slot / 2;
1183
1184    /* From the Ivybridge PRM, Vol 2 Part 1, 3DSTATE_SBE,
1185     * description of dw10 Point Sprite Texture Coordinate Enable:
1186     *
1187     * "This field must be programmed to zero when non-point primitives
1188     * are rendered."
1189     *
1190     * The SandyBridge PRM doesn't explicitly say that point sprite enables
1191     * must be programmed to zero when rendering non-point primitives, but
1192     * the IvyBridge PRM does, and if we don't, we get garbage.
1193     *
1194     * This is not required on Haswell, as the hardware ignores this state
1195     * when drawing non-points -- although we do still need to be careful to
1196     * correctly set the attr overrides.
1197     *
1198     * _NEW_POLYGON
1199     * BRW_NEW_PRIMITIVE | BRW_NEW_GS_PROG_DATA | BRW_NEW_TES_PROG_DATA
1200     */
1201    bool drawing_points = brw_is_drawing_points(brw);
1202
1203    for (int attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1204       int input_index = wm_prog_data->urb_setup[attr];
1205
1206       if (input_index < 0)
1207          continue;
1208
1209       /* _NEW_POINT */
1210       bool point_sprite = false;
1211       if (drawing_points) {
1212          if (point->PointSprite &&
1213              (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7) &&
1214              (point->CoordReplace & (1u << (attr - VARYING_SLOT_TEX0)))) {
1215             point_sprite = true;
1216          }
1217
1218          if (attr == VARYING_SLOT_PNTC)
1219             point_sprite = true;
1220
1221          if (point_sprite)
1222             *point_sprite_enables |= (1 << input_index);
1223       }
1224
1225       /* BRW_NEW_VUE_MAP_GEOM_OUT | _NEW_LIGHT | _NEW_PROGRAM */
1226       struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 };
1227
1228       if (!point_sprite) {
1229          genX(get_attr_override)(&attribute,
1230                                  &brw->vue_map_geom_out,
1231                                  *urb_entry_read_offset, attr,
1232                                  _mesa_vertex_program_two_side_enabled(ctx),
1233                                  &max_source_attr);
1234       }
1235
1236       /* The hardware can only do the overrides on 16 overrides at a
1237        * time, and the other up to 16 have to be lined up so that the
1238        * input index = the output index.  We'll need to do some
1239        * tweaking to make sure that's the case.
1240        */
1241       if (input_index < 16)
1242          attr_overrides[input_index] = attribute;
1243       else
1244          assert(attribute.SourceAttribute == input_index);
1245    }
1246
1247    /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
1248     * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
1249     *
1250     * "This field should be set to the minimum length required to read the
1251     *  maximum source attribute.  The maximum source attribute is indicated
1252     *  by the maximum value of the enabled Attribute # Source Attribute if
1253     *  Attribute Swizzle Enable is set, Number of Output Attributes-1 if
1254     *  enable is not set.
1255     *  read_length = ceiling((max_source_attr + 1) / 2)
1256     *
1257     *  [errata] Corruption/Hang possible if length programmed larger than
1258     *  recommended"
1259     *
1260     * Similar text exists for Ivy Bridge.
1261     */
1262    *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2);
1263 }
1264 #endif
1265
1266 /* ---------------------------------------------------------------------- */
1267
1268 #if GEN_GEN >= 8
1269 typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML;
1270 #elif GEN_GEN >= 6
1271 typedef struct GENX(DEPTH_STENCIL_STATE)      DEPTH_STENCIL_GENXML;
1272 #else
1273 typedef struct GENX(COLOR_CALC_STATE)         DEPTH_STENCIL_GENXML;
1274 #endif
1275
1276 static inline void
1277 set_depth_stencil_bits(struct brw_context *brw, DEPTH_STENCIL_GENXML *ds)
1278 {
1279    struct gl_context *ctx = &brw->ctx;
1280
1281    /* _NEW_BUFFERS */
1282    struct intel_renderbuffer *depth_irb =
1283       intel_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH);
1284
1285    /* _NEW_DEPTH */
1286    struct gl_depthbuffer_attrib *depth = &ctx->Depth;
1287
1288    /* _NEW_STENCIL */
1289    struct gl_stencil_attrib *stencil = &ctx->Stencil;
1290    const int b = stencil->_BackFace;
1291
1292    if (depth->Test && depth_irb) {
1293       ds->DepthTestEnable = true;
1294       ds->DepthBufferWriteEnable = brw_depth_writes_enabled(brw);
1295       ds->DepthTestFunction = intel_translate_compare_func(depth->Func);
1296    }
1297
1298    if (brw->stencil_enabled) {
1299       ds->StencilTestEnable = true;
1300       ds->StencilWriteMask = stencil->WriteMask[0] & 0xff;
1301       ds->StencilTestMask = stencil->ValueMask[0] & 0xff;
1302
1303       ds->StencilTestFunction =
1304          intel_translate_compare_func(stencil->Function[0]);
1305       ds->StencilFailOp =
1306          intel_translate_stencil_op(stencil->FailFunc[0]);
1307       ds->StencilPassDepthPassOp =
1308          intel_translate_stencil_op(stencil->ZPassFunc[0]);
1309       ds->StencilPassDepthFailOp =
1310          intel_translate_stencil_op(stencil->ZFailFunc[0]);
1311
1312       ds->StencilBufferWriteEnable = brw->stencil_write_enabled;
1313
1314       if (brw->stencil_two_sided) {
1315          ds->DoubleSidedStencilEnable = true;
1316          ds->BackfaceStencilWriteMask = stencil->WriteMask[b] & 0xff;
1317          ds->BackfaceStencilTestMask = stencil->ValueMask[b] & 0xff;
1318
1319          ds->BackfaceStencilTestFunction =
1320             intel_translate_compare_func(stencil->Function[b]);
1321          ds->BackfaceStencilFailOp =
1322             intel_translate_stencil_op(stencil->FailFunc[b]);
1323          ds->BackfaceStencilPassDepthPassOp =
1324             intel_translate_stencil_op(stencil->ZPassFunc[b]);
1325          ds->BackfaceStencilPassDepthFailOp =
1326             intel_translate_stencil_op(stencil->ZFailFunc[b]);
1327       }
1328
1329 #if GEN_GEN <= 5 || GEN_GEN >= 9
1330       ds->StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
1331       ds->BackfaceStencilReferenceValue = _mesa_get_stencil_ref(ctx, b);
1332 #endif
1333    }
1334 }
1335
1336 #if GEN_GEN >= 6
1337 static void
1338 genX(upload_depth_stencil_state)(struct brw_context *brw)
1339 {
1340 #if GEN_GEN >= 8
1341    brw_batch_emit(brw, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) {
1342       set_depth_stencil_bits(brw, &wmds);
1343    }
1344 #else
1345    uint32_t ds_offset;
1346    brw_state_emit(brw, GENX(DEPTH_STENCIL_STATE), 64, &ds_offset, ds) {
1347       set_depth_stencil_bits(brw, &ds);
1348    }
1349
1350    /* Now upload a pointer to the indirect state */
1351 #if GEN_GEN == 6
1352    brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
1353       ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
1354       ptr.DEPTH_STENCIL_STATEChange = true;
1355    }
1356 #else
1357    brw_batch_emit(brw, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
1358       ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
1359    }
1360 #endif
1361 #endif
1362 }
1363
1364 static const struct brw_tracked_state genX(depth_stencil_state) = {
1365    .dirty = {
1366       .mesa = _NEW_BUFFERS |
1367               _NEW_DEPTH |
1368               _NEW_STENCIL,
1369       .brw  = BRW_NEW_BLORP |
1370               (GEN_GEN >= 8 ? BRW_NEW_CONTEXT
1371                             : BRW_NEW_BATCH |
1372                               BRW_NEW_STATE_BASE_ADDRESS),
1373    },
1374    .emit = genX(upload_depth_stencil_state),
1375 };
1376 #endif
1377
1378 /* ---------------------------------------------------------------------- */
1379
1380 #if GEN_GEN <= 5
1381
1382 static void
1383 genX(upload_clip_state)(struct brw_context *brw)
1384 {
1385    struct gl_context *ctx = &brw->ctx;
1386
1387    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
1388    brw_state_emit(brw, GENX(CLIP_STATE), 32, &brw->clip.state_offset, clip) {
1389       clip.KernelStartPointer = KSP(brw, brw->clip.prog_offset);
1390       clip.GRFRegisterCount =
1391          DIV_ROUND_UP(brw->clip.prog_data->total_grf, 16) - 1;
1392       clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
1393       clip.SingleProgramFlow = true;
1394       clip.VertexURBEntryReadLength = brw->clip.prog_data->urb_read_length;
1395       clip.ConstantURBEntryReadLength = brw->clip.prog_data->curb_read_length;
1396
1397       /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
1398       clip.ConstantURBEntryReadOffset = brw->curbe.clip_start * 2;
1399       clip.DispatchGRFStartRegisterForURBData = 1;
1400       clip.VertexURBEntryReadOffset = 0;
1401
1402       /* BRW_NEW_URB_FENCE */
1403       clip.NumberofURBEntries = brw->urb.nr_clip_entries;
1404       clip.URBEntryAllocationSize = brw->urb.vsize - 1;
1405
1406       if (brw->urb.nr_clip_entries >= 10) {
1407          /* Half of the URB entries go to each thread, and it has to be an
1408           * even number.
1409           */
1410          assert(brw->urb.nr_clip_entries % 2 == 0);
1411
1412          /* Although up to 16 concurrent Clip threads are allowed on Ironlake,
1413           * only 2 threads can output VUEs at a time.
1414           */
1415          clip.MaximumNumberofThreads = (GEN_GEN == 5 ? 16 : 2) - 1;
1416       } else {
1417          assert(brw->urb.nr_clip_entries >= 5);
1418          clip.MaximumNumberofThreads = 1 - 1;
1419       }
1420
1421       clip.VertexPositionSpace = VPOS_NDCSPACE;
1422       clip.UserClipFlagsMustClipEnable = true;
1423       clip.GuardbandClipTestEnable = true;
1424
1425       clip.ClipperViewportStatePointer =
1426          ro_bo(brw->batch.state.bo, brw->clip.vp_offset);
1427
1428       clip.ScreenSpaceViewportXMin = -1;
1429       clip.ScreenSpaceViewportXMax = 1;
1430       clip.ScreenSpaceViewportYMin = -1;
1431       clip.ScreenSpaceViewportYMax = 1;
1432
1433       clip.ViewportXYClipTestEnable = true;
1434       clip.ViewportZClipTestEnable = !(ctx->Transform.DepthClampNear &&
1435                                        ctx->Transform.DepthClampFar);
1436
1437       /* _NEW_TRANSFORM */
1438       if (GEN_GEN == 5 || GEN_IS_G4X) {
1439          clip.UserClipDistanceClipTestEnableBitmask =
1440             ctx->Transform.ClipPlanesEnabled;
1441       } else {
1442          /* Up to 6 actual clip flags, plus the 7th for the negative RHW
1443           * workaround.
1444           */
1445          clip.UserClipDistanceClipTestEnableBitmask =
1446             (ctx->Transform.ClipPlanesEnabled & 0x3f) | 0x40;
1447       }
1448
1449       if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE)
1450          clip.APIMode = APIMODE_D3D;
1451       else
1452          clip.APIMode = APIMODE_OGL;
1453
1454       clip.GuardbandClipTestEnable = true;
1455
1456       clip.ClipMode = brw->clip.prog_data->clip_mode;
1457
1458 #if GEN_IS_G4X
1459       clip.NegativeWClipTestEnable = true;
1460 #endif
1461    }
1462 }
1463
1464 const struct brw_tracked_state genX(clip_state) = {
1465    .dirty = {
1466       .mesa  = _NEW_TRANSFORM |
1467                _NEW_VIEWPORT,
1468       .brw   = BRW_NEW_BATCH |
1469                BRW_NEW_BLORP |
1470                BRW_NEW_CLIP_PROG_DATA |
1471                BRW_NEW_PUSH_CONSTANT_ALLOCATION |
1472                BRW_NEW_PROGRAM_CACHE |
1473                BRW_NEW_URB_FENCE,
1474    },
1475    .emit = genX(upload_clip_state),
1476 };
1477
1478 #else
1479
1480 static void
1481 genX(upload_clip_state)(struct brw_context *brw)
1482 {
1483    struct gl_context *ctx = &brw->ctx;
1484
1485    /* _NEW_BUFFERS */
1486    struct gl_framebuffer *fb = ctx->DrawBuffer;
1487
1488    /* BRW_NEW_FS_PROG_DATA */
1489    struct brw_wm_prog_data *wm_prog_data =
1490       brw_wm_prog_data(brw->wm.base.prog_data);
1491
1492    brw_batch_emit(brw, GENX(3DSTATE_CLIP), clip) {
1493       clip.StatisticsEnable = !brw->meta_in_progress;
1494
1495       if (wm_prog_data->barycentric_interp_modes &
1496           BRW_BARYCENTRIC_NONPERSPECTIVE_BITS)
1497          clip.NonPerspectiveBarycentricEnable = true;
1498
1499 #if GEN_GEN >= 7
1500       clip.EarlyCullEnable = true;
1501 #endif
1502
1503 #if GEN_GEN == 7
1504       clip.FrontWinding = brw->polygon_front_bit != fb->FlipY;
1505
1506       if (ctx->Polygon.CullFlag) {
1507          switch (ctx->Polygon.CullFaceMode) {
1508          case GL_FRONT:
1509             clip.CullMode = CULLMODE_FRONT;
1510             break;
1511          case GL_BACK:
1512             clip.CullMode = CULLMODE_BACK;
1513             break;
1514          case GL_FRONT_AND_BACK:
1515             clip.CullMode = CULLMODE_BOTH;
1516             break;
1517          default:
1518             unreachable("Should not get here: invalid CullFlag");
1519          }
1520       } else {
1521          clip.CullMode = CULLMODE_NONE;
1522       }
1523 #endif
1524
1525 #if GEN_GEN < 8
1526       clip.UserClipDistanceCullTestEnableBitmask =
1527          brw_vue_prog_data(brw->vs.base.prog_data)->cull_distance_mask;
1528
1529       clip.ViewportZClipTestEnable = !(ctx->Transform.DepthClampNear &&
1530                                        ctx->Transform.DepthClampFar);
1531 #endif
1532
1533       /* _NEW_LIGHT */
1534       if (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION) {
1535          clip.TriangleStripListProvokingVertexSelect = 0;
1536          clip.TriangleFanProvokingVertexSelect = 1;
1537          clip.LineStripListProvokingVertexSelect = 0;
1538       } else {
1539          clip.TriangleStripListProvokingVertexSelect = 2;
1540          clip.TriangleFanProvokingVertexSelect = 2;
1541          clip.LineStripListProvokingVertexSelect = 1;
1542       }
1543
1544       /* _NEW_TRANSFORM */
1545       clip.UserClipDistanceClipTestEnableBitmask =
1546          ctx->Transform.ClipPlanesEnabled;
1547
1548 #if GEN_GEN >= 8
1549       clip.ForceUserClipDistanceClipTestEnableBitmask = true;
1550 #endif
1551
1552       if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE)
1553          clip.APIMode = APIMODE_D3D;
1554       else
1555          clip.APIMode = APIMODE_OGL;
1556
1557       clip.GuardbandClipTestEnable = true;
1558
1559       /* BRW_NEW_VIEWPORT_COUNT */
1560       const unsigned viewport_count = brw->clip.viewport_count;
1561
1562       if (ctx->RasterDiscard) {
1563          clip.ClipMode = CLIPMODE_REJECT_ALL;
1564 #if GEN_GEN == 6
1565          perf_debug("Rasterizer discard is currently implemented via the "
1566                     "clipper; having the GS not write primitives would "
1567                     "likely be faster.\n");
1568 #endif
1569       } else {
1570          clip.ClipMode = CLIPMODE_NORMAL;
1571       }
1572
1573       clip.ClipEnable = true;
1574
1575       /* _NEW_POLYGON,
1576        * BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_TES_PROG_DATA | BRW_NEW_PRIMITIVE
1577        */
1578       if (!brw_is_drawing_points(brw) && !brw_is_drawing_lines(brw))
1579          clip.ViewportXYClipTestEnable = true;
1580
1581       clip.MinimumPointWidth = 0.125;
1582       clip.MaximumPointWidth = 255.875;
1583       clip.MaximumVPIndex = viewport_count - 1;
1584       if (_mesa_geometric_layers(fb) == 0)
1585          clip.ForceZeroRTAIndexEnable = true;
1586    }
1587 }
1588
1589 static const struct brw_tracked_state genX(clip_state) = {
1590    .dirty = {
1591       .mesa  = _NEW_BUFFERS |
1592                _NEW_LIGHT |
1593                _NEW_POLYGON |
1594                _NEW_TRANSFORM,
1595       .brw   = BRW_NEW_BLORP |
1596                BRW_NEW_CONTEXT |
1597                BRW_NEW_FS_PROG_DATA |
1598                BRW_NEW_GS_PROG_DATA |
1599                BRW_NEW_VS_PROG_DATA |
1600                BRW_NEW_META_IN_PROGRESS |
1601                BRW_NEW_PRIMITIVE |
1602                BRW_NEW_RASTERIZER_DISCARD |
1603                BRW_NEW_TES_PROG_DATA |
1604                BRW_NEW_VIEWPORT_COUNT,
1605    },
1606    .emit = genX(upload_clip_state),
1607 };
1608 #endif
1609
1610 /* ---------------------------------------------------------------------- */
1611
1612 static void
1613 genX(upload_sf)(struct brw_context *brw)
1614 {
1615    struct gl_context *ctx = &brw->ctx;
1616    float point_size;
1617
1618 #if GEN_GEN <= 7
1619    /* _NEW_BUFFERS */
1620    bool flip_y = ctx->DrawBuffer->FlipY;
1621    UNUSED const bool multisampled_fbo =
1622       _mesa_geometric_samples(ctx->DrawBuffer) > 1;
1623 #endif
1624
1625 #if GEN_GEN < 6
1626    const struct brw_sf_prog_data *sf_prog_data = brw->sf.prog_data;
1627
1628    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
1629
1630    brw_state_emit(brw, GENX(SF_STATE), 64, &brw->sf.state_offset, sf) {
1631       sf.KernelStartPointer = KSP(brw, brw->sf.prog_offset);
1632       sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
1633       sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
1634       sf.DispatchGRFStartRegisterForURBData = 3;
1635       sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET;
1636       sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length;
1637       sf.NumberofURBEntries = brw->urb.nr_sf_entries;
1638       sf.URBEntryAllocationSize = brw->urb.sfsize - 1;
1639
1640       /* STATE_PREFETCH command description describes this state as being
1641        * something loaded through the GPE (L2 ISC), so it's INSTRUCTION
1642        * domain.
1643        */
1644       sf.SetupViewportStateOffset =
1645          ro_bo(brw->batch.state.bo, brw->sf.vp_offset);
1646
1647       sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
1648
1649       /* sf.ConstantURBEntryReadLength = stage_prog_data->curb_read_length; */
1650       /* sf.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2; */
1651
1652       sf.MaximumNumberofThreads =
1653          MIN2(GEN_GEN == 5 ? 48 : 24, brw->urb.nr_sf_entries) - 1;
1654
1655       sf.SpritePointEnable = ctx->Point.PointSprite;
1656
1657       sf.DestinationOriginHorizontalBias = 0.5;
1658       sf.DestinationOriginVerticalBias = 0.5;
1659 #else
1660    brw_batch_emit(brw, GENX(3DSTATE_SF), sf) {
1661       sf.StatisticsEnable = true;
1662 #endif
1663       sf.ViewportTransformEnable = true;
1664
1665 #if GEN_GEN == 7
1666       /* _NEW_BUFFERS */
1667       sf.DepthBufferSurfaceFormat = brw_depthbuffer_format(brw);
1668 #endif
1669
1670 #if GEN_GEN <= 7
1671       /* _NEW_POLYGON */
1672       sf.FrontWinding = brw->polygon_front_bit != flip_y;
1673 #if GEN_GEN >= 6
1674       sf.GlobalDepthOffsetEnableSolid = ctx->Polygon.OffsetFill;
1675       sf.GlobalDepthOffsetEnableWireframe = ctx->Polygon.OffsetLine;
1676       sf.GlobalDepthOffsetEnablePoint = ctx->Polygon.OffsetPoint;
1677
1678       switch (ctx->Polygon.FrontMode) {
1679          case GL_FILL:
1680             sf.FrontFaceFillMode = FILL_MODE_SOLID;
1681             break;
1682          case GL_LINE:
1683             sf.FrontFaceFillMode = FILL_MODE_WIREFRAME;
1684             break;
1685          case GL_POINT:
1686             sf.FrontFaceFillMode = FILL_MODE_POINT;
1687             break;
1688          default:
1689             unreachable("not reached");
1690       }
1691
1692       switch (ctx->Polygon.BackMode) {
1693          case GL_FILL:
1694             sf.BackFaceFillMode = FILL_MODE_SOLID;
1695             break;
1696          case GL_LINE:
1697             sf.BackFaceFillMode = FILL_MODE_WIREFRAME;
1698             break;
1699          case GL_POINT:
1700             sf.BackFaceFillMode = FILL_MODE_POINT;
1701             break;
1702          default:
1703             unreachable("not reached");
1704       }
1705
1706       if (multisampled_fbo && ctx->Multisample.Enabled)
1707          sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
1708
1709       sf.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
1710       sf.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
1711       sf.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
1712 #endif
1713
1714       sf.ScissorRectangleEnable = true;
1715
1716       if (ctx->Polygon.CullFlag) {
1717          switch (ctx->Polygon.CullFaceMode) {
1718             case GL_FRONT:
1719                sf.CullMode = CULLMODE_FRONT;
1720                break;
1721             case GL_BACK:
1722                sf.CullMode = CULLMODE_BACK;
1723                break;
1724             case GL_FRONT_AND_BACK:
1725                sf.CullMode = CULLMODE_BOTH;
1726                break;
1727             default:
1728                unreachable("not reached");
1729          }
1730       } else {
1731          sf.CullMode = CULLMODE_NONE;
1732       }
1733
1734 #if GEN_IS_HASWELL
1735       sf.LineStippleEnable = ctx->Line.StippleFlag;
1736 #endif
1737
1738 #endif
1739
1740       /* _NEW_LINE */
1741 #if GEN_GEN == 8
1742       const struct gen_device_info *devinfo = &brw->screen->devinfo;
1743
1744       if (devinfo->is_cherryview)
1745          sf.CHVLineWidth = brw_get_line_width(brw);
1746       else
1747          sf.LineWidth = brw_get_line_width(brw);
1748 #else
1749       sf.LineWidth = brw_get_line_width(brw);
1750 #endif
1751
1752       if (ctx->Line.SmoothFlag) {
1753          sf.LineEndCapAntialiasingRegionWidth = _10pixels;
1754 #if GEN_GEN <= 7
1755          sf.AntiAliasingEnable = true;
1756 #endif
1757       }
1758
1759       /* _NEW_POINT - Clamp to ARB_point_parameters user limits */
1760       point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
1761       /* Clamp to the hardware limits */
1762       sf.PointWidth = CLAMP(point_size, 0.125f, 255.875f);
1763
1764       /* _NEW_PROGRAM | _NEW_POINT, BRW_NEW_VUE_MAP_GEOM_OUT */
1765       if (use_state_point_size(brw))
1766          sf.PointWidthSource = State;
1767
1768 #if GEN_GEN >= 8
1769       /* _NEW_POINT | _NEW_MULTISAMPLE */
1770       if ((ctx->Point.SmoothFlag || _mesa_is_multisample_enabled(ctx)) &&
1771           !ctx->Point.PointSprite)
1772          sf.SmoothPointEnable = true;
1773 #endif
1774
1775 #if GEN_GEN == 10
1776       /* _NEW_BUFFERS
1777        * Smooth Point Enable bit MUST not be set when NUM_MULTISAMPLES > 1.
1778        */
1779       const bool multisampled_fbo =
1780          _mesa_geometric_samples(ctx->DrawBuffer) > 1;
1781       if (multisampled_fbo)
1782          sf.SmoothPointEnable = false;
1783 #endif
1784
1785 #if GEN_IS_G4X || GEN_GEN >= 5
1786       sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
1787 #endif
1788
1789       /* _NEW_LIGHT */
1790       if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) {
1791          sf.TriangleStripListProvokingVertexSelect = 2;
1792          sf.TriangleFanProvokingVertexSelect = 2;
1793          sf.LineStripListProvokingVertexSelect = 1;
1794       } else {
1795          sf.TriangleFanProvokingVertexSelect = 1;
1796       }
1797
1798 #if GEN_GEN == 6
1799       /* BRW_NEW_FS_PROG_DATA */
1800       const struct brw_wm_prog_data *wm_prog_data =
1801          brw_wm_prog_data(brw->wm.base.prog_data);
1802
1803       sf.AttributeSwizzleEnable = true;
1804       sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
1805
1806       /*
1807        * Window coordinates in an FBO are inverted, which means point
1808        * sprite origin must be inverted, too.
1809        */
1810       if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y) {
1811          sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
1812       } else {
1813          sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
1814       }
1815
1816       /* BRW_NEW_VUE_MAP_GEOM_OUT | BRW_NEW_FRAGMENT_PROGRAM |
1817        * _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM | BRW_NEW_FS_PROG_DATA
1818        */
1819       uint32_t urb_entry_read_length;
1820       uint32_t urb_entry_read_offset;
1821       uint32_t point_sprite_enables;
1822       genX(calculate_attr_overrides)(brw, sf.Attribute, &point_sprite_enables,
1823                                      &urb_entry_read_length,
1824                                      &urb_entry_read_offset);
1825       sf.VertexURBEntryReadLength = urb_entry_read_length;
1826       sf.VertexURBEntryReadOffset = urb_entry_read_offset;
1827       sf.PointSpriteTextureCoordinateEnable = point_sprite_enables;
1828       sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
1829 #endif
1830    }
1831 }
1832
1833 static const struct brw_tracked_state genX(sf_state) = {
1834    .dirty = {
1835       .mesa  = _NEW_LIGHT |
1836                _NEW_LINE |
1837                _NEW_POINT |
1838                _NEW_PROGRAM |
1839                (GEN_GEN >= 6 ? _NEW_MULTISAMPLE : 0) |
1840                (GEN_GEN <= 7 ? _NEW_BUFFERS | _NEW_POLYGON : 0) |
1841                (GEN_GEN == 10 ? _NEW_BUFFERS : 0),
1842       .brw   = BRW_NEW_BLORP |
1843                BRW_NEW_VUE_MAP_GEOM_OUT |
1844                (GEN_GEN <= 5 ? BRW_NEW_BATCH |
1845                                BRW_NEW_PROGRAM_CACHE |
1846                                BRW_NEW_SF_PROG_DATA |
1847                                BRW_NEW_SF_VP |
1848                                BRW_NEW_URB_FENCE
1849                              : 0) |
1850                (GEN_GEN >= 6 ? BRW_NEW_CONTEXT : 0) |
1851                (GEN_GEN >= 6 && GEN_GEN <= 7 ?
1852                                BRW_NEW_GS_PROG_DATA |
1853                                BRW_NEW_PRIMITIVE |
1854                                BRW_NEW_TES_PROG_DATA
1855                              : 0) |
1856                (GEN_GEN == 6 ? BRW_NEW_FS_PROG_DATA |
1857                                BRW_NEW_FRAGMENT_PROGRAM
1858                              : 0),
1859    },
1860    .emit = genX(upload_sf),
1861 };
1862
1863 /* ---------------------------------------------------------------------- */
1864
1865 static bool
1866 brw_color_buffer_write_enabled(struct brw_context *brw)
1867 {
1868    struct gl_context *ctx = &brw->ctx;
1869    /* BRW_NEW_FRAGMENT_PROGRAM */
1870    const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
1871    unsigned i;
1872
1873    /* _NEW_BUFFERS */
1874    for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
1875       struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
1876       uint64_t outputs_written = fp->info.outputs_written;
1877
1878       /* _NEW_COLOR */
1879       if (rb && (outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR) ||
1880                  outputs_written & BITFIELD64_BIT(FRAG_RESULT_DATA0 + i)) &&
1881           GET_COLORMASK(ctx->Color.ColorMask, i)) {
1882          return true;
1883       }
1884    }
1885
1886    return false;
1887 }
1888
1889 static void
1890 genX(upload_wm)(struct brw_context *brw)
1891 {
1892    struct gl_context *ctx = &brw->ctx;
1893
1894    /* BRW_NEW_FS_PROG_DATA */
1895    const struct brw_wm_prog_data *wm_prog_data =
1896       brw_wm_prog_data(brw->wm.base.prog_data);
1897
1898    UNUSED bool writes_depth =
1899       wm_prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF;
1900    UNUSED struct brw_stage_state *stage_state = &brw->wm.base;
1901    UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
1902
1903 #if GEN_GEN == 6
1904    /* We can't fold this into gen6_upload_wm_push_constants(), because
1905     * according to the SNB PRM, vol 2 part 1 section 7.2.2
1906     * (3DSTATE_CONSTANT_PS [DevSNB]):
1907     *
1908     *     "[DevSNB]: This packet must be followed by WM_STATE."
1909     */
1910    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_PS), wmcp) {
1911       if (wm_prog_data->base.nr_params != 0) {
1912          wmcp.Buffer0Valid = true;
1913          /* Pointer to the WM constant buffer.  Covered by the set of
1914           * state flags from gen6_upload_wm_push_constants.
1915           */
1916          wmcp.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
1917          wmcp.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
1918       }
1919    }
1920 #endif
1921
1922 #if GEN_GEN >= 6
1923    brw_batch_emit(brw, GENX(3DSTATE_WM), wm) {
1924 #else
1925    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
1926    brw_state_emit(brw, GENX(WM_STATE), 64, &stage_state->state_offset, wm) {
1927 #endif
1928
1929 #if GEN_GEN <= 6
1930       wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
1931       wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
1932       wm._32PixelDispatchEnable = wm_prog_data->dispatch_32;
1933 #endif
1934
1935 #if GEN_GEN == 4
1936       /* On gen4, we only have one shader kernel */
1937       if (brw_wm_state_has_ksp(wm, 0)) {
1938          assert(brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0) == 0);
1939          wm.KernelStartPointer0 = KSP(brw, stage_state->prog_offset);
1940          wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
1941          wm.DispatchGRFStartRegisterForConstantSetupData0 =
1942             brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
1943       }
1944 #elif GEN_GEN == 5
1945       /* On gen5, we have multiple shader kernels but only one GRF start
1946        * register for all kernels
1947        */
1948       wm.KernelStartPointer0 = stage_state->prog_offset +
1949                                brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
1950       wm.KernelStartPointer1 = stage_state->prog_offset +
1951                                brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
1952       wm.KernelStartPointer2 = stage_state->prog_offset +
1953                                brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
1954
1955       wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
1956       wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 1);
1957       wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 2);
1958
1959       wm.DispatchGRFStartRegisterForConstantSetupData0 =
1960          wm_prog_data->base.dispatch_grf_start_reg;
1961
1962       /* Dispatch GRF Start should be the same for all shaders on gen5 */
1963       if (brw_wm_state_has_ksp(wm, 1)) {
1964          assert(wm_prog_data->base.dispatch_grf_start_reg ==
1965                 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1));
1966       }
1967       if (brw_wm_state_has_ksp(wm, 2)) {
1968          assert(wm_prog_data->base.dispatch_grf_start_reg ==
1969                 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2));
1970       }
1971 #elif GEN_GEN == 6
1972       /* On gen6, we have multiple shader kernels and we no longer specify a
1973        * register count for each one.
1974        */
1975       wm.KernelStartPointer0 = stage_state->prog_offset +
1976                                brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
1977       wm.KernelStartPointer1 = stage_state->prog_offset +
1978                                brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
1979       wm.KernelStartPointer2 = stage_state->prog_offset +
1980                                brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
1981
1982       wm.DispatchGRFStartRegisterForConstantSetupData0 =
1983          brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
1984       wm.DispatchGRFStartRegisterForConstantSetupData1 =
1985          brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1);
1986       wm.DispatchGRFStartRegisterForConstantSetupData2 =
1987          brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2);
1988 #endif
1989
1990 #if GEN_GEN <= 5
1991       wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
1992       /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
1993       wm.ConstantURBEntryReadOffset = brw->curbe.wm_start * 2;
1994       wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
1995       wm.SetupURBEntryReadOffset = 0;
1996       wm.EarlyDepthTestEnable = true;
1997 #endif
1998
1999 #if GEN_GEN >= 6
2000       wm.LineAntialiasingRegionWidth = _10pixels;
2001       wm.LineEndCapAntialiasingRegionWidth = _05pixels;
2002
2003       wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
2004       wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
2005 #else
2006       if (stage_state->sampler_count)
2007          wm.SamplerStatePointer =
2008             ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
2009
2010       wm.LineAntialiasingRegionWidth = _05pixels;
2011       wm.LineEndCapAntialiasingRegionWidth = _10pixels;
2012
2013       /* _NEW_POLYGON */
2014       if (ctx->Polygon.OffsetFill) {
2015          wm.GlobalDepthOffsetEnable = true;
2016          /* Something weird going on with legacy_global_depth_bias,
2017           * offset_constant, scaling and MRD.  This value passes glean
2018           * but gives some odd results elsewere (eg. the
2019           * quad-offset-units test).
2020           */
2021          wm.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
2022
2023          /* This is the only value that passes glean:
2024          */
2025          wm.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
2026       }
2027
2028       wm.DepthCoefficientURBReadOffset = 1;
2029 #endif
2030
2031       /* BRW_NEW_STATS_WM */
2032       wm.StatisticsEnable = GEN_GEN >= 6 || brw->stats_wm;
2033
2034 #if GEN_GEN < 7
2035       if (wm_prog_data->base.use_alt_mode)
2036          wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
2037
2038       /* WA_1606682166 */
2039       wm.SamplerCount = (GEN_GEN == 5 || GEN_GEN == 11) ?
2040          0 : DIV_ROUND_UP(stage_state->sampler_count, 4);
2041
2042       wm.BindingTableEntryCount =
2043          wm_prog_data->base.binding_table.size_bytes / 4;
2044       wm.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
2045
2046 #if GEN_GEN == 6
2047       wm.DualSourceBlendEnable =
2048          wm_prog_data->dual_src_blend && (ctx->Color.BlendEnabled & 1) &&
2049          ctx->Color.Blend[0]._UsesDualSrc;
2050       wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
2051       wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
2052
2053       /* From the SNB PRM, volume 2 part 1, page 281:
2054        * "If the PS kernel does not need the Position XY Offsets
2055        * to compute a Position XY value, then this field should be
2056        * programmed to POSOFFSET_NONE."
2057        *
2058        * "SW Recommendation: If the PS kernel needs the Position Offsets
2059        * to compute a Position XY value, this field should match Position
2060        * ZW Interpolation Mode to ensure a consistent position.xyzw
2061        * computation."
2062        * We only require XY sample offsets. So, this recommendation doesn't
2063        * look useful at the moment. We might need this in future.
2064        */
2065       if (wm_prog_data->uses_pos_offset)
2066          wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
2067       else
2068          wm.PositionXYOffsetSelect = POSOFFSET_NONE;
2069 #endif
2070
2071       if (wm_prog_data->base.total_scratch) {
2072          wm.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
2073          wm.PerThreadScratchSpace =
2074             ffs(stage_state->per_thread_scratch) - 11;
2075       }
2076
2077       wm.PixelShaderComputedDepth = writes_depth;
2078 #endif
2079
2080       /* _NEW_LINE */
2081       wm.LineStippleEnable = ctx->Line.StippleFlag;
2082
2083       /* _NEW_POLYGON */
2084       wm.PolygonStippleEnable = ctx->Polygon.StippleFlag;
2085
2086 #if GEN_GEN < 8
2087
2088 #if GEN_GEN >= 6
2089       wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
2090
2091       /* _NEW_BUFFERS */
2092       const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
2093
2094       if (multisampled_fbo) {
2095          /* _NEW_MULTISAMPLE */
2096          if (ctx->Multisample.Enabled)
2097             wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
2098          else
2099             wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
2100
2101          if (wm_prog_data->persample_dispatch)
2102             wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
2103          else
2104             wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
2105       } else {
2106          wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
2107          wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
2108       }
2109 #endif
2110       wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
2111       if (wm_prog_data->uses_kill ||
2112           _mesa_is_alpha_test_enabled(ctx) ||
2113           _mesa_is_alpha_to_coverage_enabled(ctx) ||
2114           (GEN_GEN >= 6 && wm_prog_data->uses_omask)) {
2115          wm.PixelShaderKillsPixel = true;
2116       }
2117
2118       /* _NEW_BUFFERS | _NEW_COLOR */
2119       if (brw_color_buffer_write_enabled(brw) || writes_depth ||
2120           wm.PixelShaderKillsPixel ||
2121           (GEN_GEN >= 6 && wm_prog_data->has_side_effects)) {
2122          wm.ThreadDispatchEnable = true;
2123       }
2124
2125 #if GEN_GEN >= 7
2126       wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
2127       wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
2128 #endif
2129
2130       /* The "UAV access enable" bits are unnecessary on HSW because they only
2131        * seem to have an effect on the HW-assisted coherency mechanism which we
2132        * don't need, and the rasterization-related UAV_ONLY flag and the
2133        * DISPATCH_ENABLE bit can be set independently from it.
2134        * C.f. gen8_upload_ps_extra().
2135        *
2136        * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS |
2137        * _NEW_COLOR
2138        */
2139 #if GEN_IS_HASWELL
2140       if (!(brw_color_buffer_write_enabled(brw) || writes_depth) &&
2141           wm_prog_data->has_side_effects)
2142          wm.PSUAVonly = ON;
2143 #endif
2144 #endif
2145
2146 #if GEN_GEN >= 7
2147       /* BRW_NEW_FS_PROG_DATA */
2148       if (wm_prog_data->early_fragment_tests)
2149          wm.EarlyDepthStencilControl = EDSC_PREPS;
2150       else if (wm_prog_data->has_side_effects)
2151          wm.EarlyDepthStencilControl = EDSC_PSEXEC;
2152 #endif
2153    }
2154
2155 #if GEN_GEN <= 5
2156    if (brw->wm.offset_clamp != ctx->Polygon.OffsetClamp) {
2157       brw_batch_emit(brw, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) {
2158          clamp.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
2159       }
2160
2161       brw->wm.offset_clamp = ctx->Polygon.OffsetClamp;
2162    }
2163 #endif
2164 }
2165
2166 static const struct brw_tracked_state genX(wm_state) = {
2167    .dirty = {
2168       .mesa  = _NEW_LINE |
2169                _NEW_POLYGON |
2170                (GEN_GEN < 8 ? _NEW_BUFFERS |
2171                               _NEW_COLOR :
2172                               0) |
2173                (GEN_GEN == 6 ? _NEW_PROGRAM_CONSTANTS : 0) |
2174                (GEN_GEN < 6 ? _NEW_POLYGONSTIPPLE : 0) |
2175                (GEN_GEN < 8 && GEN_GEN >= 6 ? _NEW_MULTISAMPLE : 0),
2176       .brw   = BRW_NEW_BLORP |
2177                BRW_NEW_FS_PROG_DATA |
2178                (GEN_GEN < 6 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
2179                               BRW_NEW_FRAGMENT_PROGRAM |
2180                               BRW_NEW_PROGRAM_CACHE |
2181                               BRW_NEW_SAMPLER_STATE_TABLE |
2182                               BRW_NEW_STATS_WM
2183                             : 0) |
2184                (GEN_GEN < 7 ? BRW_NEW_BATCH : BRW_NEW_CONTEXT),
2185    },
2186    .emit = genX(upload_wm),
2187 };
2188
2189 /* ---------------------------------------------------------------------- */
2190
2191 /* We restrict scratch buffers to the bottom 32 bits of the address space
2192  * by using rw_32_bo().
2193  *
2194  * General State Base Address is a bit broken.  If the address + size as
2195  * seen by STATE_BASE_ADDRESS overflows 48 bits, the GPU appears to treat
2196  * all accesses to the buffer as being out of bounds and returns zero.
2197  */
2198
2199 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \
2200    pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset);           \
2201    /* WA_1606682166 */                                                    \
2202    pkt.SamplerCount       =                                               \
2203       GEN_GEN == 11 ?                                                     \
2204       0 :                                                                 \
2205       DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);          \
2206    /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to     \
2207     * disable prefetching of binding tables in A0 and B0 steppings.       \
2208     * TODO: Revisit this WA on C0 stepping.                               \
2209     */                                                                    \
2210    pkt.BindingTableEntryCount =                                           \
2211       GEN_GEN == 11 ?                                                     \
2212       0 :                                                                 \
2213       stage_prog_data->binding_table.size_bytes / 4;                      \
2214    pkt.FloatingPointMode  = stage_prog_data->use_alt_mode;                \
2215                                                                           \
2216    if (stage_prog_data->total_scratch) {                                  \
2217       pkt.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0); \
2218       pkt.PerThreadScratchSpace =                                         \
2219          ffs(stage_state->per_thread_scratch) - 11;                       \
2220    }                                                                      \
2221                                                                           \
2222    pkt.DispatchGRFStartRegisterForURBData =                               \
2223       stage_prog_data->dispatch_grf_start_reg;                            \
2224    pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length;       \
2225    pkt.prefix##URBEntryReadOffset = 0;                                    \
2226                                                                           \
2227    pkt.StatisticsEnable = true;                                           \
2228    pkt.Enable           = true;
2229
2230 static void
2231 genX(upload_vs_state)(struct brw_context *brw)
2232 {
2233    UNUSED struct gl_context *ctx = &brw->ctx;
2234    const struct gen_device_info *devinfo = &brw->screen->devinfo;
2235    struct brw_stage_state *stage_state = &brw->vs.base;
2236
2237    /* BRW_NEW_VS_PROG_DATA */
2238    const struct brw_vue_prog_data *vue_prog_data =
2239       brw_vue_prog_data(brw->vs.base.prog_data);
2240    const struct brw_stage_prog_data *stage_prog_data = &vue_prog_data->base;
2241
2242    assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ||
2243           vue_prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT);
2244    assert(GEN_GEN < 11 ||
2245           vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8);
2246
2247 #if GEN_GEN == 6
2248    /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
2249     * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
2250     *
2251     *   [DevSNB] A pipeline flush must be programmed prior to a 3DSTATE_VS
2252     *   command that causes the VS Function Enable to toggle. Pipeline
2253     *   flush can be executed by sending a PIPE_CONTROL command with CS
2254     *   stall bit set and a post sync operation.
2255     *
2256     * We've already done such a flush at the start of state upload, so we
2257     * don't need to do another one here.
2258     */
2259    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), cvs) {
2260       if (stage_state->push_const_size != 0) {
2261          cvs.Buffer0Valid = true;
2262          cvs.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
2263          cvs.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
2264       }
2265    }
2266 #endif
2267
2268    if (GEN_GEN == 7 && devinfo->is_ivybridge)
2269       gen7_emit_vs_workaround_flush(brw);
2270
2271 #if GEN_GEN >= 6
2272    brw_batch_emit(brw, GENX(3DSTATE_VS), vs) {
2273 #else
2274    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
2275    brw_state_emit(brw, GENX(VS_STATE), 32, &stage_state->state_offset, vs) {
2276 #endif
2277       INIT_THREAD_DISPATCH_FIELDS(vs, Vertex);
2278
2279       vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
2280
2281 #if GEN_GEN < 6
2282       vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1;
2283       vs.ConstantURBEntryReadLength = stage_prog_data->curb_read_length;
2284       vs.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2;
2285
2286       vs.NumberofURBEntries = brw->urb.nr_vs_entries >> (GEN_GEN == 5 ? 2 : 0);
2287       vs.URBEntryAllocationSize = brw->urb.vsize - 1;
2288
2289       vs.MaximumNumberofThreads =
2290          CLAMP(brw->urb.nr_vs_entries / 2, 1, devinfo->max_vs_threads) - 1;
2291
2292       vs.StatisticsEnable = false;
2293       vs.SamplerStatePointer =
2294          ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
2295 #endif
2296
2297 #if GEN_GEN == 5
2298       /* Force single program flow on Ironlake.  We cannot reliably get
2299        * all applications working without it.  See:
2300        * https://bugs.freedesktop.org/show_bug.cgi?id=29172
2301        *
2302        * The most notable and reliably failing application is the Humus
2303        * demo "CelShading"
2304        */
2305       vs.SingleProgramFlow = true;
2306       vs.SamplerCount = 0; /* hardware requirement */
2307 #endif
2308
2309 #if GEN_GEN >= 8
2310       vs.SIMD8DispatchEnable =
2311          vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
2312
2313       vs.UserClipDistanceCullTestEnableBitmask =
2314          vue_prog_data->cull_distance_mask;
2315 #endif
2316    }
2317
2318 #if GEN_GEN == 6
2319    /* Based on my reading of the simulator, the VS constants don't get
2320     * pulled into the VS FF unit until an appropriate pipeline flush
2321     * happens, and instead the 3DSTATE_CONSTANT_VS packet just adds
2322     * references to them into a little FIFO.  The flushes are common,
2323     * but don't reliably happen between this and a 3DPRIMITIVE, causing
2324     * the primitive to use the wrong constants.  Then the FIFO
2325     * containing the constant setup gets added to again on the next
2326     * constants change, and eventually when a flush does happen the
2327     * unit is overwhelmed by constant changes and dies.
2328     *
2329     * To avoid this, send a PIPE_CONTROL down the line that will
2330     * update the unit immediately loading the constants.  The flush
2331     * type bits here were those set by the STATE_BASE_ADDRESS whose
2332     * move in a82a43e8d99e1715dd11c9c091b5ab734079b6a6 triggered the
2333     * bug reports that led to this workaround, and may be more than
2334     * what is strictly required to avoid the issue.
2335     */
2336    brw_emit_pipe_control_flush(brw,
2337                                PIPE_CONTROL_DEPTH_STALL |
2338                                PIPE_CONTROL_INSTRUCTION_INVALIDATE |
2339                                PIPE_CONTROL_STATE_CACHE_INVALIDATE);
2340 #endif
2341 }
2342
2343 static const struct brw_tracked_state genX(vs_state) = {
2344    .dirty = {
2345       .mesa  = (GEN_GEN == 6 ? (_NEW_PROGRAM_CONSTANTS | _NEW_TRANSFORM) : 0),
2346       .brw   = BRW_NEW_BATCH |
2347                BRW_NEW_BLORP |
2348                BRW_NEW_CONTEXT |
2349                BRW_NEW_VS_PROG_DATA |
2350                (GEN_GEN == 6 ? BRW_NEW_VERTEX_PROGRAM : 0) |
2351                (GEN_GEN <= 5 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
2352                                BRW_NEW_PROGRAM_CACHE |
2353                                BRW_NEW_SAMPLER_STATE_TABLE |
2354                                BRW_NEW_URB_FENCE
2355                              : 0),
2356    },
2357    .emit = genX(upload_vs_state),
2358 };
2359
2360 /* ---------------------------------------------------------------------- */
2361
2362 static void
2363 genX(upload_cc_viewport)(struct brw_context *brw)
2364 {
2365    struct gl_context *ctx = &brw->ctx;
2366
2367    /* BRW_NEW_VIEWPORT_COUNT */
2368    const unsigned viewport_count = brw->clip.viewport_count;
2369
2370    struct GENX(CC_VIEWPORT) ccv;
2371    uint32_t cc_vp_offset;
2372    uint32_t *cc_map =
2373       brw_state_batch(brw, 4 * GENX(CC_VIEWPORT_length) * viewport_count,
2374                       32, &cc_vp_offset);
2375
2376    for (unsigned i = 0; i < viewport_count; i++) {
2377       /* _NEW_VIEWPORT | _NEW_TRANSFORM */
2378       const struct gl_viewport_attrib *vp = &ctx->ViewportArray[i];
2379       if (ctx->Transform.DepthClampNear && ctx->Transform.DepthClampFar) {
2380          ccv.MinimumDepth = MIN2(vp->Near, vp->Far);
2381          ccv.MaximumDepth = MAX2(vp->Near, vp->Far);
2382       } else if (ctx->Transform.DepthClampNear) {
2383          ccv.MinimumDepth = MIN2(vp->Near, vp->Far);
2384          ccv.MaximumDepth = 0.0;
2385       } else if (ctx->Transform.DepthClampFar) {
2386          ccv.MinimumDepth = 0.0;
2387          ccv.MaximumDepth = MAX2(vp->Near, vp->Far);
2388       } else {
2389          ccv.MinimumDepth = 0.0;
2390          ccv.MaximumDepth = 1.0;
2391       }
2392       GENX(CC_VIEWPORT_pack)(NULL, cc_map, &ccv);
2393       cc_map += GENX(CC_VIEWPORT_length);
2394    }
2395
2396 #if GEN_GEN >= 7
2397    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
2398       ptr.CCViewportPointer = cc_vp_offset;
2399    }
2400 #elif GEN_GEN == 6
2401    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
2402       vp.CCViewportStateChange = 1;
2403       vp.PointertoCC_VIEWPORT = cc_vp_offset;
2404    }
2405 #else
2406    brw->cc.vp_offset = cc_vp_offset;
2407    ctx->NewDriverState |= BRW_NEW_CC_VP;
2408 #endif
2409 }
2410
2411 const struct brw_tracked_state genX(cc_vp) = {
2412    .dirty = {
2413       .mesa = _NEW_TRANSFORM |
2414               _NEW_VIEWPORT,
2415       .brw = BRW_NEW_BATCH |
2416              BRW_NEW_BLORP |
2417              BRW_NEW_VIEWPORT_COUNT,
2418    },
2419    .emit = genX(upload_cc_viewport)
2420 };
2421
2422 /* ---------------------------------------------------------------------- */
2423
2424 static void
2425 set_scissor_bits(const struct gl_context *ctx, int i,
2426                  bool flip_y, unsigned fb_width, unsigned fb_height,
2427                  struct GENX(SCISSOR_RECT) *sc)
2428 {
2429    int bbox[4];
2430
2431    bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
2432    bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width);
2433    bbox[2] = MAX2(ctx->ViewportArray[i].Y, 0);
2434    bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height);
2435    _mesa_intersect_scissor_bounding_box(ctx, i, bbox);
2436
2437    if (bbox[0] == bbox[1] || bbox[2] == bbox[3]) {
2438       /* If the scissor was out of bounds and got clamped to 0 width/height
2439        * at the bounds, the subtraction of 1 from maximums could produce a
2440        * negative number and thus not clip anything.  Instead, just provide
2441        * a min > max scissor inside the bounds, which produces the expected
2442        * no rendering.
2443        */
2444       sc->ScissorRectangleXMin = 1;
2445       sc->ScissorRectangleXMax = 0;
2446       sc->ScissorRectangleYMin = 1;
2447       sc->ScissorRectangleYMax = 0;
2448    } else if (!flip_y) {
2449       /* texmemory: Y=0=bottom */
2450       sc->ScissorRectangleXMin = bbox[0];
2451       sc->ScissorRectangleXMax = bbox[1] - 1;
2452       sc->ScissorRectangleYMin = bbox[2];
2453       sc->ScissorRectangleYMax = bbox[3] - 1;
2454    } else {
2455       /* memory: Y=0=top */
2456       sc->ScissorRectangleXMin = bbox[0];
2457       sc->ScissorRectangleXMax = bbox[1] - 1;
2458       sc->ScissorRectangleYMin = fb_height - bbox[3];
2459       sc->ScissorRectangleYMax = fb_height - bbox[2] - 1;
2460    }
2461 }
2462
2463 #if GEN_GEN >= 6
2464 static void
2465 genX(upload_scissor_state)(struct brw_context *brw)
2466 {
2467    struct gl_context *ctx = &brw->ctx;
2468    const bool flip_y = ctx->DrawBuffer->FlipY;
2469    struct GENX(SCISSOR_RECT) scissor;
2470    uint32_t scissor_state_offset;
2471    const unsigned int fb_width = _mesa_geometric_width(ctx->DrawBuffer);
2472    const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer);
2473    uint32_t *scissor_map;
2474
2475    /* BRW_NEW_VIEWPORT_COUNT */
2476    const unsigned viewport_count = brw->clip.viewport_count;
2477
2478    scissor_map = brw_state_batch(
2479       brw, GENX(SCISSOR_RECT_length) * sizeof(uint32_t) * viewport_count,
2480       32, &scissor_state_offset);
2481
2482    /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */
2483
2484    /* The scissor only needs to handle the intersection of drawable and
2485     * scissor rect.  Clipping to the boundaries of static shared buffers
2486     * for front/back/depth is covered by looping over cliprects in brw_draw.c.
2487     *
2488     * Note that the hardware's coordinates are inclusive, while Mesa's min is
2489     * inclusive but max is exclusive.
2490     */
2491    for (unsigned i = 0; i < viewport_count; i++) {
2492       set_scissor_bits(ctx, i, flip_y, fb_width, fb_height, &scissor);
2493       GENX(SCISSOR_RECT_pack)(
2494          NULL, scissor_map + i * GENX(SCISSOR_RECT_length), &scissor);
2495    }
2496
2497    brw_batch_emit(brw, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
2498       ptr.ScissorRectPointer = scissor_state_offset;
2499    }
2500 }
2501
2502 static const struct brw_tracked_state genX(scissor_state) = {
2503    .dirty = {
2504       .mesa = _NEW_BUFFERS |
2505               _NEW_SCISSOR |
2506               _NEW_VIEWPORT,
2507       .brw = BRW_NEW_BATCH |
2508              BRW_NEW_BLORP |
2509              BRW_NEW_VIEWPORT_COUNT,
2510    },
2511    .emit = genX(upload_scissor_state),
2512 };
2513 #endif
2514
2515 /* ---------------------------------------------------------------------- */
2516
2517 static void
2518 brw_calculate_guardband_size(uint32_t fb_width, uint32_t fb_height,
2519                              float m00, float m11, float m30, float m31,
2520                              float *xmin, float *xmax,
2521                              float *ymin, float *ymax)
2522 {
2523    /* According to the "Vertex X,Y Clamping and Quantization" section of the
2524     * Strips and Fans documentation:
2525     *
2526     * "The vertex X and Y screen-space coordinates are also /clamped/ to the
2527     *  fixed-point "guardband" range supported by the rasterization hardware"
2528     *
2529     * and
2530     *
2531     * "In almost all circumstances, if an object’s vertices are actually
2532     *  modified by this clamping (i.e., had X or Y coordinates outside of
2533     *  the guardband extent the rendered object will not match the intended
2534     *  result.  Therefore software should take steps to ensure that this does
2535     *  not happen - e.g., by clipping objects such that they do not exceed
2536     *  these limits after the Drawing Rectangle is applied."
2537     *
2538     * I believe the fundamental restriction is that the rasterizer (in
2539     * the SF/WM stages) have a limit on the number of pixels that can be
2540     * rasterized.  We need to ensure any coordinates beyond the rasterizer
2541     * limit are handled by the clipper.  So effectively that limit becomes
2542     * the clipper's guardband size.
2543     *
2544     * It goes on to say:
2545     *
2546     * "In addition, in order to be correctly rendered, objects must have a
2547     *  screenspace bounding box not exceeding 8K in the X or Y direction.
2548     *  This additional restriction must also be comprehended by software,
2549     *  i.e., enforced by use of clipping."
2550     *
2551     * This makes no sense.  Gen7+ hardware supports 16K render targets,
2552     * and you definitely need to be able to draw polygons that fill the
2553     * surface.  Our assumption is that the rasterizer was limited to 8K
2554     * on Sandybridge, which only supports 8K surfaces, and it was actually
2555     * increased to 16K on Ivybridge and later.
2556     *
2557     * So, limit the guardband to 16K on Gen7+ and 8K on Sandybridge.
2558     */
2559    const float gb_size = GEN_GEN >= 7 ? 16384.0f : 8192.0f;
2560
2561    /* Workaround: prevent gpu hangs on SandyBridge
2562     * by disabling guardband clipping for odd dimensions.
2563     */
2564    if (GEN_GEN == 6 && (fb_width & 1 || fb_height & 1)) {
2565       *xmin = -1.0f;
2566       *xmax =  1.0f;
2567       *ymin = -1.0f;
2568       *ymax =  1.0f;
2569       return;
2570    }
2571
2572    if (m00 != 0 && m11 != 0) {
2573       /* First, we compute the screen-space render area */
2574       const float ss_ra_xmin = MIN3(        0, m30 + m00, m30 - m00);
2575       const float ss_ra_xmax = MAX3( fb_width, m30 + m00, m30 - m00);
2576       const float ss_ra_ymin = MIN3(        0, m31 + m11, m31 - m11);
2577       const float ss_ra_ymax = MAX3(fb_height, m31 + m11, m31 - m11);
2578
2579       /* We want the guardband to be centered on that */
2580       const float ss_gb_xmin = (ss_ra_xmin + ss_ra_xmax) / 2 - gb_size;
2581       const float ss_gb_xmax = (ss_ra_xmin + ss_ra_xmax) / 2 + gb_size;
2582       const float ss_gb_ymin = (ss_ra_ymin + ss_ra_ymax) / 2 - gb_size;
2583       const float ss_gb_ymax = (ss_ra_ymin + ss_ra_ymax) / 2 + gb_size;
2584
2585       /* Now we need it in native device coordinates */
2586       const float ndc_gb_xmin = (ss_gb_xmin - m30) / m00;
2587       const float ndc_gb_xmax = (ss_gb_xmax - m30) / m00;
2588       const float ndc_gb_ymin = (ss_gb_ymin - m31) / m11;
2589       const float ndc_gb_ymax = (ss_gb_ymax - m31) / m11;
2590
2591       /* Thanks to Y-flipping and ORIGIN_UPPER_LEFT, the Y coordinates may be
2592        * flipped upside-down.  X should be fine though.
2593        */
2594       assert(ndc_gb_xmin <= ndc_gb_xmax);
2595       *xmin = ndc_gb_xmin;
2596       *xmax = ndc_gb_xmax;
2597       *ymin = MIN2(ndc_gb_ymin, ndc_gb_ymax);
2598       *ymax = MAX2(ndc_gb_ymin, ndc_gb_ymax);
2599    } else {
2600       /* The viewport scales to 0, so nothing will be rendered. */
2601       *xmin = 0.0f;
2602       *xmax = 0.0f;
2603       *ymin = 0.0f;
2604       *ymax = 0.0f;
2605    }
2606 }
2607
2608 static void
2609 genX(upload_sf_clip_viewport)(struct brw_context *brw)
2610 {
2611    struct gl_context *ctx = &brw->ctx;
2612    float y_scale, y_bias;
2613
2614    /* BRW_NEW_VIEWPORT_COUNT */
2615    const unsigned viewport_count = brw->clip.viewport_count;
2616
2617    /* _NEW_BUFFERS */
2618    const bool flip_y = ctx->DrawBuffer->FlipY;
2619    const uint32_t fb_width = (float)_mesa_geometric_width(ctx->DrawBuffer);
2620    const uint32_t fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer);
2621
2622 #if GEN_GEN >= 7
2623 #define clv sfv
2624    struct GENX(SF_CLIP_VIEWPORT) sfv;
2625    uint32_t sf_clip_vp_offset;
2626    uint32_t *sf_clip_map =
2627       brw_state_batch(brw, GENX(SF_CLIP_VIEWPORT_length) * 4 * viewport_count,
2628                       64, &sf_clip_vp_offset);
2629 #else
2630    struct GENX(SF_VIEWPORT) sfv;
2631    struct GENX(CLIP_VIEWPORT) clv;
2632    uint32_t sf_vp_offset, clip_vp_offset;
2633    uint32_t *sf_map =
2634       brw_state_batch(brw, GENX(SF_VIEWPORT_length) * 4 * viewport_count,
2635                       32, &sf_vp_offset);
2636    uint32_t *clip_map =
2637       brw_state_batch(brw, GENX(CLIP_VIEWPORT_length) * 4 * viewport_count,
2638                       32, &clip_vp_offset);
2639 #endif
2640
2641    /* _NEW_BUFFERS */
2642    if (flip_y) {
2643       y_scale = -1.0;
2644       y_bias = (float)fb_height;
2645    } else {
2646       y_scale = 1.0;
2647       y_bias = 0;
2648    }
2649
2650    for (unsigned i = 0; i < brw->clip.viewport_count; i++) {
2651       /* _NEW_VIEWPORT: Guardband Clipping */
2652       float scale[3], translate[3], gb_xmin, gb_xmax, gb_ymin, gb_ymax;
2653       _mesa_get_viewport_xform(ctx, i, scale, translate);
2654
2655       sfv.ViewportMatrixElementm00 = scale[0];
2656       sfv.ViewportMatrixElementm11 = scale[1] * y_scale,
2657       sfv.ViewportMatrixElementm22 = scale[2],
2658       sfv.ViewportMatrixElementm30 = translate[0],
2659       sfv.ViewportMatrixElementm31 = translate[1] * y_scale + y_bias,
2660       sfv.ViewportMatrixElementm32 = translate[2],
2661       brw_calculate_guardband_size(fb_width, fb_height,
2662                                    sfv.ViewportMatrixElementm00,
2663                                    sfv.ViewportMatrixElementm11,
2664                                    sfv.ViewportMatrixElementm30,
2665                                    sfv.ViewportMatrixElementm31,
2666                                    &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
2667
2668
2669       clv.XMinClipGuardband = gb_xmin;
2670       clv.XMaxClipGuardband = gb_xmax;
2671       clv.YMinClipGuardband = gb_ymin;
2672       clv.YMaxClipGuardband = gb_ymax;
2673
2674 #if GEN_GEN < 6
2675       set_scissor_bits(ctx, i, flip_y, fb_width, fb_height,
2676                        &sfv.ScissorRectangle);
2677 #elif GEN_GEN >= 8
2678       /* _NEW_VIEWPORT | _NEW_BUFFERS: Screen Space Viewport
2679        * The hardware will take the intersection of the drawing rectangle,
2680        * scissor rectangle, and the viewport extents.  However, emitting
2681        * 3DSTATE_DRAWING_RECTANGLE is expensive since it requires a full
2682        * pipeline stall so we're better off just being a little more clever
2683        * with our viewport so we can emit it once at context creation time.
2684        */
2685       const float viewport_Xmin = MAX2(ctx->ViewportArray[i].X, 0);
2686       const float viewport_Ymin = MAX2(ctx->ViewportArray[i].Y, 0);
2687       const float viewport_Xmax =
2688          MIN2(ctx->ViewportArray[i].X + ctx->ViewportArray[i].Width, fb_width);
2689       const float viewport_Ymax =
2690          MIN2(ctx->ViewportArray[i].Y + ctx->ViewportArray[i].Height, fb_height);
2691
2692       if (flip_y) {
2693          sfv.XMinViewPort = viewport_Xmin;
2694          sfv.XMaxViewPort = viewport_Xmax - 1;
2695          sfv.YMinViewPort = fb_height - viewport_Ymax;
2696          sfv.YMaxViewPort = fb_height - viewport_Ymin - 1;
2697       } else {
2698          sfv.XMinViewPort = viewport_Xmin;
2699          sfv.XMaxViewPort = viewport_Xmax - 1;
2700          sfv.YMinViewPort = viewport_Ymin;
2701          sfv.YMaxViewPort = viewport_Ymax - 1;
2702       }
2703 #endif
2704
2705 #if GEN_GEN >= 7
2706       GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_map, &sfv);
2707       sf_clip_map += GENX(SF_CLIP_VIEWPORT_length);
2708 #else
2709       GENX(SF_VIEWPORT_pack)(NULL, sf_map, &sfv);
2710       GENX(CLIP_VIEWPORT_pack)(NULL, clip_map, &clv);
2711       sf_map += GENX(SF_VIEWPORT_length);
2712       clip_map += GENX(CLIP_VIEWPORT_length);
2713 #endif
2714    }
2715
2716 #if GEN_GEN >= 7
2717    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
2718       ptr.SFClipViewportPointer = sf_clip_vp_offset;
2719    }
2720 #elif GEN_GEN == 6
2721    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
2722       vp.SFViewportStateChange = 1;
2723       vp.CLIPViewportStateChange = 1;
2724       vp.PointertoCLIP_VIEWPORT = clip_vp_offset;
2725       vp.PointertoSF_VIEWPORT = sf_vp_offset;
2726    }
2727 #else
2728    brw->sf.vp_offset = sf_vp_offset;
2729    brw->clip.vp_offset = clip_vp_offset;
2730    brw->ctx.NewDriverState |= BRW_NEW_SF_VP | BRW_NEW_CLIP_VP;
2731 #endif
2732 }
2733
2734 static const struct brw_tracked_state genX(sf_clip_viewport) = {
2735    .dirty = {
2736       .mesa = _NEW_BUFFERS |
2737               _NEW_VIEWPORT |
2738               (GEN_GEN <= 5 ? _NEW_SCISSOR : 0),
2739       .brw = BRW_NEW_BATCH |
2740              BRW_NEW_BLORP |
2741              BRW_NEW_VIEWPORT_COUNT,
2742    },
2743    .emit = genX(upload_sf_clip_viewport),
2744 };
2745
2746 /* ---------------------------------------------------------------------- */
2747
2748 static void
2749 genX(upload_gs_state)(struct brw_context *brw)
2750 {
2751    UNUSED struct gl_context *ctx = &brw->ctx;
2752    UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
2753    const struct brw_stage_state *stage_state = &brw->gs.base;
2754    const struct gl_program *gs_prog = brw->programs[MESA_SHADER_GEOMETRY];
2755    /* BRW_NEW_GEOMETRY_PROGRAM */
2756    bool active = GEN_GEN >= 6 && gs_prog;
2757
2758    /* BRW_NEW_GS_PROG_DATA */
2759    struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
2760    UNUSED const struct brw_vue_prog_data *vue_prog_data =
2761       brw_vue_prog_data(stage_prog_data);
2762 #if GEN_GEN >= 7
2763    const struct brw_gs_prog_data *gs_prog_data =
2764       brw_gs_prog_data(stage_prog_data);
2765 #endif
2766
2767 #if GEN_GEN == 6
2768    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_GS), cgs) {
2769       if (active && stage_state->push_const_size != 0) {
2770          cgs.Buffer0Valid = true;
2771          cgs.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
2772          cgs.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
2773       }
2774    }
2775 #endif
2776
2777 #if GEN_GEN == 7 && !GEN_IS_HASWELL
2778    /**
2779     * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
2780     * Geometry > Geometry Shader > State:
2781     *
2782     *     "Note: Because of corruption in IVB:GT2, software needs to flush the
2783     *     whole fixed function pipeline when the GS enable changes value in
2784     *     the 3DSTATE_GS."
2785     *
2786     * The hardware architects have clarified that in this context "flush the
2787     * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
2788     * Stall" bit set.
2789     */
2790    if (devinfo->gt == 2 && brw->gs.enabled != active)
2791       gen7_emit_cs_stall_flush(brw);
2792 #endif
2793
2794 #if GEN_GEN >= 6
2795    brw_batch_emit(brw, GENX(3DSTATE_GS), gs) {
2796 #else
2797    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
2798    brw_state_emit(brw, GENX(GS_STATE), 32, &brw->ff_gs.state_offset, gs) {
2799 #endif
2800
2801 #if GEN_GEN >= 6
2802       if (active) {
2803          INIT_THREAD_DISPATCH_FIELDS(gs, Vertex);
2804
2805 #if GEN_GEN >= 7
2806          gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
2807          gs.OutputTopology = gs_prog_data->output_topology;
2808          gs.ControlDataHeaderSize =
2809             gs_prog_data->control_data_header_size_hwords;
2810
2811          gs.InstanceControl = gs_prog_data->invocations - 1;
2812          gs.DispatchMode = vue_prog_data->dispatch_mode;
2813
2814          gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
2815
2816          gs.ControlDataFormat = gs_prog_data->control_data_format;
2817 #endif
2818
2819          /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between
2820           * Ivy Bridge and Haswell.
2821           *
2822           * On Ivy Bridge, setting this bit causes the vertices of a triangle
2823           * strip to be delivered to the geometry shader in an order that does
2824           * not strictly follow the OpenGL spec, but preserves triangle
2825           * orientation.  For example, if the vertices are (1, 2, 3, 4, 5), then
2826           * the geometry shader sees triangles:
2827           *
2828           * (1, 2, 3), (2, 4, 3), (3, 4, 5)
2829           *
2830           * (Clearing the bit is even worse, because it fails to preserve
2831           * orientation).
2832           *
2833           * Triangle strips with adjacency always ordered in a way that preserves
2834           * triangle orientation but does not strictly follow the OpenGL spec,
2835           * regardless of the setting of this bit.
2836           *
2837           * On Haswell, both triangle strips and triangle strips with adjacency
2838           * are always ordered in a way that preserves triangle orientation.
2839           * Setting this bit causes the ordering to strictly follow the OpenGL
2840           * spec.
2841           *
2842           * So in either case we want to set the bit.  Unfortunately on Ivy
2843           * Bridge this will get the order close to correct but not perfect.
2844           */
2845          gs.ReorderMode = TRAILING;
2846          gs.MaximumNumberofThreads =
2847             GEN_GEN == 8 ? (devinfo->max_gs_threads / 2 - 1)
2848                          : (devinfo->max_gs_threads - 1);
2849
2850 #if GEN_GEN < 7
2851          gs.SOStatisticsEnable = true;
2852          if (gs_prog->info.has_transform_feedback_varyings)
2853             gs.SVBIPayloadEnable = _mesa_is_xfb_active_and_unpaused(ctx);
2854
2855          /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
2856           * was previously done for gen6.
2857           *
2858           * TODO: test with both disabled to see if the HW is behaving
2859           * as expected, like in gen7.
2860           */
2861          gs.SingleProgramFlow = true;
2862          gs.VectorMaskEnable = true;
2863 #endif
2864
2865 #if GEN_GEN >= 8
2866          gs.ExpectedVertexCount = gs_prog_data->vertices_in;
2867
2868          if (gs_prog_data->static_vertex_count != -1) {
2869             gs.StaticOutput = true;
2870             gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;
2871          }
2872          gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;
2873
2874          gs.UserClipDistanceCullTestEnableBitmask =
2875             vue_prog_data->cull_distance_mask;
2876
2877          const int urb_entry_write_offset = 1;
2878          const uint32_t urb_entry_output_length =
2879             DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -
2880             urb_entry_write_offset;
2881
2882          gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
2883          gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
2884 #endif
2885       }
2886 #endif
2887
2888 #if GEN_GEN <= 6
2889       if (!active && brw->ff_gs.prog_active) {
2890          /* In gen6, transform feedback for the VS stage is done with an
2891           * ad-hoc GS program. This function provides the needed 3DSTATE_GS
2892           * for this.
2893           */
2894          gs.KernelStartPointer = KSP(brw, brw->ff_gs.prog_offset);
2895          gs.SingleProgramFlow = true;
2896          gs.DispatchGRFStartRegisterForURBData = GEN_GEN == 6 ? 2 : 1;
2897          gs.VertexURBEntryReadLength = brw->ff_gs.prog_data->urb_read_length;
2898
2899 #if GEN_GEN <= 5
2900          gs.GRFRegisterCount =
2901             DIV_ROUND_UP(brw->ff_gs.prog_data->total_grf, 16) - 1;
2902          /* BRW_NEW_URB_FENCE */
2903          gs.NumberofURBEntries = brw->urb.nr_gs_entries;
2904          gs.URBEntryAllocationSize = brw->urb.vsize - 1;
2905          gs.MaximumNumberofThreads = brw->urb.nr_gs_entries >= 8 ? 1 : 0;
2906          gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
2907 #else
2908          gs.Enable = true;
2909          gs.VectorMaskEnable = true;
2910          gs.SVBIPayloadEnable = true;
2911          gs.SVBIPostIncrementEnable = true;
2912          gs.SVBIPostIncrementValue =
2913             brw->ff_gs.prog_data->svbi_postincrement_value;
2914          gs.SOStatisticsEnable = true;
2915          gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
2916 #endif
2917       }
2918 #endif
2919       if (!active && !brw->ff_gs.prog_active) {
2920 #if GEN_GEN < 8
2921          gs.DispatchGRFStartRegisterForURBData = 1;
2922 #if GEN_GEN >= 7
2923          gs.IncludeVertexHandles = true;
2924 #endif
2925 #endif
2926       }
2927
2928 #if GEN_GEN >= 6
2929       gs.StatisticsEnable = true;
2930 #endif
2931 #if GEN_GEN == 5 || GEN_GEN == 6
2932       gs.RenderingEnabled = true;
2933 #endif
2934 #if GEN_GEN <= 5
2935       gs.MaximumVPIndex = brw->clip.viewport_count - 1;
2936 #endif
2937    }
2938
2939 #if GEN_GEN == 6
2940    brw->gs.enabled = active;
2941 #endif
2942 }
2943
2944 static const struct brw_tracked_state genX(gs_state) = {
2945    .dirty = {
2946       .mesa  = (GEN_GEN == 6 ? _NEW_PROGRAM_CONSTANTS : 0),
2947       .brw   = BRW_NEW_BATCH |
2948                BRW_NEW_BLORP |
2949                (GEN_GEN <= 5 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
2950                                BRW_NEW_PROGRAM_CACHE |
2951                                BRW_NEW_URB_FENCE |
2952                                BRW_NEW_VIEWPORT_COUNT
2953                              : 0) |
2954                (GEN_GEN >= 6 ? BRW_NEW_CONTEXT |
2955                                BRW_NEW_GEOMETRY_PROGRAM |
2956                                BRW_NEW_GS_PROG_DATA
2957                              : 0) |
2958                (GEN_GEN < 7 ? BRW_NEW_FF_GS_PROG_DATA : 0),
2959    },
2960    .emit = genX(upload_gs_state),
2961 };
2962
2963 /* ---------------------------------------------------------------------- */
2964
2965 UNUSED static GLenum
2966 fix_dual_blend_alpha_to_one(GLenum function)
2967 {
2968    switch (function) {
2969    case GL_SRC1_ALPHA:
2970       return GL_ONE;
2971
2972    case GL_ONE_MINUS_SRC1_ALPHA:
2973       return GL_ZERO;
2974    }
2975
2976    return function;
2977 }
2978
2979 #define blend_factor(x) brw_translate_blend_factor(x)
2980 #define blend_eqn(x) brw_translate_blend_equation(x)
2981
2982 /**
2983  * Modify blend function to force destination alpha to 1.0
2984  *
2985  * If \c function specifies a blend function that uses destination alpha,
2986  * replace it with a function that hard-wires destination alpha to 1.0.  This
2987  * is used when rendering to xRGB targets.
2988  */
2989 static GLenum
2990 brw_fix_xRGB_alpha(GLenum function)
2991 {
2992    switch (function) {
2993    case GL_DST_ALPHA:
2994       return GL_ONE;
2995
2996    case GL_ONE_MINUS_DST_ALPHA:
2997    case GL_SRC_ALPHA_SATURATE:
2998       return GL_ZERO;
2999    }
3000
3001    return function;
3002 }
3003
3004 #if GEN_GEN >= 6
3005 typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML;
3006 #else
3007 typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML;
3008 #endif
3009
3010 UNUSED static bool
3011 set_blend_entry_bits(struct brw_context *brw, BLEND_ENTRY_GENXML *entry, int i,
3012                      bool alpha_to_one)
3013 {
3014    struct gl_context *ctx = &brw->ctx;
3015
3016    /* _NEW_BUFFERS */
3017    const struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
3018
3019    bool independent_alpha_blend = false;
3020
3021    /* Used for implementing the following bit of GL_EXT_texture_integer:
3022     * "Per-fragment operations that require floating-point color
3023     *  components, including multisample alpha operations, alpha test,
3024     *  blending, and dithering, have no effect when the corresponding
3025     *  colors are written to an integer color buffer."
3026     */
3027    const bool integer = ctx->DrawBuffer->_IntegerBuffers & (0x1 << i);
3028
3029    const unsigned blend_enabled = GEN_GEN >= 6 ?
3030       ctx->Color.BlendEnabled & (1 << i) : ctx->Color.BlendEnabled;
3031
3032    /* _NEW_COLOR */
3033    if (ctx->Color.ColorLogicOpEnabled) {
3034       GLenum rb_type = rb ? _mesa_get_format_datatype(rb->Format)
3035          : GL_UNSIGNED_NORMALIZED;
3036       WARN_ONCE(ctx->Color.LogicOp != GL_COPY &&
3037                 rb_type != GL_UNSIGNED_NORMALIZED &&
3038                 rb_type != GL_FLOAT, "Ignoring %s logic op on %s "
3039                 "renderbuffer\n",
3040                 _mesa_enum_to_string(ctx->Color.LogicOp),
3041                 _mesa_enum_to_string(rb_type));
3042       if (GEN_GEN >= 8 || rb_type == GL_UNSIGNED_NORMALIZED) {
3043          entry->LogicOpEnable = true;
3044          entry->LogicOpFunction = ctx->Color._LogicOp;
3045       }
3046    } else if (blend_enabled && !ctx->Color._AdvancedBlendMode
3047               && (GEN_GEN <= 5 || !integer)) {
3048       GLenum eqRGB = ctx->Color.Blend[i].EquationRGB;
3049       GLenum eqA = ctx->Color.Blend[i].EquationA;
3050       GLenum srcRGB = ctx->Color.Blend[i].SrcRGB;
3051       GLenum dstRGB = ctx->Color.Blend[i].DstRGB;
3052       GLenum srcA = ctx->Color.Blend[i].SrcA;
3053       GLenum dstA = ctx->Color.Blend[i].DstA;
3054
3055       if (eqRGB == GL_MIN || eqRGB == GL_MAX)
3056          srcRGB = dstRGB = GL_ONE;
3057
3058       if (eqA == GL_MIN || eqA == GL_MAX)
3059          srcA = dstA = GL_ONE;
3060
3061       /* Due to hardware limitations, the destination may have information
3062        * in an alpha channel even when the format specifies no alpha
3063        * channel. In order to avoid getting any incorrect blending due to
3064        * that alpha channel, coerce the blend factors to values that will
3065        * not read the alpha channel, but will instead use the correct
3066        * implicit value for alpha.
3067        */
3068       if (rb && !_mesa_base_format_has_channel(rb->_BaseFormat,
3069                                                GL_TEXTURE_ALPHA_TYPE)) {
3070          srcRGB = brw_fix_xRGB_alpha(srcRGB);
3071          srcA = brw_fix_xRGB_alpha(srcA);
3072          dstRGB = brw_fix_xRGB_alpha(dstRGB);
3073          dstA = brw_fix_xRGB_alpha(dstA);
3074       }
3075
3076       /* From the BLEND_STATE docs, DWord 0, Bit 29 (AlphaToOne Enable):
3077        * "If Dual Source Blending is enabled, this bit must be disabled."
3078        *
3079        * We override SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO,
3080        * and leave it enabled anyway.
3081        */
3082       if (GEN_GEN >= 6 && ctx->Color.Blend[i]._UsesDualSrc && alpha_to_one) {
3083          srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
3084          srcA = fix_dual_blend_alpha_to_one(srcA);
3085          dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
3086          dstA = fix_dual_blend_alpha_to_one(dstA);
3087       }
3088
3089       /* BRW_NEW_FS_PROG_DATA */
3090       const struct brw_wm_prog_data *wm_prog_data =
3091          brw_wm_prog_data(brw->wm.base.prog_data);
3092
3093       /* The Dual Source Blending documentation says:
3094        *
3095        * "If SRC1 is included in a src/dst blend factor and
3096        * a DualSource RT Write message is not used, results
3097        * are UNDEFINED. (This reflects the same restriction in DX APIs,
3098        * where undefined results are produced if “o1” is not written
3099        * by a PS – there are no default values defined).
3100        * If SRC1 is not included in a src/dst blend factor,
3101        * dual source blending must be disabled."
3102        *
3103        * There is no way to gracefully fix this undefined situation
3104        * so we just disable the blending to prevent possible issues.
3105        */
3106       entry->ColorBufferBlendEnable =
3107          !ctx->Color.Blend[0]._UsesDualSrc || wm_prog_data->dual_src_blend;
3108
3109       entry->DestinationBlendFactor = blend_factor(dstRGB);
3110       entry->SourceBlendFactor = blend_factor(srcRGB);
3111       entry->DestinationAlphaBlendFactor = blend_factor(dstA);
3112       entry->SourceAlphaBlendFactor = blend_factor(srcA);
3113       entry->ColorBlendFunction = blend_eqn(eqRGB);
3114       entry->AlphaBlendFunction = blend_eqn(eqA);
3115
3116       if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB)
3117          independent_alpha_blend = true;
3118    }
3119
3120    return independent_alpha_blend;
3121 }
3122
3123 #if GEN_GEN >= 6
3124 static void
3125 genX(upload_blend_state)(struct brw_context *brw)
3126 {
3127    struct gl_context *ctx = &brw->ctx;
3128    int size;
3129
3130    /* We need at least one BLEND_STATE written, because we might do
3131     * thread dispatch even if _NumColorDrawBuffers is 0 (for example
3132     * for computed depth or alpha test), which will do an FB write
3133     * with render target 0, which will reference BLEND_STATE[0] for
3134     * alpha test enable.
3135     */
3136    int nr_draw_buffers = ctx->DrawBuffer->_NumColorDrawBuffers;
3137    if (nr_draw_buffers == 0 && ctx->Color.AlphaEnabled)
3138       nr_draw_buffers = 1;
3139
3140    size = GENX(BLEND_STATE_ENTRY_length) * 4 * nr_draw_buffers;
3141 #if GEN_GEN >= 8
3142    size += GENX(BLEND_STATE_length) * 4;
3143 #endif
3144
3145    uint32_t *blend_map;
3146    blend_map = brw_state_batch(brw, size, 64, &brw->cc.blend_state_offset);
3147
3148 #if GEN_GEN >= 8
3149    struct GENX(BLEND_STATE) blend = { 0 };
3150    {
3151 #else
3152    for (int i = 0; i < nr_draw_buffers; i++) {
3153       struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
3154 #define blend entry
3155 #endif
3156       /* OpenGL specification 3.3 (page 196), section 4.1.3 says:
3157        * "If drawbuffer zero is not NONE and the buffer it references has an
3158        * integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
3159        * operations are skipped."
3160        */
3161       if (!(ctx->DrawBuffer->_IntegerBuffers & 0x1)) {
3162          /* _NEW_MULTISAMPLE */
3163          if (_mesa_is_multisample_enabled(ctx)) {
3164             if (ctx->Multisample.SampleAlphaToCoverage) {
3165                blend.AlphaToCoverageEnable = true;
3166                blend.AlphaToCoverageDitherEnable = GEN_GEN >= 7;
3167             }
3168             if (ctx->Multisample.SampleAlphaToOne)
3169                blend.AlphaToOneEnable = true;
3170          }
3171
3172          /* _NEW_COLOR */
3173          if (ctx->Color.AlphaEnabled) {
3174             blend.AlphaTestEnable = true;
3175             blend.AlphaTestFunction =
3176                intel_translate_compare_func(ctx->Color.AlphaFunc);
3177          }
3178
3179          if (ctx->Color.DitherFlag) {
3180             blend.ColorDitherEnable = true;
3181          }
3182       }
3183
3184 #if GEN_GEN >= 8
3185       for (int i = 0; i < nr_draw_buffers; i++) {
3186          struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
3187 #else
3188       {
3189 #endif
3190          blend.IndependentAlphaBlendEnable =
3191             set_blend_entry_bits(brw, &entry, i, blend.AlphaToOneEnable) ||
3192             blend.IndependentAlphaBlendEnable;
3193
3194          /* See section 8.1.6 "Pre-Blend Color Clamping" of the
3195           * SandyBridge PRM Volume 2 Part 1 for HW requirements.
3196           *
3197           * We do our ARB_color_buffer_float CLAMP_FRAGMENT_COLOR
3198           * clamping in the fragment shader.  For its clamping of
3199           * blending, the spec says:
3200           *
3201           *     "RESOLVED: For fixed-point color buffers, the inputs and
3202           *      the result of the blending equation are clamped.  For
3203           *      floating-point color buffers, no clamping occurs."
3204           *
3205           * So, generally, we want clamping to the render target's range.
3206           * And, good news, the hardware tables for both pre- and
3207           * post-blend color clamping are either ignored, or any are
3208           * allowed, or clamping is required but RT range clamping is a
3209           * valid option.
3210           */
3211          entry.PreBlendColorClampEnable = true;
3212          entry.PostBlendColorClampEnable = true;
3213          entry.ColorClampRange = COLORCLAMP_RTFORMAT;
3214
3215          entry.WriteDisableRed   = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 0);
3216          entry.WriteDisableGreen = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 1);
3217          entry.WriteDisableBlue  = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 2);
3218          entry.WriteDisableAlpha = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 3);
3219
3220 #if GEN_GEN >= 8
3221          GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry);
3222 #else
3223          GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[i * 2], &entry);
3224 #endif
3225       }
3226    }
3227
3228 #if GEN_GEN >= 8
3229    GENX(BLEND_STATE_pack)(NULL, blend_map, &blend);
3230 #endif
3231
3232 #if GEN_GEN < 7
3233    brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
3234       ptr.PointertoBLEND_STATE = brw->cc.blend_state_offset;
3235       ptr.BLEND_STATEChange = true;
3236    }
3237 #else
3238    brw_batch_emit(brw, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
3239       ptr.BlendStatePointer = brw->cc.blend_state_offset;
3240 #if GEN_GEN >= 8
3241       ptr.BlendStatePointerValid = true;
3242 #endif
3243    }
3244 #endif
3245 }
3246
3247 static const struct brw_tracked_state genX(blend_state) = {
3248    .dirty = {
3249       .mesa = _NEW_BUFFERS |
3250               _NEW_COLOR |
3251               _NEW_MULTISAMPLE,
3252       .brw = BRW_NEW_BATCH |
3253              BRW_NEW_BLORP |
3254              BRW_NEW_FS_PROG_DATA |
3255              BRW_NEW_STATE_BASE_ADDRESS,
3256    },
3257    .emit = genX(upload_blend_state),
3258 };
3259 #endif
3260
3261 /* ---------------------------------------------------------------------- */
3262
3263 #if GEN_GEN >= 7
3264 UNUSED static const uint32_t push_constant_opcodes[] = {
3265    [MESA_SHADER_VERTEX]                      = 21,
3266    [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
3267    [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
3268    [MESA_SHADER_GEOMETRY]                    = 22,
3269    [MESA_SHADER_FRAGMENT]                    = 23,
3270    [MESA_SHADER_COMPUTE]                     = 0,
3271 };
3272
3273 static void
3274 genX(upload_push_constant_packets)(struct brw_context *brw)
3275 {
3276    const struct gen_device_info *devinfo = &brw->screen->devinfo;
3277    struct gl_context *ctx = &brw->ctx;
3278
3279    UNUSED uint32_t mocs = GEN_GEN < 8 ? GEN7_MOCS_L3 : 0;
3280
3281    struct brw_stage_state *stage_states[] = {
3282       &brw->vs.base,
3283       &brw->tcs.base,
3284       &brw->tes.base,
3285       &brw->gs.base,
3286       &brw->wm.base,
3287    };
3288
3289    if (GEN_GEN == 7 && !GEN_IS_HASWELL && !devinfo->is_baytrail &&
3290        stage_states[MESA_SHADER_VERTEX]->push_constants_dirty)
3291       gen7_emit_vs_workaround_flush(brw);
3292
3293    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
3294       struct brw_stage_state *stage_state = stage_states[stage];
3295       UNUSED struct gl_program *prog = ctx->_Shader->CurrentProgram[stage];
3296
3297       if (!stage_state->push_constants_dirty)
3298          continue;
3299
3300       brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), pkt) {
3301          pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
3302          if (stage_state->prog_data) {
3303 #if GEN_GEN >= 8 || GEN_IS_HASWELL
3304             /* The Skylake PRM contains the following restriction:
3305              *
3306              *    "The driver must ensure The following case does not occur
3307              *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
3308              *     buffer 3 read length equal to zero committed followed by a
3309              *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
3310              *     zero committed."
3311              *
3312              * To avoid this, we program the buffers in the highest slots.
3313              * This way, slot 0 is only used if slot 3 is also used.
3314              */
3315             int n = 3;
3316
3317             for (int i = 3; i >= 0; i--) {
3318                const struct brw_ubo_range *range =
3319                   &stage_state->prog_data->ubo_ranges[i];
3320
3321                if (range->length == 0)
3322                   continue;
3323
3324                const struct gl_uniform_block *block =
3325                   prog->sh.UniformBlocks[range->block];
3326                const struct gl_buffer_binding *binding =
3327                   &ctx->UniformBufferBindings[block->Binding];
3328
3329                if (binding->BufferObject == ctx->Shared->NullBufferObj) {
3330                   static unsigned msg_id = 0;
3331                   _mesa_gl_debug(ctx, &msg_id, MESA_DEBUG_SOURCE_API,
3332                                  MESA_DEBUG_TYPE_UNDEFINED,
3333                                  MESA_DEBUG_SEVERITY_HIGH,
3334                                  "UBO %d unbound, %s shader uniform data "
3335                                  "will be undefined.",
3336                                  range->block,
3337                                  _mesa_shader_stage_to_string(stage));
3338                   continue;
3339                }
3340
3341                assert(binding->Offset % 32 == 0);
3342
3343                struct brw_bo *bo = intel_bufferobj_buffer(brw,
3344                   intel_buffer_object(binding->BufferObject),
3345                   binding->Offset, range->length * 32, false);
3346
3347                pkt.ConstantBody.ReadLength[n] = range->length;
3348                pkt.ConstantBody.Buffer[n] =
3349                   ro_bo(bo, range->start * 32 + binding->Offset);
3350                n--;
3351             }
3352
3353             if (stage_state->push_const_size > 0) {
3354                assert(n >= 0);
3355                pkt.ConstantBody.ReadLength[n] = stage_state->push_const_size;
3356                pkt.ConstantBody.Buffer[n] =
3357                   ro_bo(stage_state->push_const_bo,
3358                         stage_state->push_const_offset);
3359             }
3360 #else
3361             pkt.ConstantBody.ReadLength[0] = stage_state->push_const_size;
3362             pkt.ConstantBody.Buffer[0].offset =
3363                stage_state->push_const_offset | mocs;
3364 #endif
3365          }
3366       }
3367
3368       stage_state->push_constants_dirty = false;
3369       brw->ctx.NewDriverState |= GEN_GEN >= 9 ? BRW_NEW_SURFACES : 0;
3370    }
3371 }
3372
3373 const struct brw_tracked_state genX(push_constant_packets) = {
3374    .dirty = {
3375       .mesa  = 0,
3376       .brw   = BRW_NEW_DRAW_CALL,
3377    },
3378    .emit = genX(upload_push_constant_packets),
3379 };
3380 #endif
3381
3382 #if GEN_GEN >= 6
3383 static void
3384 genX(upload_vs_push_constants)(struct brw_context *brw)
3385 {
3386    struct brw_stage_state *stage_state = &brw->vs.base;
3387
3388    /* BRW_NEW_VERTEX_PROGRAM */
3389    const struct gl_program *vp = brw->programs[MESA_SHADER_VERTEX];
3390    /* BRW_NEW_VS_PROG_DATA */
3391    const struct brw_stage_prog_data *prog_data = brw->vs.base.prog_data;
3392
3393    gen6_upload_push_constants(brw, vp, prog_data, stage_state);
3394 }
3395
3396 static const struct brw_tracked_state genX(vs_push_constants) = {
3397    .dirty = {
3398       .mesa  = _NEW_PROGRAM_CONSTANTS |
3399                _NEW_TRANSFORM,
3400       .brw   = BRW_NEW_BATCH |
3401                BRW_NEW_BLORP |
3402                BRW_NEW_VERTEX_PROGRAM |
3403                BRW_NEW_VS_PROG_DATA,
3404    },
3405    .emit = genX(upload_vs_push_constants),
3406 };
3407
3408 static void
3409 genX(upload_gs_push_constants)(struct brw_context *brw)
3410 {
3411    struct brw_stage_state *stage_state = &brw->gs.base;
3412
3413    /* BRW_NEW_GEOMETRY_PROGRAM */
3414    const struct gl_program *gp = brw->programs[MESA_SHADER_GEOMETRY];
3415
3416    /* BRW_NEW_GS_PROG_DATA */
3417    struct brw_stage_prog_data *prog_data = brw->gs.base.prog_data;
3418
3419    gen6_upload_push_constants(brw, gp, prog_data, stage_state);
3420 }
3421
3422 static const struct brw_tracked_state genX(gs_push_constants) = {
3423    .dirty = {
3424       .mesa  = _NEW_PROGRAM_CONSTANTS |
3425                _NEW_TRANSFORM,
3426       .brw   = BRW_NEW_BATCH |
3427                BRW_NEW_BLORP |
3428                BRW_NEW_GEOMETRY_PROGRAM |
3429                BRW_NEW_GS_PROG_DATA,
3430    },
3431    .emit = genX(upload_gs_push_constants),
3432 };
3433
3434 static void
3435 genX(upload_wm_push_constants)(struct brw_context *brw)
3436 {
3437    struct brw_stage_state *stage_state = &brw->wm.base;
3438    /* BRW_NEW_FRAGMENT_PROGRAM */
3439    const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
3440    /* BRW_NEW_FS_PROG_DATA */
3441    const struct brw_stage_prog_data *prog_data = brw->wm.base.prog_data;
3442
3443    gen6_upload_push_constants(brw, fp, prog_data, stage_state);
3444 }
3445
3446 static const struct brw_tracked_state genX(wm_push_constants) = {
3447    .dirty = {
3448       .mesa  = _NEW_PROGRAM_CONSTANTS,
3449       .brw   = BRW_NEW_BATCH |
3450                BRW_NEW_BLORP |
3451                BRW_NEW_FRAGMENT_PROGRAM |
3452                BRW_NEW_FS_PROG_DATA,
3453    },
3454    .emit = genX(upload_wm_push_constants),
3455 };
3456 #endif
3457
3458 /* ---------------------------------------------------------------------- */
3459
3460 #if GEN_GEN >= 6
3461 static unsigned
3462 genX(determine_sample_mask)(struct brw_context *brw)
3463 {
3464    struct gl_context *ctx = &brw->ctx;
3465    float coverage = 1.0f;
3466    float coverage_invert = false;
3467    unsigned sample_mask = ~0u;
3468
3469    /* BRW_NEW_NUM_SAMPLES */
3470    unsigned num_samples = brw->num_samples;
3471
3472    if (_mesa_is_multisample_enabled(ctx)) {
3473       if (ctx->Multisample.SampleCoverage) {
3474          coverage = ctx->Multisample.SampleCoverageValue;
3475          coverage_invert = ctx->Multisample.SampleCoverageInvert;
3476       }
3477       if (ctx->Multisample.SampleMask) {
3478          sample_mask = ctx->Multisample.SampleMaskValue;
3479       }
3480    }
3481
3482    if (num_samples > 1) {
3483       int coverage_int = (int) (num_samples * coverage + 0.5f);
3484       uint32_t coverage_bits = (1 << coverage_int) - 1;
3485       if (coverage_invert)
3486          coverage_bits ^= (1 << num_samples) - 1;
3487       return coverage_bits & sample_mask;
3488    } else {
3489       return 1;
3490    }
3491 }
3492
3493 static void
3494 genX(emit_3dstate_multisample2)(struct brw_context *brw,
3495                                 unsigned num_samples)
3496 {
3497    unsigned log2_samples = ffs(num_samples) - 1;
3498
3499    brw_batch_emit(brw, GENX(3DSTATE_MULTISAMPLE), multi) {
3500       multi.PixelLocation = CENTER;
3501       multi.NumberofMultisamples = log2_samples;
3502 #if GEN_GEN == 6
3503       GEN_SAMPLE_POS_4X(multi.Sample);
3504 #elif GEN_GEN == 7
3505       switch (num_samples) {
3506       case 1:
3507          GEN_SAMPLE_POS_1X(multi.Sample);
3508          break;
3509       case 2:
3510          GEN_SAMPLE_POS_2X(multi.Sample);
3511          break;
3512       case 4:
3513          GEN_SAMPLE_POS_4X(multi.Sample);
3514          break;
3515       case 8:
3516          GEN_SAMPLE_POS_8X(multi.Sample);
3517          break;
3518       default:
3519          break;
3520       }
3521 #endif
3522    }
3523 }
3524
3525 static void
3526 genX(upload_multisample_state)(struct brw_context *brw)
3527 {
3528    assert(brw->num_samples > 0 && brw->num_samples <= 16);
3529
3530    genX(emit_3dstate_multisample2)(brw, brw->num_samples);
3531
3532    brw_batch_emit(brw, GENX(3DSTATE_SAMPLE_MASK), sm) {
3533       sm.SampleMask = genX(determine_sample_mask)(brw);
3534    }
3535 }
3536
3537 static const struct brw_tracked_state genX(multisample_state) = {
3538    .dirty = {
3539       .mesa = _NEW_MULTISAMPLE |
3540               (GEN_GEN == 10 ? _NEW_BUFFERS : 0),
3541       .brw = BRW_NEW_BLORP |
3542              BRW_NEW_CONTEXT |
3543              BRW_NEW_NUM_SAMPLES,
3544    },
3545    .emit = genX(upload_multisample_state)
3546 };
3547 #endif
3548
3549 /* ---------------------------------------------------------------------- */
3550
3551 static void
3552 genX(upload_color_calc_state)(struct brw_context *brw)
3553 {
3554    struct gl_context *ctx = &brw->ctx;
3555
3556    brw_state_emit(brw, GENX(COLOR_CALC_STATE), 64, &brw->cc.state_offset, cc) {
3557 #if GEN_GEN <= 5
3558       cc.IndependentAlphaBlendEnable =
3559          set_blend_entry_bits(brw, &cc, 0, false);
3560       set_depth_stencil_bits(brw, &cc);
3561
3562       if (ctx->Color.AlphaEnabled &&
3563           ctx->DrawBuffer->_NumColorDrawBuffers <= 1) {
3564          cc.AlphaTestEnable = true;
3565          cc.AlphaTestFunction =
3566             intel_translate_compare_func(ctx->Color.AlphaFunc);
3567       }
3568
3569       cc.ColorDitherEnable = ctx->Color.DitherFlag;
3570
3571       cc.StatisticsEnable = brw->stats_wm;
3572
3573       cc.CCViewportStatePointer =
3574          ro_bo(brw->batch.state.bo, brw->cc.vp_offset);
3575 #else
3576       /* _NEW_COLOR */
3577       cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
3578       cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
3579       cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
3580       cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
3581
3582 #if GEN_GEN < 9
3583       /* _NEW_STENCIL */
3584       cc.StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
3585       cc.BackfaceStencilReferenceValue =
3586          _mesa_get_stencil_ref(ctx, ctx->Stencil._BackFace);
3587 #endif
3588
3589 #endif
3590
3591       /* _NEW_COLOR */
3592       UNCLAMPED_FLOAT_TO_UBYTE(cc.AlphaReferenceValueAsUNORM8,
3593                                ctx->Color.AlphaRef);
3594    }
3595
3596 #if GEN_GEN >= 6
3597    brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
3598       ptr.ColorCalcStatePointer = brw->cc.state_offset;
3599 #if GEN_GEN != 7
3600       ptr.ColorCalcStatePointerValid = true;
3601 #endif
3602    }
3603 #else
3604    brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
3605 #endif
3606 }
3607
3608 static const struct brw_tracked_state genX(color_calc_state) = {
3609    .dirty = {
3610       .mesa = _NEW_COLOR |
3611               _NEW_STENCIL |
3612               (GEN_GEN <= 5 ? _NEW_BUFFERS |
3613                               _NEW_DEPTH
3614                             : 0),
3615       .brw = BRW_NEW_BATCH |
3616              BRW_NEW_BLORP |
3617              (GEN_GEN <= 5 ? BRW_NEW_CC_VP |
3618                              BRW_NEW_STATS_WM
3619                            : BRW_NEW_CC_STATE |
3620                              BRW_NEW_STATE_BASE_ADDRESS),
3621    },
3622    .emit = genX(upload_color_calc_state),
3623 };
3624
3625
3626 /* ---------------------------------------------------------------------- */
3627
3628 #if GEN_GEN >= 7
3629 static void
3630 genX(upload_sbe)(struct brw_context *brw)
3631 {
3632    struct gl_context *ctx = &brw->ctx;
3633    /* BRW_NEW_FRAGMENT_PROGRAM */
3634    UNUSED const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
3635    /* BRW_NEW_FS_PROG_DATA */
3636    const struct brw_wm_prog_data *wm_prog_data =
3637       brw_wm_prog_data(brw->wm.base.prog_data);
3638 #if GEN_GEN >= 8
3639    struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = { { 0 } };
3640 #else
3641 #define attr_overrides sbe.Attribute
3642 #endif
3643    uint32_t urb_entry_read_length;
3644    uint32_t urb_entry_read_offset;
3645    uint32_t point_sprite_enables;
3646
3647    brw_batch_emit(brw, GENX(3DSTATE_SBE), sbe) {
3648       sbe.AttributeSwizzleEnable = true;
3649       sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
3650
3651       /* _NEW_BUFFERS */
3652       bool flip_y = ctx->DrawBuffer->FlipY;
3653
3654       /* _NEW_POINT
3655        *
3656        * Window coordinates in an FBO are inverted, which means point
3657        * sprite origin must be inverted.
3658        */
3659       if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y)
3660          sbe.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
3661       else
3662          sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
3663
3664       /* _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM,
3665        * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM |
3666        * BRW_NEW_GS_PROG_DATA | BRW_NEW_PRIMITIVE | BRW_NEW_TES_PROG_DATA |
3667        * BRW_NEW_VUE_MAP_GEOM_OUT
3668        */
3669       genX(calculate_attr_overrides)(brw,
3670                                      attr_overrides,
3671                                      &point_sprite_enables,
3672                                      &urb_entry_read_length,
3673                                      &urb_entry_read_offset);
3674
3675       /* Typically, the URB entry read length and offset should be programmed
3676        * in 3DSTATE_VS and 3DSTATE_GS; SBE inherits it from the last active
3677        * stage which produces geometry.  However, we don't know the proper
3678        * value until we call calculate_attr_overrides().
3679        *
3680        * To fit with our existing code, we override the inherited values and
3681        * specify it here directly, as we did on previous generations.
3682        */
3683       sbe.VertexURBEntryReadLength = urb_entry_read_length;
3684       sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
3685       sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables;
3686       sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
3687
3688 #if GEN_GEN >= 8
3689       sbe.ForceVertexURBEntryReadLength = true;
3690       sbe.ForceVertexURBEntryReadOffset = true;
3691 #endif
3692
3693 #if GEN_GEN >= 9
3694       /* prepare the active component dwords */
3695       for (int i = 0; i < 32; i++)
3696          sbe.AttributeActiveComponentFormat[i] = ACTIVE_COMPONENT_XYZW;
3697 #endif
3698    }
3699
3700 #if GEN_GEN >= 8
3701    brw_batch_emit(brw, GENX(3DSTATE_SBE_SWIZ), sbes) {
3702       for (int i = 0; i < 16; i++)
3703          sbes.Attribute[i] = attr_overrides[i];
3704    }
3705 #endif
3706
3707 #undef attr_overrides
3708 }
3709
3710 static const struct brw_tracked_state genX(sbe_state) = {
3711    .dirty = {
3712       .mesa  = _NEW_BUFFERS |
3713                _NEW_LIGHT |
3714                _NEW_POINT |
3715                _NEW_POLYGON |
3716                _NEW_PROGRAM,
3717       .brw   = BRW_NEW_BLORP |
3718                BRW_NEW_CONTEXT |
3719                BRW_NEW_FRAGMENT_PROGRAM |
3720                BRW_NEW_FS_PROG_DATA |
3721                BRW_NEW_GS_PROG_DATA |
3722                BRW_NEW_TES_PROG_DATA |
3723                BRW_NEW_VUE_MAP_GEOM_OUT |
3724                (GEN_GEN == 7 ? BRW_NEW_PRIMITIVE
3725                              : 0),
3726    },
3727    .emit = genX(upload_sbe),
3728 };
3729 #endif
3730
3731 /* ---------------------------------------------------------------------- */
3732
3733 #if GEN_GEN >= 7
3734 /**
3735  * Outputs the 3DSTATE_SO_DECL_LIST command.
3736  *
3737  * The data output is a series of 64-bit entries containing a SO_DECL per
3738  * stream.  We only have one stream of rendering coming out of the GS unit, so
3739  * we only emit stream 0 (low 16 bits) SO_DECLs.
3740  */
3741 static void
3742 genX(upload_3dstate_so_decl_list)(struct brw_context *brw,
3743                                   const struct brw_vue_map *vue_map)
3744 {
3745    struct gl_context *ctx = &brw->ctx;
3746    /* BRW_NEW_TRANSFORM_FEEDBACK */
3747    struct gl_transform_feedback_object *xfb_obj =
3748       ctx->TransformFeedback.CurrentObject;
3749    const struct gl_transform_feedback_info *linked_xfb_info =
3750       xfb_obj->program->sh.LinkedTransformFeedback;
3751    struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128];
3752    int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3753    int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3754    int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3755    int max_decls = 0;
3756    STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);
3757
3758    memset(so_decl, 0, sizeof(so_decl));
3759
3760    /* Construct the list of SO_DECLs to be emitted.  The formatting of the
3761     * command feels strange -- each dword pair contains a SO_DECL per stream.
3762     */
3763    for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) {
3764       const struct gl_transform_feedback_output *output =
3765          &linked_xfb_info->Outputs[i];
3766       const int buffer = output->OutputBuffer;
3767       const int varying = output->OutputRegister;
3768       const unsigned stream_id = output->StreamId;
3769       assert(stream_id < MAX_VERTEX_STREAMS);
3770
3771       buffer_mask[stream_id] |= 1 << buffer;
3772
3773       assert(vue_map->varying_to_slot[varying] >= 0);
3774
3775       /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
3776        * array.  Instead, it simply increments DstOffset for the following
3777        * input by the number of components that should be skipped.
3778        *
3779        * Our hardware is unusual in that it requires us to program SO_DECLs
3780        * for fake "hole" components, rather than simply taking the offset
3781        * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
3782        * program as many size = 4 holes as we can, then a final hole to
3783        * accommodate the final 1, 2, or 3 remaining.
3784        */
3785       int skip_components = output->DstOffset - next_offset[buffer];
3786
3787       while (skip_components > 0) {
3788          so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
3789             .HoleFlag = 1,
3790             .OutputBufferSlot = output->OutputBuffer,
3791             .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
3792          };
3793          skip_components -= 4;
3794       }
3795
3796       next_offset[buffer] = output->DstOffset + output->NumComponents;
3797
3798       so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
3799          .OutputBufferSlot = output->OutputBuffer,
3800          .RegisterIndex = vue_map->varying_to_slot[varying],
3801          .ComponentMask =
3802             ((1 << output->NumComponents) - 1) << output->ComponentOffset,
3803       };
3804
3805       if (decls[stream_id] > max_decls)
3806          max_decls = decls[stream_id];
3807    }
3808
3809    uint32_t *dw;
3810    dw = brw_batch_emitn(brw, GENX(3DSTATE_SO_DECL_LIST), 3 + 2 * max_decls,
3811                         .StreamtoBufferSelects0 = buffer_mask[0],
3812                         .StreamtoBufferSelects1 = buffer_mask[1],
3813                         .StreamtoBufferSelects2 = buffer_mask[2],
3814                         .StreamtoBufferSelects3 = buffer_mask[3],
3815                         .NumEntries0 = decls[0],
3816                         .NumEntries1 = decls[1],
3817                         .NumEntries2 = decls[2],
3818                         .NumEntries3 = decls[3]);
3819
3820    for (int i = 0; i < max_decls; i++) {
3821       GENX(SO_DECL_ENTRY_pack)(
3822          brw, dw + 2 + i * 2,
3823          &(struct GENX(SO_DECL_ENTRY)) {
3824             .Stream0Decl = so_decl[0][i],
3825             .Stream1Decl = so_decl[1][i],
3826             .Stream2Decl = so_decl[2][i],
3827             .Stream3Decl = so_decl[3][i],
3828          });
3829    }
3830 }
3831
3832 static void
3833 genX(upload_3dstate_so_buffers)(struct brw_context *brw)
3834 {
3835    struct gl_context *ctx = &brw->ctx;
3836    /* BRW_NEW_TRANSFORM_FEEDBACK */
3837    struct gl_transform_feedback_object *xfb_obj =
3838       ctx->TransformFeedback.CurrentObject;
3839 #if GEN_GEN < 8
3840    const struct gl_transform_feedback_info *linked_xfb_info =
3841       xfb_obj->program->sh.LinkedTransformFeedback;
3842 #else
3843    struct brw_transform_feedback_object *brw_obj =
3844       (struct brw_transform_feedback_object *) xfb_obj;
3845    uint32_t mocs_wb = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
3846 #endif
3847
3848    /* Set up the up to 4 output buffers.  These are the ranges defined in the
3849     * gl_transform_feedback_object.
3850     */
3851    for (int i = 0; i < 4; i++) {
3852       struct intel_buffer_object *bufferobj =
3853          intel_buffer_object(xfb_obj->Buffers[i]);
3854       uint32_t start = xfb_obj->Offset[i];
3855       uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
3856       uint32_t const size = end - start;
3857
3858       if (!bufferobj || !size) {
3859          brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
3860             sob.SOBufferIndex = i;
3861          }
3862          continue;
3863       }
3864
3865       assert(start % 4 == 0);
3866       struct brw_bo *bo =
3867          intel_bufferobj_buffer(brw, bufferobj, start, size, true);
3868       assert(end <= bo->size);
3869
3870       brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
3871          sob.SOBufferIndex = i;
3872
3873          sob.SurfaceBaseAddress = rw_bo(bo, start);
3874 #if GEN_GEN < 8
3875          sob.SurfacePitch = linked_xfb_info->Buffers[i].Stride * 4;
3876          sob.SurfaceEndAddress = rw_bo(bo, end);
3877 #else
3878          sob.SOBufferEnable = true;
3879          sob.StreamOffsetWriteEnable = true;
3880          sob.StreamOutputBufferOffsetAddressEnable = true;
3881          sob.SOBufferMOCS = mocs_wb;
3882
3883          sob.SurfaceSize = MAX2(xfb_obj->Size[i] / 4, 1) - 1;
3884          sob.StreamOutputBufferOffsetAddress =
3885             rw_bo(brw_obj->offset_bo, i * sizeof(uint32_t));
3886
3887          if (brw_obj->zero_offsets) {
3888             /* Zero out the offset and write that to offset_bo */
3889             sob.StreamOffset = 0;
3890          } else {
3891             /* Use offset_bo as the "Stream Offset." */
3892             sob.StreamOffset = 0xFFFFFFFF;
3893          }
3894 #endif
3895       }
3896    }
3897
3898 #if GEN_GEN >= 8
3899    brw_obj->zero_offsets = false;
3900 #endif
3901 }
3902
3903 static bool
3904 query_active(struct gl_query_object *q)
3905 {
3906    return q && q->Active;
3907 }
3908
3909 static void
3910 genX(upload_3dstate_streamout)(struct brw_context *brw, bool active,
3911                                const struct brw_vue_map *vue_map)
3912 {
3913    struct gl_context *ctx = &brw->ctx;
3914    /* BRW_NEW_TRANSFORM_FEEDBACK */
3915    struct gl_transform_feedback_object *xfb_obj =
3916       ctx->TransformFeedback.CurrentObject;
3917
3918    brw_batch_emit(brw, GENX(3DSTATE_STREAMOUT), sos) {
3919       if (active) {
3920          int urb_entry_read_offset = 0;
3921          int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
3922             urb_entry_read_offset;
3923
3924          sos.SOFunctionEnable = true;
3925          sos.SOStatisticsEnable = true;
3926
3927          /* BRW_NEW_RASTERIZER_DISCARD */
3928          if (ctx->RasterDiscard) {
3929             if (!query_active(ctx->Query.PrimitivesGenerated[0])) {
3930                sos.RenderingDisable = true;
3931             } else {
3932                perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED "
3933                           "query active relies on the clipper.\n");
3934             }
3935          }
3936
3937          /* _NEW_LIGHT */
3938          if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION)
3939             sos.ReorderMode = TRAILING;
3940
3941 #if GEN_GEN < 8
3942          sos.SOBufferEnable0 = xfb_obj->Buffers[0] != NULL;
3943          sos.SOBufferEnable1 = xfb_obj->Buffers[1] != NULL;
3944          sos.SOBufferEnable2 = xfb_obj->Buffers[2] != NULL;
3945          sos.SOBufferEnable3 = xfb_obj->Buffers[3] != NULL;
3946 #else
3947          const struct gl_transform_feedback_info *linked_xfb_info =
3948             xfb_obj->program->sh.LinkedTransformFeedback;
3949          /* Set buffer pitches; 0 means unbound. */
3950          if (xfb_obj->Buffers[0])
3951             sos.Buffer0SurfacePitch = linked_xfb_info->Buffers[0].Stride * 4;
3952          if (xfb_obj->Buffers[1])
3953             sos.Buffer1SurfacePitch = linked_xfb_info->Buffers[1].Stride * 4;
3954          if (xfb_obj->Buffers[2])
3955             sos.Buffer2SurfacePitch = linked_xfb_info->Buffers[2].Stride * 4;
3956          if (xfb_obj->Buffers[3])
3957             sos.Buffer3SurfacePitch = linked_xfb_info->Buffers[3].Stride * 4;
3958 #endif
3959
3960          /* We always read the whole vertex.  This could be reduced at some
3961           * point by reading less and offsetting the register index in the
3962           * SO_DECLs.
3963           */
3964          sos.Stream0VertexReadOffset = urb_entry_read_offset;
3965          sos.Stream0VertexReadLength = urb_entry_read_length - 1;
3966          sos.Stream1VertexReadOffset = urb_entry_read_offset;
3967          sos.Stream1VertexReadLength = urb_entry_read_length - 1;
3968          sos.Stream2VertexReadOffset = urb_entry_read_offset;
3969          sos.Stream2VertexReadLength = urb_entry_read_length - 1;
3970          sos.Stream3VertexReadOffset = urb_entry_read_offset;
3971          sos.Stream3VertexReadLength = urb_entry_read_length - 1;
3972       }
3973    }
3974 }
3975
3976 static void
3977 genX(upload_sol)(struct brw_context *brw)
3978 {
3979    struct gl_context *ctx = &brw->ctx;
3980    /* BRW_NEW_TRANSFORM_FEEDBACK */
3981    bool active = _mesa_is_xfb_active_and_unpaused(ctx);
3982
3983    if (active) {
3984       genX(upload_3dstate_so_buffers)(brw);
3985
3986       /* BRW_NEW_VUE_MAP_GEOM_OUT */
3987       genX(upload_3dstate_so_decl_list)(brw, &brw->vue_map_geom_out);
3988    }
3989
3990    /* Finally, set up the SOL stage.  This command must always follow updates to
3991     * the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or
3992     * MMIO register updates (current performed by the kernel at each batch
3993     * emit).
3994     */
3995    genX(upload_3dstate_streamout)(brw, active, &brw->vue_map_geom_out);
3996 }
3997
3998 static const struct brw_tracked_state genX(sol_state) = {
3999    .dirty = {
4000       .mesa  = _NEW_LIGHT,
4001       .brw   = BRW_NEW_BATCH |
4002                BRW_NEW_BLORP |
4003                BRW_NEW_RASTERIZER_DISCARD |
4004                BRW_NEW_VUE_MAP_GEOM_OUT |
4005                BRW_NEW_TRANSFORM_FEEDBACK,
4006    },
4007    .emit = genX(upload_sol),
4008 };
4009 #endif
4010
4011 /* ---------------------------------------------------------------------- */
4012
4013 #if GEN_GEN >= 7
4014 static void
4015 genX(upload_ps)(struct brw_context *brw)
4016 {
4017    UNUSED const struct gl_context *ctx = &brw->ctx;
4018    UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
4019
4020    /* BRW_NEW_FS_PROG_DATA */
4021    const struct brw_wm_prog_data *prog_data =
4022       brw_wm_prog_data(brw->wm.base.prog_data);
4023    const struct brw_stage_state *stage_state = &brw->wm.base;
4024
4025 #if GEN_GEN < 8
4026 #endif
4027
4028    brw_batch_emit(brw, GENX(3DSTATE_PS), ps) {
4029       /* Initialize the execution mask with VMask.  Otherwise, derivatives are
4030        * incorrect for subspans where some of the pixels are unlit.  We believe
4031        * the bit just didn't take effect in previous generations.
4032        */
4033       ps.VectorMaskEnable = GEN_GEN >= 8;
4034
4035       /* WA_1606682166:
4036        * "Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes.
4037        * Disable the Sampler state prefetch functionality in the SARB by
4038        * programming 0xB000[30] to '1'."
4039        */
4040       ps.SamplerCount = GEN_GEN == 11 ?
4041          0 : DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);
4042
4043       /* BRW_NEW_FS_PROG_DATA */
4044       /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to disable
4045        * prefetching of binding tables in A0 and B0 steppings.
4046        * TODO: Revisit this workaround on C0 stepping.
4047        */
4048       ps.BindingTableEntryCount = GEN_GEN == 11 ?
4049                                   0 :
4050                                   prog_data->base.binding_table.size_bytes / 4;
4051
4052       if (prog_data->base.use_alt_mode)
4053          ps.FloatingPointMode = Alternate;
4054
4055       /* Haswell requires the sample mask to be set in this packet as well as
4056        * in 3DSTATE_SAMPLE_MASK; the values should match.
4057        */
4058
4059       /* _NEW_BUFFERS, _NEW_MULTISAMPLE */
4060 #if GEN_IS_HASWELL
4061       ps.SampleMask = genX(determine_sample_mask(brw));
4062 #endif
4063
4064       /* 3DSTATE_PS expects the number of threads per PSD, which is always 64
4065        * for pre Gen11 and 128 for gen11+; On gen11+ If a programmed value is
4066        * k, it implies 2(k+1) threads. It implicitly scales for different GT
4067        * levels (which have some # of PSDs).
4068        *
4069        * In Gen8 the format is U8-2 whereas in Gen9+ it is U9-1.
4070        */
4071 #if GEN_GEN >= 9
4072       ps.MaximumNumberofThreadsPerPSD = 64 - 1;
4073 #elif GEN_GEN >= 8
4074       ps.MaximumNumberofThreadsPerPSD = 64 - 2;
4075 #else
4076       ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
4077 #endif
4078
4079       if (prog_data->base.nr_params > 0 ||
4080           prog_data->base.ubo_ranges[0].length > 0)
4081          ps.PushConstantEnable = true;
4082
4083 #if GEN_GEN < 8
4084       /* From the IVB PRM, volume 2 part 1, page 287:
4085        * "This bit is inserted in the PS payload header and made available to
4086        * the DataPort (either via the message header or via header bypass) to
4087        * indicate that oMask data (one or two phases) is included in Render
4088        * Target Write messages. If present, the oMask data is used to mask off
4089        * samples."
4090        */
4091       ps.oMaskPresenttoRenderTarget = prog_data->uses_omask;
4092
4093       /* The hardware wedges if you have this bit set but don't turn on any
4094        * dual source blend factors.
4095        *
4096        * BRW_NEW_FS_PROG_DATA | _NEW_COLOR
4097        */
4098       ps.DualSourceBlendEnable = prog_data->dual_src_blend &&
4099                                  (ctx->Color.BlendEnabled & 1) &&
4100                                  ctx->Color.Blend[0]._UsesDualSrc;
4101
4102       /* BRW_NEW_FS_PROG_DATA */
4103       ps.AttributeEnable = (prog_data->num_varying_inputs != 0);
4104 #endif
4105
4106       /* From the documentation for this packet:
4107        * "If the PS kernel does not need the Position XY Offsets to
4108        *  compute a Position Value, then this field should be programmed
4109        *  to POSOFFSET_NONE."
4110        *
4111        * "SW Recommendation: If the PS kernel needs the Position Offsets
4112        *  to compute a Position XY value, this field should match Position
4113        *  ZW Interpolation Mode to ensure a consistent position.xyzw
4114        *  computation."
4115        *
4116        * We only require XY sample offsets. So, this recommendation doesn't
4117        * look useful at the moment. We might need this in future.
4118        */
4119       if (prog_data->uses_pos_offset)
4120          ps.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
4121       else
4122          ps.PositionXYOffsetSelect = POSOFFSET_NONE;
4123
4124       ps._8PixelDispatchEnable = prog_data->dispatch_8;
4125       ps._16PixelDispatchEnable = prog_data->dispatch_16;
4126       ps._32PixelDispatchEnable = prog_data->dispatch_32;
4127
4128       /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
4129        *
4130        *    "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
4131        *    Dispatch must not be enabled for PER_PIXEL dispatch mode."
4132        *
4133        * Since 16x MSAA is first introduced on SKL, we don't need to apply
4134        * the workaround on any older hardware.
4135        *
4136        * BRW_NEW_NUM_SAMPLES
4137        */
4138       if (GEN_GEN >= 9 && !prog_data->persample_dispatch &&
4139           brw->num_samples == 16) {
4140          assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
4141          ps._32PixelDispatchEnable = false;
4142       }
4143
4144       ps.DispatchGRFStartRegisterForConstantSetupData0 =
4145          brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
4146       ps.DispatchGRFStartRegisterForConstantSetupData1 =
4147          brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
4148       ps.DispatchGRFStartRegisterForConstantSetupData2 =
4149          brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
4150
4151       ps.KernelStartPointer0 = stage_state->prog_offset +
4152                                brw_wm_prog_data_prog_offset(prog_data, ps, 0);
4153       ps.KernelStartPointer1 = stage_state->prog_offset +
4154                                brw_wm_prog_data_prog_offset(prog_data, ps, 1);
4155       ps.KernelStartPointer2 = stage_state->prog_offset +
4156                                brw_wm_prog_data_prog_offset(prog_data, ps, 2);
4157
4158       if (prog_data->base.total_scratch) {
4159          ps.ScratchSpaceBasePointer =
4160             rw_32_bo(stage_state->scratch_bo,
4161                      ffs(stage_state->per_thread_scratch) - 11);
4162       }
4163    }
4164 }
4165
4166 static const struct brw_tracked_state genX(ps_state) = {
4167    .dirty = {
4168       .mesa  = _NEW_MULTISAMPLE |
4169                (GEN_GEN < 8 ? _NEW_BUFFERS |
4170                               _NEW_COLOR
4171                             : 0),
4172       .brw   = BRW_NEW_BATCH |
4173                BRW_NEW_BLORP |
4174                BRW_NEW_FS_PROG_DATA |
4175                (GEN_GEN >= 9 ? BRW_NEW_NUM_SAMPLES : 0),
4176    },
4177    .emit = genX(upload_ps),
4178 };
4179 #endif
4180
4181 /* ---------------------------------------------------------------------- */
4182
4183 #if GEN_GEN >= 7
4184 static void
4185 genX(upload_hs_state)(struct brw_context *brw)
4186 {
4187    const struct gen_device_info *devinfo = &brw->screen->devinfo;
4188    struct brw_stage_state *stage_state = &brw->tcs.base;
4189    struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
4190    const struct brw_vue_prog_data *vue_prog_data =
4191       brw_vue_prog_data(stage_prog_data);
4192
4193    /* BRW_NEW_TES_PROG_DATA */
4194    struct brw_tcs_prog_data *tcs_prog_data =
4195       brw_tcs_prog_data(stage_prog_data);
4196
4197    if (!tcs_prog_data) {
4198       brw_batch_emit(brw, GENX(3DSTATE_HS), hs);
4199    } else {
4200       brw_batch_emit(brw, GENX(3DSTATE_HS), hs) {
4201          INIT_THREAD_DISPATCH_FIELDS(hs, Vertex);
4202
4203          hs.InstanceCount = tcs_prog_data->instances - 1;
4204          hs.IncludeVertexHandles = true;
4205
4206          hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
4207       }
4208    }
4209 }
4210
4211 static const struct brw_tracked_state genX(hs_state) = {
4212    .dirty = {
4213       .mesa  = 0,
4214       .brw   = BRW_NEW_BATCH |
4215                BRW_NEW_BLORP |
4216                BRW_NEW_TCS_PROG_DATA |
4217                BRW_NEW_TESS_PROGRAMS,
4218    },
4219    .emit = genX(upload_hs_state),
4220 };
4221
4222 static void
4223 genX(upload_ds_state)(struct brw_context *brw)
4224 {
4225    const struct gen_device_info *devinfo = &brw->screen->devinfo;
4226    const struct brw_stage_state *stage_state = &brw->tes.base;
4227    struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
4228
4229    /* BRW_NEW_TES_PROG_DATA */
4230    const struct brw_tes_prog_data *tes_prog_data =
4231       brw_tes_prog_data(stage_prog_data);
4232    const struct brw_vue_prog_data *vue_prog_data =
4233       brw_vue_prog_data(stage_prog_data);
4234
4235    if (!tes_prog_data) {
4236       brw_batch_emit(brw, GENX(3DSTATE_DS), ds);
4237    } else {
4238       assert(GEN_GEN < 11 ||
4239              vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8);
4240
4241       brw_batch_emit(brw, GENX(3DSTATE_DS), ds) {
4242          INIT_THREAD_DISPATCH_FIELDS(ds, Patch);
4243
4244         ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
4245         ds.ComputeWCoordinateEnable =
4246            tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
4247
4248 #if GEN_GEN >= 8
4249         if (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8)
4250            ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
4251         ds.UserClipDistanceCullTestEnableBitmask =
4252             vue_prog_data->cull_distance_mask;
4253 #endif
4254       }
4255    }
4256 }
4257
4258 static const struct brw_tracked_state genX(ds_state) = {
4259    .dirty = {
4260       .mesa  = 0,
4261       .brw   = BRW_NEW_BATCH |
4262                BRW_NEW_BLORP |
4263                BRW_NEW_TESS_PROGRAMS |
4264                BRW_NEW_TES_PROG_DATA,
4265    },
4266    .emit = genX(upload_ds_state),
4267 };
4268
4269 /* ---------------------------------------------------------------------- */
4270
4271 static void
4272 upload_te_state(struct brw_context *brw)
4273 {
4274    /* BRW_NEW_TESS_PROGRAMS */
4275    bool active = brw->programs[MESA_SHADER_TESS_EVAL];
4276
4277    /* BRW_NEW_TES_PROG_DATA */
4278    const struct brw_tes_prog_data *tes_prog_data =
4279       brw_tes_prog_data(brw->tes.base.prog_data);
4280
4281    if (active) {
4282       brw_batch_emit(brw, GENX(3DSTATE_TE), te) {
4283          te.Partitioning = tes_prog_data->partitioning;
4284          te.OutputTopology = tes_prog_data->output_topology;
4285          te.TEDomain = tes_prog_data->domain;
4286          te.TEEnable = true;
4287          te.MaximumTessellationFactorOdd = 63.0;
4288          te.MaximumTessellationFactorNotOdd = 64.0;
4289       }
4290    } else {
4291       brw_batch_emit(brw, GENX(3DSTATE_TE), te);
4292    }
4293 }
4294
4295 static const struct brw_tracked_state genX(te_state) = {
4296    .dirty = {
4297       .mesa  = 0,
4298       .brw   = BRW_NEW_BLORP |
4299                BRW_NEW_CONTEXT |
4300                BRW_NEW_TES_PROG_DATA |
4301                BRW_NEW_TESS_PROGRAMS,
4302    },
4303    .emit = upload_te_state,
4304 };
4305
4306 /* ---------------------------------------------------------------------- */
4307
4308 static void
4309 genX(upload_tes_push_constants)(struct brw_context *brw)
4310 {
4311    struct brw_stage_state *stage_state = &brw->tes.base;
4312    /* BRW_NEW_TESS_PROGRAMS */
4313    const struct gl_program *tep = brw->programs[MESA_SHADER_TESS_EVAL];
4314
4315    /* BRW_NEW_TES_PROG_DATA */
4316    const struct brw_stage_prog_data *prog_data = brw->tes.base.prog_data;
4317    gen6_upload_push_constants(brw, tep, prog_data, stage_state);
4318 }
4319
4320 static const struct brw_tracked_state genX(tes_push_constants) = {
4321    .dirty = {
4322       .mesa  = _NEW_PROGRAM_CONSTANTS,
4323       .brw   = BRW_NEW_BATCH |
4324                BRW_NEW_BLORP |
4325                BRW_NEW_TESS_PROGRAMS |
4326                BRW_NEW_TES_PROG_DATA,
4327    },
4328    .emit = genX(upload_tes_push_constants),
4329 };
4330
4331 static void
4332 genX(upload_tcs_push_constants)(struct brw_context *brw)
4333 {
4334    struct brw_stage_state *stage_state = &brw->tcs.base;
4335    /* BRW_NEW_TESS_PROGRAMS */
4336    const struct gl_program *tcp = brw->programs[MESA_SHADER_TESS_CTRL];
4337
4338    /* BRW_NEW_TCS_PROG_DATA */
4339    const struct brw_stage_prog_data *prog_data = brw->tcs.base.prog_data;
4340
4341    gen6_upload_push_constants(brw, tcp, prog_data, stage_state);
4342 }
4343
4344 static const struct brw_tracked_state genX(tcs_push_constants) = {
4345    .dirty = {
4346       .mesa  = _NEW_PROGRAM_CONSTANTS,
4347       .brw   = BRW_NEW_BATCH |
4348                BRW_NEW_BLORP |
4349                BRW_NEW_DEFAULT_TESS_LEVELS |
4350                BRW_NEW_TESS_PROGRAMS |
4351                BRW_NEW_TCS_PROG_DATA,
4352    },
4353    .emit = genX(upload_tcs_push_constants),
4354 };
4355
4356 #endif
4357
4358 /* ---------------------------------------------------------------------- */
4359
4360 #if GEN_GEN >= 7
4361 static void
4362 genX(upload_cs_push_constants)(struct brw_context *brw)
4363 {
4364    struct brw_stage_state *stage_state = &brw->cs.base;
4365
4366    /* BRW_NEW_COMPUTE_PROGRAM */
4367    const struct gl_program *cp = brw->programs[MESA_SHADER_COMPUTE];
4368
4369    if (cp) {
4370       /* BRW_NEW_CS_PROG_DATA */
4371       struct brw_cs_prog_data *cs_prog_data =
4372          brw_cs_prog_data(brw->cs.base.prog_data);
4373
4374       _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE);
4375       brw_upload_cs_push_constants(brw, cp, cs_prog_data, stage_state);
4376    }
4377 }
4378
4379 const struct brw_tracked_state genX(cs_push_constants) = {
4380    .dirty = {
4381       .mesa = _NEW_PROGRAM_CONSTANTS,
4382       .brw = BRW_NEW_BATCH |
4383              BRW_NEW_BLORP |
4384              BRW_NEW_COMPUTE_PROGRAM |
4385              BRW_NEW_CS_PROG_DATA,
4386    },
4387    .emit = genX(upload_cs_push_constants),
4388 };
4389
4390 /**
4391  * Creates a new CS constant buffer reflecting the current CS program's
4392  * constants, if needed by the CS program.
4393  */
4394 static void
4395 genX(upload_cs_pull_constants)(struct brw_context *brw)
4396 {
4397    struct brw_stage_state *stage_state = &brw->cs.base;
4398
4399    /* BRW_NEW_COMPUTE_PROGRAM */
4400    struct brw_program *cp =
4401       (struct brw_program *) brw->programs[MESA_SHADER_COMPUTE];
4402
4403    /* BRW_NEW_CS_PROG_DATA */
4404    const struct brw_stage_prog_data *prog_data = brw->cs.base.prog_data;
4405
4406    _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE);
4407    /* _NEW_PROGRAM_CONSTANTS */
4408    brw_upload_pull_constants(brw, BRW_NEW_SURFACES, &cp->program,
4409                              stage_state, prog_data);
4410 }
4411
4412 const struct brw_tracked_state genX(cs_pull_constants) = {
4413    .dirty = {
4414       .mesa = _NEW_PROGRAM_CONSTANTS,
4415       .brw = BRW_NEW_BATCH |
4416              BRW_NEW_BLORP |
4417              BRW_NEW_COMPUTE_PROGRAM |
4418              BRW_NEW_CS_PROG_DATA,
4419    },
4420    .emit = genX(upload_cs_pull_constants),
4421 };
4422
4423 static void
4424 genX(upload_cs_state)(struct brw_context *brw)
4425 {
4426    if (!brw->cs.base.prog_data)
4427       return;
4428
4429    uint32_t offset;
4430    uint32_t *desc = (uint32_t*) brw_state_batch(
4431       brw, GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t), 64,
4432       &offset);
4433
4434    struct brw_stage_state *stage_state = &brw->cs.base;
4435    struct brw_stage_prog_data *prog_data = stage_state->prog_data;
4436    struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
4437    const struct gen_device_info *devinfo = &brw->screen->devinfo;
4438
4439    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
4440       brw_emit_buffer_surface_state(
4441          brw, &stage_state->surf_offset[
4442                  prog_data->binding_table.shader_time_start],
4443          brw->shader_time.bo, 0, ISL_FORMAT_RAW,
4444          brw->shader_time.bo->size, 1,
4445          RELOC_WRITE);
4446    }
4447
4448    uint32_t *bind = brw_state_batch(brw, prog_data->binding_table.size_bytes,
4449                                     32, &stage_state->bind_bo_offset);
4450
4451    /* The MEDIA_VFE_STATE documentation for Gen8+ says:
4452     *
4453     * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
4454     *  the only bits that are changed are scoreboard related: Scoreboard
4455     *  Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
4456     *  these scoreboard related states, a MEDIA_STATE_FLUSH is sufficient."
4457     *
4458     * Earlier generations say "MI_FLUSH" instead of "stalling PIPE_CONTROL",
4459     * but MI_FLUSH isn't really a thing, so we assume they meant PIPE_CONTROL.
4460     */
4461    brw_emit_pipe_control_flush(brw, PIPE_CONTROL_CS_STALL);
4462
4463    brw_batch_emit(brw, GENX(MEDIA_VFE_STATE), vfe) {
4464       if (prog_data->total_scratch) {
4465          uint32_t per_thread_scratch_value;
4466
4467          if (GEN_GEN >= 8) {
4468             /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
4469              * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
4470              */
4471             per_thread_scratch_value = ffs(stage_state->per_thread_scratch) - 11;
4472          } else if (GEN_IS_HASWELL) {
4473             /* Haswell's Per Thread Scratch Space is in the range [0, 10]
4474              * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
4475              */
4476             per_thread_scratch_value = ffs(stage_state->per_thread_scratch) - 12;
4477          } else {
4478             /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
4479              * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
4480              */
4481             per_thread_scratch_value = stage_state->per_thread_scratch / 1024 - 1;
4482          }
4483          vfe.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
4484          vfe.PerThreadScratchSpace = per_thread_scratch_value;
4485       }
4486
4487       /* If brw->screen->subslice_total is greater than one, then
4488        * devinfo->max_cs_threads stores number of threads per sub-slice;
4489        * thus we need to multiply by that number by subslices to get
4490        * the actual maximum number of threads; the -1 is because the HW
4491        * has a bias of 1 (would not make sense to say the maximum number
4492        * of threads is 0).
4493        */
4494       const uint32_t subslices = MAX2(brw->screen->subslice_total, 1);
4495       vfe.MaximumNumberofThreads = devinfo->max_cs_threads * subslices - 1;
4496       vfe.NumberofURBEntries = GEN_GEN >= 8 ? 2 : 0;
4497 #if GEN_GEN < 11
4498       vfe.ResetGatewayTimer =
4499          Resettingrelativetimerandlatchingtheglobaltimestamp;
4500 #endif
4501 #if GEN_GEN < 9
4502       vfe.BypassGatewayControl = BypassingOpenGatewayCloseGatewayprotocol;
4503 #endif
4504 #if GEN_GEN == 7
4505       vfe.GPGPUMode = 1;
4506 #endif
4507
4508       /* We are uploading duplicated copies of push constant uniforms for each
4509        * thread. Although the local id data needs to vary per thread, it won't
4510        * change for other uniform data. Unfortunately this duplication is
4511        * required for gen7. As of Haswell, this duplication can be avoided,
4512        * but this older mechanism with duplicated data continues to work.
4513        *
4514        * FINISHME: As of Haswell, we could make use of the
4515        * INTERFACE_DESCRIPTOR_DATA "Cross-Thread Constant Data Read Length"
4516        * field to only store one copy of uniform data.
4517        *
4518        * FINISHME: Broadwell adds a new alternative "Indirect Payload Storage"
4519        * which is described in the GPGPU_WALKER command and in the Broadwell
4520        * PRM Volume 7: 3D Media GPGPU, under Media GPGPU Pipeline => Mode of
4521        * Operations => GPGPU Mode => Indirect Payload Storage.
4522        *
4523        * Note: The constant data is built in brw_upload_cs_push_constants
4524        * below.
4525        */
4526       vfe.URBEntryAllocationSize = GEN_GEN >= 8 ? 2 : 0;
4527
4528       const uint32_t vfe_curbe_allocation =
4529          ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads +
4530                cs_prog_data->push.cross_thread.regs, 2);
4531       vfe.CURBEAllocationSize = vfe_curbe_allocation;
4532    }
4533
4534    if (cs_prog_data->push.total.size > 0) {
4535       brw_batch_emit(brw, GENX(MEDIA_CURBE_LOAD), curbe) {
4536          curbe.CURBETotalDataLength =
4537             ALIGN(cs_prog_data->push.total.size, 64);
4538          curbe.CURBEDataStartAddress = stage_state->push_const_offset;
4539       }
4540    }
4541
4542    /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */
4543    memcpy(bind, stage_state->surf_offset,
4544           prog_data->binding_table.size_bytes);
4545    const struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {
4546       .KernelStartPointer = brw->cs.base.prog_offset,
4547       .SamplerStatePointer = stage_state->sampler_offset,
4548       .SamplerCount = DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4),
4549       .BindingTablePointer = stage_state->bind_bo_offset,
4550       .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
4551       .NumberofThreadsinGPGPUThreadGroup = cs_prog_data->threads,
4552       .SharedLocalMemorySize = encode_slm_size(GEN_GEN,
4553                                                prog_data->total_shared),
4554       .BarrierEnable = cs_prog_data->uses_barrier,
4555 #if GEN_GEN >= 8 || GEN_IS_HASWELL
4556       .CrossThreadConstantDataReadLength =
4557          cs_prog_data->push.cross_thread.regs,
4558 #endif
4559    };
4560
4561    GENX(INTERFACE_DESCRIPTOR_DATA_pack)(brw, desc, &idd);
4562
4563    brw_batch_emit(brw, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
4564       load.InterfaceDescriptorTotalLength =
4565          GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
4566       load.InterfaceDescriptorDataStartAddress = offset;
4567    }
4568 }
4569
4570 static const struct brw_tracked_state genX(cs_state) = {
4571    .dirty = {
4572       .mesa = _NEW_PROGRAM_CONSTANTS,
4573       .brw = BRW_NEW_BATCH |
4574              BRW_NEW_BLORP |
4575              BRW_NEW_CS_PROG_DATA |
4576              BRW_NEW_SAMPLER_STATE_TABLE |
4577              BRW_NEW_SURFACES,
4578    },
4579    .emit = genX(upload_cs_state)
4580 };
4581
4582 #endif
4583
4584 /* ---------------------------------------------------------------------- */
4585
4586 #if GEN_GEN >= 8
4587 static void
4588 genX(upload_raster)(struct brw_context *brw)
4589 {
4590    const struct gl_context *ctx = &brw->ctx;
4591
4592    /* _NEW_BUFFERS */
4593    const bool flip_y = ctx->DrawBuffer->FlipY;
4594
4595    /* _NEW_POLYGON */
4596    const struct gl_polygon_attrib *polygon = &ctx->Polygon;
4597
4598    /* _NEW_POINT */
4599    const struct gl_point_attrib *point = &ctx->Point;
4600
4601    brw_batch_emit(brw, GENX(3DSTATE_RASTER), raster) {
4602       if (brw->polygon_front_bit != flip_y)
4603          raster.FrontWinding = CounterClockwise;
4604
4605       if (polygon->CullFlag) {
4606          switch (polygon->CullFaceMode) {
4607          case GL_FRONT:
4608             raster.CullMode = CULLMODE_FRONT;
4609             break;
4610          case GL_BACK:
4611             raster.CullMode = CULLMODE_BACK;
4612             break;
4613          case GL_FRONT_AND_BACK:
4614             raster.CullMode = CULLMODE_BOTH;
4615             break;
4616          default:
4617             unreachable("not reached");
4618          }
4619       } else {
4620          raster.CullMode = CULLMODE_NONE;
4621       }
4622
4623       raster.SmoothPointEnable = point->SmoothFlag;
4624
4625       raster.DXMultisampleRasterizationEnable =
4626          _mesa_is_multisample_enabled(ctx);
4627
4628       raster.GlobalDepthOffsetEnableSolid = polygon->OffsetFill;
4629       raster.GlobalDepthOffsetEnableWireframe = polygon->OffsetLine;
4630       raster.GlobalDepthOffsetEnablePoint = polygon->OffsetPoint;
4631
4632       switch (polygon->FrontMode) {
4633       case GL_FILL:
4634          raster.FrontFaceFillMode = FILL_MODE_SOLID;
4635          break;
4636       case GL_LINE:
4637          raster.FrontFaceFillMode = FILL_MODE_WIREFRAME;
4638          break;
4639       case GL_POINT:
4640          raster.FrontFaceFillMode = FILL_MODE_POINT;
4641          break;
4642       default:
4643          unreachable("not reached");
4644       }
4645
4646       switch (polygon->BackMode) {
4647       case GL_FILL:
4648          raster.BackFaceFillMode = FILL_MODE_SOLID;
4649          break;
4650       case GL_LINE:
4651          raster.BackFaceFillMode = FILL_MODE_WIREFRAME;
4652          break;
4653       case GL_POINT:
4654          raster.BackFaceFillMode = FILL_MODE_POINT;
4655          break;
4656       default:
4657          unreachable("not reached");
4658       }
4659
4660       /* _NEW_LINE */
4661       raster.AntialiasingEnable = ctx->Line.SmoothFlag;
4662
4663 #if GEN_GEN == 10
4664       /* _NEW_BUFFERS
4665        * Antialiasing Enable bit MUST not be set when NUM_MULTISAMPLES > 1.
4666        */
4667       const bool multisampled_fbo =
4668          _mesa_geometric_samples(ctx->DrawBuffer) > 1;
4669       if (multisampled_fbo)
4670          raster.AntialiasingEnable = false;
4671 #endif
4672
4673       /* _NEW_SCISSOR */
4674       raster.ScissorRectangleEnable = ctx->Scissor.EnableFlags;
4675
4676       /* _NEW_TRANSFORM */
4677 #if GEN_GEN < 9
4678       if (!(ctx->Transform.DepthClampNear &&
4679             ctx->Transform.DepthClampFar))
4680          raster.ViewportZClipTestEnable = true;
4681 #endif
4682
4683 #if GEN_GEN >= 9
4684       if (!ctx->Transform.DepthClampNear)
4685          raster.ViewportZNearClipTestEnable = true;
4686
4687       if (!ctx->Transform.DepthClampFar)
4688          raster.ViewportZFarClipTestEnable = true;
4689 #endif
4690
4691       /* BRW_NEW_CONSERVATIVE_RASTERIZATION */
4692 #if GEN_GEN >= 9
4693       raster.ConservativeRasterizationEnable =
4694          ctx->IntelConservativeRasterization;
4695 #endif
4696
4697       raster.GlobalDepthOffsetClamp = polygon->OffsetClamp;
4698       raster.GlobalDepthOffsetScale = polygon->OffsetFactor;
4699
4700       raster.GlobalDepthOffsetConstant = polygon->OffsetUnits * 2;
4701    }
4702 }
4703
4704 static const struct brw_tracked_state genX(raster_state) = {
4705    .dirty = {
4706       .mesa  = _NEW_BUFFERS |
4707                _NEW_LINE |
4708                _NEW_MULTISAMPLE |
4709                _NEW_POINT |
4710                _NEW_POLYGON |
4711                _NEW_SCISSOR |
4712                _NEW_TRANSFORM,
4713       .brw   = BRW_NEW_BLORP |
4714                BRW_NEW_CONTEXT |
4715                BRW_NEW_CONSERVATIVE_RASTERIZATION,
4716    },
4717    .emit = genX(upload_raster),
4718 };
4719 #endif
4720
4721 /* ---------------------------------------------------------------------- */
4722
4723 #if GEN_GEN >= 8
4724 static void
4725 genX(upload_ps_extra)(struct brw_context *brw)
4726 {
4727    UNUSED struct gl_context *ctx = &brw->ctx;
4728
4729    const struct brw_wm_prog_data *prog_data =
4730       brw_wm_prog_data(brw->wm.base.prog_data);
4731
4732    brw_batch_emit(brw, GENX(3DSTATE_PS_EXTRA), psx) {
4733       psx.PixelShaderValid = true;
4734       psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
4735       psx.PixelShaderKillsPixel = prog_data->uses_kill;
4736       psx.AttributeEnable = prog_data->num_varying_inputs != 0;
4737       psx.PixelShaderUsesSourceDepth = prog_data->uses_src_depth;
4738       psx.PixelShaderUsesSourceW = prog_data->uses_src_w;
4739       psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
4740
4741       /* _NEW_MULTISAMPLE | BRW_NEW_CONSERVATIVE_RASTERIZATION */
4742       if (prog_data->uses_sample_mask) {
4743 #if GEN_GEN >= 9
4744          if (prog_data->post_depth_coverage)
4745             psx.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
4746          else if (prog_data->inner_coverage && ctx->IntelConservativeRasterization)
4747             psx.InputCoverageMaskState = ICMS_INNER_CONSERVATIVE;
4748          else
4749             psx.InputCoverageMaskState = ICMS_NORMAL;
4750 #else
4751          psx.PixelShaderUsesInputCoverageMask = true;
4752 #endif
4753       }
4754
4755       psx.oMaskPresenttoRenderTarget = prog_data->uses_omask;
4756 #if GEN_GEN >= 9
4757       psx.PixelShaderPullsBary = prog_data->pulls_bary;
4758       psx.PixelShaderComputesStencil = prog_data->computed_stencil;
4759 #endif
4760
4761       /* The stricter cross-primitive coherency guarantees that the hardware
4762        * gives us with the "Accesses UAV" bit set for at least one shader stage
4763        * and the "UAV coherency required" bit set on the 3DPRIMITIVE command
4764        * are redundant within the current image, atomic counter and SSBO GL
4765        * APIs, which all have very loose ordering and coherency requirements
4766        * and generally rely on the application to insert explicit barriers when
4767        * a shader invocation is expected to see the memory writes performed by
4768        * the invocations of some previous primitive.  Regardless of the value
4769        * of "UAV coherency required", the "Accesses UAV" bits will implicitly
4770        * cause an in most cases useless DC flush when the lowermost stage with
4771        * the bit set finishes execution.
4772        *
4773        * It would be nice to disable it, but in some cases we can't because on
4774        * Gen8+ it also has an influence on rasterization via the PS UAV-only
4775        * signal (which could be set independently from the coherency mechanism
4776        * in the 3DSTATE_WM command on Gen7), and because in some cases it will
4777        * determine whether the hardware skips execution of the fragment shader
4778        * or not via the ThreadDispatchEnable signal.  However if we know that
4779        * GEN8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
4780        * GEN8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
4781        * difference so we may just disable it here.
4782        *
4783        * Gen8 hardware tries to compute ThreadDispatchEnable for us but doesn't
4784        * take into account KillPixels when no depth or stencil writes are
4785        * enabled.  In order for occlusion queries to work correctly with no
4786        * attachments, we need to force-enable here.
4787        *
4788        * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS |
4789        * _NEW_COLOR
4790        */
4791       if ((prog_data->has_side_effects || prog_data->uses_kill) &&
4792           !brw_color_buffer_write_enabled(brw))
4793          psx.PixelShaderHasUAV = true;
4794    }
4795 }
4796
4797 const struct brw_tracked_state genX(ps_extra) = {
4798    .dirty = {
4799       .mesa  = _NEW_BUFFERS | _NEW_COLOR,
4800       .brw   = BRW_NEW_BLORP |
4801                BRW_NEW_CONTEXT |
4802                BRW_NEW_FRAGMENT_PROGRAM |
4803                BRW_NEW_FS_PROG_DATA |
4804                BRW_NEW_CONSERVATIVE_RASTERIZATION,
4805    },
4806    .emit = genX(upload_ps_extra),
4807 };
4808 #endif
4809
4810 /* ---------------------------------------------------------------------- */
4811
4812 #if GEN_GEN >= 8
4813 static void
4814 genX(upload_ps_blend)(struct brw_context *brw)
4815 {
4816    struct gl_context *ctx = &brw->ctx;
4817
4818    /* _NEW_BUFFERS */
4819    struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[0];
4820    const bool buffer0_is_integer = ctx->DrawBuffer->_IntegerBuffers & 0x1;
4821
4822    /* _NEW_COLOR */
4823    struct gl_colorbuffer_attrib *color = &ctx->Color;
4824
4825    brw_batch_emit(brw, GENX(3DSTATE_PS_BLEND), pb) {
4826       /* BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR */
4827       pb.HasWriteableRT = brw_color_buffer_write_enabled(brw);
4828
4829       bool alpha_to_one = false;
4830
4831       if (!buffer0_is_integer) {
4832          /* _NEW_MULTISAMPLE */
4833
4834          if (_mesa_is_multisample_enabled(ctx)) {
4835             pb.AlphaToCoverageEnable = ctx->Multisample.SampleAlphaToCoverage;
4836             alpha_to_one = ctx->Multisample.SampleAlphaToOne;
4837          }
4838
4839          pb.AlphaTestEnable = color->AlphaEnabled;
4840       }
4841
4842       /* Used for implementing the following bit of GL_EXT_texture_integer:
4843        * "Per-fragment operations that require floating-point color
4844        *  components, including multisample alpha operations, alpha test,
4845        *  blending, and dithering, have no effect when the corresponding
4846        *  colors are written to an integer color buffer."
4847        *
4848        * The OpenGL specification 3.3 (page 196), section 4.1.3 says:
4849        * "If drawbuffer zero is not NONE and the buffer it references has an
4850        *  integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
4851        *  operations are skipped."
4852        */
4853       if (rb && !buffer0_is_integer && (color->BlendEnabled & 1)) {
4854          GLenum eqRGB = color->Blend[0].EquationRGB;
4855          GLenum eqA = color->Blend[0].EquationA;
4856          GLenum srcRGB = color->Blend[0].SrcRGB;
4857          GLenum dstRGB = color->Blend[0].DstRGB;
4858          GLenum srcA = color->Blend[0].SrcA;
4859          GLenum dstA = color->Blend[0].DstA;
4860
4861          if (eqRGB == GL_MIN || eqRGB == GL_MAX)
4862             srcRGB = dstRGB = GL_ONE;
4863
4864          if (eqA == GL_MIN || eqA == GL_MAX)
4865             srcA = dstA = GL_ONE;
4866
4867          /* Due to hardware limitations, the destination may have information
4868           * in an alpha channel even when the format specifies no alpha
4869           * channel. In order to avoid getting any incorrect blending due to
4870           * that alpha channel, coerce the blend factors to values that will
4871           * not read the alpha channel, but will instead use the correct
4872           * implicit value for alpha.
4873           */
4874          if (!_mesa_base_format_has_channel(rb->_BaseFormat,
4875                                             GL_TEXTURE_ALPHA_TYPE)) {
4876             srcRGB = brw_fix_xRGB_alpha(srcRGB);
4877             srcA = brw_fix_xRGB_alpha(srcA);
4878             dstRGB = brw_fix_xRGB_alpha(dstRGB);
4879             dstA = brw_fix_xRGB_alpha(dstA);
4880          }
4881
4882          /* Alpha to One doesn't work with Dual Color Blending.  Override
4883           * SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO.
4884           */
4885          if (alpha_to_one && color->Blend[0]._UsesDualSrc) {
4886             srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
4887             srcA = fix_dual_blend_alpha_to_one(srcA);
4888             dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
4889             dstA = fix_dual_blend_alpha_to_one(dstA);
4890          }
4891
4892          /* BRW_NEW_FS_PROG_DATA */
4893          const struct brw_wm_prog_data *wm_prog_data =
4894             brw_wm_prog_data(brw->wm.base.prog_data);
4895
4896          /* The Dual Source Blending documentation says:
4897           *
4898           * "If SRC1 is included in a src/dst blend factor and
4899           * a DualSource RT Write message is not used, results
4900           * are UNDEFINED. (This reflects the same restriction in DX APIs,
4901           * where undefined results are produced if “o1” is not written
4902           * by a PS – there are no default values defined).
4903           * If SRC1 is not included in a src/dst blend factor,
4904           * dual source blending must be disabled."
4905           *
4906           * There is no way to gracefully fix this undefined situation
4907           * so we just disable the blending to prevent possible issues.
4908           */
4909          pb.ColorBufferBlendEnable =
4910             !color->Blend[0]._UsesDualSrc || wm_prog_data->dual_src_blend;
4911          pb.SourceAlphaBlendFactor = brw_translate_blend_factor(srcA);
4912          pb.DestinationAlphaBlendFactor = brw_translate_blend_factor(dstA);
4913          pb.SourceBlendFactor = brw_translate_blend_factor(srcRGB);
4914          pb.DestinationBlendFactor = brw_translate_blend_factor(dstRGB);
4915
4916          pb.IndependentAlphaBlendEnable =
4917             srcA != srcRGB || dstA != dstRGB || eqA != eqRGB;
4918       }
4919    }
4920 }
4921
4922 static const struct brw_tracked_state genX(ps_blend) = {
4923    .dirty = {
4924       .mesa = _NEW_BUFFERS |
4925               _NEW_COLOR |
4926               _NEW_MULTISAMPLE,
4927       .brw = BRW_NEW_BLORP |
4928              BRW_NEW_CONTEXT |
4929              BRW_NEW_FRAGMENT_PROGRAM |
4930              BRW_NEW_FS_PROG_DATA,
4931    },
4932    .emit = genX(upload_ps_blend)
4933 };
4934 #endif
4935
4936 /* ---------------------------------------------------------------------- */
4937
4938 #if GEN_GEN >= 8
4939 static void
4940 genX(emit_vf_topology)(struct brw_context *brw)
4941 {
4942    brw_batch_emit(brw, GENX(3DSTATE_VF_TOPOLOGY), vftopo) {
4943       vftopo.PrimitiveTopologyType = brw->primitive;
4944    }
4945 }
4946
4947 static const struct brw_tracked_state genX(vf_topology) = {
4948    .dirty = {
4949       .mesa = 0,
4950       .brw = BRW_NEW_BLORP |
4951              BRW_NEW_PRIMITIVE,
4952    },
4953    .emit = genX(emit_vf_topology),
4954 };
4955 #endif
4956
4957 /* ---------------------------------------------------------------------- */
4958
4959 #if GEN_GEN >= 7
4960 static void
4961 genX(emit_mi_report_perf_count)(struct brw_context *brw,
4962                                 struct brw_bo *bo,
4963                                 uint32_t offset_in_bytes,
4964                                 uint32_t report_id)
4965 {
4966    brw_batch_emit(brw, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
4967       mi_rpc.MemoryAddress = ggtt_bo(bo, offset_in_bytes);
4968       mi_rpc.ReportID = report_id;
4969    }
4970 }
4971 #endif
4972
4973 /* ---------------------------------------------------------------------- */
4974
4975 /**
4976  * Emit a 3DSTATE_SAMPLER_STATE_POINTERS_{VS,HS,GS,DS,PS} packet.
4977  */
4978 static void
4979 genX(emit_sampler_state_pointers_xs)(MAYBE_UNUSED struct brw_context *brw,
4980                                      MAYBE_UNUSED struct brw_stage_state *stage_state)
4981 {
4982 #if GEN_GEN >= 7
4983    static const uint16_t packet_headers[] = {
4984       [MESA_SHADER_VERTEX] = 43,
4985       [MESA_SHADER_TESS_CTRL] = 44,
4986       [MESA_SHADER_TESS_EVAL] = 45,
4987       [MESA_SHADER_GEOMETRY] = 46,
4988       [MESA_SHADER_FRAGMENT] = 47,
4989    };
4990
4991    /* Ivybridge requires a workaround flush before VS packets. */
4992    if (GEN_GEN == 7 && !GEN_IS_HASWELL &&
4993        stage_state->stage == MESA_SHADER_VERTEX) {
4994       gen7_emit_vs_workaround_flush(brw);
4995    }
4996
4997    brw_batch_emit(brw, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
4998       ptr._3DCommandSubOpcode = packet_headers[stage_state->stage];
4999       ptr.PointertoVSSamplerState = stage_state->sampler_offset;
5000    }
5001 #endif
5002 }
5003
5004 UNUSED static bool
5005 has_component(mesa_format format, int i)
5006 {
5007    if (_mesa_is_format_color_format(format))
5008       return _mesa_format_has_color_component(format, i);
5009
5010    /* depth and stencil have only one component */
5011    return i == 0;
5012 }
5013
5014 /**
5015  * Upload SAMPLER_BORDER_COLOR_STATE.
5016  */
5017 static void
5018 genX(upload_default_color)(struct brw_context *brw,
5019                            const struct gl_sampler_object *sampler,
5020                            MAYBE_UNUSED mesa_format format, GLenum base_format,
5021                            bool is_integer_format, bool is_stencil_sampling,
5022                            uint32_t *sdc_offset)
5023 {
5024    union gl_color_union color;
5025
5026    switch (base_format) {
5027    case GL_DEPTH_COMPONENT:
5028       /* GL specs that border color for depth textures is taken from the
5029        * R channel, while the hardware uses A.  Spam R into all the
5030        * channels for safety.
5031        */
5032       color.ui[0] = sampler->BorderColor.ui[0];
5033       color.ui[1] = sampler->BorderColor.ui[0];
5034       color.ui[2] = sampler->BorderColor.ui[0];
5035       color.ui[3] = sampler->BorderColor.ui[0];
5036       break;
5037    case GL_ALPHA:
5038       color.ui[0] = 0u;
5039       color.ui[1] = 0u;
5040       color.ui[2] = 0u;
5041       color.ui[3] = sampler->BorderColor.ui[3];
5042       break;
5043    case GL_INTENSITY:
5044       color.ui[0] = sampler->BorderColor.ui[0];
5045       color.ui[1] = sampler->BorderColor.ui[0];
5046       color.ui[2] = sampler->BorderColor.ui[0];
5047       color.ui[3] = sampler->BorderColor.ui[0];
5048       break;
5049    case GL_LUMINANCE:
5050       color.ui[0] = sampler->BorderColor.ui[0];
5051       color.ui[1] = sampler->BorderColor.ui[0];
5052       color.ui[2] = sampler->BorderColor.ui[0];
5053       color.ui[3] = float_as_int(1.0);
5054       break;
5055    case GL_LUMINANCE_ALPHA:
5056       color.ui[0] = sampler->BorderColor.ui[0];
5057       color.ui[1] = sampler->BorderColor.ui[0];
5058       color.ui[2] = sampler->BorderColor.ui[0];
5059       color.ui[3] = sampler->BorderColor.ui[3];
5060       break;
5061    default:
5062       color.ui[0] = sampler->BorderColor.ui[0];
5063       color.ui[1] = sampler->BorderColor.ui[1];
5064       color.ui[2] = sampler->BorderColor.ui[2];
5065       color.ui[3] = sampler->BorderColor.ui[3];
5066       break;
5067    }
5068
5069    /* In some cases we use an RGBA surface format for GL RGB textures,
5070     * where we've initialized the A channel to 1.0.  We also have to set
5071     * the border color alpha to 1.0 in that case.
5072     */
5073    if (base_format == GL_RGB)
5074       color.ui[3] = float_as_int(1.0);
5075
5076    int alignment = 32;
5077    if (GEN_GEN >= 8) {
5078       alignment = 64;
5079    } else if (GEN_IS_HASWELL && (is_integer_format || is_stencil_sampling)) {
5080       alignment = 512;
5081    }
5082
5083    uint32_t *sdc = brw_state_batch(
5084       brw, GENX(SAMPLER_BORDER_COLOR_STATE_length) * sizeof(uint32_t),
5085       alignment, sdc_offset);
5086
5087    struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 };
5088
5089 #define ASSIGN(dst, src) \
5090    do {                  \
5091       dst = src;         \
5092    } while (0)
5093
5094 #define ASSIGNu16(dst, src) \
5095    do {                     \
5096       dst = (uint16_t)src;  \
5097    } while (0)
5098
5099 #define ASSIGNu8(dst, src) \
5100    do {                    \
5101       dst = (uint8_t)src;  \
5102    } while (0)
5103
5104 #define BORDER_COLOR_ATTR(macro, _color_type, src)              \
5105    macro(state.BorderColor ## _color_type ## Red, src[0]);   \
5106    macro(state.BorderColor ## _color_type ## Green, src[1]);   \
5107    macro(state.BorderColor ## _color_type ## Blue, src[2]);   \
5108    macro(state.BorderColor ## _color_type ## Alpha, src[3]);
5109
5110 #if GEN_GEN >= 8
5111    /* On Broadwell, the border color is represented as four 32-bit floats,
5112     * integers, or unsigned values, interpreted according to the surface
5113     * format.  This matches the sampler->BorderColor union exactly; just
5114     * memcpy the values.
5115     */
5116    BORDER_COLOR_ATTR(ASSIGN, 32bit, color.ui);
5117 #elif GEN_IS_HASWELL
5118    if (is_integer_format || is_stencil_sampling) {
5119       bool stencil = format == MESA_FORMAT_S_UINT8 || is_stencil_sampling;
5120       const int bits_per_channel =
5121          _mesa_get_format_bits(format, stencil ? GL_STENCIL_BITS : GL_RED_BITS);
5122
5123       /* From the Haswell PRM, "Command Reference: Structures", Page 36:
5124        * "If any color channel is missing from the surface format,
5125        *  corresponding border color should be programmed as zero and if
5126        *  alpha channel is missing, corresponding Alpha border color should
5127        *  be programmed as 1."
5128        */
5129       unsigned c[4] = { 0, 0, 0, 1 };
5130       for (int i = 0; i < 4; i++) {
5131          if (has_component(format, i))
5132             c[i] = color.ui[i];
5133       }
5134
5135       switch (bits_per_channel) {
5136       case 8:
5137          /* Copy RGBA in order. */
5138          BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c);
5139          break;
5140       case 10:
5141          /* R10G10B10A2_UINT is treated like a 16-bit format. */
5142       case 16:
5143          BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c);
5144          break;
5145       case 32:
5146          if (base_format == GL_RG) {
5147             /* Careful inspection of the tables reveals that for RG32 formats,
5148              * the green channel needs to go where blue normally belongs.
5149              */
5150             state.BorderColor32bitRed = c[0];
5151             state.BorderColor32bitBlue = c[1];
5152             state.BorderColor32bitAlpha = 1;
5153          } else {
5154             /* Copy RGBA in order. */
5155             BORDER_COLOR_ATTR(ASSIGN, 32bit, c);
5156          }
5157          break;
5158       default:
5159          assert(!"Invalid number of bits per channel in integer format.");
5160          break;
5161       }
5162    } else {
5163       BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
5164    }
5165 #elif GEN_GEN == 5 || GEN_GEN == 6
5166    BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color.f);
5167    BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color.f);
5168    BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color.f);
5169
5170 #define MESA_FLOAT_TO_HALF(dst, src) \
5171    dst = _mesa_float_to_half(src);
5172
5173    BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color.f);
5174
5175 #undef MESA_FLOAT_TO_HALF
5176
5177    state.BorderColorSnorm8Red   = state.BorderColorSnorm16Red >> 8;
5178    state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8;
5179    state.BorderColorSnorm8Blue  = state.BorderColorSnorm16Blue >> 8;
5180    state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8;
5181
5182    BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
5183 #elif GEN_GEN == 4
5184    BORDER_COLOR_ATTR(ASSIGN, , color.f);
5185 #else
5186    BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
5187 #endif
5188
5189 #undef ASSIGN
5190 #undef BORDER_COLOR_ATTR
5191
5192    GENX(SAMPLER_BORDER_COLOR_STATE_pack)(brw, sdc, &state);
5193 }
5194
5195 static uint32_t
5196 translate_wrap_mode(GLenum wrap, MAYBE_UNUSED bool using_nearest)
5197 {
5198    switch (wrap) {
5199    case GL_REPEAT:
5200       return TCM_WRAP;
5201    case GL_CLAMP:
5202 #if GEN_GEN >= 8
5203       /* GL_CLAMP is the weird mode where coordinates are clamped to
5204        * [0.0, 1.0], so linear filtering of coordinates outside of
5205        * [0.0, 1.0] give you half edge texel value and half border
5206        * color.
5207        *
5208        * Gen8+ supports this natively.
5209        */
5210       return TCM_HALF_BORDER;
5211 #else
5212       /* On Gen4-7.5, we clamp the coordinates in the fragment shader
5213        * and set clamp_border here, which gets the result desired.
5214        * We just use clamp(_to_edge) for nearest, because for nearest
5215        * clamping to 1.0 gives border color instead of the desired
5216        * edge texels.
5217        */
5218       if (using_nearest)
5219          return TCM_CLAMP;
5220       else
5221          return TCM_CLAMP_BORDER;
5222 #endif
5223    case GL_CLAMP_TO_EDGE:
5224       return TCM_CLAMP;
5225    case GL_CLAMP_TO_BORDER:
5226       return TCM_CLAMP_BORDER;
5227    case GL_MIRRORED_REPEAT:
5228       return TCM_MIRROR;
5229    case GL_MIRROR_CLAMP_TO_EDGE:
5230       return TCM_MIRROR_ONCE;
5231    default:
5232       return TCM_WRAP;
5233    }
5234 }
5235
5236 /**
5237  * Return true if the given wrap mode requires the border color to exist.
5238  */
5239 static bool
5240 wrap_mode_needs_border_color(unsigned wrap_mode)
5241 {
5242 #if GEN_GEN >= 8
5243    return wrap_mode == TCM_CLAMP_BORDER ||
5244           wrap_mode == TCM_HALF_BORDER;
5245 #else
5246    return wrap_mode == TCM_CLAMP_BORDER;
5247 #endif
5248 }
5249
5250 /**
5251  * Sets the sampler state for a single unit based off of the sampler key
5252  * entry.
5253  */
5254 static void
5255 genX(update_sampler_state)(struct brw_context *brw,
5256                            GLenum target, bool tex_cube_map_seamless,
5257                            GLfloat tex_unit_lod_bias,
5258                            mesa_format format, GLenum base_format,
5259                            const struct gl_texture_object *texObj,
5260                            const struct gl_sampler_object *sampler,
5261                            uint32_t *sampler_state)
5262 {
5263    struct GENX(SAMPLER_STATE) samp_st = { 0 };
5264
5265    /* Select min and mip filters. */
5266    switch (sampler->MinFilter) {
5267    case GL_NEAREST:
5268       samp_st.MinModeFilter = MAPFILTER_NEAREST;
5269       samp_st.MipModeFilter = MIPFILTER_NONE;
5270       break;
5271    case GL_LINEAR:
5272       samp_st.MinModeFilter = MAPFILTER_LINEAR;
5273       samp_st.MipModeFilter = MIPFILTER_NONE;
5274       break;
5275    case GL_NEAREST_MIPMAP_NEAREST:
5276       samp_st.MinModeFilter = MAPFILTER_NEAREST;
5277       samp_st.MipModeFilter = MIPFILTER_NEAREST;
5278       break;
5279    case GL_LINEAR_MIPMAP_NEAREST:
5280       samp_st.MinModeFilter = MAPFILTER_LINEAR;
5281       samp_st.MipModeFilter = MIPFILTER_NEAREST;
5282       break;
5283    case GL_NEAREST_MIPMAP_LINEAR:
5284       samp_st.MinModeFilter = MAPFILTER_NEAREST;
5285       samp_st.MipModeFilter = MIPFILTER_LINEAR;
5286       break;
5287    case GL_LINEAR_MIPMAP_LINEAR:
5288       samp_st.MinModeFilter = MAPFILTER_LINEAR;
5289       samp_st.MipModeFilter = MIPFILTER_LINEAR;
5290       break;
5291    default:
5292       unreachable("not reached");
5293    }
5294
5295    /* Select mag filter. */
5296    samp_st.MagModeFilter = sampler->MagFilter == GL_LINEAR ?
5297       MAPFILTER_LINEAR : MAPFILTER_NEAREST;
5298
5299    /* Enable anisotropic filtering if desired. */
5300    samp_st.MaximumAnisotropy = RATIO21;
5301
5302    if (sampler->MaxAnisotropy > 1.0f) {
5303       if (samp_st.MinModeFilter == MAPFILTER_LINEAR)
5304          samp_st.MinModeFilter = MAPFILTER_ANISOTROPIC;
5305       if (samp_st.MagModeFilter == MAPFILTER_LINEAR)
5306          samp_st.MagModeFilter = MAPFILTER_ANISOTROPIC;
5307
5308       if (sampler->MaxAnisotropy > 2.0f) {
5309          samp_st.MaximumAnisotropy =
5310             MIN2((sampler->MaxAnisotropy - 2) / 2, RATIO161);
5311       }
5312    }
5313
5314    /* Set address rounding bits if not using nearest filtering. */
5315    if (samp_st.MinModeFilter != MAPFILTER_NEAREST) {
5316       samp_st.UAddressMinFilterRoundingEnable = true;
5317       samp_st.VAddressMinFilterRoundingEnable = true;
5318       samp_st.RAddressMinFilterRoundingEnable = true;
5319    }
5320
5321    if (samp_st.MagModeFilter != MAPFILTER_NEAREST) {
5322       samp_st.UAddressMagFilterRoundingEnable = true;
5323       samp_st.VAddressMagFilterRoundingEnable = true;
5324       samp_st.RAddressMagFilterRoundingEnable = true;
5325    }
5326
5327    bool either_nearest =
5328       sampler->MinFilter == GL_NEAREST || sampler->MagFilter == GL_NEAREST;
5329    unsigned wrap_s = translate_wrap_mode(sampler->WrapS, either_nearest);
5330    unsigned wrap_t = translate_wrap_mode(sampler->WrapT, either_nearest);
5331    unsigned wrap_r = translate_wrap_mode(sampler->WrapR, either_nearest);
5332
5333    if (target == GL_TEXTURE_CUBE_MAP ||
5334        target == GL_TEXTURE_CUBE_MAP_ARRAY) {
5335       /* Cube maps must use the same wrap mode for all three coordinate
5336        * dimensions.  Prior to Haswell, only CUBE and CLAMP are valid.
5337        *
5338        * Ivybridge and Baytrail seem to have problems with CUBE mode and
5339        * integer formats.  Fall back to CLAMP for now.
5340        */
5341       if ((tex_cube_map_seamless || sampler->CubeMapSeamless) &&
5342           !(GEN_GEN == 7 && !GEN_IS_HASWELL && texObj->_IsIntegerFormat)) {
5343          wrap_s = TCM_CUBE;
5344          wrap_t = TCM_CUBE;
5345          wrap_r = TCM_CUBE;
5346       } else {
5347          wrap_s = TCM_CLAMP;
5348          wrap_t = TCM_CLAMP;
5349          wrap_r = TCM_CLAMP;
5350       }
5351    } else if (target == GL_TEXTURE_1D) {
5352       /* There's a bug in 1D texture sampling - it actually pays
5353        * attention to the wrap_t value, though it should not.
5354        * Override the wrap_t value here to GL_REPEAT to keep
5355        * any nonexistent border pixels from floating in.
5356        */
5357       wrap_t = TCM_WRAP;
5358    }
5359
5360    samp_st.TCXAddressControlMode = wrap_s;
5361    samp_st.TCYAddressControlMode = wrap_t;
5362    samp_st.TCZAddressControlMode = wrap_r;
5363
5364    samp_st.ShadowFunction =
5365       sampler->CompareMode == GL_COMPARE_R_TO_TEXTURE_ARB ?
5366       intel_translate_shadow_compare_func(sampler->CompareFunc) : 0;
5367
5368 #if GEN_GEN >= 7
5369    /* Set shadow function. */
5370    samp_st.AnisotropicAlgorithm =
5371       samp_st.MinModeFilter == MAPFILTER_ANISOTROPIC ?
5372       EWAApproximation : LEGACY;
5373 #endif
5374
5375 #if GEN_GEN >= 6
5376    samp_st.NonnormalizedCoordinateEnable = target == GL_TEXTURE_RECTANGLE;
5377 #endif
5378
5379    const float hw_max_lod = GEN_GEN >= 7 ? 14 : 13;
5380    samp_st.MinLOD = CLAMP(sampler->MinLod, 0, hw_max_lod);
5381    samp_st.MaxLOD = CLAMP(sampler->MaxLod, 0, hw_max_lod);
5382    samp_st.TextureLODBias =
5383       CLAMP(tex_unit_lod_bias + sampler->LodBias, -16, 15);
5384
5385 #if GEN_GEN == 6
5386    samp_st.BaseMipLevel =
5387       CLAMP(texObj->MinLevel + texObj->BaseLevel, 0, hw_max_lod);
5388    samp_st.MinandMagStateNotEqual =
5389       samp_st.MinModeFilter != samp_st.MagModeFilter;
5390 #endif
5391
5392    /* Upload the border color if necessary.  If not, just point it at
5393     * offset 0 (the start of the batch) - the color should be ignored,
5394     * but that address won't fault in case something reads it anyway.
5395     */
5396    uint32_t border_color_offset = 0;
5397    if (wrap_mode_needs_border_color(wrap_s) ||
5398        wrap_mode_needs_border_color(wrap_t) ||
5399        wrap_mode_needs_border_color(wrap_r)) {
5400       genX(upload_default_color)(brw, sampler, format, base_format,
5401                                  texObj->_IsIntegerFormat,
5402                                  texObj->StencilSampling,
5403                                  &border_color_offset);
5404    }
5405 #if GEN_GEN < 6
5406       samp_st.BorderColorPointer =
5407          ro_bo(brw->batch.state.bo, border_color_offset);
5408 #else
5409       samp_st.BorderColorPointer = border_color_offset;
5410 #endif
5411
5412 #if GEN_GEN >= 8
5413    samp_st.LODPreClampMode = CLAMP_MODE_OGL;
5414 #else
5415    samp_st.LODPreClampEnable = true;
5416 #endif
5417
5418    GENX(SAMPLER_STATE_pack)(brw, sampler_state, &samp_st);
5419 }
5420
5421 static void
5422 update_sampler_state(struct brw_context *brw,
5423                      int unit,
5424                      uint32_t *sampler_state)
5425 {
5426    struct gl_context *ctx = &brw->ctx;
5427    const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
5428    const struct gl_texture_object *texObj = texUnit->_Current;
5429    const struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, unit);
5430
5431    /* These don't use samplers at all. */
5432    if (texObj->Target == GL_TEXTURE_BUFFER)
5433       return;
5434
5435    struct gl_texture_image *firstImage = texObj->Image[0][texObj->BaseLevel];
5436    genX(update_sampler_state)(brw, texObj->Target,
5437                               ctx->Texture.CubeMapSeamless,
5438                               texUnit->LodBias,
5439                               firstImage->TexFormat, firstImage->_BaseFormat,
5440                               texObj, sampler,
5441                               sampler_state);
5442 }
5443
5444 static void
5445 genX(upload_sampler_state_table)(struct brw_context *brw,
5446                                  struct gl_program *prog,
5447                                  struct brw_stage_state *stage_state)
5448 {
5449    struct gl_context *ctx = &brw->ctx;
5450    uint32_t sampler_count = stage_state->sampler_count;
5451
5452    GLbitfield SamplersUsed = prog->SamplersUsed;
5453
5454    if (sampler_count == 0)
5455       return;
5456
5457    /* SAMPLER_STATE is 4 DWords on all platforms. */
5458    const int dwords = GENX(SAMPLER_STATE_length);
5459    const int size_in_bytes = dwords * sizeof(uint32_t);
5460
5461    uint32_t *sampler_state = brw_state_batch(brw,
5462                                              sampler_count * size_in_bytes,
5463                                              32, &stage_state->sampler_offset);
5464    /* memset(sampler_state, 0, sampler_count * size_in_bytes); */
5465
5466    for (unsigned s = 0; s < sampler_count; s++) {
5467       if (SamplersUsed & (1 << s)) {
5468          const unsigned unit = prog->SamplerUnits[s];
5469          if (ctx->Texture.Unit[unit]._Current) {
5470             update_sampler_state(brw, unit, sampler_state);
5471          }
5472       }
5473
5474       sampler_state += dwords;
5475    }
5476
5477    if (GEN_GEN >= 7 && stage_state->stage != MESA_SHADER_COMPUTE) {
5478       /* Emit a 3DSTATE_SAMPLER_STATE_POINTERS_XS packet. */
5479       genX(emit_sampler_state_pointers_xs)(brw, stage_state);
5480    } else {
5481       /* Flag that the sampler state table pointer has changed; later atoms
5482        * will handle it.
5483        */
5484       brw->ctx.NewDriverState |= BRW_NEW_SAMPLER_STATE_TABLE;
5485    }
5486 }
5487
5488 static void
5489 genX(upload_fs_samplers)(struct brw_context *brw)
5490 {
5491    /* BRW_NEW_FRAGMENT_PROGRAM */
5492    struct gl_program *fs = brw->programs[MESA_SHADER_FRAGMENT];
5493    genX(upload_sampler_state_table)(brw, fs, &brw->wm.base);
5494 }
5495
5496 static const struct brw_tracked_state genX(fs_samplers) = {
5497    .dirty = {
5498       .mesa = _NEW_TEXTURE,
5499       .brw = BRW_NEW_BATCH |
5500              BRW_NEW_BLORP |
5501              BRW_NEW_FRAGMENT_PROGRAM,
5502    },
5503    .emit = genX(upload_fs_samplers),
5504 };
5505
5506 static void
5507 genX(upload_vs_samplers)(struct brw_context *brw)
5508 {
5509    /* BRW_NEW_VERTEX_PROGRAM */
5510    struct gl_program *vs = brw->programs[MESA_SHADER_VERTEX];
5511    genX(upload_sampler_state_table)(brw, vs, &brw->vs.base);
5512 }
5513
5514 static const struct brw_tracked_state genX(vs_samplers) = {
5515    .dirty = {
5516       .mesa = _NEW_TEXTURE,
5517       .brw = BRW_NEW_BATCH |
5518              BRW_NEW_BLORP |
5519              BRW_NEW_VERTEX_PROGRAM,
5520    },
5521    .emit = genX(upload_vs_samplers),
5522 };
5523
5524 #if GEN_GEN >= 6
5525 static void
5526 genX(upload_gs_samplers)(struct brw_context *brw)
5527 {
5528    /* BRW_NEW_GEOMETRY_PROGRAM */
5529    struct gl_program *gs = brw->programs[MESA_SHADER_GEOMETRY];
5530    if (!gs)
5531       return;
5532
5533    genX(upload_sampler_state_table)(brw, gs, &brw->gs.base);
5534 }
5535
5536
5537 static const struct brw_tracked_state genX(gs_samplers) = {
5538    .dirty = {
5539       .mesa = _NEW_TEXTURE,
5540       .brw = BRW_NEW_BATCH |
5541              BRW_NEW_BLORP |
5542              BRW_NEW_GEOMETRY_PROGRAM,
5543    },
5544    .emit = genX(upload_gs_samplers),
5545 };
5546 #endif
5547
5548 #if GEN_GEN >= 7
5549 static void
5550 genX(upload_tcs_samplers)(struct brw_context *brw)
5551 {
5552    /* BRW_NEW_TESS_PROGRAMS */
5553    struct gl_program *tcs = brw->programs[MESA_SHADER_TESS_CTRL];
5554    if (!tcs)
5555       return;
5556
5557    genX(upload_sampler_state_table)(brw, tcs, &brw->tcs.base);
5558 }
5559
5560 static const struct brw_tracked_state genX(tcs_samplers) = {
5561    .dirty = {
5562       .mesa = _NEW_TEXTURE,
5563       .brw = BRW_NEW_BATCH |
5564              BRW_NEW_BLORP |
5565              BRW_NEW_TESS_PROGRAMS,
5566    },
5567    .emit = genX(upload_tcs_samplers),
5568 };
5569 #endif
5570
5571 #if GEN_GEN >= 7
5572 static void
5573 genX(upload_tes_samplers)(struct brw_context *brw)
5574 {
5575    /* BRW_NEW_TESS_PROGRAMS */
5576    struct gl_program *tes = brw->programs[MESA_SHADER_TESS_EVAL];
5577    if (!tes)
5578       return;
5579
5580    genX(upload_sampler_state_table)(brw, tes, &brw->tes.base);
5581 }
5582
5583 static const struct brw_tracked_state genX(tes_samplers) = {
5584    .dirty = {
5585       .mesa = _NEW_TEXTURE,
5586       .brw = BRW_NEW_BATCH |
5587              BRW_NEW_BLORP |
5588              BRW_NEW_TESS_PROGRAMS,
5589    },
5590    .emit = genX(upload_tes_samplers),
5591 };
5592 #endif
5593
5594 #if GEN_GEN >= 7
5595 static void
5596 genX(upload_cs_samplers)(struct brw_context *brw)
5597 {
5598    /* BRW_NEW_COMPUTE_PROGRAM */
5599    struct gl_program *cs = brw->programs[MESA_SHADER_COMPUTE];
5600    if (!cs)
5601       return;
5602
5603    genX(upload_sampler_state_table)(brw, cs, &brw->cs.base);
5604 }
5605
5606 const struct brw_tracked_state genX(cs_samplers) = {
5607    .dirty = {
5608       .mesa = _NEW_TEXTURE,
5609       .brw = BRW_NEW_BATCH |
5610              BRW_NEW_BLORP |
5611              BRW_NEW_COMPUTE_PROGRAM,
5612    },
5613    .emit = genX(upload_cs_samplers),
5614 };
5615 #endif
5616
5617 /* ---------------------------------------------------------------------- */
5618
5619 #if GEN_GEN <= 5
5620
5621 static void genX(upload_blend_constant_color)(struct brw_context *brw)
5622 {
5623    struct gl_context *ctx = &brw->ctx;
5624
5625    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) {
5626       blend_cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
5627       blend_cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
5628       blend_cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
5629       blend_cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
5630    }
5631 }
5632
5633 static const struct brw_tracked_state genX(blend_constant_color) = {
5634    .dirty = {
5635       .mesa = _NEW_COLOR,
5636       .brw = BRW_NEW_CONTEXT |
5637              BRW_NEW_BLORP,
5638    },
5639    .emit = genX(upload_blend_constant_color)
5640 };
5641 #endif
5642
5643 /* ---------------------------------------------------------------------- */
5644
5645 void
5646 genX(init_atoms)(struct brw_context *brw)
5647 {
5648 #if GEN_GEN < 6
5649    static const struct brw_tracked_state *render_atoms[] =
5650    {
5651       /* Once all the programs are done, we know how large urb entry
5652        * sizes need to be and can decide if we need to change the urb
5653        * layout.
5654        */
5655       &brw_curbe_offsets,
5656       &brw_recalculate_urb_fence,
5657
5658       &genX(cc_vp),
5659       &genX(color_calc_state),
5660
5661       /* Surface state setup.  Must come before the VS/WM unit.  The binding
5662        * table upload must be last.
5663        */
5664       &brw_vs_pull_constants,
5665       &brw_wm_pull_constants,
5666       &brw_renderbuffer_surfaces,
5667       &brw_renderbuffer_read_surfaces,
5668       &brw_texture_surfaces,
5669       &brw_vs_binding_table,
5670       &brw_wm_binding_table,
5671
5672       &genX(fs_samplers),
5673       &genX(vs_samplers),
5674
5675       /* These set up state for brw_psp_urb_cbs */
5676       &genX(wm_state),
5677       &genX(sf_clip_viewport),
5678       &genX(sf_state),
5679       &genX(vs_state), /* always required, enabled or not */
5680       &genX(clip_state),
5681       &genX(gs_state),
5682
5683       /* Command packets:
5684        */
5685       &brw_binding_table_pointers,
5686       &genX(blend_constant_color),
5687
5688       &brw_depthbuffer,
5689
5690       &genX(polygon_stipple),
5691       &genX(polygon_stipple_offset),
5692
5693       &genX(line_stipple),
5694
5695       &brw_psp_urb_cbs,
5696
5697       &genX(drawing_rect),
5698       &brw_indices, /* must come before brw_vertices */
5699       &genX(index_buffer),
5700       &genX(vertices),
5701
5702       &brw_constant_buffer
5703    };
5704 #elif GEN_GEN == 6
5705    static const struct brw_tracked_state *render_atoms[] =
5706    {
5707       &genX(sf_clip_viewport),
5708
5709       /* Command packets: */
5710
5711       &genX(cc_vp),
5712
5713       &gen6_urb,
5714       &genX(blend_state),               /* must do before cc unit */
5715       &genX(color_calc_state),  /* must do before cc unit */
5716       &genX(depth_stencil_state),       /* must do before cc unit */
5717
5718       &genX(vs_push_constants), /* Before vs_state */
5719       &genX(gs_push_constants), /* Before gs_state */
5720       &genX(wm_push_constants), /* Before wm_state */
5721
5722       /* Surface state setup.  Must come before the VS/WM unit.  The binding
5723        * table upload must be last.
5724        */
5725       &brw_vs_pull_constants,
5726       &brw_vs_ubo_surfaces,
5727       &brw_gs_pull_constants,
5728       &brw_gs_ubo_surfaces,
5729       &brw_wm_pull_constants,
5730       &brw_wm_ubo_surfaces,
5731       &gen6_renderbuffer_surfaces,
5732       &brw_renderbuffer_read_surfaces,
5733       &brw_texture_surfaces,
5734       &gen6_sol_surface,
5735       &brw_vs_binding_table,
5736       &gen6_gs_binding_table,
5737       &brw_wm_binding_table,
5738
5739       &genX(fs_samplers),
5740       &genX(vs_samplers),
5741       &genX(gs_samplers),
5742       &gen6_sampler_state,
5743       &genX(multisample_state),
5744
5745       &genX(vs_state),
5746       &genX(gs_state),
5747       &genX(clip_state),
5748       &genX(sf_state),
5749       &genX(wm_state),
5750
5751       &genX(scissor_state),
5752
5753       &gen6_binding_table_pointers,
5754
5755       &brw_depthbuffer,
5756
5757       &genX(polygon_stipple),
5758       &genX(polygon_stipple_offset),
5759
5760       &genX(line_stipple),
5761
5762       &genX(drawing_rect),
5763
5764       &brw_indices, /* must come before brw_vertices */
5765       &genX(index_buffer),
5766       &genX(vertices),
5767    };
5768 #elif GEN_GEN == 7
5769    static const struct brw_tracked_state *render_atoms[] =
5770    {
5771       /* Command packets: */
5772
5773       &genX(cc_vp),
5774       &genX(sf_clip_viewport),
5775
5776       &gen7_l3_state,
5777       &gen7_push_constant_space,
5778       &gen7_urb,
5779       &genX(blend_state),               /* must do before cc unit */
5780       &genX(color_calc_state),  /* must do before cc unit */
5781       &genX(depth_stencil_state),       /* must do before cc unit */
5782
5783       &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
5784       &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
5785       &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
5786       &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
5787       &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
5788
5789       &genX(vs_push_constants), /* Before vs_state */
5790       &genX(tcs_push_constants),
5791       &genX(tes_push_constants),
5792       &genX(gs_push_constants), /* Before gs_state */
5793       &genX(wm_push_constants), /* Before wm_surfaces and constant_buffer */
5794
5795       /* Surface state setup.  Must come before the VS/WM unit.  The binding
5796        * table upload must be last.
5797        */
5798       &brw_vs_pull_constants,
5799       &brw_vs_ubo_surfaces,
5800       &brw_tcs_pull_constants,
5801       &brw_tcs_ubo_surfaces,
5802       &brw_tes_pull_constants,
5803       &brw_tes_ubo_surfaces,
5804       &brw_gs_pull_constants,
5805       &brw_gs_ubo_surfaces,
5806       &brw_wm_pull_constants,
5807       &brw_wm_ubo_surfaces,
5808       &gen6_renderbuffer_surfaces,
5809       &brw_renderbuffer_read_surfaces,
5810       &brw_texture_surfaces,
5811
5812       &genX(push_constant_packets),
5813
5814       &brw_vs_binding_table,
5815       &brw_tcs_binding_table,
5816       &brw_tes_binding_table,
5817       &brw_gs_binding_table,
5818       &brw_wm_binding_table,
5819
5820       &genX(fs_samplers),
5821       &genX(vs_samplers),
5822       &genX(tcs_samplers),
5823       &genX(tes_samplers),
5824       &genX(gs_samplers),
5825       &genX(multisample_state),
5826
5827       &genX(vs_state),
5828       &genX(hs_state),
5829       &genX(te_state),
5830       &genX(ds_state),
5831       &genX(gs_state),
5832       &genX(sol_state),
5833       &genX(clip_state),
5834       &genX(sbe_state),
5835       &genX(sf_state),
5836       &genX(wm_state),
5837       &genX(ps_state),
5838
5839       &genX(scissor_state),
5840
5841       &brw_depthbuffer,
5842
5843       &genX(polygon_stipple),
5844       &genX(polygon_stipple_offset),
5845
5846       &genX(line_stipple),
5847
5848       &genX(drawing_rect),
5849
5850       &brw_indices, /* must come before brw_vertices */
5851       &genX(index_buffer),
5852       &genX(vertices),
5853
5854 #if GEN_IS_HASWELL
5855       &genX(cut_index),
5856 #endif
5857    };
5858 #elif GEN_GEN >= 8
5859    static const struct brw_tracked_state *render_atoms[] =
5860    {
5861       &genX(cc_vp),
5862       &genX(sf_clip_viewport),
5863
5864       &gen7_l3_state,
5865       &gen7_push_constant_space,
5866       &gen7_urb,
5867       &genX(blend_state),
5868       &genX(color_calc_state),
5869
5870       &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
5871       &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
5872       &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
5873       &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
5874       &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
5875
5876       &genX(vs_push_constants), /* Before vs_state */
5877       &genX(tcs_push_constants),
5878       &genX(tes_push_constants),
5879       &genX(gs_push_constants), /* Before gs_state */
5880       &genX(wm_push_constants), /* Before wm_surfaces and constant_buffer */
5881
5882       /* Surface state setup.  Must come before the VS/WM unit.  The binding
5883        * table upload must be last.
5884        */
5885       &brw_vs_pull_constants,
5886       &brw_vs_ubo_surfaces,
5887       &brw_tcs_pull_constants,
5888       &brw_tcs_ubo_surfaces,
5889       &brw_tes_pull_constants,
5890       &brw_tes_ubo_surfaces,
5891       &brw_gs_pull_constants,
5892       &brw_gs_ubo_surfaces,
5893       &brw_wm_pull_constants,
5894       &brw_wm_ubo_surfaces,
5895       &gen6_renderbuffer_surfaces,
5896       &brw_renderbuffer_read_surfaces,
5897       &brw_texture_surfaces,
5898
5899       &genX(push_constant_packets),
5900
5901       &brw_vs_binding_table,
5902       &brw_tcs_binding_table,
5903       &brw_tes_binding_table,
5904       &brw_gs_binding_table,
5905       &brw_wm_binding_table,
5906
5907       &genX(fs_samplers),
5908       &genX(vs_samplers),
5909       &genX(tcs_samplers),
5910       &genX(tes_samplers),
5911       &genX(gs_samplers),
5912       &genX(multisample_state),
5913
5914       &genX(vs_state),
5915       &genX(hs_state),
5916       &genX(te_state),
5917       &genX(ds_state),
5918       &genX(gs_state),
5919       &genX(sol_state),
5920       &genX(clip_state),
5921       &genX(raster_state),
5922       &genX(sbe_state),
5923       &genX(sf_state),
5924       &genX(ps_blend),
5925       &genX(ps_extra),
5926       &genX(ps_state),
5927       &genX(depth_stencil_state),
5928       &genX(wm_state),
5929
5930       &genX(scissor_state),
5931
5932       &brw_depthbuffer,
5933
5934       &genX(polygon_stipple),
5935       &genX(polygon_stipple_offset),
5936
5937       &genX(line_stipple),
5938
5939       &genX(drawing_rect),
5940
5941       &genX(vf_topology),
5942
5943       &brw_indices,
5944       &genX(index_buffer),
5945       &genX(vertices),
5946
5947       &genX(cut_index),
5948       &gen8_pma_fix,
5949    };
5950 #endif
5951
5952    STATIC_ASSERT(ARRAY_SIZE(render_atoms) <= ARRAY_SIZE(brw->render_atoms));
5953    brw_copy_pipeline_atoms(brw, BRW_RENDER_PIPELINE,
5954                            render_atoms, ARRAY_SIZE(render_atoms));
5955
5956 #if GEN_GEN >= 7
5957    static const struct brw_tracked_state *compute_atoms[] =
5958    {
5959       &gen7_l3_state,
5960       &brw_cs_image_surfaces,
5961       &genX(cs_push_constants),
5962       &genX(cs_pull_constants),
5963       &brw_cs_ubo_surfaces,
5964       &brw_cs_texture_surfaces,
5965       &brw_cs_work_groups_surface,
5966       &genX(cs_samplers),
5967       &genX(cs_state),
5968    };
5969
5970    STATIC_ASSERT(ARRAY_SIZE(compute_atoms) <= ARRAY_SIZE(brw->compute_atoms));
5971    brw_copy_pipeline_atoms(brw, BRW_COMPUTE_PIPELINE,
5972                            compute_atoms, ARRAY_SIZE(compute_atoms));
5973
5974    brw->vtbl.emit_mi_report_perf_count = genX(emit_mi_report_perf_count);
5975 #endif
5976 }