src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp

   1 /*
   2  * Copyright © 2014 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * This code is based on original work by Ilia Mirkin.
  24  */
  25
  26 /**
  27  * \file gen6_gs_visitor.cpp
  28  *
  29  * Gen6 geometry shader implementation
  30  */
  31
  32 #include "gen6_gs_visitor.h"
  33
  34 const unsigned MAX_GS_INPUT_VERTICES = 6;
  35
  36 namespace brw {
  37
  38 void
  39 gen6_gs_visitor::assign_binding_table_offsets()
  40 {
  41    /* In gen6 we reserve the first BRW_MAX_SOL_BINDINGS entries for transform
  42     * feedback surfaces.
  43     */
  44    assign_common_binding_table_offsets(BRW_MAX_SOL_BINDINGS);
  45 }
  46
  47 void
  48 gen6_gs_visitor::emit_prolog()
  49 {
  50    vec4_gs_visitor::emit_prolog();
  51
  52    /* Gen6 geometry shaders require to allocate an initial VUE handle via
  53     * FF_SYNC message, however the documentation remarks that only one thread
  54     * can write to the URB simultaneously and the FF_SYNC message provides the
  55     * synchronization mechanism for this, so using this message effectively
  56     * stalls the thread until it is its turn to write to the URB. Because of
  57     * this, the best way to implement geometry shader algorithms in gen6 is to
  58     * execute the algorithm before the FF_SYNC message to maximize parallelism.
  59     *
  60     * To achieve this we buffer the geometry shader outputs for each emitted
  61     * vertex in vertex_output during operation. Then, when we have processed
  62     * the last vertex (that is, at thread end time), we send the FF_SYNC
  63     * message to allocate the initial VUE handle and write all buffered vertex
  64     * data to the URB in one go.
  65     *
  66     * For each emitted vertex, vertex_output will hold vue_map.num_slots
  67     * data items plus one additional item to hold required flags
  68     * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
  69     * which come right after the data items for that vertex. Vertex data and
  70     * flags for the next vertex come right after the data items and flags for
  71     * the previous vertex.
  72     */
  73    this->current_annotation = "gen6 prolog";
  74    this->vertex_output = src_reg(this,
  75                                  glsl_type::uint_type,
  76                                  (prog_data->vue_map.num_slots + 1) *
  77                                  c->gp->program.VerticesOut);
  78    this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
  79    emit(MOV(dst_reg(this->vertex_output_offset), src_reg(0u)));
  80
  81    /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
  82     * so initialize it once to R0.
  83     */
  84    vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
  85                                      retype(brw_vec8_grf(0, 0),
  86                                             BRW_REGISTER_TYPE_UD)));
  87    inst->force_writemask_all = true;
  88
  89    /* This will be used as a temporary to store writeback data of FF_SYNC
  90     * and URB_WRITE messages.
  91     */
  92    this->temp = src_reg(this, glsl_type::uint_type);
  93
  94    /* This will be used to know when we are processing the first vertex of
  95     * a primitive. We will set this to URB_WRITE_PRIM_START only when we know
  96     * that we are processing the first vertex in the primitive and to zero
  97     * otherwise. This way we can use its value directly in the URB write
  98     * headers.
  99     */
 100    this->first_vertex = src_reg(this, glsl_type::uint_type);
 101    emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
 102
 103    /* The FF_SYNC message requires to know the number of primitives generated,
 104     * so keep a counter for this.
 105     */
 106    this->prim_count = src_reg(this, glsl_type::uint_type);
 107    emit(MOV(dst_reg(this->prim_count), 0u));
 108
 109    if (c->prog_data.gen6_xfb_enabled) {
 110       /* Create a virtual register to hold destination indices in SOL */
 111       this->destination_indices = src_reg(this, glsl_type::uvec4_type);
 112       /* Create a virtual register to hold number of written primitives */
 113       this->sol_prim_written = src_reg(this, glsl_type::uint_type);
 114       /* Create a virtual register to hold Streamed Vertex Buffer Indices */
 115       this->svbi = src_reg(this, glsl_type::uvec4_type);
 116       /* Create a virtual register to hold max values of SVBI */
 117       this->max_svbi = src_reg(this, glsl_type::uvec4_type);
 118       emit(MOV(dst_reg(this->max_svbi),
 119                src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD))));
 120
 121       xfb_setup();
 122    }
 123
 124    /* PrimitveID is delivered in r0.1 of the thread payload. If the program
 125     * needs it we have to move it to a separate register where we can map
 126     * the atttribute.
 127     *
 128     * Notice that we cannot use a virtual register for this, because we need to
 129     * map all input attributes to hardware registers in setup_payload(),
 130     * which happens before virtual registers are mapped to hardware registers.
 131     * We could work around that issue if we were able to compute the first
 132     * non-payload register here and move the PrimitiveID information to that
 133     * register, but we can't because at this point we don't know the final
 134     * number uniforms that will be included in the payload.
 135     *
 136     * So, what we do is to place PrimitiveID information in r1, which is always
 137     * delivered as part of the payload, but its only populated with data
 138     * relevant for transform feedback when we set GEN6_GS_SVBI_PAYLOAD_ENABLE
 139     * in the 3DSTATE_GS state packet. That information can be obtained by other
 140     * means though, so we can safely use r1 for this purpose.
 141     */
 142    if (c->prog_data.include_primitive_id) {
 143       this->primitive_id =
 144          src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
 145       emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
 146    }
 147 }
 148
 149 void
 150 gen6_gs_visitor::visit(ir_emit_vertex *ir)
 151 {
 152    gs_emit_vertex(ir->stream_id());
 153 }
 154 void
 155 gen6_gs_visitor::gs_emit_vertex(int stream_id)
 156 {
 157    this->current_annotation = "gen6 emit vertex";
 158    /* Honor max_vertex layout indication in geometry shader by ignoring any
 159     * vertices coming after c->gp->program.VerticesOut.
 160     */
 161    unsigned num_output_vertices = c->gp->program.VerticesOut;
 162    emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices),
 163             BRW_CONDITIONAL_L));
 164    emit(IF(BRW_PREDICATE_NORMAL));
 165    {
 166       /* Buffer all output slots for this vertex in vertex_output */
 167       for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
 168          int varying = prog_data->vue_map.slot_to_varying[slot];
 169          if (varying != VARYING_SLOT_PSIZ) {
 170             dst_reg dst(this->vertex_output);
 171             dst.reladdr = ralloc(mem_ctx, src_reg);
 172             memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
 173             emit_urb_slot(dst, varying);
 174          } else {
 175             /* The PSIZ slot can pack multiple varyings in different channels
 176              * and emit_urb_slot() will produce a MOV instruction for each of
 177              * them. Since we are writing to an array, that will translate to
 178              * possibly multiple MOV instructions with an array destination and
 179              * each will generate a scratch write with the same offset into
 180              * scratch space (thus, each one overwriting the previous). This is
 181              * not what we want. What we will do instead is emit PSIZ to a
 182              * a regular temporary register, then move that resgister into the
 183              * array. This way we only have one instruction with an array
 184              * destination and we only produce a single scratch write.
 185              */
 186             dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type));
 187             emit_urb_slot(tmp, varying);
 188             dst_reg dst(this->vertex_output);
 189             dst.reladdr = ralloc(mem_ctx, src_reg);
 190             memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
 191             vec4_instruction *inst = emit(MOV(dst, src_reg(tmp)));
 192             inst->force_writemask_all = true;
 193          }
 194
 195          emit(ADD(dst_reg(this->vertex_output_offset),
 196                   this->vertex_output_offset, 1u));
 197       }
 198
 199       /* Now buffer flags for this vertex */
 200       dst_reg dst(this->vertex_output);
 201       dst.reladdr = ralloc(mem_ctx, src_reg);
 202       memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
 203       if (c->gp->program.OutputType == GL_POINTS) {
 204          /* If we are outputting points, then every vertex has PrimStart and
 205           * PrimEnd set.
 206           */
 207          emit(MOV(dst, (_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
 208                   URB_WRITE_PRIM_START | URB_WRITE_PRIM_END));
 209          emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
 210       } else {
 211          /* Otherwise, we can only set the PrimStart flag, which we have stored
 212           * in the first_vertex register. We will have to wait until we execute
 213           * EndPrimitive() or we end the thread to set the PrimEnd flag on a
 214           * vertex.
 215           */
 216          emit(OR(dst, this->first_vertex,
 217                  (c->prog_data.output_topology << URB_WRITE_PRIM_TYPE_SHIFT)));
 218          emit(MOV(dst_reg(this->first_vertex), 0u));
 219       }
 220       emit(ADD(dst_reg(this->vertex_output_offset),
 221                this->vertex_output_offset, 1u));
 222
 223       /* Update vertex count */
 224       emit(ADD(dst_reg(this->vertex_count), this->vertex_count, 1u));
 225    }
 226    emit(BRW_OPCODE_ENDIF);
 227 }
 228
 229 void
 230 gen6_gs_visitor::visit(ir_end_primitive *)
 231 {
 232    gs_end_primitive();
 233 }
 234
 235 void
 236 gen6_gs_visitor::gs_end_primitive()
 237 {
 238    this->current_annotation = "gen6 end primitive";
 239    /* Calling EndPrimitive() is optional for point output. In this case we set
 240     * the PrimEnd flag when we process EmitVertex().
 241     */
 242    if (c->gp->program.OutputType == GL_POINTS)
 243       return;
 244
 245    /* Otherwise we know that the last vertex we have processed was the last
 246     * vertex in the primitive and we need to set its PrimEnd flag, so do this
 247     * unless we haven't emitted that vertex at all (vertex_count != 0).
 248     *
 249     * Notice that we have already incremented vertex_count when we processed
 250     * the last emit_vertex, so we need to take that into account in the
 251     * comparison below (hence the num_output_vertices + 1 in the comparison
 252     * below).
 253     */
 254    unsigned num_output_vertices = c->gp->program.VerticesOut;
 255    emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices + 1),
 256             BRW_CONDITIONAL_L));
 257    vec4_instruction *inst = emit(CMP(dst_null_d(),
 258                                      this->vertex_count, 0u,
 259                                      BRW_CONDITIONAL_NEQ));
 260    inst->predicate = BRW_PREDICATE_NORMAL;
 261    emit(IF(BRW_PREDICATE_NORMAL));
 262    {
 263       /* vertex_output_offset is already pointing at the first entry of the
 264        * next vertex. So subtract 1 to modify the flags for the previous
 265        * vertex.
 266        */
 267       src_reg offset(this, glsl_type::uint_type);
 268       emit(ADD(dst_reg(offset), this->vertex_output_offset, src_reg(-1)));
 269
 270       src_reg dst(this->vertex_output);
 271       dst.reladdr = ralloc(mem_ctx, src_reg);
 272       memcpy(dst.reladdr, &offset, sizeof(src_reg));
 273
 274       emit(OR(dst_reg(dst), dst, URB_WRITE_PRIM_END));
 275       emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
 276
 277       /* Set the first vertex flag to indicate that the next vertex will start
 278        * a primitive.
 279        */
 280       emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
 281    }
 282    emit(BRW_OPCODE_ENDIF);
 283 }
 284
 285 void
 286 gen6_gs_visitor::emit_urb_write_header(int mrf)
 287 {
 288    this->current_annotation = "gen6 urb header";
 289    /* Compute offset of the flags for the current vertex in vertex_output and
 290     * write them in dw2 of the message header.
 291     *
 292     * Notice that by the time that emit_thread_end() calls here
 293     * vertex_output_offset should point to the first data item of the current
 294     * vertex in vertex_output, thus we only need to add the number of output
 295     * slots per vertex to that offset to obtain the flags data offset.
 296     */
 297    src_reg flags_offset(this, glsl_type::uint_type);
 298    emit(ADD(dst_reg(flags_offset),
 299             this->vertex_output_offset, src_reg(prog_data->vue_map.num_slots)));
 300
 301    src_reg flags_data(this->vertex_output);
 302    flags_data.reladdr = ralloc(mem_ctx, src_reg);
 303    memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg));
 304
 305    emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
 306 }
 307
 308 void
 309 gen6_gs_visitor::emit_urb_write_opcode(bool complete, int base_mrf,
 310                                        int last_mrf, int urb_offset)
 311 {
 312    vec4_instruction *inst = NULL;
 313
 314    if (!complete) {
 315       /* If the vertex is not complete we don't have to do anything special */
 316       inst = emit(GS_OPCODE_URB_WRITE);
 317       inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
 318    } else {
 319       /* Otherwise we always request to allocate a new VUE handle. If this is
 320        * the last write before the EOT message and the new handle never gets
 321        * used it will be dereferenced when we send the EOT message. This is
 322        * necessary to avoid different setups for the EOT message (one for the
 323        * case when there is no output and another for the case when there is)
 324        * which would require to end the program with an IF/ELSE/ENDIF block,
 325        * something we do not want.
 326        */
 327       inst = emit(GS_OPCODE_URB_WRITE_ALLOCATE);
 328       inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
 329       inst->dst = dst_reg(MRF, base_mrf);
 330       inst->src[0] = this->temp;
 331    }
 332
 333    inst->base_mrf = base_mrf;
 334    /* URB data written (does not include the message header reg) must
 335     * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
 336     * section 5.4.3.2.2: URB_INTERLEAVED.
 337     */
 338    int mlen = last_mrf - base_mrf;
 339    if ((mlen % 2) != 1)
 340       mlen++;
 341    inst->mlen = mlen;
 342    inst->offset = urb_offset;
 343 }
 344
 345 void
 346 gen6_gs_visitor::emit_thread_end()
 347 {
 348    /* Make sure the current primitive is ended: we know it is not ended when
 349     * first_vertex is not zero. This is only relevant for outputs other than
 350     * points because in the point case we set PrimEnd on all vertices.
 351     */
 352    if (c->gp->program.OutputType != GL_POINTS) {
 353       emit(CMP(dst_null_d(), this->first_vertex, 0u, BRW_CONDITIONAL_Z));
 354       emit(IF(BRW_PREDICATE_NORMAL));
 355       {
 356          visit((ir_end_primitive *) NULL);
 357       }
 358       emit(BRW_OPCODE_ENDIF);
 359    }
 360
 361    /* Here we have to:
 362     * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle.
 363     * 2) Loop over all buffered vertex data and write it to corresponding
 364     *    URB entries.
 365     * 3) Allocate new VUE handles for all vertices other than the first.
 366     * 4) Send a final EOT message.
 367     */
 368
 369    /* MRF 0 is reserved for the debugger, so start with message header
 370     * in MRF 1.
 371     */
 372    int base_mrf = 1;
 373
 374    /* In the process of generating our URB write message contents, we
 375     * may need to unspill a register or load from an array.  Those
 376     * reads would use MRFs 14-15.
 377     */
 378    int max_usable_mrf = 13;
 379
 380    /* Issue the FF_SYNC message and obtain the initial VUE handle. */
 381    emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_G));
 382    emit(IF(BRW_PREDICATE_NORMAL));
 383    {
 384       this->current_annotation = "gen6 thread end: ff_sync";
 385
 386       vec4_instruction *inst;
 387       if (c->prog_data.gen6_xfb_enabled) {
 388          src_reg sol_temp(this, glsl_type::uvec4_type);
 389          emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
 390               dst_reg(this->svbi),
 391               this->vertex_count,
 392               this->prim_count,
 393               sol_temp);
 394          inst = emit(GS_OPCODE_FF_SYNC,
 395                      dst_reg(this->temp), this->prim_count, this->svbi);
 396       } else {
 397          inst = emit(GS_OPCODE_FF_SYNC,
 398                      dst_reg(this->temp), this->prim_count, src_reg(0u));
 399       }
 400       inst->base_mrf = base_mrf;
 401
 402       /* Loop over all buffered vertices and emit URB write messages */
 403       this->current_annotation = "gen6 thread end: urb writes init";
 404       src_reg vertex(this, glsl_type::uint_type);
 405       emit(MOV(dst_reg(vertex), 0u));
 406       emit(MOV(dst_reg(this->vertex_output_offset), 0u));
 407
 408       this->current_annotation = "gen6 thread end: urb writes";
 409       emit(BRW_OPCODE_DO);
 410       {
 411          emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
 412          inst = emit(BRW_OPCODE_BREAK);
 413          inst->predicate = BRW_PREDICATE_NORMAL;
 414
 415          /* First we prepare the message header */
 416          emit_urb_write_header(base_mrf);
 417
 418          /* Then add vertex data to the message in interleaved fashion */
 419          int slot = 0;
 420          bool complete = false;
 421          do {
 422             int mrf = base_mrf + 1;
 423
 424             /* URB offset is in URB row increments, and each of our MRFs is half
 425              * of one of those, since we're doing interleaved writes.
 426              */
 427             int urb_offset = slot / 2;
 428
 429             for (; slot < prog_data->vue_map.num_slots; ++slot) {
 430                int varying = prog_data->vue_map.slot_to_varying[slot];
 431                current_annotation = output_reg_annotation[varying];
 432
 433                /* Compute offset of this slot for the current vertex
 434                 * in vertex_output
 435                 */
 436                src_reg data(this->vertex_output);
 437                data.reladdr = ralloc(mem_ctx, src_reg);
 438                memcpy(data.reladdr, &this->vertex_output_offset,
 439                       sizeof(src_reg));
 440
 441                /* Copy this slot to the appropriate message register */
 442                dst_reg reg = dst_reg(MRF, mrf);
 443                reg.type = output_reg[varying].type;
 444                data.type = reg.type;
 445                vec4_instruction *inst = emit(MOV(reg, data));
 446                inst->force_writemask_all = true;
 447
 448                mrf++;
 449                emit(ADD(dst_reg(this->vertex_output_offset),
 450                         this->vertex_output_offset, 1u));
 451
 452                /* If this was max_usable_mrf, we can't fit anything more into
 453                 * this URB WRITE.
 454                 */
 455                if (mrf > max_usable_mrf) {
 456                   slot++;
 457                   break;
 458                }
 459             }
 460
 461             complete = slot >= prog_data->vue_map.num_slots;
 462             emit_urb_write_opcode(complete, base_mrf, mrf, urb_offset);
 463          } while (!complete);
 464
 465          /* Skip over the flags data item so that vertex_output_offset points
 466           * to the first data item of the next vertex, so that we can start
 467           * writing the next vertex.
 468           */
 469          emit(ADD(dst_reg(this->vertex_output_offset),
 470                   this->vertex_output_offset, 1u));
 471
 472          emit(ADD(dst_reg(vertex), vertex, 1u));
 473       }
 474       emit(BRW_OPCODE_WHILE);
 475
 476       if (c->prog_data.gen6_xfb_enabled)
 477          xfb_write();
 478    }
 479    emit(BRW_OPCODE_ENDIF);
 480
 481    /* Finally, emit EOT message.
 482     *
 483     * In gen6 we need to end the thread differently depending on whether we have
 484     * emitted at least one vertex or not. In case we did, the EOT message must
 485     * always include the COMPLETE flag or else the GPU hangs. If we have not
 486     * produced any output we can't use the COMPLETE flag.
 487     *
 488     * However, this would lead us to end the program with an ENDIF opcode,
 489     * which we want to avoid, so what we do is that we always request a new
 490     * VUE handle every time we do a URB WRITE, even for the last vertex we emit.
 491     * With this we make sure that whether we have emitted at least one vertex
 492     * or none at all, we have to finish the thread without writing to the URB,
 493     * which works for both cases by setting the COMPLETE and UNUSED flags in
 494     * the EOT message.
 495     */
 496    this->current_annotation = "gen6 thread end: EOT";
 497
 498    if (c->prog_data.gen6_xfb_enabled) {
 499       /* When emitting EOT, set SONumPrimsWritten Increment Value. */
 500       src_reg data(this, glsl_type::uint_type);
 501       emit(AND(dst_reg(data), this->sol_prim_written, src_reg(0xffffu)));
 502       emit(SHL(dst_reg(data), data, src_reg(16u)));
 503       emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
 504    }
 505
 506    vec4_instruction *inst = emit(GS_OPCODE_THREAD_END);
 507    inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED;
 508    inst->base_mrf = base_mrf;
 509    inst->mlen = 1;
 510 }
 511
 512 void
 513 gen6_gs_visitor::setup_payload()
 514 {
 515    int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
 516
 517    /* Attributes are going to be interleaved, so one register contains two
 518     * attribute slots.
 519     */
 520    int attributes_per_reg = 2;
 521
 522    /* If a geometry shader tries to read from an input that wasn't written by
 523     * the vertex shader, that produces undefined results, but it shouldn't
 524     * crash anything.  So initialize attribute_map to zeros--that ensures that
 525     * these undefined results are read from r0.
 526     */
 527    memset(attribute_map, 0, sizeof(attribute_map));
 528
 529    int reg = 0;
 530
 531    /* The payload always contains important data in r0. */
 532    reg++;
 533
 534    /* r1 is always part of the payload and it holds information relevant
 535     * for transform feedback when we set the GEN6_GS_SVBI_PAYLOAD_ENABLE bit in
 536     * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID
 537     * information (and move the original value to a virtual register if
 538     * necessary).
 539     */
 540    if (c->prog_data.include_primitive_id)
 541       attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg;
 542    reg++;
 543
 544    reg = setup_uniforms(reg);
 545
 546    reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
 547
 548    lower_attributes_to_hw_regs(attribute_map, true);
 549
 550    this->first_non_payload_grf = reg;
 551 }
 552
 553 void
 554 gen6_gs_visitor::xfb_setup()
 555 {
 556    static const unsigned swizzle_for_offset[4] = {
 557       BRW_SWIZZLE4(0, 1, 2, 3),
 558       BRW_SWIZZLE4(1, 2, 3, 3),
 559       BRW_SWIZZLE4(2, 3, 3, 3),
 560       BRW_SWIZZLE4(3, 3, 3, 3)
 561    };
 562
 563    struct brw_gs_prog_data *prog_data =
 564       (struct brw_gs_prog_data *) &c->prog_data;
 565
 566    const struct gl_transform_feedback_info *linked_xfb_info =
 567       &this->shader_prog->LinkedTransformFeedback;
 568    int i;
 569
 570    /* Make sure that the VUE slots won't overflow the unsigned chars in
 571     * prog_data->transform_feedback_bindings[].
 572     */
 573    STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256);
 574
 575    /* Make sure that we don't need more binding table entries than we've
 576     * set aside for use in transform feedback.  (We shouldn't, since we
 577     * set aside enough binding table entries to have one per component).
 578     */
 579    assert(linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS);
 580
 581    prog_data->num_transform_feedback_bindings = linked_xfb_info->NumOutputs;
 582    for (i = 0; i < prog_data->num_transform_feedback_bindings; i++) {
 583       prog_data->transform_feedback_bindings[i] =
 584          linked_xfb_info->Outputs[i].OutputRegister;
 585       prog_data->transform_feedback_swizzles[i] =
 586          swizzle_for_offset[linked_xfb_info->Outputs[i].ComponentOffset];
 587    }
 588 }
 589
 590 void
 591 gen6_gs_visitor::xfb_write()
 592 {
 593    unsigned num_verts;
 594    struct brw_gs_prog_data *prog_data =
 595       (struct brw_gs_prog_data *) &c->prog_data;
 596
 597    if (!prog_data->num_transform_feedback_bindings)
 598       return;
 599
 600    switch (c->prog_data.output_topology) {
 601    case _3DPRIM_POINTLIST:
 602       num_verts = 1;
 603       break;
 604    case _3DPRIM_LINELIST:
 605    case _3DPRIM_LINESTRIP:
 606    case _3DPRIM_LINELOOP:
 607       num_verts = 2;
 608       break;
 609    case _3DPRIM_TRILIST:
 610    case _3DPRIM_TRIFAN:
 611    case _3DPRIM_TRISTRIP:
 612    case _3DPRIM_RECTLIST:
 613       num_verts = 3;
 614       break;
 615    case _3DPRIM_QUADLIST:
 616    case _3DPRIM_QUADSTRIP:
 617    case _3DPRIM_POLYGON:
 618       num_verts = 3;
 619       break;
 620    default:
 621       unreachable("Unexpected primitive type in Gen6 SOL program.");
 622    }
 623
 624    this->current_annotation = "gen6 thread end: svb writes init";
 625
 626    emit(MOV(dst_reg(this->vertex_output_offset), 0u));
 627    emit(MOV(dst_reg(this->sol_prim_written), 0u));
 628
 629    /* Check that at least one primitive can be written
 630     *
 631     * Note: since we use the binding table to keep track of buffer offsets
 632     * and stride, the GS doesn't need to keep track of a separate pointer
 633     * into each buffer; it uses a single pointer which increments by 1 for
 634     * each vertex.  So we use SVBI0 for this pointer, regardless of whether
 635     * transform feedback is in interleaved or separate attribs mode.
 636     */
 637    src_reg sol_temp(this, glsl_type::uvec4_type);
 638    emit(ADD(dst_reg(sol_temp), this->svbi, src_reg(num_verts)));
 639
 640    /* Compare SVBI calculated number with the maximum value, which is
 641     * in R1.4 (previously saved in this->max_svbi) for gen6.
 642     */
 643    emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
 644    emit(IF(BRW_PREDICATE_NORMAL));
 645    {
 646       src_reg destination_indices_uw =
 647          retype(destination_indices, BRW_REGISTER_TYPE_UW);
 648
 649       vec4_instruction *inst = emit(MOV(dst_reg(destination_indices_uw),
 650                                         brw_imm_v(0x00020100))); /* (0, 1, 2) */
 651       inst->force_writemask_all = true;
 652
 653       emit(ADD(dst_reg(this->destination_indices),
 654                this->destination_indices,
 655                this->svbi));
 656    }
 657    emit(BRW_OPCODE_ENDIF);
 658
 659    /* Write transform feedback data for all processed vertices. */
 660    for (int i = 0; i < c->gp->program.VerticesOut; i++) {
 661       emit(MOV(dst_reg(sol_temp), i));
 662       emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
 663                BRW_CONDITIONAL_L));
 664       emit(IF(BRW_PREDICATE_NORMAL));
 665       {
 666          xfb_program(i, num_verts);
 667       }
 668       emit(BRW_OPCODE_ENDIF);
 669    }
 670 }
 671
 672 void
 673 gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
 674 {
 675    struct brw_gs_prog_data *prog_data =
 676       (struct brw_gs_prog_data *) &c->prog_data;
 677    unsigned binding;
 678    unsigned num_bindings = prog_data->num_transform_feedback_bindings;
 679    src_reg sol_temp(this, glsl_type::uvec4_type);
 680
 681    /* Check for buffer overflow: we need room to write the complete primitive
 682     * (all vertices). Otherwise, avoid writing any vertices for it
 683     */
 684    emit(ADD(dst_reg(sol_temp), this->sol_prim_written, 1u));
 685    emit(MUL(dst_reg(sol_temp), sol_temp, src_reg(num_verts)));
 686    emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
 687    emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
 688    emit(IF(BRW_PREDICATE_NORMAL));
 689    {
 690       /* Avoid overwriting MRF 1 as it is used as URB write message header */
 691       dst_reg mrf_reg(MRF, 2);
 692
 693       this->current_annotation = "gen6: emit SOL vertex data";
 694       /* For each vertex, generate code to output each varying using the
 695        * appropriate binding table entry.
 696        */
 697       for (binding = 0; binding < num_bindings; ++binding) {
 698          unsigned char varying =
 699             prog_data->transform_feedback_bindings[binding];
 700
 701          /* Set up the correct destination index for this vertex */
 702          vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
 703                                        mrf_reg,
 704                                        this->destination_indices);
 705          inst->sol_vertex = vertex % num_verts;
 706
 707          /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
 708           *
 709           *   "Prior to End of Thread with a URB_WRITE, the kernel must
 710           *   ensure that all writes are complete by sending the final
 711           *   write as a committed write."
 712           */
 713          bool final_write = binding == (unsigned) num_bindings - 1 &&
 714                             inst->sol_vertex == num_verts - 1;
 715
 716          /* Compute offset of this varying for the current vertex
 717           * in vertex_output
 718           */
 719          this->current_annotation = output_reg_annotation[varying];
 720          src_reg data(this->vertex_output);
 721          data.reladdr = ralloc(mem_ctx, src_reg);
 722          int offset = get_vertex_output_offset_for_varying(vertex, varying);
 723          emit(MOV(dst_reg(this->vertex_output_offset), offset));
 724          memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
 725          data.type = output_reg[varying].type;
 726
 727          /* PSIZ, LAYER and VIEWPORT are packed in different channels of the
 728           * same slot, so make sure we write the appropriate channel
 729           */
 730          if (varying == VARYING_SLOT_PSIZ)
 731             data.swizzle = BRW_SWIZZLE_WWWW;
 732          else if (varying == VARYING_SLOT_LAYER)
 733             data.swizzle = BRW_SWIZZLE_YYYY;
 734          else if (varying == VARYING_SLOT_VIEWPORT)
 735             data.swizzle = BRW_SWIZZLE_ZZZZ;
 736          else
 737             data.swizzle = prog_data->transform_feedback_swizzles[binding];
 738
 739          /* Write data */
 740          inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
 741          inst->sol_binding = binding;
 742          inst->sol_final_write = final_write;
 743
 744          if (final_write) {
 745             /* This is the last vertex of the primitive, then increment
 746              * SO num primitive counter and destination indices.
 747              */
 748             emit(ADD(dst_reg(this->destination_indices),
 749                      this->destination_indices,
 750                      src_reg(num_verts)));
 751             emit(ADD(dst_reg(this->sol_prim_written),
 752                      this->sol_prim_written, 1u));
 753          }
 754
 755       }
 756       this->current_annotation = NULL;
 757    }
 758    emit(BRW_OPCODE_ENDIF);
 759 }
 760
 761 int
 762 gen6_gs_visitor::get_vertex_output_offset_for_varying(int vertex, int varying)
 763 {
 764    /* Find the output slot assigned to this varying.
 765     *
 766     * VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot
 767     * as VARYING_SLOT_PSIZ.
 768     */
 769    if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT)
 770       varying = VARYING_SLOT_PSIZ;
 771    int slot = prog_data->vue_map.varying_to_slot[varying];
 772
 773    if (slot < 0) {
 774       /* This varying does not exist in the VUE so we are not writing to it
 775        * and its value is undefined. We still want to return a valid offset
 776        * into vertex_output though, to prevent any out-of-bound accesses into
 777        * the vertex_output array. Since the value for this varying is undefined
 778        * we don't really care for the value we assign to it, so any offset
 779        * within the limits of vertex_output will do.
 780        */
 781       slot = 0;
 782    }
 783
 784    return vertex * (prog_data->vue_map.num_slots + 1) + slot;
 785 }
 786
 787 } /* namespace brw */