src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp

   1 /*
   2  * Copyright © 2014 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * This code is based on original work by Ilia Mirkin.
  24  */
  25
  26 /**
  27  * \file gen6_gs_visitor.cpp
  28  *
  29  * Gen6 geometry shader implementation
  30  */
  31
  32 #include "gen6_gs_visitor.h"
  33
  34 const unsigned MAX_GS_INPUT_VERTICES = 6;
  35
  36 namespace brw {
  37
  38 void
  39 gen6_gs_visitor::emit_prolog()
  40 {
  41    vec4_gs_visitor::emit_prolog();
  42
  43    /* Gen6 geometry shaders require to allocate an initial VUE handle via
  44     * FF_SYNC message, however the documentation remarks that only one thread
  45     * can write to the URB simultaneously and the FF_SYNC message provides the
  46     * synchronization mechanism for this, so using this message effectively
  47     * stalls the thread until it is its turn to write to the URB. Because of
  48     * this, the best way to implement geometry shader algorithms in gen6 is to
  49     * execute the algorithm before the FF_SYNC message to maximize parallelism.
  50     *
  51     * To achieve this we buffer the geometry shader outputs for each emitted
  52     * vertex in vertex_output during operation. Then, when we have processed
  53     * the last vertex (that is, at thread end time), we send the FF_SYNC
  54     * message to allocate the initial VUE handle and write all buffered vertex
  55     * data to the URB in one go.
  56     *
  57     * For each emitted vertex, vertex_output will hold vue_map.num_slots
  58     * data items plus one additional item to hold required flags
  59     * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
  60     * which come right after the data items for that vertex. Vertex data and
  61     * flags for the next vertex come right after the data items and flags for
  62     * the previous vertex.
  63     */
  64    this->current_annotation = "gen6 prolog";
  65    this->vertex_output = src_reg(this,
  66                                  glsl_type::uint_type,
  67                                  (prog_data->vue_map.num_slots + 1) *
  68                                  c->gp->program.VerticesOut);
  69    this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
  70    emit(MOV(dst_reg(this->vertex_output_offset), src_reg(0u)));
  71
  72    /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
  73     * so initialize it once to R0.
  74     */
  75    vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
  76                                      retype(brw_vec8_grf(0, 0),
  77                                             BRW_REGISTER_TYPE_UD)));
  78    inst->force_writemask_all = true;
  79
  80    /* This will be used as a temporary to store writeback data of FF_SYNC
  81     * and URB_WRITE messages.
  82     */
  83    this->temp = src_reg(this, glsl_type::uint_type);
  84
  85    /* This will be used to know when we are processing the first vertex of
  86     * a primitive. We will set this to URB_WRITE_PRIM_START only when we know
  87     * that we are processing the first vertex in the primitive and to zero
  88     * otherwise. This way we can use its value directly in the URB write
  89     * headers.
  90     */
  91    this->first_vertex = src_reg(this, glsl_type::uint_type);
  92    emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
  93
  94    /* The FF_SYNC message requires to know the number of primitives generated,
  95     * so keep a counter for this.
  96     */
  97    this->prim_count = src_reg(this, glsl_type::uint_type);
  98    emit(MOV(dst_reg(this->prim_count), 0u));
  99
 100    if (c->prog_data.gen6_xfb_enabled) {
 101       const struct gl_transform_feedback_info *linked_xfb_info =
 102          &this->shader_prog->LinkedTransformFeedback;
 103
 104       /* Gen6 geometry shaders are required to ask for Streamed Vertex Buffer
 105        * Indices values via FF_SYNC message, when Transform Feedback is
 106        * enabled.
 107        *
 108        * To achieve this we buffer the Transform feedback outputs for each
 109        * emitted vertex in xfb_output during operation. Then, when we have
 110        * processed the last vertex (that is, at thread end time), we know all
 111        * the required data for the FF_SYNC message header in order to receive
 112        * the SVBI in the writeback.
 113        *
 114        * For each emitted vertex, xfb_output will hold
 115        * num_transform_feedback_bindings data items plus one, which will
 116        * indicate the end of the primitive. Next vertex's data comes right
 117        * after.
 118        */
 119       this->xfb_output = src_reg(this,
 120                                  glsl_type::uint_type,
 121                                  linked_xfb_info->NumOutputs *
 122                                  c->gp->program.VerticesOut);
 123       this->xfb_output_offset = src_reg(this, glsl_type::uint_type);
 124       emit(MOV(dst_reg(this->xfb_output_offset), src_reg(0u)));
 125       /* Create a virtual register to hold destination indices in SOL */
 126       this->destination_indices = src_reg(this, glsl_type::uvec4_type);
 127       /* Create a virtual register to hold number of written primitives */
 128       this->sol_prim_written = src_reg(this, glsl_type::uint_type);
 129       /* Create a virtual register to hold Streamed Vertex Buffer Indices */
 130       this->svbi = src_reg(this, glsl_type::uvec4_type);
 131       /* Create a virtual register to hold max values of SVBI */
 132       this->max_svbi = src_reg(this, glsl_type::uvec4_type);
 133       emit(MOV(dst_reg(this->max_svbi),
 134                src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD))));
 135    }
 136
 137    /* PrimitveID is delivered in r0.1 of the thread payload. If the program
 138     * needs it we have to move it to a separate register where we can map
 139     * the atttribute.
 140     *
 141     * Notice that we cannot use a virtual register for this, because we need to
 142     * map all input attributes to hardware registers in setup_payload(),
 143     * which happens before virtual registers are mapped to hardware registers.
 144     * We could work around that issue if we were able to compute the first
 145     * non-payload register here and move the PrimitiveID information to that
 146     * register, but we can't because at this point we don't know the final
 147     * number uniforms that will be included in the payload.
 148     *
 149     * So, what we do is to place PrimitiveID information in r1, which is always
 150     * delivered as part of the payload, but its only populated with data
 151     * relevant for transform feedback when we set GEN6_GS_SVBI_PAYLOAD_ENABLE
 152     * in the 3DSTATE_GS state packet. That information can be obtained by other
 153     * means though, so we can safely use r1 for this purpose.
 154     */
 155    if (c->prog_data.include_primitive_id) {
 156       this->primitive_id =
 157          src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
 158       emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
 159    }
 160 }
 161
 162 void
 163 gen6_gs_visitor::visit(ir_emit_vertex *)
 164 {
 165    this->current_annotation = "gen6 emit vertex";
 166    /* Honor max_vertex layout indication in geometry shader by ignoring any
 167     * vertices coming after c->gp->program.VerticesOut.
 168     */
 169    unsigned num_output_vertices = c->gp->program.VerticesOut;
 170    emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices),
 171             BRW_CONDITIONAL_L));
 172    emit(IF(BRW_PREDICATE_NORMAL));
 173    {
 174       if (c->prog_data.gen6_xfb_enabled)
 175          xfb_buffer_output();
 176
 177       /* Buffer all output slots for this vertex in vertex_output */
 178       for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
 179          int varying = prog_data->vue_map.slot_to_varying[slot];
 180          if (varying != VARYING_SLOT_PSIZ) {
 181             dst_reg dst(this->vertex_output);
 182             dst.reladdr = ralloc(mem_ctx, src_reg);
 183             memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
 184             emit_urb_slot(dst, varying);
 185          } else {
 186             /* The PSIZ slot can pack multiple varyings in different channels
 187              * and emit_urb_slot() will produce a MOV instruction for each of
 188              * them. Since we are writing to an array, that will translate to
 189              * possibly multiple MOV instructions with an array destination and
 190              * each will generate a scratch write with the same offset into
 191              * scratch space (thus, each one overwriting the previous). This is
 192              * not what we want. What we will do instead is emit PSIZ to a
 193              * a regular temporary register, then move that resgister into the
 194              * array. This way we only have one instruction with an array
 195              * destination and we only produce a single scratch write.
 196              */
 197             dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type));
 198             emit_urb_slot(tmp, varying);
 199             dst_reg dst(this->vertex_output);
 200             dst.reladdr = ralloc(mem_ctx, src_reg);
 201             memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
 202             vec4_instruction *inst = emit(MOV(dst, src_reg(tmp)));
 203             inst->force_writemask_all = true;
 204          }
 205
 206          emit(ADD(dst_reg(this->vertex_output_offset),
 207                   this->vertex_output_offset, 1u));
 208       }
 209
 210       /* Now buffer flags for this vertex */
 211       dst_reg dst(this->vertex_output);
 212       dst.reladdr = ralloc(mem_ctx, src_reg);
 213       memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
 214       if (c->gp->program.OutputType == GL_POINTS) {
 215          /* If we are outputting points, then every vertex has PrimStart and
 216           * PrimEnd set.
 217           */
 218          emit(MOV(dst, (_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
 219                   URB_WRITE_PRIM_START | URB_WRITE_PRIM_END));
 220          emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
 221       } else {
 222          /* Otherwise, we can only set the PrimStart flag, which we have stored
 223           * in the first_vertex register. We will have to wait until we execute
 224           * EndPrimitive() or we end the thread to set the PrimEnd flag on a
 225           * vertex.
 226           */
 227          emit(OR(dst, this->first_vertex,
 228                  (c->prog_data.output_topology << URB_WRITE_PRIM_TYPE_SHIFT)));
 229          emit(MOV(dst_reg(this->first_vertex), 0u));
 230       }
 231       emit(ADD(dst_reg(this->vertex_output_offset),
 232                this->vertex_output_offset, 1u));
 233
 234       /* Update vertex count */
 235       emit(ADD(dst_reg(this->vertex_count), this->vertex_count, 1u));
 236    }
 237    emit(BRW_OPCODE_ENDIF);
 238 }
 239
 240 void
 241 gen6_gs_visitor::visit(ir_end_primitive *)
 242 {
 243    this->current_annotation = "gen6 end primitive";
 244    /* Calling EndPrimitive() is optional for point output. In this case we set
 245     * the PrimEnd flag when we process EmitVertex().
 246     */
 247    if (c->gp->program.OutputType == GL_POINTS)
 248       return;
 249
 250    /* Otherwise we know that the last vertex we have processed was the last
 251     * vertex in the primitive and we need to set its PrimEnd flag, so do this
 252     * unless we haven't emitted that vertex at all (vertex_count != 0).
 253     *
 254     * Notice that we have already incremented vertex_count when we processed
 255     * the last emit_vertex, so we need to take that into account in the
 256     * comparison below (hence the num_output_vertices + 1 in the comparison
 257     * below).
 258     */
 259    unsigned num_output_vertices = c->gp->program.VerticesOut;
 260    emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices + 1),
 261             BRW_CONDITIONAL_L));
 262    vec4_instruction *inst = emit(CMP(dst_null_d(),
 263                                      this->vertex_count, 0u,
 264                                      BRW_CONDITIONAL_NEQ));
 265    inst->predicate = BRW_PREDICATE_NORMAL;
 266    emit(IF(BRW_PREDICATE_NORMAL));
 267    {
 268       /* vertex_output_offset is already pointing at the first entry of the
 269        * next vertex. So subtract 1 to modify the flags for the previous
 270        * vertex.
 271        */
 272       src_reg offset(this, glsl_type::uint_type);
 273       emit(ADD(dst_reg(offset), this->vertex_output_offset, brw_imm_d(-1)));
 274
 275       src_reg dst(this->vertex_output);
 276       dst.reladdr = ralloc(mem_ctx, src_reg);
 277       memcpy(dst.reladdr, &offset, sizeof(src_reg));
 278
 279       emit(OR(dst_reg(dst), dst, URB_WRITE_PRIM_END));
 280       emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
 281
 282       /* Set the first vertex flag to indicate that the next vertex will start
 283        * a primitive.
 284        */
 285       emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
 286    }
 287    emit(BRW_OPCODE_ENDIF);
 288 }
 289
 290 void
 291 gen6_gs_visitor::emit_urb_write_header(int mrf)
 292 {
 293    this->current_annotation = "gen6 urb header";
 294    /* Compute offset of the flags for the current vertex in vertex_output and
 295     * write them in dw2 of the message header.
 296     *
 297     * Notice that by the time that emit_thread_end() calls here
 298     * vertex_output_offset should point to the first data item of the current
 299     * vertex in vertex_output, thus we only need to add the number of output
 300     * slots per vertex to that offset to obtain the flags data offset.
 301     */
 302    src_reg flags_offset(this, glsl_type::uint_type);
 303    emit(ADD(dst_reg(flags_offset),
 304             this->vertex_output_offset, src_reg(prog_data->vue_map.num_slots)));
 305
 306    src_reg flags_data(this->vertex_output);
 307    flags_data.reladdr = ralloc(mem_ctx, src_reg);
 308    memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg));
 309
 310    emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
 311 }
 312
 313 void
 314 gen6_gs_visitor::emit_urb_write_opcode(bool complete, int base_mrf,
 315                                        int last_mrf, int urb_offset)
 316 {
 317    vec4_instruction *inst = NULL;
 318
 319    if (!complete) {
 320       /* If the vertex is not complete we don't have to do anything special */
 321       inst = emit(GS_OPCODE_URB_WRITE);
 322       inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
 323    } else {
 324       /* Otherwise we always request to allocate a new VUE handle. If this is
 325        * the last write before the EOT message and the new handle never gets
 326        * used it will be dereferenced when we send the EOT message. This is
 327        * necessary to avoid different setups for the EOT message (one for the
 328        * case when there is no output and another for the case when there is)
 329        * which would require to end the program with an IF/ELSE/ENDIF block,
 330        * something we do not want.
 331        */
 332       inst = emit(GS_OPCODE_URB_WRITE_ALLOCATE);
 333       inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
 334       inst->dst = dst_reg(MRF, base_mrf);
 335       inst->src[0] = this->temp;
 336    }
 337
 338    inst->base_mrf = base_mrf;
 339    /* URB data written (does not include the message header reg) must
 340     * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
 341     * section 5.4.3.2.2: URB_INTERLEAVED.
 342     */
 343    int mlen = last_mrf - base_mrf;
 344    if ((mlen % 2) != 1)
 345       mlen++;
 346    inst->mlen = mlen;
 347    inst->offset = urb_offset;
 348 }
 349
 350 void
 351 gen6_gs_visitor::emit_thread_end()
 352 {
 353    /* Make sure the current primitive is ended: we know it is not ended when
 354     * first_vertex is not zero. This is only relevant for outputs other than
 355     * points because in the point case we set PrimEnd on all vertices.
 356     */
 357    if (c->gp->program.OutputType != GL_POINTS) {
 358       emit(CMP(dst_null_d(), this->first_vertex, 0u, BRW_CONDITIONAL_Z));
 359       emit(IF(BRW_PREDICATE_NORMAL));
 360       {
 361          visit((ir_end_primitive *) NULL);
 362       }
 363       emit(BRW_OPCODE_ENDIF);
 364    }
 365
 366    /* Here we have to:
 367     * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle.
 368     * 2) Loop over all buffered vertex data and write it to corresponding
 369     *    URB entries.
 370     * 3) Allocate new VUE handles for all vertices other than the first.
 371     * 4) Send a final EOT message.
 372     */
 373
 374    /* MRF 0 is reserved for the debugger, so start with message header
 375     * in MRF 1.
 376     */
 377    int base_mrf = 1;
 378
 379    /* In the process of generating our URB write message contents, we
 380     * may need to unspill a register or load from an array.  Those
 381     * reads would use MRFs 14-15.
 382     */
 383    int max_usable_mrf = 13;
 384
 385    /* Issue the FF_SYNC message and obtain the initial VUE handle. */
 386    emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_G));
 387    emit(IF(BRW_PREDICATE_NORMAL));
 388    {
 389       this->current_annotation = "gen6 thread end: ff_sync";
 390
 391       vec4_instruction *inst;
 392       if (c->prog_data.gen6_xfb_enabled) {
 393          src_reg sol_temp(this, glsl_type::uvec4_type);
 394          emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
 395               dst_reg(this->svbi),
 396               this->vertex_count,
 397               this->prim_count,
 398               sol_temp);
 399          inst = emit(GS_OPCODE_FF_SYNC,
 400                      dst_reg(this->temp), this->prim_count, this->svbi);
 401       } else {
 402          inst = emit(GS_OPCODE_FF_SYNC,
 403                      dst_reg(this->temp), this->prim_count, brw_imm_ud(0u));
 404       }
 405       inst->base_mrf = base_mrf;
 406
 407       /* Loop over all buffered vertices and emit URB write messages */
 408       this->current_annotation = "gen6 thread end: urb writes init";
 409       src_reg vertex(this, glsl_type::uint_type);
 410       emit(MOV(dst_reg(vertex), 0u));
 411       emit(MOV(dst_reg(this->vertex_output_offset), 0u));
 412
 413       this->current_annotation = "gen6 thread end: urb writes";
 414       emit(BRW_OPCODE_DO);
 415       {
 416          emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
 417          inst = emit(BRW_OPCODE_BREAK);
 418          inst->predicate = BRW_PREDICATE_NORMAL;
 419
 420          /* First we prepare the message header */
 421          emit_urb_write_header(base_mrf);
 422
 423          /* Then add vertex data to the message in interleaved fashion */
 424          int slot = 0;
 425          bool complete = false;
 426          do {
 427             int mrf = base_mrf + 1;
 428
 429             /* URB offset is in URB row increments, and each of our MRFs is half
 430              * of one of those, since we're doing interleaved writes.
 431              */
 432             int urb_offset = slot / 2;
 433
 434             for (; slot < prog_data->vue_map.num_slots; ++slot) {
 435                int varying = prog_data->vue_map.slot_to_varying[slot];
 436                current_annotation = output_reg_annotation[varying];
 437
 438                /* Compute offset of this slot for the current vertex
 439                 * in vertex_output
 440                 */
 441                src_reg data(this->vertex_output);
 442                data.reladdr = ralloc(mem_ctx, src_reg);
 443                memcpy(data.reladdr, &this->vertex_output_offset,
 444                       sizeof(src_reg));
 445
 446                /* Copy this slot to the appropriate message register */
 447                dst_reg reg = dst_reg(MRF, mrf);
 448                reg.type = output_reg[varying].type;
 449                data.type = reg.type;
 450                vec4_instruction *inst = emit(MOV(reg, data));
 451                inst->force_writemask_all = true;
 452
 453                mrf++;
 454                emit(ADD(dst_reg(this->vertex_output_offset),
 455                         this->vertex_output_offset, 1u));
 456
 457                /* If this was max_usable_mrf, we can't fit anything more into
 458                 * this URB WRITE.
 459                 */
 460                if (mrf > max_usable_mrf) {
 461                   slot++;
 462                   break;
 463                }
 464             }
 465
 466             complete = slot >= prog_data->vue_map.num_slots;
 467             emit_urb_write_opcode(complete, base_mrf, mrf, urb_offset);
 468          } while (!complete);
 469
 470          /* Skip over the flags data item so that vertex_output_offset points
 471           * to the first data item of the next vertex, so that we can start
 472           * writing the next vertex.
 473           */
 474          emit(ADD(dst_reg(this->vertex_output_offset),
 475                   this->vertex_output_offset, 1u));
 476
 477          emit(ADD(dst_reg(vertex), vertex, 1u));
 478       }
 479       emit(BRW_OPCODE_WHILE);
 480
 481       if (c->prog_data.gen6_xfb_enabled)
 482          xfb_write();
 483    }
 484    emit(BRW_OPCODE_ENDIF);
 485
 486    /* Finally, emit EOT message.
 487     *
 488     * In gen6 we need to end the thread differently depending on whether we have
 489     * emitted at least one vertex or not. In case we did, the EOT message must
 490     * always include the COMPLETE flag or else the GPU hangs. If we have not
 491     * produced any output we can't use the COMPLETE flag.
 492     *
 493     * However, this would lead us to end the program with an ENDIF opcode,
 494     * which we want to avoid, so what we do is that we always request a new
 495     * VUE handle every time we do a URB WRITE, even for the last vertex we emit.
 496     * With this we make sure that whether we have emitted at least one vertex
 497     * or none at all, we have to finish the thread without writing to the URB,
 498     * which works for both cases by setting the COMPLETE and UNUSED flags in
 499     * the EOT message.
 500     */
 501    this->current_annotation = "gen6 thread end: EOT";
 502
 503    if (c->prog_data.gen6_xfb_enabled) {
 504       /* When emitting EOT, set SONumPrimsWritten Increment Value. */
 505       src_reg data(this, glsl_type::uint_type);
 506       emit(AND(dst_reg(data), this->sol_prim_written, brw_imm_ud(0xffffu)));
 507       emit(SHL(dst_reg(data), data, brw_imm_ud(16u)));
 508       emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
 509    }
 510
 511    vec4_instruction *inst = emit(GS_OPCODE_THREAD_END);
 512    inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED;
 513    inst->base_mrf = base_mrf;
 514    inst->mlen = 1;
 515 }
 516
 517 void
 518 gen6_gs_visitor::setup_payload()
 519 {
 520    int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
 521
 522    /* Attributes are going to be interleaved, so one register contains two
 523     * attribute slots.
 524     */
 525    int attributes_per_reg = 2;
 526
 527    /* If a geometry shader tries to read from an input that wasn't written by
 528     * the vertex shader, that produces undefined results, but it shouldn't
 529     * crash anything.  So initialize attribute_map to zeros--that ensures that
 530     * these undefined results are read from r0.
 531     */
 532    memset(attribute_map, 0, sizeof(attribute_map));
 533
 534    int reg = 0;
 535
 536    /* The payload always contains important data in r0. */
 537    reg++;
 538
 539    /* r1 is always part of the payload and it holds information relevant
 540     * for transform feedback when we set the GEN6_GS_SVBI_PAYLOAD_ENABLE bit in
 541     * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID
 542     * information (and move the original value to a virtual register if
 543     * necessary).
 544     */
 545    if (c->prog_data.include_primitive_id)
 546       attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg;
 547    reg++;
 548
 549    reg = setup_uniforms(reg);
 550
 551    reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
 552
 553    lower_attributes_to_hw_regs(attribute_map, true);
 554
 555    this->first_non_payload_grf = reg;
 556 }
 557
 558 void
 559 gen6_gs_visitor::xfb_buffer_output()
 560 {
 561    static const unsigned swizzle_for_offset[4] = {
 562       BRW_SWIZZLE4(0, 1, 2, 3),
 563       BRW_SWIZZLE4(1, 2, 3, 3),
 564       BRW_SWIZZLE4(2, 3, 3, 3),
 565       BRW_SWIZZLE4(3, 3, 3, 3)
 566    };
 567
 568    struct brw_gs_prog_data *prog_data =
 569       (struct brw_gs_prog_data *) &c->prog_data;
 570
 571    if (!prog_data->num_transform_feedback_bindings) {
 572       const struct gl_transform_feedback_info *linked_xfb_info =
 573          &this->shader_prog->LinkedTransformFeedback;
 574       int i;
 575
 576       /* Make sure that the VUE slots won't overflow the unsigned chars in
 577        * prog_data->transform_feedback_bindings[].
 578        */
 579       STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256);
 580
 581       /* Make sure that we don't need more binding table entries than we've
 582        * set aside for use in transform feedback.  (We shouldn't, since we
 583        * set aside enough binding table entries to have one per component).
 584        */
 585       assert(linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS);
 586
 587       prog_data->num_transform_feedback_bindings = linked_xfb_info->NumOutputs;
 588       for (i = 0; i < prog_data->num_transform_feedback_bindings; i++) {
 589          prog_data->transform_feedback_bindings[i] =
 590             linked_xfb_info->Outputs[i].OutputRegister;
 591          prog_data->transform_feedback_swizzles[i] =
 592             swizzle_for_offset[linked_xfb_info->Outputs[i].ComponentOffset];
 593       }
 594    }
 595
 596    /* Buffer all TF outputs for this vertex in xfb_output */
 597    for (int binding = 0; binding < prog_data->num_transform_feedback_bindings;
 598         binding++) {
 599       unsigned varying =
 600          prog_data->transform_feedback_bindings[binding];
 601       dst_reg dst(this->xfb_output);
 602       dst.reladdr = ralloc(mem_ctx, src_reg);
 603       memcpy(dst.reladdr, &this->xfb_output_offset, sizeof(src_reg));
 604       dst.type = output_reg[varying].type;
 605
 606       this->current_annotation = output_reg_annotation[varying];
 607       src_reg out_reg = src_reg(output_reg[varying]);
 608       out_reg.swizzle = varying == VARYING_SLOT_PSIZ
 609          ? BRW_SWIZZLE_WWWW : prog_data->transform_feedback_swizzles[binding];
 610       emit(MOV(dst, out_reg));
 611
 612       emit(ADD(dst_reg(this->xfb_output_offset), this->xfb_output_offset, 1u));
 613    }
 614 }
 615
 616 void
 617 gen6_gs_visitor::xfb_write()
 618 {
 619    unsigned num_verts;
 620    struct brw_gs_prog_data *prog_data =
 621       (struct brw_gs_prog_data *) &c->prog_data;
 622
 623    if (!prog_data->num_transform_feedback_bindings)
 624       return;
 625
 626    switch (c->prog_data.output_topology) {
 627    case _3DPRIM_POINTLIST:
 628       num_verts = 1;
 629       break;
 630    case _3DPRIM_LINELIST:
 631    case _3DPRIM_LINESTRIP:
 632    case _3DPRIM_LINELOOP:
 633       num_verts = 2;
 634       break;
 635    case _3DPRIM_TRILIST:
 636    case _3DPRIM_TRIFAN:
 637    case _3DPRIM_TRISTRIP:
 638    case _3DPRIM_RECTLIST:
 639       num_verts = 3;
 640       break;
 641    case _3DPRIM_QUADLIST:
 642    case _3DPRIM_QUADSTRIP:
 643    case _3DPRIM_POLYGON:
 644       num_verts = 3;
 645       break;
 646    default:
 647       unreachable("Unexpected primitive type in Gen6 SOL program.");
 648    }
 649
 650    this->current_annotation = "gen6 thread end: svb writes init";
 651
 652    emit(MOV(dst_reg(this->xfb_output_offset), 0u));
 653    emit(MOV(dst_reg(this->sol_prim_written), 0u));
 654
 655    /* Check that at least one primitive can be written
 656     *
 657     * Note: since we use the binding table to keep track of buffer offsets
 658     * and stride, the GS doesn't need to keep track of a separate pointer
 659     * into each buffer; it uses a single pointer which increments by 1 for
 660     * each vertex.  So we use SVBI0 for this pointer, regardless of whether
 661     * transform feedback is in interleaved or separate attribs mode.
 662     */
 663    src_reg sol_temp(this, glsl_type::uvec4_type);
 664    emit(ADD(dst_reg(sol_temp), this->svbi, brw_imm_ud(num_verts)));
 665
 666    /* Compare SVBI calculated number with the maximum value, which is
 667     * in R1.4 (previously saved in this->max_svbi) for gen6.
 668     */
 669    emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
 670    emit(IF(BRW_PREDICATE_NORMAL));
 671    {
 672       struct src_reg destination_indices_uw =
 673          retype(destination_indices, BRW_REGISTER_TYPE_UW);
 674
 675       vec4_instruction *inst = emit(MOV(dst_reg(destination_indices_uw),
 676                                         brw_imm_v(0x00020100))); /* (0, 1, 2) */
 677       inst->force_writemask_all = true;
 678
 679       emit(ADD(dst_reg(this->destination_indices),
 680                this->destination_indices,
 681                this->svbi));
 682    }
 683    emit(BRW_OPCODE_ENDIF);
 684
 685    this->current_vertex = 0;
 686    /* Make sure we do not emit more transform feedback data than the amount
 687     * we have buffered.
 688     */
 689    for (int i = 0; i < c->gp->program.VerticesOut; i++) {
 690       emit(MOV(dst_reg(sol_temp), i));
 691       emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
 692                BRW_CONDITIONAL_L));
 693       emit(IF(BRW_PREDICATE_NORMAL));
 694       {
 695          xfb_program(num_verts);
 696       }
 697       emit(BRW_OPCODE_ENDIF);
 698    }
 699 }
 700
 701 void
 702 gen6_gs_visitor::xfb_program(unsigned num_verts)
 703 {
 704    struct brw_gs_prog_data *prog_data =
 705       (struct brw_gs_prog_data *) &c->prog_data;
 706    unsigned binding;
 707    unsigned num_bindings = prog_data->num_transform_feedback_bindings;
 708    src_reg sol_temp(this, glsl_type::uvec4_type);
 709
 710    /* Check if we can write one primitive more */
 711    emit(ADD(dst_reg(sol_temp), this->sol_prim_written, 1u));
 712    emit(MUL(dst_reg(sol_temp), sol_temp, brw_imm_ud(num_verts)));
 713    emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
 714    emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
 715    emit(IF(BRW_PREDICATE_NORMAL));
 716    {
 717       if (this->current_vertex >= num_verts)
 718          this->current_vertex = 0;
 719
 720       /* Avoid overwriting MRF 1 as it is used as URB write message header */
 721       dst_reg mrf_reg(MRF, 2);
 722
 723       this->current_annotation = "gen6: emit SOL vertex data";
 724       /* For each vertex, generate code to output each varying using the
 725        * appropriate binding table entry.
 726        */
 727       for (binding = 0; binding < num_bindings; ++binding) {
 728          /* Set up the correct destination index for this vertex */
 729          vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
 730                                        mrf_reg,
 731                                        this->destination_indices);
 732          inst->sol_vertex = this->current_vertex;
 733
 734          unsigned char varying =
 735             prog_data->transform_feedback_bindings[binding];
 736
 737          /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
 738           *
 739           *   "Prior to End of Thread with a URB_WRITE, the kernel must
 740           *   ensure that all writes are complete by sending the final
 741           *   write as a committed write."
 742           */
 743          bool final_write = binding == (unsigned) num_bindings - 1 &&
 744                             this->current_vertex == num_verts - 1;
 745
 746          /* Compute offset of this varying for the current vertex
 747           * in xfb_output
 748           */
 749          src_reg data(this->xfb_output);
 750          data.reladdr = ralloc(mem_ctx, src_reg);
 751          memcpy(data.reladdr, &this->xfb_output_offset, sizeof(src_reg));
 752          src_reg out_reg;
 753          this->current_annotation = output_reg_annotation[varying];
 754
 755          /* Copy this varying to the appropriate message register */
 756          out_reg = src_reg(this, glsl_type::uvec4_type);
 757          out_reg.type = output_reg[varying].type;
 758
 759          data.type = output_reg[varying].type;
 760          emit(MOV(dst_reg(out_reg), data));
 761
 762          /* Write data and send SVB Write */
 763          inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, out_reg, sol_temp);
 764          inst->sol_binding = binding;
 765          inst->sol_final_write = final_write;
 766
 767          emit(ADD(dst_reg(this->xfb_output_offset),
 768                   this->xfb_output_offset, 1u));
 769
 770          if (final_write) {
 771             /* This is the last vertex of the primitive, then increment
 772              * SO num primitive counter and destination indices.
 773              */
 774             emit(ADD(dst_reg(this->destination_indices),
 775                      this->destination_indices,
 776                      brw_imm_ud(num_verts)));
 777             emit(ADD(dst_reg(this->sol_prim_written),
 778                      this->sol_prim_written, 1u));
 779          }
 780
 781       }
 782       this->current_vertex++;
 783       this->current_annotation = NULL;
 784    }
 785    emit(BRW_OPCODE_ENDIF);
 786 }
 787
 788 } /* namespace brw */