src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp

   1 /*
   2  * Copyright © 2014 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * This code is based on original work by Ilia Mirkin.
  24  */
  25
  26 /**
  27  * \file gen6_gs_visitor.cpp
  28  *
  29  * Gen6 geometry shader implementation
  30  */
  31
  32 #include "gen6_gs_visitor.h"
  33
  34 namespace brw {
  35
  36 void
  37 gen6_gs_visitor::emit_prolog()
  38 {
  39    vec4_gs_visitor::emit_prolog();
  40
  41    /* Gen6 geometry shaders require to allocate an initial VUE handle via
  42     * FF_SYNC message, however the documentation remarks that only one thread
  43     * can write to the URB simultaneously and the FF_SYNC message provides the
  44     * synchronization mechanism for this, so using this message effectively
  45     * stalls the thread until it is its turn to write to the URB. Because of
  46     * this, the best way to implement geometry shader algorithms in gen6 is to
  47     * execute the algorithm before the FF_SYNC message to maximize parallelism.
  48     *
  49     * To achieve this we buffer the geometry shader outputs for each emitted
  50     * vertex in vertex_output during operation. Then, when we have processed
  51     * the last vertex (that is, at thread end time), we send the FF_SYNC
  52     * message to allocate the initial VUE handle and write all buffered vertex
  53     * data to the URB in one go.
  54     *
  55     * For each emitted vertex, vertex_output will hold vue_map.num_slots
  56     * data items plus one additional item to hold required flags
  57     * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
  58     * which come right after the data items for that vertex. Vertex data and
  59     * flags for the next vertex come right after the data items and flags for
  60     * the previous vertex.
  61     */
  62    this->current_annotation = "gen6 prolog";
  63    this->vertex_output = src_reg(this,
  64                                  glsl_type::uint_type,
  65                                  (prog_data->vue_map.num_slots + 1) *
  66                                  nir->info.gs.vertices_out);
  67    this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
  68    emit(MOV(dst_reg(this->vertex_output_offset), src_reg(0u)));
  69
  70    /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
  71     * so initialize it once to R0.
  72     */
  73    vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
  74                                      retype(brw_vec8_grf(0, 0),
  75                                             BRW_REGISTER_TYPE_UD)));
  76    inst->force_writemask_all = true;
  77
  78    /* This will be used as a temporary to store writeback data of FF_SYNC
  79     * and URB_WRITE messages.
  80     */
  81    this->temp = src_reg(this, glsl_type::uint_type);
  82
  83    /* This will be used to know when we are processing the first vertex of
  84     * a primitive. We will set this to URB_WRITE_PRIM_START only when we know
  85     * that we are processing the first vertex in the primitive and to zero
  86     * otherwise. This way we can use its value directly in the URB write
  87     * headers.
  88     */
  89    this->first_vertex = src_reg(this, glsl_type::uint_type);
  90    emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
  91
  92    /* The FF_SYNC message requires to know the number of primitives generated,
  93     * so keep a counter for this.
  94     */
  95    this->prim_count = src_reg(this, glsl_type::uint_type);
  96    emit(MOV(dst_reg(this->prim_count), 0u));
  97
  98    if (gs_prog_data->gen6_xfb_enabled) {
  99       /* Create a virtual register to hold destination indices in SOL */
 100       this->destination_indices = src_reg(this, glsl_type::uvec4_type);
 101       /* Create a virtual register to hold number of written primitives */
 102       this->sol_prim_written = src_reg(this, glsl_type::uint_type);
 103       /* Create a virtual register to hold Streamed Vertex Buffer Indices */
 104       this->svbi = src_reg(this, glsl_type::uvec4_type);
 105       /* Create a virtual register to hold max values of SVBI */
 106       this->max_svbi = src_reg(this, glsl_type::uvec4_type);
 107       emit(MOV(dst_reg(this->max_svbi),
 108                src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD))));
 109
 110       xfb_setup();
 111    }
 112
 113    /* PrimitveID is delivered in r0.1 of the thread payload. If the program
 114     * needs it we have to move it to a separate register where we can map
 115     * the atttribute.
 116     *
 117     * Notice that we cannot use a virtual register for this, because we need to
 118     * map all input attributes to hardware registers in setup_payload(),
 119     * which happens before virtual registers are mapped to hardware registers.
 120     * We could work around that issue if we were able to compute the first
 121     * non-payload register here and move the PrimitiveID information to that
 122     * register, but we can't because at this point we don't know the final
 123     * number uniforms that will be included in the payload.
 124     *
 125     * So, what we do is to place PrimitiveID information in r1, which is always
 126     * delivered as part of the payload, but its only populated with data
 127     * relevant for transform feedback when we set GEN6_GS_SVBI_PAYLOAD_ENABLE
 128     * in the 3DSTATE_GS state packet. That information can be obtained by other
 129     * means though, so we can safely use r1 for this purpose.
 130     */
 131    if (gs_prog_data->include_primitive_id) {
 132       this->primitive_id =
 133          src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
 134       emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
 135    }
 136 }
 137
 138 void
 139 gen6_gs_visitor::gs_emit_vertex(int stream_id)
 140 {
 141    this->current_annotation = "gen6 emit vertex";
 142
 143    /* Buffer all output slots for this vertex in vertex_output */
 144    for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
 145       int varying = prog_data->vue_map.slot_to_varying[slot];
 146       if (varying != VARYING_SLOT_PSIZ) {
 147          dst_reg dst(this->vertex_output);
 148          dst.reladdr = ralloc(mem_ctx, src_reg);
 149          memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
 150          emit_urb_slot(dst, varying);
 151       } else {
 152          /* The PSIZ slot can pack multiple varyings in different channels
 153           * and emit_urb_slot() will produce a MOV instruction for each of
 154           * them. Since we are writing to an array, that will translate to
 155           * possibly multiple MOV instructions with an array destination and
 156           * each will generate a scratch write with the same offset into
 157           * scratch space (thus, each one overwriting the previous). This is
 158           * not what we want. What we will do instead is emit PSIZ to a
 159           * a regular temporary register, then move that resgister into the
 160           * array. This way we only have one instruction with an array
 161           * destination and we only produce a single scratch write.
 162           */
 163          dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type));
 164          emit_urb_slot(tmp, varying);
 165          dst_reg dst(this->vertex_output);
 166          dst.reladdr = ralloc(mem_ctx, src_reg);
 167          memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
 168          vec4_instruction *inst = emit(MOV(dst, src_reg(tmp)));
 169          inst->force_writemask_all = true;
 170       }
 171
 172       emit(ADD(dst_reg(this->vertex_output_offset),
 173                this->vertex_output_offset, 1u));
 174    }
 175
 176    /* Now buffer flags for this vertex */
 177    dst_reg dst(this->vertex_output);
 178    dst.reladdr = ralloc(mem_ctx, src_reg);
 179    memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
 180    if (nir->info.gs.output_primitive == GL_POINTS) {
 181       /* If we are outputting points, then every vertex has PrimStart and
 182        * PrimEnd set.
 183        */
 184       emit(MOV(dst, (_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
 185                URB_WRITE_PRIM_START | URB_WRITE_PRIM_END));
 186       emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
 187    } else {
 188       /* Otherwise, we can only set the PrimStart flag, which we have stored
 189        * in the first_vertex register. We will have to wait until we execute
 190        * EndPrimitive() or we end the thread to set the PrimEnd flag on a
 191        * vertex.
 192        */
 193       emit(OR(dst, this->first_vertex,
 194               (gs_prog_data->output_topology << URB_WRITE_PRIM_TYPE_SHIFT)));
 195       emit(MOV(dst_reg(this->first_vertex), 0u));
 196    }
 197    emit(ADD(dst_reg(this->vertex_output_offset),
 198             this->vertex_output_offset, 1u));
 199 }
 200
 201 void
 202 gen6_gs_visitor::gs_end_primitive()
 203 {
 204    this->current_annotation = "gen6 end primitive";
 205    /* Calling EndPrimitive() is optional for point output. In this case we set
 206     * the PrimEnd flag when we process EmitVertex().
 207     */
 208    if (nir->info.gs.output_primitive == GL_POINTS)
 209       return;
 210
 211    /* Otherwise we know that the last vertex we have processed was the last
 212     * vertex in the primitive and we need to set its PrimEnd flag, so do this
 213     * unless we haven't emitted that vertex at all (vertex_count != 0).
 214     *
 215     * Notice that we have already incremented vertex_count when we processed
 216     * the last emit_vertex, so we need to take that into account in the
 217     * comparison below (hence the num_output_vertices + 1 in the comparison
 218     * below).
 219     */
 220    unsigned num_output_vertices = nir->info.gs.vertices_out;
 221    emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices + 1),
 222             BRW_CONDITIONAL_L));
 223    vec4_instruction *inst = emit(CMP(dst_null_d(),
 224                                      this->vertex_count, 0u,
 225                                      BRW_CONDITIONAL_NEQ));
 226    inst->predicate = BRW_PREDICATE_NORMAL;
 227    emit(IF(BRW_PREDICATE_NORMAL));
 228    {
 229       /* vertex_output_offset is already pointing at the first entry of the
 230        * next vertex. So subtract 1 to modify the flags for the previous
 231        * vertex.
 232        */
 233       src_reg offset(this, glsl_type::uint_type);
 234       emit(ADD(dst_reg(offset), this->vertex_output_offset, src_reg(-1)));
 235
 236       src_reg dst(this->vertex_output);
 237       dst.reladdr = ralloc(mem_ctx, src_reg);
 238       memcpy(dst.reladdr, &offset, sizeof(src_reg));
 239
 240       emit(OR(dst_reg(dst), dst, URB_WRITE_PRIM_END));
 241       emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
 242
 243       /* Set the first vertex flag to indicate that the next vertex will start
 244        * a primitive.
 245        */
 246       emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
 247    }
 248    emit(BRW_OPCODE_ENDIF);
 249 }
 250
 251 void
 252 gen6_gs_visitor::emit_urb_write_header(int mrf)
 253 {
 254    this->current_annotation = "gen6 urb header";
 255    /* Compute offset of the flags for the current vertex in vertex_output and
 256     * write them in dw2 of the message header.
 257     *
 258     * Notice that by the time that emit_thread_end() calls here
 259     * vertex_output_offset should point to the first data item of the current
 260     * vertex in vertex_output, thus we only need to add the number of output
 261     * slots per vertex to that offset to obtain the flags data offset.
 262     */
 263    src_reg flags_offset(this, glsl_type::uint_type);
 264    emit(ADD(dst_reg(flags_offset),
 265             this->vertex_output_offset, src_reg(prog_data->vue_map.num_slots)));
 266
 267    src_reg flags_data(this->vertex_output);
 268    flags_data.reladdr = ralloc(mem_ctx, src_reg);
 269    memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg));
 270
 271    emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
 272 }
 273
 274 static int
 275 align_interleaved_urb_mlen(int mlen)
 276 {
 277    /* URB data written (does not include the message header reg) must
 278     * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
 279     * section 5.4.3.2.2: URB_INTERLEAVED.
 280     */
 281    if ((mlen % 2) != 1)
 282       mlen++;
 283    return mlen;
 284 }
 285
 286 void
 287 gen6_gs_visitor::emit_urb_write_opcode(bool complete, int base_mrf,
 288                                        int last_mrf, int urb_offset)
 289 {
 290    vec4_instruction *inst = NULL;
 291
 292    if (!complete) {
 293       /* If the vertex is not complete we don't have to do anything special */
 294       inst = emit(GS_OPCODE_URB_WRITE);
 295       inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
 296    } else {
 297       /* Otherwise we always request to allocate a new VUE handle. If this is
 298        * the last write before the EOT message and the new handle never gets
 299        * used it will be dereferenced when we send the EOT message. This is
 300        * necessary to avoid different setups for the EOT message (one for the
 301        * case when there is no output and another for the case when there is)
 302        * which would require to end the program with an IF/ELSE/ENDIF block,
 303        * something we do not want.
 304        */
 305       inst = emit(GS_OPCODE_URB_WRITE_ALLOCATE);
 306       inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
 307       inst->dst = dst_reg(MRF, base_mrf);
 308       inst->src[0] = this->temp;
 309    }
 310
 311    inst->base_mrf = base_mrf;
 312    inst->mlen = align_interleaved_urb_mlen(last_mrf - base_mrf);
 313    inst->offset = urb_offset;
 314 }
 315
 316 void
 317 gen6_gs_visitor::emit_thread_end()
 318 {
 319    /* Make sure the current primitive is ended: we know it is not ended when
 320     * first_vertex is not zero. This is only relevant for outputs other than
 321     * points because in the point case we set PrimEnd on all vertices.
 322     */
 323    if (nir->info.gs.output_primitive != GL_POINTS) {
 324       emit(CMP(dst_null_d(), this->first_vertex, 0u, BRW_CONDITIONAL_Z));
 325       emit(IF(BRW_PREDICATE_NORMAL));
 326       gs_end_primitive();
 327       emit(BRW_OPCODE_ENDIF);
 328    }
 329
 330    /* Here we have to:
 331     * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle.
 332     * 2) Loop over all buffered vertex data and write it to corresponding
 333     *    URB entries.
 334     * 3) Allocate new VUE handles for all vertices other than the first.
 335     * 4) Send a final EOT message.
 336     */
 337
 338    /* MRF 0 is reserved for the debugger, so start with message header
 339     * in MRF 1.
 340     */
 341    int base_mrf = 1;
 342
 343    /* In the process of generating our URB write message contents, we
 344     * may need to unspill a register or load from an array.  Those
 345     * reads would use MRFs 21..23
 346     */
 347    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
 348
 349    /* Issue the FF_SYNC message and obtain the initial VUE handle. */
 350    emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_G));
 351    emit(IF(BRW_PREDICATE_NORMAL));
 352    {
 353       this->current_annotation = "gen6 thread end: ff_sync";
 354
 355       vec4_instruction *inst;
 356       if (gs_prog_data->gen6_xfb_enabled) {
 357          src_reg sol_temp(this, glsl_type::uvec4_type);
 358          emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
 359               dst_reg(this->svbi),
 360               this->vertex_count,
 361               this->prim_count,
 362               sol_temp);
 363          inst = emit(GS_OPCODE_FF_SYNC,
 364                      dst_reg(this->temp), this->prim_count, this->svbi);
 365       } else {
 366          inst = emit(GS_OPCODE_FF_SYNC,
 367                      dst_reg(this->temp), this->prim_count, src_reg(0u));
 368       }
 369       inst->base_mrf = base_mrf;
 370
 371       /* Loop over all buffered vertices and emit URB write messages */
 372       this->current_annotation = "gen6 thread end: urb writes init";
 373       src_reg vertex(this, glsl_type::uint_type);
 374       emit(MOV(dst_reg(vertex), 0u));
 375       emit(MOV(dst_reg(this->vertex_output_offset), 0u));
 376
 377       this->current_annotation = "gen6 thread end: urb writes";
 378       emit(BRW_OPCODE_DO);
 379       {
 380          emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
 381          inst = emit(BRW_OPCODE_BREAK);
 382          inst->predicate = BRW_PREDICATE_NORMAL;
 383
 384          /* First we prepare the message header */
 385          emit_urb_write_header(base_mrf);
 386
 387          /* Then add vertex data to the message in interleaved fashion */
 388          int slot = 0;
 389          bool complete = false;
 390          do {
 391             int mrf = base_mrf + 1;
 392
 393             /* URB offset is in URB row increments, and each of our MRFs is half
 394              * of one of those, since we're doing interleaved writes.
 395              */
 396             int urb_offset = slot / 2;
 397
 398             for (; slot < prog_data->vue_map.num_slots; ++slot) {
 399                int varying = prog_data->vue_map.slot_to_varying[slot];
 400                current_annotation = output_reg_annotation[varying];
 401
 402                /* Compute offset of this slot for the current vertex
 403                 * in vertex_output
 404                 */
 405                src_reg data(this->vertex_output);
 406                data.reladdr = ralloc(mem_ctx, src_reg);
 407                memcpy(data.reladdr, &this->vertex_output_offset,
 408                       sizeof(src_reg));
 409
 410                /* Copy this slot to the appropriate message register */
 411                dst_reg reg = dst_reg(MRF, mrf);
 412                reg.type = output_reg[varying].type;
 413                data.type = reg.type;
 414                vec4_instruction *inst = emit(MOV(reg, data));
 415                inst->force_writemask_all = true;
 416
 417                mrf++;
 418                emit(ADD(dst_reg(this->vertex_output_offset),
 419                         this->vertex_output_offset, 1u));
 420
 421                /* If this was max_usable_mrf, we can't fit anything more into
 422                 * this URB WRITE. Same if we reached the max. message length.
 423                 */
 424                if (mrf > max_usable_mrf ||
 425                    align_interleaved_urb_mlen(mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
 426                   slot++;
 427                   break;
 428                }
 429             }
 430
 431             complete = slot >= prog_data->vue_map.num_slots;
 432             emit_urb_write_opcode(complete, base_mrf, mrf, urb_offset);
 433          } while (!complete);
 434
 435          /* Skip over the flags data item so that vertex_output_offset points
 436           * to the first data item of the next vertex, so that we can start
 437           * writing the next vertex.
 438           */
 439          emit(ADD(dst_reg(this->vertex_output_offset),
 440                   this->vertex_output_offset, 1u));
 441
 442          emit(ADD(dst_reg(vertex), vertex, 1u));
 443       }
 444       emit(BRW_OPCODE_WHILE);
 445
 446       if (gs_prog_data->gen6_xfb_enabled)
 447          xfb_write();
 448    }
 449    emit(BRW_OPCODE_ENDIF);
 450
 451    /* Finally, emit EOT message.
 452     *
 453     * In gen6 we need to end the thread differently depending on whether we have
 454     * emitted at least one vertex or not. In case we did, the EOT message must
 455     * always include the COMPLETE flag or else the GPU hangs. If we have not
 456     * produced any output we can't use the COMPLETE flag.
 457     *
 458     * However, this would lead us to end the program with an ENDIF opcode,
 459     * which we want to avoid, so what we do is that we always request a new
 460     * VUE handle every time we do a URB WRITE, even for the last vertex we emit.
 461     * With this we make sure that whether we have emitted at least one vertex
 462     * or none at all, we have to finish the thread without writing to the URB,
 463     * which works for both cases by setting the COMPLETE and UNUSED flags in
 464     * the EOT message.
 465     */
 466    this->current_annotation = "gen6 thread end: EOT";
 467
 468    if (gs_prog_data->gen6_xfb_enabled) {
 469       /* When emitting EOT, set SONumPrimsWritten Increment Value. */
 470       src_reg data(this, glsl_type::uint_type);
 471       emit(AND(dst_reg(data), this->sol_prim_written, src_reg(0xffffu)));
 472       emit(SHL(dst_reg(data), data, src_reg(16u)));
 473       emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
 474    }
 475
 476    vec4_instruction *inst = emit(GS_OPCODE_THREAD_END);
 477    inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED;
 478    inst->base_mrf = base_mrf;
 479    inst->mlen = 1;
 480 }
 481
 482 void
 483 gen6_gs_visitor::setup_payload()
 484 {
 485    int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
 486
 487    /* Attributes are going to be interleaved, so one register contains two
 488     * attribute slots.
 489     */
 490    int attributes_per_reg = 2;
 491
 492    /* If a geometry shader tries to read from an input that wasn't written by
 493     * the vertex shader, that produces undefined results, but it shouldn't
 494     * crash anything.  So initialize attribute_map to zeros--that ensures that
 495     * these undefined results are read from r0.
 496     */
 497    memset(attribute_map, 0, sizeof(attribute_map));
 498
 499    int reg = 0;
 500
 501    /* The payload always contains important data in r0. */
 502    reg++;
 503
 504    /* r1 is always part of the payload and it holds information relevant
 505     * for transform feedback when we set the GEN6_GS_SVBI_PAYLOAD_ENABLE bit in
 506     * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID
 507     * information (and move the original value to a virtual register if
 508     * necessary).
 509     */
 510    if (gs_prog_data->include_primitive_id)
 511       attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg;
 512    reg++;
 513
 514    reg = setup_uniforms(reg);
 515
 516    reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
 517
 518    lower_attributes_to_hw_regs(attribute_map, true);
 519
 520    this->first_non_payload_grf = reg;
 521 }
 522
 523 void
 524 gen6_gs_visitor::xfb_setup()
 525 {
 526    static const unsigned swizzle_for_offset[4] = {
 527       BRW_SWIZZLE4(0, 1, 2, 3),
 528       BRW_SWIZZLE4(1, 2, 3, 3),
 529       BRW_SWIZZLE4(2, 3, 3, 3),
 530       BRW_SWIZZLE4(3, 3, 3, 3)
 531    };
 532
 533    const struct gl_transform_feedback_info *linked_xfb_info =
 534       &this->shader_prog->LinkedTransformFeedback;
 535    int i;
 536
 537    /* Make sure that the VUE slots won't overflow the unsigned chars in
 538     * prog_data->transform_feedback_bindings[].
 539     */
 540    STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256);
 541
 542    /* Make sure that we don't need more binding table entries than we've
 543     * set aside for use in transform feedback.  (We shouldn't, since we
 544     * set aside enough binding table entries to have one per component).
 545     */
 546    assert(linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS);
 547
 548    gs_prog_data->num_transform_feedback_bindings = linked_xfb_info->NumOutputs;
 549    for (i = 0; i < gs_prog_data->num_transform_feedback_bindings; i++) {
 550       gs_prog_data->transform_feedback_bindings[i] =
 551          linked_xfb_info->Outputs[i].OutputRegister;
 552       gs_prog_data->transform_feedback_swizzles[i] =
 553          swizzle_for_offset[linked_xfb_info->Outputs[i].ComponentOffset];
 554    }
 555 }
 556
 557 void
 558 gen6_gs_visitor::xfb_write()
 559 {
 560    unsigned num_verts;
 561
 562    if (!gs_prog_data->num_transform_feedback_bindings)
 563       return;
 564
 565    switch (gs_prog_data->output_topology) {
 566    case _3DPRIM_POINTLIST:
 567       num_verts = 1;
 568       break;
 569    case _3DPRIM_LINELIST:
 570    case _3DPRIM_LINESTRIP:
 571    case _3DPRIM_LINELOOP:
 572       num_verts = 2;
 573       break;
 574    case _3DPRIM_TRILIST:
 575    case _3DPRIM_TRIFAN:
 576    case _3DPRIM_TRISTRIP:
 577    case _3DPRIM_RECTLIST:
 578       num_verts = 3;
 579       break;
 580    case _3DPRIM_QUADLIST:
 581    case _3DPRIM_QUADSTRIP:
 582    case _3DPRIM_POLYGON:
 583       num_verts = 3;
 584       break;
 585    default:
 586       unreachable("Unexpected primitive type in Gen6 SOL program.");
 587    }
 588
 589    this->current_annotation = "gen6 thread end: svb writes init";
 590
 591    emit(MOV(dst_reg(this->vertex_output_offset), 0u));
 592    emit(MOV(dst_reg(this->sol_prim_written), 0u));
 593
 594    /* Check that at least one primitive can be written
 595     *
 596     * Note: since we use the binding table to keep track of buffer offsets
 597     * and stride, the GS doesn't need to keep track of a separate pointer
 598     * into each buffer; it uses a single pointer which increments by 1 for
 599     * each vertex.  So we use SVBI0 for this pointer, regardless of whether
 600     * transform feedback is in interleaved or separate attribs mode.
 601     */
 602    src_reg sol_temp(this, glsl_type::uvec4_type);
 603    emit(ADD(dst_reg(sol_temp), this->svbi, src_reg(num_verts)));
 604
 605    /* Compare SVBI calculated number with the maximum value, which is
 606     * in R1.4 (previously saved in this->max_svbi) for gen6.
 607     */
 608    emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
 609    emit(IF(BRW_PREDICATE_NORMAL));
 610    {
 611       src_reg destination_indices_uw =
 612          retype(destination_indices, BRW_REGISTER_TYPE_UW);
 613
 614       vec4_instruction *inst = emit(MOV(dst_reg(destination_indices_uw),
 615                                         brw_imm_v(0x00020100))); /* (0, 1, 2) */
 616       inst->force_writemask_all = true;
 617
 618       emit(ADD(dst_reg(this->destination_indices),
 619                this->destination_indices,
 620                this->svbi));
 621    }
 622    emit(BRW_OPCODE_ENDIF);
 623
 624    /* Write transform feedback data for all processed vertices. */
 625    for (int i = 0; i < (int)nir->info.gs.vertices_out; i++) {
 626       emit(MOV(dst_reg(sol_temp), i));
 627       emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
 628                BRW_CONDITIONAL_L));
 629       emit(IF(BRW_PREDICATE_NORMAL));
 630       {
 631          xfb_program(i, num_verts);
 632       }
 633       emit(BRW_OPCODE_ENDIF);
 634    }
 635 }
 636
 637 void
 638 gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
 639 {
 640    unsigned binding;
 641    unsigned num_bindings = gs_prog_data->num_transform_feedback_bindings;
 642    src_reg sol_temp(this, glsl_type::uvec4_type);
 643
 644    /* Check for buffer overflow: we need room to write the complete primitive
 645     * (all vertices). Otherwise, avoid writing any vertices for it
 646     */
 647    emit(ADD(dst_reg(sol_temp), this->sol_prim_written, 1u));
 648    emit(MUL(dst_reg(sol_temp), sol_temp, src_reg(num_verts)));
 649    emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
 650    emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
 651    emit(IF(BRW_PREDICATE_NORMAL));
 652    {
 653       /* Avoid overwriting MRF 1 as it is used as URB write message header */
 654       dst_reg mrf_reg(MRF, 2);
 655
 656       this->current_annotation = "gen6: emit SOL vertex data";
 657       /* For each vertex, generate code to output each varying using the
 658        * appropriate binding table entry.
 659        */
 660       for (binding = 0; binding < num_bindings; ++binding) {
 661          unsigned char varying =
 662             gs_prog_data->transform_feedback_bindings[binding];
 663
 664          /* Set up the correct destination index for this vertex */
 665          vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
 666                                        mrf_reg,
 667                                        this->destination_indices);
 668          inst->sol_vertex = vertex % num_verts;
 669
 670          /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
 671           *
 672           *   "Prior to End of Thread with a URB_WRITE, the kernel must
 673           *   ensure that all writes are complete by sending the final
 674           *   write as a committed write."
 675           */
 676          bool final_write = binding == (unsigned) num_bindings - 1 &&
 677                             inst->sol_vertex == num_verts - 1;
 678
 679          /* Compute offset of this varying for the current vertex
 680           * in vertex_output
 681           */
 682          this->current_annotation = output_reg_annotation[varying];
 683          src_reg data(this->vertex_output);
 684          data.reladdr = ralloc(mem_ctx, src_reg);
 685          int offset = get_vertex_output_offset_for_varying(vertex, varying);
 686          emit(MOV(dst_reg(this->vertex_output_offset), offset));
 687          memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
 688          data.type = output_reg[varying].type;
 689
 690          /* PSIZ, LAYER and VIEWPORT are packed in different channels of the
 691           * same slot, so make sure we write the appropriate channel
 692           */
 693          if (varying == VARYING_SLOT_PSIZ)
 694             data.swizzle = BRW_SWIZZLE_WWWW;
 695          else if (varying == VARYING_SLOT_LAYER)
 696             data.swizzle = BRW_SWIZZLE_YYYY;
 697          else if (varying == VARYING_SLOT_VIEWPORT)
 698             data.swizzle = BRW_SWIZZLE_ZZZZ;
 699          else
 700             data.swizzle = gs_prog_data->transform_feedback_swizzles[binding];
 701
 702          /* Write data */
 703          inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
 704          inst->sol_binding = binding;
 705          inst->sol_final_write = final_write;
 706
 707          if (final_write) {
 708             /* This is the last vertex of the primitive, then increment
 709              * SO num primitive counter and destination indices.
 710              */
 711             emit(ADD(dst_reg(this->destination_indices),
 712                      this->destination_indices,
 713                      src_reg(num_verts)));
 714             emit(ADD(dst_reg(this->sol_prim_written),
 715                      this->sol_prim_written, 1u));
 716          }
 717
 718       }
 719       this->current_annotation = NULL;
 720    }
 721    emit(BRW_OPCODE_ENDIF);
 722 }
 723
 724 int
 725 gen6_gs_visitor::get_vertex_output_offset_for_varying(int vertex, int varying)
 726 {
 727    /* Find the output slot assigned to this varying.
 728     *
 729     * VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot
 730     * as VARYING_SLOT_PSIZ.
 731     */
 732    if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT)
 733       varying = VARYING_SLOT_PSIZ;
 734    int slot = prog_data->vue_map.varying_to_slot[varying];
 735
 736    if (slot < 0) {
 737       /* This varying does not exist in the VUE so we are not writing to it
 738        * and its value is undefined. We still want to return a valid offset
 739        * into vertex_output though, to prevent any out-of-bound accesses into
 740        * the vertex_output array. Since the value for this varying is undefined
 741        * we don't really care for the value we assign to it, so any offset
 742        * within the limits of vertex_output will do.
 743        */
 744       slot = 0;
 745    }
 746
 747    return vertex * (prog_data->vue_map.num_slots + 1) + slot;
 748 }
 749
 750 } /* namespace brw */