2 * Copyright © 2014 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * This code is based on original work by Ilia Mirkin.
27 * \file gen6_gs_visitor.cpp
29 * Gen6 geometry shader implementation
32 #include "gen6_gs_visitor.h"
34 const unsigned MAX_GS_INPUT_VERTICES
= 6;
39 gen6_gs_visitor::assign_binding_table_offsets()
41 /* In gen6 we reserve the first BRW_MAX_SOL_BINDINGS entries for transform
44 assign_common_binding_table_offsets(BRW_MAX_SOL_BINDINGS
);
48 gen6_gs_visitor::emit_prolog()
50 vec4_gs_visitor::emit_prolog();
52 /* Gen6 geometry shaders require to allocate an initial VUE handle via
53 * FF_SYNC message, however the documentation remarks that only one thread
54 * can write to the URB simultaneously and the FF_SYNC message provides the
55 * synchronization mechanism for this, so using this message effectively
56 * stalls the thread until it is its turn to write to the URB. Because of
57 * this, the best way to implement geometry shader algorithms in gen6 is to
58 * execute the algorithm before the FF_SYNC message to maximize parallelism.
60 * To achieve this we buffer the geometry shader outputs for each emitted
61 * vertex in vertex_output during operation. Then, when we have processed
62 * the last vertex (that is, at thread end time), we send the FF_SYNC
63 * message to allocate the initial VUE handle and write all buffered vertex
64 * data to the URB in one go.
66 * For each emitted vertex, vertex_output will hold vue_map.num_slots
67 * data items plus one additional item to hold required flags
68 * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
69 * which come right after the data items for that vertex. Vertex data and
70 * flags for the next vertex come right after the data items and flags for
71 * the previous vertex.
73 this->current_annotation
= "gen6 prolog";
74 this->vertex_output
= src_reg(this,
76 (prog_data
->vue_map
.num_slots
+ 1) *
77 c
->gp
->program
.VerticesOut
);
78 this->vertex_output_offset
= src_reg(this, glsl_type::uint_type
);
79 emit(MOV(dst_reg(this->vertex_output_offset
), src_reg(0u)));
81 /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
82 * so initialize it once to R0.
84 vec4_instruction
*inst
= emit(MOV(dst_reg(MRF
, 1),
85 retype(brw_vec8_grf(0, 0),
86 BRW_REGISTER_TYPE_UD
)));
87 inst
->force_writemask_all
= true;
89 /* This will be used as a temporary to store writeback data of FF_SYNC
90 * and URB_WRITE messages.
92 this->temp
= src_reg(this, glsl_type::uint_type
);
94 /* This will be used to know when we are processing the first vertex of
95 * a primitive. We will set this to URB_WRITE_PRIM_START only when we know
96 * that we are processing the first vertex in the primitive and to zero
97 * otherwise. This way we can use its value directly in the URB write
100 this->first_vertex
= src_reg(this, glsl_type::uint_type
);
101 emit(MOV(dst_reg(this->first_vertex
), URB_WRITE_PRIM_START
));
103 /* The FF_SYNC message requires to know the number of primitives generated,
104 * so keep a counter for this.
106 this->prim_count
= src_reg(this, glsl_type::uint_type
);
107 emit(MOV(dst_reg(this->prim_count
), 0u));
109 if (c
->prog_data
.gen6_xfb_enabled
) {
110 /* Create a virtual register to hold destination indices in SOL */
111 this->destination_indices
= src_reg(this, glsl_type::uvec4_type
);
112 /* Create a virtual register to hold number of written primitives */
113 this->sol_prim_written
= src_reg(this, glsl_type::uint_type
);
114 /* Create a virtual register to hold Streamed Vertex Buffer Indices */
115 this->svbi
= src_reg(this, glsl_type::uvec4_type
);
116 /* Create a virtual register to hold max values of SVBI */
117 this->max_svbi
= src_reg(this, glsl_type::uvec4_type
);
118 emit(MOV(dst_reg(this->max_svbi
),
119 src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD
))));
124 /* PrimitveID is delivered in r0.1 of the thread payload. If the program
125 * needs it we have to move it to a separate register where we can map
128 * Notice that we cannot use a virtual register for this, because we need to
129 * map all input attributes to hardware registers in setup_payload(),
130 * which happens before virtual registers are mapped to hardware registers.
131 * We could work around that issue if we were able to compute the first
132 * non-payload register here and move the PrimitiveID information to that
133 * register, but we can't because at this point we don't know the final
134 * number uniforms that will be included in the payload.
136 * So, what we do is to place PrimitiveID information in r1, which is always
137 * delivered as part of the payload, but its only populated with data
138 * relevant for transform feedback when we set GEN6_GS_SVBI_PAYLOAD_ENABLE
139 * in the 3DSTATE_GS state packet. That information can be obtained by other
140 * means though, so we can safely use r1 for this purpose.
142 if (c
->prog_data
.include_primitive_id
) {
144 src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD
));
145 emit(GS_OPCODE_SET_PRIMITIVE_ID
, dst_reg(this->primitive_id
));
150 gen6_gs_visitor::visit(ir_emit_vertex
*)
152 this->current_annotation
= "gen6 emit vertex";
153 /* Honor max_vertex layout indication in geometry shader by ignoring any
154 * vertices coming after c->gp->program.VerticesOut.
156 unsigned num_output_vertices
= c
->gp
->program
.VerticesOut
;
157 emit(CMP(dst_null_d(), this->vertex_count
, src_reg(num_output_vertices
),
159 emit(IF(BRW_PREDICATE_NORMAL
));
161 /* Buffer all output slots for this vertex in vertex_output */
162 for (int slot
= 0; slot
< prog_data
->vue_map
.num_slots
; ++slot
) {
163 int varying
= prog_data
->vue_map
.slot_to_varying
[slot
];
164 if (varying
!= VARYING_SLOT_PSIZ
) {
165 dst_reg
dst(this->vertex_output
);
166 dst
.reladdr
= ralloc(mem_ctx
, src_reg
);
167 memcpy(dst
.reladdr
, &this->vertex_output_offset
, sizeof(src_reg
));
168 emit_urb_slot(dst
, varying
);
170 /* The PSIZ slot can pack multiple varyings in different channels
171 * and emit_urb_slot() will produce a MOV instruction for each of
172 * them. Since we are writing to an array, that will translate to
173 * possibly multiple MOV instructions with an array destination and
174 * each will generate a scratch write with the same offset into
175 * scratch space (thus, each one overwriting the previous). This is
176 * not what we want. What we will do instead is emit PSIZ to a
177 * a regular temporary register, then move that resgister into the
178 * array. This way we only have one instruction with an array
179 * destination and we only produce a single scratch write.
181 dst_reg tmp
= dst_reg(src_reg(this, glsl_type::uvec4_type
));
182 emit_urb_slot(tmp
, varying
);
183 dst_reg
dst(this->vertex_output
);
184 dst
.reladdr
= ralloc(mem_ctx
, src_reg
);
185 memcpy(dst
.reladdr
, &this->vertex_output_offset
, sizeof(src_reg
));
186 vec4_instruction
*inst
= emit(MOV(dst
, src_reg(tmp
)));
187 inst
->force_writemask_all
= true;
190 emit(ADD(dst_reg(this->vertex_output_offset
),
191 this->vertex_output_offset
, 1u));
194 /* Now buffer flags for this vertex */
195 dst_reg
dst(this->vertex_output
);
196 dst
.reladdr
= ralloc(mem_ctx
, src_reg
);
197 memcpy(dst
.reladdr
, &this->vertex_output_offset
, sizeof(src_reg
));
198 if (c
->gp
->program
.OutputType
== GL_POINTS
) {
199 /* If we are outputting points, then every vertex has PrimStart and
202 emit(MOV(dst
, (_3DPRIM_POINTLIST
<< URB_WRITE_PRIM_TYPE_SHIFT
) |
203 URB_WRITE_PRIM_START
| URB_WRITE_PRIM_END
));
204 emit(ADD(dst_reg(this->prim_count
), this->prim_count
, 1u));
206 /* Otherwise, we can only set the PrimStart flag, which we have stored
207 * in the first_vertex register. We will have to wait until we execute
208 * EndPrimitive() or we end the thread to set the PrimEnd flag on a
211 emit(OR(dst
, this->first_vertex
,
212 (c
->prog_data
.output_topology
<< URB_WRITE_PRIM_TYPE_SHIFT
)));
213 emit(MOV(dst_reg(this->first_vertex
), 0u));
215 emit(ADD(dst_reg(this->vertex_output_offset
),
216 this->vertex_output_offset
, 1u));
218 /* Update vertex count */
219 emit(ADD(dst_reg(this->vertex_count
), this->vertex_count
, 1u));
221 emit(BRW_OPCODE_ENDIF
);
225 gen6_gs_visitor::visit(ir_end_primitive
*)
227 this->current_annotation
= "gen6 end primitive";
228 /* Calling EndPrimitive() is optional for point output. In this case we set
229 * the PrimEnd flag when we process EmitVertex().
231 if (c
->gp
->program
.OutputType
== GL_POINTS
)
234 /* Otherwise we know that the last vertex we have processed was the last
235 * vertex in the primitive and we need to set its PrimEnd flag, so do this
236 * unless we haven't emitted that vertex at all (vertex_count != 0).
238 * Notice that we have already incremented vertex_count when we processed
239 * the last emit_vertex, so we need to take that into account in the
240 * comparison below (hence the num_output_vertices + 1 in the comparison
243 unsigned num_output_vertices
= c
->gp
->program
.VerticesOut
;
244 emit(CMP(dst_null_d(), this->vertex_count
, src_reg(num_output_vertices
+ 1),
246 vec4_instruction
*inst
= emit(CMP(dst_null_d(),
247 this->vertex_count
, 0u,
248 BRW_CONDITIONAL_NEQ
));
249 inst
->predicate
= BRW_PREDICATE_NORMAL
;
250 emit(IF(BRW_PREDICATE_NORMAL
));
252 /* vertex_output_offset is already pointing at the first entry of the
253 * next vertex. So subtract 1 to modify the flags for the previous
256 src_reg
offset(this, glsl_type::uint_type
);
257 emit(ADD(dst_reg(offset
), this->vertex_output_offset
, src_reg(-1)));
259 src_reg
dst(this->vertex_output
);
260 dst
.reladdr
= ralloc(mem_ctx
, src_reg
);
261 memcpy(dst
.reladdr
, &offset
, sizeof(src_reg
));
263 emit(OR(dst_reg(dst
), dst
, URB_WRITE_PRIM_END
));
264 emit(ADD(dst_reg(this->prim_count
), this->prim_count
, 1u));
266 /* Set the first vertex flag to indicate that the next vertex will start
269 emit(MOV(dst_reg(this->first_vertex
), URB_WRITE_PRIM_START
));
271 emit(BRW_OPCODE_ENDIF
);
275 gen6_gs_visitor::emit_urb_write_header(int mrf
)
277 this->current_annotation
= "gen6 urb header";
278 /* Compute offset of the flags for the current vertex in vertex_output and
279 * write them in dw2 of the message header.
281 * Notice that by the time that emit_thread_end() calls here
282 * vertex_output_offset should point to the first data item of the current
283 * vertex in vertex_output, thus we only need to add the number of output
284 * slots per vertex to that offset to obtain the flags data offset.
286 src_reg
flags_offset(this, glsl_type::uint_type
);
287 emit(ADD(dst_reg(flags_offset
),
288 this->vertex_output_offset
, src_reg(prog_data
->vue_map
.num_slots
)));
290 src_reg
flags_data(this->vertex_output
);
291 flags_data
.reladdr
= ralloc(mem_ctx
, src_reg
);
292 memcpy(flags_data
.reladdr
, &flags_offset
, sizeof(src_reg
));
294 emit(GS_OPCODE_SET_DWORD_2
, dst_reg(MRF
, mrf
), flags_data
);
298 gen6_gs_visitor::emit_urb_write_opcode(bool complete
, int base_mrf
,
299 int last_mrf
, int urb_offset
)
301 vec4_instruction
*inst
= NULL
;
304 /* If the vertex is not complete we don't have to do anything special */
305 inst
= emit(GS_OPCODE_URB_WRITE
);
306 inst
->urb_write_flags
= BRW_URB_WRITE_NO_FLAGS
;
308 /* Otherwise we always request to allocate a new VUE handle. If this is
309 * the last write before the EOT message and the new handle never gets
310 * used it will be dereferenced when we send the EOT message. This is
311 * necessary to avoid different setups for the EOT message (one for the
312 * case when there is no output and another for the case when there is)
313 * which would require to end the program with an IF/ELSE/ENDIF block,
314 * something we do not want.
316 inst
= emit(GS_OPCODE_URB_WRITE_ALLOCATE
);
317 inst
->urb_write_flags
= BRW_URB_WRITE_COMPLETE
;
318 inst
->dst
= dst_reg(MRF
, base_mrf
);
319 inst
->src
[0] = this->temp
;
322 inst
->base_mrf
= base_mrf
;
323 /* URB data written (does not include the message header reg) must
324 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
325 * section 5.4.3.2.2: URB_INTERLEAVED.
327 int mlen
= last_mrf
- base_mrf
;
331 inst
->offset
= urb_offset
;
335 gen6_gs_visitor::emit_thread_end()
337 /* Make sure the current primitive is ended: we know it is not ended when
338 * first_vertex is not zero. This is only relevant for outputs other than
339 * points because in the point case we set PrimEnd on all vertices.
341 if (c
->gp
->program
.OutputType
!= GL_POINTS
) {
342 emit(CMP(dst_null_d(), this->first_vertex
, 0u, BRW_CONDITIONAL_Z
));
343 emit(IF(BRW_PREDICATE_NORMAL
));
345 visit((ir_end_primitive
*) NULL
);
347 emit(BRW_OPCODE_ENDIF
);
351 * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle.
352 * 2) Loop over all buffered vertex data and write it to corresponding
354 * 3) Allocate new VUE handles for all vertices other than the first.
355 * 4) Send a final EOT message.
358 /* MRF 0 is reserved for the debugger, so start with message header
363 /* In the process of generating our URB write message contents, we
364 * may need to unspill a register or load from an array. Those
365 * reads would use MRFs 14-15.
367 int max_usable_mrf
= 13;
369 /* Issue the FF_SYNC message and obtain the initial VUE handle. */
370 emit(CMP(dst_null_d(), this->vertex_count
, 0u, BRW_CONDITIONAL_G
));
371 emit(IF(BRW_PREDICATE_NORMAL
));
373 this->current_annotation
= "gen6 thread end: ff_sync";
375 vec4_instruction
*inst
;
376 if (c
->prog_data
.gen6_xfb_enabled
) {
377 src_reg
sol_temp(this, glsl_type::uvec4_type
);
378 emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES
,
383 inst
= emit(GS_OPCODE_FF_SYNC
,
384 dst_reg(this->temp
), this->prim_count
, this->svbi
);
386 inst
= emit(GS_OPCODE_FF_SYNC
,
387 dst_reg(this->temp
), this->prim_count
, src_reg(0u));
389 inst
->base_mrf
= base_mrf
;
391 /* Loop over all buffered vertices and emit URB write messages */
392 this->current_annotation
= "gen6 thread end: urb writes init";
393 src_reg
vertex(this, glsl_type::uint_type
);
394 emit(MOV(dst_reg(vertex
), 0u));
395 emit(MOV(dst_reg(this->vertex_output_offset
), 0u));
397 this->current_annotation
= "gen6 thread end: urb writes";
400 emit(CMP(dst_null_d(), vertex
, this->vertex_count
, BRW_CONDITIONAL_GE
));
401 inst
= emit(BRW_OPCODE_BREAK
);
402 inst
->predicate
= BRW_PREDICATE_NORMAL
;
404 /* First we prepare the message header */
405 emit_urb_write_header(base_mrf
);
407 /* Then add vertex data to the message in interleaved fashion */
409 bool complete
= false;
411 int mrf
= base_mrf
+ 1;
413 /* URB offset is in URB row increments, and each of our MRFs is half
414 * of one of those, since we're doing interleaved writes.
416 int urb_offset
= slot
/ 2;
418 for (; slot
< prog_data
->vue_map
.num_slots
; ++slot
) {
419 int varying
= prog_data
->vue_map
.slot_to_varying
[slot
];
420 current_annotation
= output_reg_annotation
[varying
];
422 /* Compute offset of this slot for the current vertex
425 src_reg
data(this->vertex_output
);
426 data
.reladdr
= ralloc(mem_ctx
, src_reg
);
427 memcpy(data
.reladdr
, &this->vertex_output_offset
,
430 /* Copy this slot to the appropriate message register */
431 dst_reg reg
= dst_reg(MRF
, mrf
);
432 reg
.type
= output_reg
[varying
].type
;
433 data
.type
= reg
.type
;
434 vec4_instruction
*inst
= emit(MOV(reg
, data
));
435 inst
->force_writemask_all
= true;
438 emit(ADD(dst_reg(this->vertex_output_offset
),
439 this->vertex_output_offset
, 1u));
441 /* If this was max_usable_mrf, we can't fit anything more into
444 if (mrf
> max_usable_mrf
) {
450 complete
= slot
>= prog_data
->vue_map
.num_slots
;
451 emit_urb_write_opcode(complete
, base_mrf
, mrf
, urb_offset
);
454 /* Skip over the flags data item so that vertex_output_offset points
455 * to the first data item of the next vertex, so that we can start
456 * writing the next vertex.
458 emit(ADD(dst_reg(this->vertex_output_offset
),
459 this->vertex_output_offset
, 1u));
461 emit(ADD(dst_reg(vertex
), vertex
, 1u));
463 emit(BRW_OPCODE_WHILE
);
465 if (c
->prog_data
.gen6_xfb_enabled
)
468 emit(BRW_OPCODE_ENDIF
);
470 /* Finally, emit EOT message.
472 * In gen6 we need to end the thread differently depending on whether we have
473 * emitted at least one vertex or not. In case we did, the EOT message must
474 * always include the COMPLETE flag or else the GPU hangs. If we have not
475 * produced any output we can't use the COMPLETE flag.
477 * However, this would lead us to end the program with an ENDIF opcode,
478 * which we want to avoid, so what we do is that we always request a new
479 * VUE handle every time we do a URB WRITE, even for the last vertex we emit.
480 * With this we make sure that whether we have emitted at least one vertex
481 * or none at all, we have to finish the thread without writing to the URB,
482 * which works for both cases by setting the COMPLETE and UNUSED flags in
485 this->current_annotation
= "gen6 thread end: EOT";
487 if (c
->prog_data
.gen6_xfb_enabled
) {
488 /* When emitting EOT, set SONumPrimsWritten Increment Value. */
489 src_reg
data(this, glsl_type::uint_type
);
490 emit(AND(dst_reg(data
), this->sol_prim_written
, src_reg(0xffffu
)));
491 emit(SHL(dst_reg(data
), data
, src_reg(16u)));
492 emit(GS_OPCODE_SET_DWORD_2
, dst_reg(MRF
, base_mrf
), data
);
495 vec4_instruction
*inst
= emit(GS_OPCODE_THREAD_END
);
496 inst
->urb_write_flags
= BRW_URB_WRITE_COMPLETE
| BRW_URB_WRITE_UNUSED
;
497 inst
->base_mrf
= base_mrf
;
502 gen6_gs_visitor::setup_payload()
504 int attribute_map
[BRW_VARYING_SLOT_COUNT
* MAX_GS_INPUT_VERTICES
];
506 /* Attributes are going to be interleaved, so one register contains two
509 int attributes_per_reg
= 2;
511 /* If a geometry shader tries to read from an input that wasn't written by
512 * the vertex shader, that produces undefined results, but it shouldn't
513 * crash anything. So initialize attribute_map to zeros--that ensures that
514 * these undefined results are read from r0.
516 memset(attribute_map
, 0, sizeof(attribute_map
));
520 /* The payload always contains important data in r0. */
523 /* r1 is always part of the payload and it holds information relevant
524 * for transform feedback when we set the GEN6_GS_SVBI_PAYLOAD_ENABLE bit in
525 * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID
526 * information (and move the original value to a virtual register if
529 if (c
->prog_data
.include_primitive_id
)
530 attribute_map
[VARYING_SLOT_PRIMITIVE_ID
] = attributes_per_reg
* reg
;
533 reg
= setup_uniforms(reg
);
535 reg
= setup_varying_inputs(reg
, attribute_map
, attributes_per_reg
);
537 lower_attributes_to_hw_regs(attribute_map
, true);
539 this->first_non_payload_grf
= reg
;
543 gen6_gs_visitor::xfb_setup()
545 static const unsigned swizzle_for_offset
[4] = {
546 BRW_SWIZZLE4(0, 1, 2, 3),
547 BRW_SWIZZLE4(1, 2, 3, 3),
548 BRW_SWIZZLE4(2, 3, 3, 3),
549 BRW_SWIZZLE4(3, 3, 3, 3)
552 struct brw_gs_prog_data
*prog_data
=
553 (struct brw_gs_prog_data
*) &c
->prog_data
;
555 const struct gl_transform_feedback_info
*linked_xfb_info
=
556 &this->shader_prog
->LinkedTransformFeedback
;
559 /* Make sure that the VUE slots won't overflow the unsigned chars in
560 * prog_data->transform_feedback_bindings[].
562 STATIC_ASSERT(BRW_VARYING_SLOT_COUNT
<= 256);
564 /* Make sure that we don't need more binding table entries than we've
565 * set aside for use in transform feedback. (We shouldn't, since we
566 * set aside enough binding table entries to have one per component).
568 assert(linked_xfb_info
->NumOutputs
<= BRW_MAX_SOL_BINDINGS
);
570 prog_data
->num_transform_feedback_bindings
= linked_xfb_info
->NumOutputs
;
571 for (i
= 0; i
< prog_data
->num_transform_feedback_bindings
; i
++) {
572 prog_data
->transform_feedback_bindings
[i
] =
573 linked_xfb_info
->Outputs
[i
].OutputRegister
;
574 prog_data
->transform_feedback_swizzles
[i
] =
575 swizzle_for_offset
[linked_xfb_info
->Outputs
[i
].ComponentOffset
];
580 gen6_gs_visitor::xfb_write()
583 struct brw_gs_prog_data
*prog_data
=
584 (struct brw_gs_prog_data
*) &c
->prog_data
;
586 if (!prog_data
->num_transform_feedback_bindings
)
589 switch (c
->prog_data
.output_topology
) {
590 case _3DPRIM_POINTLIST
:
593 case _3DPRIM_LINELIST
:
594 case _3DPRIM_LINESTRIP
:
595 case _3DPRIM_LINELOOP
:
598 case _3DPRIM_TRILIST
:
600 case _3DPRIM_TRISTRIP
:
601 case _3DPRIM_RECTLIST
:
604 case _3DPRIM_QUADLIST
:
605 case _3DPRIM_QUADSTRIP
:
606 case _3DPRIM_POLYGON
:
610 unreachable("Unexpected primitive type in Gen6 SOL program.");
613 this->current_annotation
= "gen6 thread end: svb writes init";
615 emit(MOV(dst_reg(this->vertex_output_offset
), 0u));
616 emit(MOV(dst_reg(this->sol_prim_written
), 0u));
618 /* Check that at least one primitive can be written
620 * Note: since we use the binding table to keep track of buffer offsets
621 * and stride, the GS doesn't need to keep track of a separate pointer
622 * into each buffer; it uses a single pointer which increments by 1 for
623 * each vertex. So we use SVBI0 for this pointer, regardless of whether
624 * transform feedback is in interleaved or separate attribs mode.
626 src_reg
sol_temp(this, glsl_type::uvec4_type
);
627 emit(ADD(dst_reg(sol_temp
), this->svbi
, src_reg(num_verts
)));
629 /* Compare SVBI calculated number with the maximum value, which is
630 * in R1.4 (previously saved in this->max_svbi) for gen6.
632 emit(CMP(dst_null_d(), sol_temp
, this->max_svbi
, BRW_CONDITIONAL_LE
));
633 emit(IF(BRW_PREDICATE_NORMAL
));
635 src_reg destination_indices_uw
=
636 retype(destination_indices
, BRW_REGISTER_TYPE_UW
);
638 vec4_instruction
*inst
= emit(MOV(dst_reg(destination_indices_uw
),
639 brw_imm_v(0x00020100))); /* (0, 1, 2) */
640 inst
->force_writemask_all
= true;
642 emit(ADD(dst_reg(this->destination_indices
),
643 this->destination_indices
,
646 emit(BRW_OPCODE_ENDIF
);
648 /* Write transform feedback data for all processed vertices. */
649 for (int i
= 0; i
< c
->gp
->program
.VerticesOut
; i
++) {
650 emit(MOV(dst_reg(sol_temp
), i
));
651 emit(CMP(dst_null_d(), sol_temp
, this->vertex_count
,
653 emit(IF(BRW_PREDICATE_NORMAL
));
655 xfb_program(i
, num_verts
);
657 emit(BRW_OPCODE_ENDIF
);
662 gen6_gs_visitor::xfb_program(unsigned vertex
, unsigned num_verts
)
664 struct brw_gs_prog_data
*prog_data
=
665 (struct brw_gs_prog_data
*) &c
->prog_data
;
667 unsigned num_bindings
= prog_data
->num_transform_feedback_bindings
;
668 src_reg
sol_temp(this, glsl_type::uvec4_type
);
670 /* Check for buffer overflow: we need room to write the complete primitive
671 * (all vertices). Otherwise, avoid writing any vertices for it
673 emit(ADD(dst_reg(sol_temp
), this->sol_prim_written
, 1u));
674 emit(MUL(dst_reg(sol_temp
), sol_temp
, src_reg(num_verts
)));
675 emit(ADD(dst_reg(sol_temp
), sol_temp
, this->svbi
));
676 emit(CMP(dst_null_d(), sol_temp
, this->max_svbi
, BRW_CONDITIONAL_LE
));
677 emit(IF(BRW_PREDICATE_NORMAL
));
679 /* Avoid overwriting MRF 1 as it is used as URB write message header */
680 dst_reg
mrf_reg(MRF
, 2);
682 this->current_annotation
= "gen6: emit SOL vertex data";
683 /* For each vertex, generate code to output each varying using the
684 * appropriate binding table entry.
686 for (binding
= 0; binding
< num_bindings
; ++binding
) {
687 unsigned char varying
=
688 prog_data
->transform_feedback_bindings
[binding
];
690 /* Set up the correct destination index for this vertex */
691 vec4_instruction
*inst
= emit(GS_OPCODE_SVB_SET_DST_INDEX
,
693 this->destination_indices
);
694 inst
->sol_vertex
= vertex
% num_verts
;
696 /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
698 * "Prior to End of Thread with a URB_WRITE, the kernel must
699 * ensure that all writes are complete by sending the final
700 * write as a committed write."
702 bool final_write
= binding
== (unsigned) num_bindings
- 1 &&
703 inst
->sol_vertex
== num_verts
- 1;
705 /* Compute offset of this varying for the current vertex
708 this->current_annotation
= output_reg_annotation
[varying
];
709 src_reg
data(this->vertex_output
);
710 data
.reladdr
= ralloc(mem_ctx
, src_reg
);
711 int offset
= get_vertex_output_offset_for_varying(vertex
, varying
);
712 emit(MOV(dst_reg(this->vertex_output_offset
), offset
));
713 memcpy(data
.reladdr
, &this->vertex_output_offset
, sizeof(src_reg
));
714 data
.type
= output_reg
[varying
].type
;
716 /* PSIZ, LAYER and VIEWPORT are packed in different channels of the
717 * same slot, so make sure we write the appropriate channel
719 if (varying
== VARYING_SLOT_PSIZ
)
720 data
.swizzle
= BRW_SWIZZLE_WWWW
;
721 else if (varying
== VARYING_SLOT_LAYER
)
722 data
.swizzle
= BRW_SWIZZLE_YYYY
;
723 else if (varying
== VARYING_SLOT_VIEWPORT
)
724 data
.swizzle
= BRW_SWIZZLE_ZZZZ
;
726 data
.swizzle
= prog_data
->transform_feedback_swizzles
[binding
];
729 inst
= emit(GS_OPCODE_SVB_WRITE
, mrf_reg
, data
, sol_temp
);
730 inst
->sol_binding
= binding
;
731 inst
->sol_final_write
= final_write
;
734 /* This is the last vertex of the primitive, then increment
735 * SO num primitive counter and destination indices.
737 emit(ADD(dst_reg(this->destination_indices
),
738 this->destination_indices
,
739 src_reg(num_verts
)));
740 emit(ADD(dst_reg(this->sol_prim_written
),
741 this->sol_prim_written
, 1u));
745 this->current_annotation
= NULL
;
747 emit(BRW_OPCODE_ENDIF
);
751 gen6_gs_visitor::get_vertex_output_offset_for_varying(int vertex
, int varying
)
753 /* Find the output slot assigned to this varying.
755 * VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot
756 * as VARYING_SLOT_PSIZ.
758 if (varying
== VARYING_SLOT_LAYER
|| varying
== VARYING_SLOT_VIEWPORT
)
759 varying
= VARYING_SLOT_PSIZ
;
760 int slot
= prog_data
->vue_map
.varying_to_slot
[varying
];
763 /* This varying does not exist in the VUE so we are not writing to it
764 * and its value is undefined. We still want to return a valid offset
765 * into vertex_output though, to prevent any out-of-bound accesses into
766 * the vertex_output array. Since the value for this varying is undefined
767 * we don't really care for the value we assign to it, so any offset
768 * within the limits of vertex_output will do.
773 return vertex
* (prog_data
->vue_map
.num_slots
+ 1) + slot
;
776 } /* namespace brw */