ccc197d02f7b453bd739acacb4a70efc33294304
[mesa.git] / src / mesa / drivers / dri / i965 / gen6_gs_visitor.cpp
1 /*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * This code is based on original work by Ilia Mirkin.
24 */
25
26 /**
27 * \file gen6_gs_visitor.cpp
28 *
29 * Gen6 geometry shader implementation
30 */
31
32 #include "gen6_gs_visitor.h"
33
34 const unsigned MAX_GS_INPUT_VERTICES = 6;
35
36 namespace brw {
37
38 void
39 gen6_gs_visitor::emit_prolog()
40 {
41 vec4_gs_visitor::emit_prolog();
42
43 /* Gen6 geometry shaders require to allocate an initial VUE handle via
44 * FF_SYNC message, however the documentation remarks that only one thread
45 * can write to the URB simultaneously and the FF_SYNC message provides the
46 * synchronization mechanism for this, so using this message effectively
47 * stalls the thread until it is its turn to write to the URB. Because of
48 * this, the best way to implement geometry shader algorithms in gen6 is to
49 * execute the algorithm before the FF_SYNC message to maximize parallelism.
50 *
51 * To achieve this we buffer the geometry shader outputs for each emitted
52 * vertex in vertex_output during operation. Then, when we have processed
53 * the last vertex (that is, at thread end time), we send the FF_SYNC
54 * message to allocate the initial VUE handle and write all buffered vertex
55 * data to the URB in one go.
56 *
57 * For each emitted vertex, vertex_output will hold vue_map.num_slots
58 * data items plus one additional item to hold required flags
59 * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
60 * which come right after the data items for that vertex. Vertex data and
61 * flags for the next vertex come right after the data items and flags for
62 * the previous vertex.
63 */
64 this->current_annotation = "gen6 prolog";
65 this->vertex_output = src_reg(this,
66 glsl_type::uint_type,
67 (prog_data->vue_map.num_slots + 1) *
68 c->gp->program.VerticesOut);
69 this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
70 emit(MOV(dst_reg(this->vertex_output_offset), src_reg(0u)));
71
72 /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
73 * so initialize it once to R0.
74 */
75 vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
76 retype(brw_vec8_grf(0, 0),
77 BRW_REGISTER_TYPE_UD)));
78 inst->force_writemask_all = true;
79
80 /* This will be used as a temporary to store writeback data of FF_SYNC
81 * and URB_WRITE messages.
82 */
83 this->temp = src_reg(this, glsl_type::uint_type);
84
85 /* This will be used to know when we are processing the first vertex of
86 * a primitive. We will set this to URB_WRITE_PRIM_START only when we know
87 * that we are processing the first vertex in the primitive and to zero
88 * otherwise. This way we can use its value directly in the URB write
89 * headers.
90 */
91 this->first_vertex = src_reg(this, glsl_type::uint_type);
92 emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
93
94 /* The FF_SYNC message requires to know the number of primitives generated,
95 * so keep a counter for this.
96 */
97 this->prim_count = src_reg(this, glsl_type::uint_type);
98 emit(MOV(dst_reg(this->prim_count), 0u));
99
100 if (c->prog_data.gen6_xfb_enabled) {
101 const struct gl_transform_feedback_info *linked_xfb_info =
102 &this->shader_prog->LinkedTransformFeedback;
103
104 /* Gen6 geometry shaders are required to ask for Streamed Vertex Buffer
105 * Indices values via FF_SYNC message, when Transform Feedback is
106 * enabled.
107 *
108 * To achieve this we buffer the Transform feedback outputs for each
109 * emitted vertex in xfb_output during operation. Then, when we have
110 * processed the last vertex (that is, at thread end time), we know all
111 * the required data for the FF_SYNC message header in order to receive
112 * the SVBI in the writeback.
113 *
114 * For each emitted vertex, xfb_output will hold
115 * num_transform_feedback_bindings data items plus one, which will
116 * indicate the end of the primitive. Next vertex's data comes right
117 * after.
118 */
119 this->xfb_output = src_reg(this,
120 glsl_type::uint_type,
121 linked_xfb_info->NumOutputs *
122 c->gp->program.VerticesOut);
123 this->xfb_output_offset = src_reg(this, glsl_type::uint_type);
124 emit(MOV(dst_reg(this->xfb_output_offset), src_reg(0u)));
125 /* Create a virtual register to hold destination indices in SOL */
126 this->destination_indices = src_reg(this, glsl_type::uvec4_type);
127 /* Create a virtual register to hold number of written primitives */
128 this->sol_prim_written = src_reg(this, glsl_type::uint_type);
129 /* Create a virtual register to hold Streamed Vertex Buffer Indices */
130 this->svbi = src_reg(this, glsl_type::uvec4_type);
131 /* Create a virtual register to hold max values of SVBI */
132 this->max_svbi = src_reg(this, glsl_type::uvec4_type);
133 emit(MOV(dst_reg(this->max_svbi),
134 src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD))));
135 }
136
137 /* PrimitveID is delivered in r0.1 of the thread payload. If the program
138 * needs it we have to move it to a separate register where we can map
139 * the atttribute.
140 *
141 * Notice that we cannot use a virtual register for this, because we need to
142 * map all input attributes to hardware registers in setup_payload(),
143 * which happens before virtual registers are mapped to hardware registers.
144 * We could work around that issue if we were able to compute the first
145 * non-payload register here and move the PrimitiveID information to that
146 * register, but we can't because at this point we don't know the final
147 * number uniforms that will be included in the payload.
148 *
149 * So, what we do is to place PrimitiveID information in r1, which is always
150 * delivered as part of the payload, but its only populated with data
151 * relevant for transform feedback when we set GEN6_GS_SVBI_PAYLOAD_ENABLE
152 * in the 3DSTATE_GS state packet. That information can be obtained by other
153 * means though, so we can safely use r1 for this purpose.
154 */
155 if (c->prog_data.include_primitive_id) {
156 this->primitive_id =
157 src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
158 emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
159 }
160 }
161
162 void
163 gen6_gs_visitor::visit(ir_emit_vertex *)
164 {
165 this->current_annotation = "gen6 emit vertex";
166 /* Honor max_vertex layout indication in geometry shader by ignoring any
167 * vertices coming after c->gp->program.VerticesOut.
168 */
169 unsigned num_output_vertices = c->gp->program.VerticesOut;
170 emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices),
171 BRW_CONDITIONAL_L));
172 emit(IF(BRW_PREDICATE_NORMAL));
173 {
174 if (c->prog_data.gen6_xfb_enabled)
175 xfb_buffer_output();
176
177 /* Buffer all output slots for this vertex in vertex_output */
178 for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
179 int varying = prog_data->vue_map.slot_to_varying[slot];
180 if (varying != VARYING_SLOT_PSIZ) {
181 dst_reg dst(this->vertex_output);
182 dst.reladdr = ralloc(mem_ctx, src_reg);
183 memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
184 emit_urb_slot(dst, varying);
185 } else {
186 /* The PSIZ slot can pack multiple varyings in different channels
187 * and emit_urb_slot() will produce a MOV instruction for each of
188 * them. Since we are writing to an array, that will translate to
189 * possibly multiple MOV instructions with an array destination and
190 * each will generate a scratch write with the same offset into
191 * scratch space (thus, each one overwriting the previous). This is
192 * not what we want. What we will do instead is emit PSIZ to a
193 * a regular temporary register, then move that resgister into the
194 * array. This way we only have one instruction with an array
195 * destination and we only produce a single scratch write.
196 */
197 dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type));
198 emit_urb_slot(tmp, varying);
199 dst_reg dst(this->vertex_output);
200 dst.reladdr = ralloc(mem_ctx, src_reg);
201 memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
202 vec4_instruction *inst = emit(MOV(dst, src_reg(tmp)));
203 inst->force_writemask_all = true;
204 }
205
206 emit(ADD(dst_reg(this->vertex_output_offset),
207 this->vertex_output_offset, 1u));
208 }
209
210 /* Now buffer flags for this vertex */
211 dst_reg dst(this->vertex_output);
212 dst.reladdr = ralloc(mem_ctx, src_reg);
213 memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
214 if (c->gp->program.OutputType == GL_POINTS) {
215 /* If we are outputting points, then every vertex has PrimStart and
216 * PrimEnd set.
217 */
218 emit(MOV(dst, (_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
219 URB_WRITE_PRIM_START | URB_WRITE_PRIM_END));
220 emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
221 } else {
222 /* Otherwise, we can only set the PrimStart flag, which we have stored
223 * in the first_vertex register. We will have to wait until we execute
224 * EndPrimitive() or we end the thread to set the PrimEnd flag on a
225 * vertex.
226 */
227 emit(OR(dst, this->first_vertex,
228 (c->prog_data.output_topology << URB_WRITE_PRIM_TYPE_SHIFT)));
229 emit(MOV(dst_reg(this->first_vertex), 0u));
230 }
231 emit(ADD(dst_reg(this->vertex_output_offset),
232 this->vertex_output_offset, 1u));
233
234 /* Update vertex count */
235 emit(ADD(dst_reg(this->vertex_count), this->vertex_count, 1u));
236 }
237 emit(BRW_OPCODE_ENDIF);
238 }
239
240 void
241 gen6_gs_visitor::visit(ir_end_primitive *)
242 {
243 this->current_annotation = "gen6 end primitive";
244 /* Calling EndPrimitive() is optional for point output. In this case we set
245 * the PrimEnd flag when we process EmitVertex().
246 */
247 if (c->gp->program.OutputType == GL_POINTS)
248 return;
249
250 /* Otherwise we know that the last vertex we have processed was the last
251 * vertex in the primitive and we need to set its PrimEnd flag, so do this
252 * unless we haven't emitted that vertex at all (vertex_count != 0).
253 *
254 * Notice that we have already incremented vertex_count when we processed
255 * the last emit_vertex, so we need to take that into account in the
256 * comparison below (hence the num_output_vertices + 1 in the comparison
257 * below).
258 */
259 unsigned num_output_vertices = c->gp->program.VerticesOut;
260 emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices + 1),
261 BRW_CONDITIONAL_L));
262 vec4_instruction *inst = emit(CMP(dst_null_d(),
263 this->vertex_count, 0u,
264 BRW_CONDITIONAL_NEQ));
265 inst->predicate = BRW_PREDICATE_NORMAL;
266 emit(IF(BRW_PREDICATE_NORMAL));
267 {
268 /* vertex_output_offset is already pointing at the first entry of the
269 * next vertex. So subtract 1 to modify the flags for the previous
270 * vertex.
271 */
272 src_reg offset(this, glsl_type::uint_type);
273 emit(ADD(dst_reg(offset), this->vertex_output_offset, brw_imm_d(-1)));
274
275 src_reg dst(this->vertex_output);
276 dst.reladdr = ralloc(mem_ctx, src_reg);
277 memcpy(dst.reladdr, &offset, sizeof(src_reg));
278
279 emit(OR(dst_reg(dst), dst, URB_WRITE_PRIM_END));
280 emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
281
282 /* Set the first vertex flag to indicate that the next vertex will start
283 * a primitive.
284 */
285 emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
286 }
287 emit(BRW_OPCODE_ENDIF);
288 }
289
290 void
291 gen6_gs_visitor::emit_urb_write_header(int mrf)
292 {
293 this->current_annotation = "gen6 urb header";
294 /* Compute offset of the flags for the current vertex in vertex_output and
295 * write them in dw2 of the message header.
296 *
297 * Notice that by the time that emit_thread_end() calls here
298 * vertex_output_offset should point to the first data item of the current
299 * vertex in vertex_output, thus we only need to add the number of output
300 * slots per vertex to that offset to obtain the flags data offset.
301 */
302 src_reg flags_offset(this, glsl_type::uint_type);
303 emit(ADD(dst_reg(flags_offset),
304 this->vertex_output_offset, src_reg(prog_data->vue_map.num_slots)));
305
306 src_reg flags_data(this->vertex_output);
307 flags_data.reladdr = ralloc(mem_ctx, src_reg);
308 memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg));
309
310 emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
311 }
312
313 void
314 gen6_gs_visitor::emit_urb_write_opcode(bool complete, int base_mrf,
315 int last_mrf, int urb_offset)
316 {
317 vec4_instruction *inst = NULL;
318
319 if (!complete) {
320 /* If the vertex is not complete we don't have to do anything special */
321 inst = emit(GS_OPCODE_URB_WRITE);
322 inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
323 } else {
324 /* Otherwise we always request to allocate a new VUE handle. If this is
325 * the last write before the EOT message and the new handle never gets
326 * used it will be dereferenced when we send the EOT message. This is
327 * necessary to avoid different setups for the EOT message (one for the
328 * case when there is no output and another for the case when there is)
329 * which would require to end the program with an IF/ELSE/ENDIF block,
330 * something we do not want.
331 */
332 inst = emit(GS_OPCODE_URB_WRITE_ALLOCATE);
333 inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
334 inst->dst = dst_reg(MRF, base_mrf);
335 inst->src[0] = this->temp;
336 }
337
338 inst->base_mrf = base_mrf;
339 /* URB data written (does not include the message header reg) must
340 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
341 * section 5.4.3.2.2: URB_INTERLEAVED.
342 */
343 int mlen = last_mrf - base_mrf;
344 if ((mlen % 2) != 1)
345 mlen++;
346 inst->mlen = mlen;
347 inst->offset = urb_offset;
348 }
349
350 void
351 gen6_gs_visitor::emit_thread_end()
352 {
353 /* Make sure the current primitive is ended: we know it is not ended when
354 * first_vertex is not zero. This is only relevant for outputs other than
355 * points because in the point case we set PrimEnd on all vertices.
356 */
357 if (c->gp->program.OutputType != GL_POINTS) {
358 emit(CMP(dst_null_d(), this->first_vertex, 0u, BRW_CONDITIONAL_Z));
359 emit(IF(BRW_PREDICATE_NORMAL));
360 {
361 visit((ir_end_primitive *) NULL);
362 }
363 emit(BRW_OPCODE_ENDIF);
364 }
365
366 /* Here we have to:
367 * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle.
368 * 2) Loop over all buffered vertex data and write it to corresponding
369 * URB entries.
370 * 3) Allocate new VUE handles for all vertices other than the first.
371 * 4) Send a final EOT message.
372 */
373
374 /* MRF 0 is reserved for the debugger, so start with message header
375 * in MRF 1.
376 */
377 int base_mrf = 1;
378
379 /* In the process of generating our URB write message contents, we
380 * may need to unspill a register or load from an array. Those
381 * reads would use MRFs 14-15.
382 */
383 int max_usable_mrf = 13;
384
385 /* Issue the FF_SYNC message and obtain the initial VUE handle. */
386 emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_G));
387 emit(IF(BRW_PREDICATE_NORMAL));
388 {
389 this->current_annotation = "gen6 thread end: ff_sync";
390
391 vec4_instruction *inst;
392 if (c->prog_data.gen6_xfb_enabled) {
393 src_reg sol_temp(this, glsl_type::uvec4_type);
394 emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
395 dst_reg(this->svbi),
396 this->vertex_count,
397 this->prim_count,
398 sol_temp);
399 inst = emit(GS_OPCODE_FF_SYNC,
400 dst_reg(this->temp), this->prim_count, this->svbi);
401 } else {
402 inst = emit(GS_OPCODE_FF_SYNC,
403 dst_reg(this->temp), this->prim_count, brw_imm_ud(0u));
404 }
405 inst->base_mrf = base_mrf;
406
407 /* Loop over all buffered vertices and emit URB write messages */
408 this->current_annotation = "gen6 thread end: urb writes init";
409 src_reg vertex(this, glsl_type::uint_type);
410 emit(MOV(dst_reg(vertex), 0u));
411 emit(MOV(dst_reg(this->vertex_output_offset), 0u));
412
413 this->current_annotation = "gen6 thread end: urb writes";
414 emit(BRW_OPCODE_DO);
415 {
416 emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
417 inst = emit(BRW_OPCODE_BREAK);
418 inst->predicate = BRW_PREDICATE_NORMAL;
419
420 /* First we prepare the message header */
421 emit_urb_write_header(base_mrf);
422
423 /* Then add vertex data to the message in interleaved fashion */
424 int slot = 0;
425 bool complete = false;
426 do {
427 int mrf = base_mrf + 1;
428
429 /* URB offset is in URB row increments, and each of our MRFs is half
430 * of one of those, since we're doing interleaved writes.
431 */
432 int urb_offset = slot / 2;
433
434 for (; slot < prog_data->vue_map.num_slots; ++slot) {
435 int varying = prog_data->vue_map.slot_to_varying[slot];
436 current_annotation = output_reg_annotation[varying];
437
438 /* Compute offset of this slot for the current vertex
439 * in vertex_output
440 */
441 src_reg data(this->vertex_output);
442 data.reladdr = ralloc(mem_ctx, src_reg);
443 memcpy(data.reladdr, &this->vertex_output_offset,
444 sizeof(src_reg));
445
446 /* Copy this slot to the appropriate message register */
447 dst_reg reg = dst_reg(MRF, mrf);
448 reg.type = output_reg[varying].type;
449 data.type = reg.type;
450 vec4_instruction *inst = emit(MOV(reg, data));
451 inst->force_writemask_all = true;
452
453 mrf++;
454 emit(ADD(dst_reg(this->vertex_output_offset),
455 this->vertex_output_offset, 1u));
456
457 /* If this was max_usable_mrf, we can't fit anything more into
458 * this URB WRITE.
459 */
460 if (mrf > max_usable_mrf) {
461 slot++;
462 break;
463 }
464 }
465
466 complete = slot >= prog_data->vue_map.num_slots;
467 emit_urb_write_opcode(complete, base_mrf, mrf, urb_offset);
468 } while (!complete);
469
470 /* Skip over the flags data item so that vertex_output_offset points
471 * to the first data item of the next vertex, so that we can start
472 * writing the next vertex.
473 */
474 emit(ADD(dst_reg(this->vertex_output_offset),
475 this->vertex_output_offset, 1u));
476
477 emit(ADD(dst_reg(vertex), vertex, 1u));
478 }
479 emit(BRW_OPCODE_WHILE);
480
481 if (c->prog_data.gen6_xfb_enabled)
482 xfb_write();
483 }
484 emit(BRW_OPCODE_ENDIF);
485
486 /* Finally, emit EOT message.
487 *
488 * In gen6 we need to end the thread differently depending on whether we have
489 * emitted at least one vertex or not. In case we did, the EOT message must
490 * always include the COMPLETE flag or else the GPU hangs. If we have not
491 * produced any output we can't use the COMPLETE flag.
492 *
493 * However, this would lead us to end the program with an ENDIF opcode,
494 * which we want to avoid, so what we do is that we always request a new
495 * VUE handle every time we do a URB WRITE, even for the last vertex we emit.
496 * With this we make sure that whether we have emitted at least one vertex
497 * or none at all, we have to finish the thread without writing to the URB,
498 * which works for both cases by setting the COMPLETE and UNUSED flags in
499 * the EOT message.
500 */
501 this->current_annotation = "gen6 thread end: EOT";
502
503 if (c->prog_data.gen6_xfb_enabled) {
504 /* When emitting EOT, set SONumPrimsWritten Increment Value. */
505 src_reg data(this, glsl_type::uint_type);
506 emit(AND(dst_reg(data), this->sol_prim_written, brw_imm_ud(0xffffu)));
507 emit(SHL(dst_reg(data), data, brw_imm_ud(16u)));
508 emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
509 }
510
511 vec4_instruction *inst = emit(GS_OPCODE_THREAD_END);
512 inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED;
513 inst->base_mrf = base_mrf;
514 inst->mlen = 1;
515 }
516
517 void
518 gen6_gs_visitor::setup_payload()
519 {
520 int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
521
522 /* Attributes are going to be interleaved, so one register contains two
523 * attribute slots.
524 */
525 int attributes_per_reg = 2;
526
527 /* If a geometry shader tries to read from an input that wasn't written by
528 * the vertex shader, that produces undefined results, but it shouldn't
529 * crash anything. So initialize attribute_map to zeros--that ensures that
530 * these undefined results are read from r0.
531 */
532 memset(attribute_map, 0, sizeof(attribute_map));
533
534 int reg = 0;
535
536 /* The payload always contains important data in r0. */
537 reg++;
538
539 /* r1 is always part of the payload and it holds information relevant
540 * for transform feedback when we set the GEN6_GS_SVBI_PAYLOAD_ENABLE bit in
541 * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID
542 * information (and move the original value to a virtual register if
543 * necessary).
544 */
545 if (c->prog_data.include_primitive_id)
546 attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg;
547 reg++;
548
549 reg = setup_uniforms(reg);
550
551 reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
552
553 lower_attributes_to_hw_regs(attribute_map, true);
554
555 this->first_non_payload_grf = reg;
556 }
557
558 void
559 gen6_gs_visitor::xfb_buffer_output()
560 {
561 static const unsigned swizzle_for_offset[4] = {
562 BRW_SWIZZLE4(0, 1, 2, 3),
563 BRW_SWIZZLE4(1, 2, 3, 3),
564 BRW_SWIZZLE4(2, 3, 3, 3),
565 BRW_SWIZZLE4(3, 3, 3, 3)
566 };
567
568 struct brw_gs_prog_data *prog_data =
569 (struct brw_gs_prog_data *) &c->prog_data;
570
571 if (!prog_data->num_transform_feedback_bindings) {
572 const struct gl_transform_feedback_info *linked_xfb_info =
573 &this->shader_prog->LinkedTransformFeedback;
574 int i;
575
576 /* Make sure that the VUE slots won't overflow the unsigned chars in
577 * prog_data->transform_feedback_bindings[].
578 */
579 STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256);
580
581 /* Make sure that we don't need more binding table entries than we've
582 * set aside for use in transform feedback. (We shouldn't, since we
583 * set aside enough binding table entries to have one per component).
584 */
585 assert(linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS);
586
587 prog_data->num_transform_feedback_bindings = linked_xfb_info->NumOutputs;
588 for (i = 0; i < prog_data->num_transform_feedback_bindings; i++) {
589 prog_data->transform_feedback_bindings[i] =
590 linked_xfb_info->Outputs[i].OutputRegister;
591 prog_data->transform_feedback_swizzles[i] =
592 swizzle_for_offset[linked_xfb_info->Outputs[i].ComponentOffset];
593 }
594 }
595
596 /* Buffer all TF outputs for this vertex in xfb_output */
597 for (int binding = 0; binding < prog_data->num_transform_feedback_bindings;
598 binding++) {
599 unsigned varying =
600 prog_data->transform_feedback_bindings[binding];
601 dst_reg dst(this->xfb_output);
602 dst.reladdr = ralloc(mem_ctx, src_reg);
603 memcpy(dst.reladdr, &this->xfb_output_offset, sizeof(src_reg));
604 dst.type = output_reg[varying].type;
605
606 this->current_annotation = output_reg_annotation[varying];
607 src_reg out_reg = src_reg(output_reg[varying]);
608 out_reg.swizzle = varying == VARYING_SLOT_PSIZ
609 ? BRW_SWIZZLE_WWWW : prog_data->transform_feedback_swizzles[binding];
610 emit(MOV(dst, out_reg));
611
612 emit(ADD(dst_reg(this->xfb_output_offset), this->xfb_output_offset, 1u));
613 }
614 }
615
616 void
617 gen6_gs_visitor::xfb_write()
618 {
619 unsigned num_verts;
620 struct brw_gs_prog_data *prog_data =
621 (struct brw_gs_prog_data *) &c->prog_data;
622
623 if (!prog_data->num_transform_feedback_bindings)
624 return;
625
626 switch (c->prog_data.output_topology) {
627 case _3DPRIM_POINTLIST:
628 num_verts = 1;
629 break;
630 case _3DPRIM_LINELIST:
631 case _3DPRIM_LINESTRIP:
632 case _3DPRIM_LINELOOP:
633 num_verts = 2;
634 break;
635 case _3DPRIM_TRILIST:
636 case _3DPRIM_TRIFAN:
637 case _3DPRIM_TRISTRIP:
638 case _3DPRIM_RECTLIST:
639 num_verts = 3;
640 break;
641 case _3DPRIM_QUADLIST:
642 case _3DPRIM_QUADSTRIP:
643 case _3DPRIM_POLYGON:
644 num_verts = 3;
645 break;
646 default:
647 unreachable("Unexpected primitive type in Gen6 SOL program.");
648 }
649
650 this->current_annotation = "gen6 thread end: svb writes init";
651
652 emit(MOV(dst_reg(this->xfb_output_offset), 0u));
653 emit(MOV(dst_reg(this->sol_prim_written), 0u));
654
655 /* Check that at least one primitive can be written
656 *
657 * Note: since we use the binding table to keep track of buffer offsets
658 * and stride, the GS doesn't need to keep track of a separate pointer
659 * into each buffer; it uses a single pointer which increments by 1 for
660 * each vertex. So we use SVBI0 for this pointer, regardless of whether
661 * transform feedback is in interleaved or separate attribs mode.
662 */
663 src_reg sol_temp(this, glsl_type::uvec4_type);
664 emit(ADD(dst_reg(sol_temp), this->svbi, brw_imm_ud(num_verts)));
665
666 /* Compare SVBI calculated number with the maximum value, which is
667 * in R1.4 (previously saved in this->max_svbi) for gen6.
668 */
669 emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
670 emit(IF(BRW_PREDICATE_NORMAL));
671 {
672 struct src_reg destination_indices_uw =
673 retype(destination_indices, BRW_REGISTER_TYPE_UW);
674
675 vec4_instruction *inst = emit(MOV(dst_reg(destination_indices_uw),
676 brw_imm_v(0x00020100))); /* (0, 1, 2) */
677 inst->force_writemask_all = true;
678
679 emit(ADD(dst_reg(this->destination_indices),
680 this->destination_indices,
681 this->svbi));
682 }
683 emit(BRW_OPCODE_ENDIF);
684
685 this->current_vertex = 0;
686 /* Make sure we do not emit more transform feedback data than the amount
687 * we have buffered.
688 */
689 for (int i = 0; i < c->gp->program.VerticesOut; i++) {
690 emit(MOV(dst_reg(sol_temp), i));
691 emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
692 BRW_CONDITIONAL_L));
693 emit(IF(BRW_PREDICATE_NORMAL));
694 {
695 xfb_program(num_verts);
696 }
697 emit(BRW_OPCODE_ENDIF);
698 }
699 }
700
701 void
702 gen6_gs_visitor::xfb_program(unsigned num_verts)
703 {
704 struct brw_gs_prog_data *prog_data =
705 (struct brw_gs_prog_data *) &c->prog_data;
706 unsigned binding;
707 unsigned num_bindings = prog_data->num_transform_feedback_bindings;
708 src_reg sol_temp(this, glsl_type::uvec4_type);
709
710 /* Check if we can write one primitive more */
711 emit(ADD(dst_reg(sol_temp), this->sol_prim_written, 1u));
712 emit(MUL(dst_reg(sol_temp), sol_temp, brw_imm_ud(num_verts)));
713 emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
714 emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
715 emit(IF(BRW_PREDICATE_NORMAL));
716 {
717 if (this->current_vertex >= num_verts)
718 this->current_vertex = 0;
719
720 /* Avoid overwriting MRF 1 as it is used as URB write message header */
721 dst_reg mrf_reg(MRF, 2);
722
723 this->current_annotation = "gen6: emit SOL vertex data";
724 /* For each vertex, generate code to output each varying using the
725 * appropriate binding table entry.
726 */
727 for (binding = 0; binding < num_bindings; ++binding) {
728 /* Set up the correct destination index for this vertex */
729 vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
730 mrf_reg,
731 this->destination_indices);
732 inst->sol_vertex = this->current_vertex;
733
734 unsigned char varying =
735 prog_data->transform_feedback_bindings[binding];
736
737 /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
738 *
739 * "Prior to End of Thread with a URB_WRITE, the kernel must
740 * ensure that all writes are complete by sending the final
741 * write as a committed write."
742 */
743 bool final_write = binding == (unsigned) num_bindings - 1 &&
744 this->current_vertex == num_verts - 1;
745
746 /* Compute offset of this varying for the current vertex
747 * in xfb_output
748 */
749 src_reg data(this->xfb_output);
750 data.reladdr = ralloc(mem_ctx, src_reg);
751 memcpy(data.reladdr, &this->xfb_output_offset, sizeof(src_reg));
752 src_reg out_reg;
753 this->current_annotation = output_reg_annotation[varying];
754
755 /* Copy this varying to the appropriate message register */
756 out_reg = src_reg(this, glsl_type::uvec4_type);
757 out_reg.type = output_reg[varying].type;
758
759 data.type = output_reg[varying].type;
760 emit(MOV(dst_reg(out_reg), data));
761
762 /* Write data and send SVB Write */
763 inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, out_reg, sol_temp);
764 inst->sol_binding = binding;
765 inst->sol_final_write = final_write;
766
767 emit(ADD(dst_reg(this->xfb_output_offset),
768 this->xfb_output_offset, 1u));
769
770 if (final_write) {
771 /* This is the last vertex of the primitive, then increment
772 * SO num primitive counter and destination indices.
773 */
774 emit(ADD(dst_reg(this->destination_indices),
775 this->destination_indices,
776 brw_imm_ud(num_verts)));
777 emit(ADD(dst_reg(this->sol_prim_written),
778 this->sol_prim_written, 1u));
779 }
780
781 }
782 this->current_vertex++;
783 this->current_annotation = NULL;
784 }
785 emit(BRW_OPCODE_ENDIF);
786 }
787
788 } /* namespace brw */