i965/nir: Do not scalarize phis in non-scalar setups
[mesa.git] / src / mesa / drivers / dri / i965 / gen6_gs_visitor.cpp
1 /*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * This code is based on original work by Ilia Mirkin.
24 */
25
26 /**
27 * \file gen6_gs_visitor.cpp
28 *
29 * Gen6 geometry shader implementation
30 */
31
32 #include "gen6_gs_visitor.h"
33
34 const unsigned MAX_GS_INPUT_VERTICES = 6;
35
36 namespace brw {
37
38 void
39 gen6_gs_visitor::assign_binding_table_offsets()
40 {
41 /* In gen6 we reserve the first BRW_MAX_SOL_BINDINGS entries for transform
42 * feedback surfaces.
43 */
44 assign_common_binding_table_offsets(BRW_MAX_SOL_BINDINGS);
45 }
46
47 void
48 gen6_gs_visitor::emit_prolog()
49 {
50 vec4_gs_visitor::emit_prolog();
51
52 /* Gen6 geometry shaders require to allocate an initial VUE handle via
53 * FF_SYNC message, however the documentation remarks that only one thread
54 * can write to the URB simultaneously and the FF_SYNC message provides the
55 * synchronization mechanism for this, so using this message effectively
56 * stalls the thread until it is its turn to write to the URB. Because of
57 * this, the best way to implement geometry shader algorithms in gen6 is to
58 * execute the algorithm before the FF_SYNC message to maximize parallelism.
59 *
60 * To achieve this we buffer the geometry shader outputs for each emitted
61 * vertex in vertex_output during operation. Then, when we have processed
62 * the last vertex (that is, at thread end time), we send the FF_SYNC
63 * message to allocate the initial VUE handle and write all buffered vertex
64 * data to the URB in one go.
65 *
66 * For each emitted vertex, vertex_output will hold vue_map.num_slots
67 * data items plus one additional item to hold required flags
68 * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
69 * which come right after the data items for that vertex. Vertex data and
70 * flags for the next vertex come right after the data items and flags for
71 * the previous vertex.
72 */
73 this->current_annotation = "gen6 prolog";
74 this->vertex_output = src_reg(this,
75 glsl_type::uint_type,
76 (prog_data->vue_map.num_slots + 1) *
77 c->gp->program.VerticesOut);
78 this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
79 emit(MOV(dst_reg(this->vertex_output_offset), src_reg(0u)));
80
81 /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
82 * so initialize it once to R0.
83 */
84 vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
85 retype(brw_vec8_grf(0, 0),
86 BRW_REGISTER_TYPE_UD)));
87 inst->force_writemask_all = true;
88
89 /* This will be used as a temporary to store writeback data of FF_SYNC
90 * and URB_WRITE messages.
91 */
92 this->temp = src_reg(this, glsl_type::uint_type);
93
94 /* This will be used to know when we are processing the first vertex of
95 * a primitive. We will set this to URB_WRITE_PRIM_START only when we know
96 * that we are processing the first vertex in the primitive and to zero
97 * otherwise. This way we can use its value directly in the URB write
98 * headers.
99 */
100 this->first_vertex = src_reg(this, glsl_type::uint_type);
101 emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
102
103 /* The FF_SYNC message requires to know the number of primitives generated,
104 * so keep a counter for this.
105 */
106 this->prim_count = src_reg(this, glsl_type::uint_type);
107 emit(MOV(dst_reg(this->prim_count), 0u));
108
109 if (c->prog_data.gen6_xfb_enabled) {
110 /* Create a virtual register to hold destination indices in SOL */
111 this->destination_indices = src_reg(this, glsl_type::uvec4_type);
112 /* Create a virtual register to hold number of written primitives */
113 this->sol_prim_written = src_reg(this, glsl_type::uint_type);
114 /* Create a virtual register to hold Streamed Vertex Buffer Indices */
115 this->svbi = src_reg(this, glsl_type::uvec4_type);
116 /* Create a virtual register to hold max values of SVBI */
117 this->max_svbi = src_reg(this, glsl_type::uvec4_type);
118 emit(MOV(dst_reg(this->max_svbi),
119 src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD))));
120
121 xfb_setup();
122 }
123
124 /* PrimitveID is delivered in r0.1 of the thread payload. If the program
125 * needs it we have to move it to a separate register where we can map
126 * the atttribute.
127 *
128 * Notice that we cannot use a virtual register for this, because we need to
129 * map all input attributes to hardware registers in setup_payload(),
130 * which happens before virtual registers are mapped to hardware registers.
131 * We could work around that issue if we were able to compute the first
132 * non-payload register here and move the PrimitiveID information to that
133 * register, but we can't because at this point we don't know the final
134 * number uniforms that will be included in the payload.
135 *
136 * So, what we do is to place PrimitiveID information in r1, which is always
137 * delivered as part of the payload, but its only populated with data
138 * relevant for transform feedback when we set GEN6_GS_SVBI_PAYLOAD_ENABLE
139 * in the 3DSTATE_GS state packet. That information can be obtained by other
140 * means though, so we can safely use r1 for this purpose.
141 */
142 if (c->prog_data.include_primitive_id) {
143 this->primitive_id =
144 src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
145 emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
146 }
147 }
148
149 void
150 gen6_gs_visitor::visit(ir_emit_vertex *ir)
151 {
152 gs_emit_vertex(ir->stream_id());
153 }
154 void
155 gen6_gs_visitor::gs_emit_vertex(int stream_id)
156 {
157 this->current_annotation = "gen6 emit vertex";
158 /* Honor max_vertex layout indication in geometry shader by ignoring any
159 * vertices coming after c->gp->program.VerticesOut.
160 */
161 unsigned num_output_vertices = c->gp->program.VerticesOut;
162 emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices),
163 BRW_CONDITIONAL_L));
164 emit(IF(BRW_PREDICATE_NORMAL));
165 {
166 /* Buffer all output slots for this vertex in vertex_output */
167 for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
168 int varying = prog_data->vue_map.slot_to_varying[slot];
169 if (varying != VARYING_SLOT_PSIZ) {
170 dst_reg dst(this->vertex_output);
171 dst.reladdr = ralloc(mem_ctx, src_reg);
172 memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
173 emit_urb_slot(dst, varying);
174 } else {
175 /* The PSIZ slot can pack multiple varyings in different channels
176 * and emit_urb_slot() will produce a MOV instruction for each of
177 * them. Since we are writing to an array, that will translate to
178 * possibly multiple MOV instructions with an array destination and
179 * each will generate a scratch write with the same offset into
180 * scratch space (thus, each one overwriting the previous). This is
181 * not what we want. What we will do instead is emit PSIZ to a
182 * a regular temporary register, then move that resgister into the
183 * array. This way we only have one instruction with an array
184 * destination and we only produce a single scratch write.
185 */
186 dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type));
187 emit_urb_slot(tmp, varying);
188 dst_reg dst(this->vertex_output);
189 dst.reladdr = ralloc(mem_ctx, src_reg);
190 memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
191 vec4_instruction *inst = emit(MOV(dst, src_reg(tmp)));
192 inst->force_writemask_all = true;
193 }
194
195 emit(ADD(dst_reg(this->vertex_output_offset),
196 this->vertex_output_offset, 1u));
197 }
198
199 /* Now buffer flags for this vertex */
200 dst_reg dst(this->vertex_output);
201 dst.reladdr = ralloc(mem_ctx, src_reg);
202 memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
203 if (c->gp->program.OutputType == GL_POINTS) {
204 /* If we are outputting points, then every vertex has PrimStart and
205 * PrimEnd set.
206 */
207 emit(MOV(dst, (_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
208 URB_WRITE_PRIM_START | URB_WRITE_PRIM_END));
209 emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
210 } else {
211 /* Otherwise, we can only set the PrimStart flag, which we have stored
212 * in the first_vertex register. We will have to wait until we execute
213 * EndPrimitive() or we end the thread to set the PrimEnd flag on a
214 * vertex.
215 */
216 emit(OR(dst, this->first_vertex,
217 (c->prog_data.output_topology << URB_WRITE_PRIM_TYPE_SHIFT)));
218 emit(MOV(dst_reg(this->first_vertex), 0u));
219 }
220 emit(ADD(dst_reg(this->vertex_output_offset),
221 this->vertex_output_offset, 1u));
222
223 /* Update vertex count */
224 emit(ADD(dst_reg(this->vertex_count), this->vertex_count, 1u));
225 }
226 emit(BRW_OPCODE_ENDIF);
227 }
228
229 void
230 gen6_gs_visitor::visit(ir_end_primitive *)
231 {
232 gs_end_primitive();
233 }
234
235 void
236 gen6_gs_visitor::gs_end_primitive()
237 {
238 this->current_annotation = "gen6 end primitive";
239 /* Calling EndPrimitive() is optional for point output. In this case we set
240 * the PrimEnd flag when we process EmitVertex().
241 */
242 if (c->gp->program.OutputType == GL_POINTS)
243 return;
244
245 /* Otherwise we know that the last vertex we have processed was the last
246 * vertex in the primitive and we need to set its PrimEnd flag, so do this
247 * unless we haven't emitted that vertex at all (vertex_count != 0).
248 *
249 * Notice that we have already incremented vertex_count when we processed
250 * the last emit_vertex, so we need to take that into account in the
251 * comparison below (hence the num_output_vertices + 1 in the comparison
252 * below).
253 */
254 unsigned num_output_vertices = c->gp->program.VerticesOut;
255 emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices + 1),
256 BRW_CONDITIONAL_L));
257 vec4_instruction *inst = emit(CMP(dst_null_d(),
258 this->vertex_count, 0u,
259 BRW_CONDITIONAL_NEQ));
260 inst->predicate = BRW_PREDICATE_NORMAL;
261 emit(IF(BRW_PREDICATE_NORMAL));
262 {
263 /* vertex_output_offset is already pointing at the first entry of the
264 * next vertex. So subtract 1 to modify the flags for the previous
265 * vertex.
266 */
267 src_reg offset(this, glsl_type::uint_type);
268 emit(ADD(dst_reg(offset), this->vertex_output_offset, src_reg(-1)));
269
270 src_reg dst(this->vertex_output);
271 dst.reladdr = ralloc(mem_ctx, src_reg);
272 memcpy(dst.reladdr, &offset, sizeof(src_reg));
273
274 emit(OR(dst_reg(dst), dst, URB_WRITE_PRIM_END));
275 emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
276
277 /* Set the first vertex flag to indicate that the next vertex will start
278 * a primitive.
279 */
280 emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
281 }
282 emit(BRW_OPCODE_ENDIF);
283 }
284
285 void
286 gen6_gs_visitor::emit_urb_write_header(int mrf)
287 {
288 this->current_annotation = "gen6 urb header";
289 /* Compute offset of the flags for the current vertex in vertex_output and
290 * write them in dw2 of the message header.
291 *
292 * Notice that by the time that emit_thread_end() calls here
293 * vertex_output_offset should point to the first data item of the current
294 * vertex in vertex_output, thus we only need to add the number of output
295 * slots per vertex to that offset to obtain the flags data offset.
296 */
297 src_reg flags_offset(this, glsl_type::uint_type);
298 emit(ADD(dst_reg(flags_offset),
299 this->vertex_output_offset, src_reg(prog_data->vue_map.num_slots)));
300
301 src_reg flags_data(this->vertex_output);
302 flags_data.reladdr = ralloc(mem_ctx, src_reg);
303 memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg));
304
305 emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
306 }
307
308 void
309 gen6_gs_visitor::emit_urb_write_opcode(bool complete, int base_mrf,
310 int last_mrf, int urb_offset)
311 {
312 vec4_instruction *inst = NULL;
313
314 if (!complete) {
315 /* If the vertex is not complete we don't have to do anything special */
316 inst = emit(GS_OPCODE_URB_WRITE);
317 inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
318 } else {
319 /* Otherwise we always request to allocate a new VUE handle. If this is
320 * the last write before the EOT message and the new handle never gets
321 * used it will be dereferenced when we send the EOT message. This is
322 * necessary to avoid different setups for the EOT message (one for the
323 * case when there is no output and another for the case when there is)
324 * which would require to end the program with an IF/ELSE/ENDIF block,
325 * something we do not want.
326 */
327 inst = emit(GS_OPCODE_URB_WRITE_ALLOCATE);
328 inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
329 inst->dst = dst_reg(MRF, base_mrf);
330 inst->src[0] = this->temp;
331 }
332
333 inst->base_mrf = base_mrf;
334 /* URB data written (does not include the message header reg) must
335 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
336 * section 5.4.3.2.2: URB_INTERLEAVED.
337 */
338 int mlen = last_mrf - base_mrf;
339 if ((mlen % 2) != 1)
340 mlen++;
341 inst->mlen = mlen;
342 inst->offset = urb_offset;
343 }
344
345 void
346 gen6_gs_visitor::emit_thread_end()
347 {
348 /* Make sure the current primitive is ended: we know it is not ended when
349 * first_vertex is not zero. This is only relevant for outputs other than
350 * points because in the point case we set PrimEnd on all vertices.
351 */
352 if (c->gp->program.OutputType != GL_POINTS) {
353 emit(CMP(dst_null_d(), this->first_vertex, 0u, BRW_CONDITIONAL_Z));
354 emit(IF(BRW_PREDICATE_NORMAL));
355 {
356 visit((ir_end_primitive *) NULL);
357 }
358 emit(BRW_OPCODE_ENDIF);
359 }
360
361 /* Here we have to:
362 * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle.
363 * 2) Loop over all buffered vertex data and write it to corresponding
364 * URB entries.
365 * 3) Allocate new VUE handles for all vertices other than the first.
366 * 4) Send a final EOT message.
367 */
368
369 /* MRF 0 is reserved for the debugger, so start with message header
370 * in MRF 1.
371 */
372 int base_mrf = 1;
373
374 /* In the process of generating our URB write message contents, we
375 * may need to unspill a register or load from an array. Those
376 * reads would use MRFs 14-15.
377 */
378 int max_usable_mrf = 13;
379
380 /* Issue the FF_SYNC message and obtain the initial VUE handle. */
381 emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_G));
382 emit(IF(BRW_PREDICATE_NORMAL));
383 {
384 this->current_annotation = "gen6 thread end: ff_sync";
385
386 vec4_instruction *inst;
387 if (c->prog_data.gen6_xfb_enabled) {
388 src_reg sol_temp(this, glsl_type::uvec4_type);
389 emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
390 dst_reg(this->svbi),
391 this->vertex_count,
392 this->prim_count,
393 sol_temp);
394 inst = emit(GS_OPCODE_FF_SYNC,
395 dst_reg(this->temp), this->prim_count, this->svbi);
396 } else {
397 inst = emit(GS_OPCODE_FF_SYNC,
398 dst_reg(this->temp), this->prim_count, src_reg(0u));
399 }
400 inst->base_mrf = base_mrf;
401
402 /* Loop over all buffered vertices and emit URB write messages */
403 this->current_annotation = "gen6 thread end: urb writes init";
404 src_reg vertex(this, glsl_type::uint_type);
405 emit(MOV(dst_reg(vertex), 0u));
406 emit(MOV(dst_reg(this->vertex_output_offset), 0u));
407
408 this->current_annotation = "gen6 thread end: urb writes";
409 emit(BRW_OPCODE_DO);
410 {
411 emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
412 inst = emit(BRW_OPCODE_BREAK);
413 inst->predicate = BRW_PREDICATE_NORMAL;
414
415 /* First we prepare the message header */
416 emit_urb_write_header(base_mrf);
417
418 /* Then add vertex data to the message in interleaved fashion */
419 int slot = 0;
420 bool complete = false;
421 do {
422 int mrf = base_mrf + 1;
423
424 /* URB offset is in URB row increments, and each of our MRFs is half
425 * of one of those, since we're doing interleaved writes.
426 */
427 int urb_offset = slot / 2;
428
429 for (; slot < prog_data->vue_map.num_slots; ++slot) {
430 int varying = prog_data->vue_map.slot_to_varying[slot];
431 current_annotation = output_reg_annotation[varying];
432
433 /* Compute offset of this slot for the current vertex
434 * in vertex_output
435 */
436 src_reg data(this->vertex_output);
437 data.reladdr = ralloc(mem_ctx, src_reg);
438 memcpy(data.reladdr, &this->vertex_output_offset,
439 sizeof(src_reg));
440
441 /* Copy this slot to the appropriate message register */
442 dst_reg reg = dst_reg(MRF, mrf);
443 reg.type = output_reg[varying].type;
444 data.type = reg.type;
445 vec4_instruction *inst = emit(MOV(reg, data));
446 inst->force_writemask_all = true;
447
448 mrf++;
449 emit(ADD(dst_reg(this->vertex_output_offset),
450 this->vertex_output_offset, 1u));
451
452 /* If this was max_usable_mrf, we can't fit anything more into
453 * this URB WRITE.
454 */
455 if (mrf > max_usable_mrf) {
456 slot++;
457 break;
458 }
459 }
460
461 complete = slot >= prog_data->vue_map.num_slots;
462 emit_urb_write_opcode(complete, base_mrf, mrf, urb_offset);
463 } while (!complete);
464
465 /* Skip over the flags data item so that vertex_output_offset points
466 * to the first data item of the next vertex, so that we can start
467 * writing the next vertex.
468 */
469 emit(ADD(dst_reg(this->vertex_output_offset),
470 this->vertex_output_offset, 1u));
471
472 emit(ADD(dst_reg(vertex), vertex, 1u));
473 }
474 emit(BRW_OPCODE_WHILE);
475
476 if (c->prog_data.gen6_xfb_enabled)
477 xfb_write();
478 }
479 emit(BRW_OPCODE_ENDIF);
480
481 /* Finally, emit EOT message.
482 *
483 * In gen6 we need to end the thread differently depending on whether we have
484 * emitted at least one vertex or not. In case we did, the EOT message must
485 * always include the COMPLETE flag or else the GPU hangs. If we have not
486 * produced any output we can't use the COMPLETE flag.
487 *
488 * However, this would lead us to end the program with an ENDIF opcode,
489 * which we want to avoid, so what we do is that we always request a new
490 * VUE handle every time we do a URB WRITE, even for the last vertex we emit.
491 * With this we make sure that whether we have emitted at least one vertex
492 * or none at all, we have to finish the thread without writing to the URB,
493 * which works for both cases by setting the COMPLETE and UNUSED flags in
494 * the EOT message.
495 */
496 this->current_annotation = "gen6 thread end: EOT";
497
498 if (c->prog_data.gen6_xfb_enabled) {
499 /* When emitting EOT, set SONumPrimsWritten Increment Value. */
500 src_reg data(this, glsl_type::uint_type);
501 emit(AND(dst_reg(data), this->sol_prim_written, src_reg(0xffffu)));
502 emit(SHL(dst_reg(data), data, src_reg(16u)));
503 emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
504 }
505
506 vec4_instruction *inst = emit(GS_OPCODE_THREAD_END);
507 inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED;
508 inst->base_mrf = base_mrf;
509 inst->mlen = 1;
510 }
511
512 void
513 gen6_gs_visitor::setup_payload()
514 {
515 int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
516
517 /* Attributes are going to be interleaved, so one register contains two
518 * attribute slots.
519 */
520 int attributes_per_reg = 2;
521
522 /* If a geometry shader tries to read from an input that wasn't written by
523 * the vertex shader, that produces undefined results, but it shouldn't
524 * crash anything. So initialize attribute_map to zeros--that ensures that
525 * these undefined results are read from r0.
526 */
527 memset(attribute_map, 0, sizeof(attribute_map));
528
529 int reg = 0;
530
531 /* The payload always contains important data in r0. */
532 reg++;
533
534 /* r1 is always part of the payload and it holds information relevant
535 * for transform feedback when we set the GEN6_GS_SVBI_PAYLOAD_ENABLE bit in
536 * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID
537 * information (and move the original value to a virtual register if
538 * necessary).
539 */
540 if (c->prog_data.include_primitive_id)
541 attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg;
542 reg++;
543
544 reg = setup_uniforms(reg);
545
546 reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
547
548 lower_attributes_to_hw_regs(attribute_map, true);
549
550 this->first_non_payload_grf = reg;
551 }
552
553 void
554 gen6_gs_visitor::xfb_setup()
555 {
556 static const unsigned swizzle_for_offset[4] = {
557 BRW_SWIZZLE4(0, 1, 2, 3),
558 BRW_SWIZZLE4(1, 2, 3, 3),
559 BRW_SWIZZLE4(2, 3, 3, 3),
560 BRW_SWIZZLE4(3, 3, 3, 3)
561 };
562
563 struct brw_gs_prog_data *prog_data =
564 (struct brw_gs_prog_data *) &c->prog_data;
565
566 const struct gl_transform_feedback_info *linked_xfb_info =
567 &this->shader_prog->LinkedTransformFeedback;
568 int i;
569
570 /* Make sure that the VUE slots won't overflow the unsigned chars in
571 * prog_data->transform_feedback_bindings[].
572 */
573 STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256);
574
575 /* Make sure that we don't need more binding table entries than we've
576 * set aside for use in transform feedback. (We shouldn't, since we
577 * set aside enough binding table entries to have one per component).
578 */
579 assert(linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS);
580
581 prog_data->num_transform_feedback_bindings = linked_xfb_info->NumOutputs;
582 for (i = 0; i < prog_data->num_transform_feedback_bindings; i++) {
583 prog_data->transform_feedback_bindings[i] =
584 linked_xfb_info->Outputs[i].OutputRegister;
585 prog_data->transform_feedback_swizzles[i] =
586 swizzle_for_offset[linked_xfb_info->Outputs[i].ComponentOffset];
587 }
588 }
589
590 void
591 gen6_gs_visitor::xfb_write()
592 {
593 unsigned num_verts;
594 struct brw_gs_prog_data *prog_data =
595 (struct brw_gs_prog_data *) &c->prog_data;
596
597 if (!prog_data->num_transform_feedback_bindings)
598 return;
599
600 switch (c->prog_data.output_topology) {
601 case _3DPRIM_POINTLIST:
602 num_verts = 1;
603 break;
604 case _3DPRIM_LINELIST:
605 case _3DPRIM_LINESTRIP:
606 case _3DPRIM_LINELOOP:
607 num_verts = 2;
608 break;
609 case _3DPRIM_TRILIST:
610 case _3DPRIM_TRIFAN:
611 case _3DPRIM_TRISTRIP:
612 case _3DPRIM_RECTLIST:
613 num_verts = 3;
614 break;
615 case _3DPRIM_QUADLIST:
616 case _3DPRIM_QUADSTRIP:
617 case _3DPRIM_POLYGON:
618 num_verts = 3;
619 break;
620 default:
621 unreachable("Unexpected primitive type in Gen6 SOL program.");
622 }
623
624 this->current_annotation = "gen6 thread end: svb writes init";
625
626 emit(MOV(dst_reg(this->vertex_output_offset), 0u));
627 emit(MOV(dst_reg(this->sol_prim_written), 0u));
628
629 /* Check that at least one primitive can be written
630 *
631 * Note: since we use the binding table to keep track of buffer offsets
632 * and stride, the GS doesn't need to keep track of a separate pointer
633 * into each buffer; it uses a single pointer which increments by 1 for
634 * each vertex. So we use SVBI0 for this pointer, regardless of whether
635 * transform feedback is in interleaved or separate attribs mode.
636 */
637 src_reg sol_temp(this, glsl_type::uvec4_type);
638 emit(ADD(dst_reg(sol_temp), this->svbi, src_reg(num_verts)));
639
640 /* Compare SVBI calculated number with the maximum value, which is
641 * in R1.4 (previously saved in this->max_svbi) for gen6.
642 */
643 emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
644 emit(IF(BRW_PREDICATE_NORMAL));
645 {
646 src_reg destination_indices_uw =
647 retype(destination_indices, BRW_REGISTER_TYPE_UW);
648
649 vec4_instruction *inst = emit(MOV(dst_reg(destination_indices_uw),
650 brw_imm_v(0x00020100))); /* (0, 1, 2) */
651 inst->force_writemask_all = true;
652
653 emit(ADD(dst_reg(this->destination_indices),
654 this->destination_indices,
655 this->svbi));
656 }
657 emit(BRW_OPCODE_ENDIF);
658
659 /* Write transform feedback data for all processed vertices. */
660 for (int i = 0; i < c->gp->program.VerticesOut; i++) {
661 emit(MOV(dst_reg(sol_temp), i));
662 emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
663 BRW_CONDITIONAL_L));
664 emit(IF(BRW_PREDICATE_NORMAL));
665 {
666 xfb_program(i, num_verts);
667 }
668 emit(BRW_OPCODE_ENDIF);
669 }
670 }
671
672 void
673 gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
674 {
675 struct brw_gs_prog_data *prog_data =
676 (struct brw_gs_prog_data *) &c->prog_data;
677 unsigned binding;
678 unsigned num_bindings = prog_data->num_transform_feedback_bindings;
679 src_reg sol_temp(this, glsl_type::uvec4_type);
680
681 /* Check for buffer overflow: we need room to write the complete primitive
682 * (all vertices). Otherwise, avoid writing any vertices for it
683 */
684 emit(ADD(dst_reg(sol_temp), this->sol_prim_written, 1u));
685 emit(MUL(dst_reg(sol_temp), sol_temp, src_reg(num_verts)));
686 emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
687 emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
688 emit(IF(BRW_PREDICATE_NORMAL));
689 {
690 /* Avoid overwriting MRF 1 as it is used as URB write message header */
691 dst_reg mrf_reg(MRF, 2);
692
693 this->current_annotation = "gen6: emit SOL vertex data";
694 /* For each vertex, generate code to output each varying using the
695 * appropriate binding table entry.
696 */
697 for (binding = 0; binding < num_bindings; ++binding) {
698 unsigned char varying =
699 prog_data->transform_feedback_bindings[binding];
700
701 /* Set up the correct destination index for this vertex */
702 vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
703 mrf_reg,
704 this->destination_indices);
705 inst->sol_vertex = vertex % num_verts;
706
707 /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
708 *
709 * "Prior to End of Thread with a URB_WRITE, the kernel must
710 * ensure that all writes are complete by sending the final
711 * write as a committed write."
712 */
713 bool final_write = binding == (unsigned) num_bindings - 1 &&
714 inst->sol_vertex == num_verts - 1;
715
716 /* Compute offset of this varying for the current vertex
717 * in vertex_output
718 */
719 this->current_annotation = output_reg_annotation[varying];
720 src_reg data(this->vertex_output);
721 data.reladdr = ralloc(mem_ctx, src_reg);
722 int offset = get_vertex_output_offset_for_varying(vertex, varying);
723 emit(MOV(dst_reg(this->vertex_output_offset), offset));
724 memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
725 data.type = output_reg[varying].type;
726
727 /* PSIZ, LAYER and VIEWPORT are packed in different channels of the
728 * same slot, so make sure we write the appropriate channel
729 */
730 if (varying == VARYING_SLOT_PSIZ)
731 data.swizzle = BRW_SWIZZLE_WWWW;
732 else if (varying == VARYING_SLOT_LAYER)
733 data.swizzle = BRW_SWIZZLE_YYYY;
734 else if (varying == VARYING_SLOT_VIEWPORT)
735 data.swizzle = BRW_SWIZZLE_ZZZZ;
736 else
737 data.swizzle = prog_data->transform_feedback_swizzles[binding];
738
739 /* Write data */
740 inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
741 inst->sol_binding = binding;
742 inst->sol_final_write = final_write;
743
744 if (final_write) {
745 /* This is the last vertex of the primitive, then increment
746 * SO num primitive counter and destination indices.
747 */
748 emit(ADD(dst_reg(this->destination_indices),
749 this->destination_indices,
750 src_reg(num_verts)));
751 emit(ADD(dst_reg(this->sol_prim_written),
752 this->sol_prim_written, 1u));
753 }
754
755 }
756 this->current_annotation = NULL;
757 }
758 emit(BRW_OPCODE_ENDIF);
759 }
760
761 int
762 gen6_gs_visitor::get_vertex_output_offset_for_varying(int vertex, int varying)
763 {
764 /* Find the output slot assigned to this varying.
765 *
766 * VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot
767 * as VARYING_SLOT_PSIZ.
768 */
769 if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT)
770 varying = VARYING_SLOT_PSIZ;
771 int slot = prog_data->vue_map.varying_to_slot[varying];
772
773 if (slot < 0) {
774 /* This varying does not exist in the VUE so we are not writing to it
775 * and its value is undefined. We still want to return a valid offset
776 * into vertex_output though, to prevent any out-of-bound accesses into
777 * the vertex_output array. Since the value for this varying is undefined
778 * we don't really care for the value we assign to it, so any offset
779 * within the limits of vertex_output will do.
780 */
781 slot = 0;
782 }
783
784 return vertex * (prog_data->vue_map.num_slots + 1) + slot;
785 }
786
787 } /* namespace brw */