i965 gen6: Implement rasterizer discard.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "brw_context.h"
34 #include "brw_defines.h"
35 #include "brw_eu.h"
36
37 #include "glsl/ralloc.h"
38
39 /***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43 static void guess_execution_size(struct brw_compile *p,
44 struct brw_instruction *insn,
45 struct brw_reg reg)
46 {
47 if (reg.width == BRW_WIDTH_8 && p->compressed)
48 insn->header.execution_size = BRW_EXECUTE_16;
49 else
50 insn->header.execution_size = reg.width; /* note - definitions are compatible */
51 }
52
53
54 /**
55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
56 * registers, implicitly moving the operand to a message register.
57 *
58 * On Sandybridge, this is no longer the case. This function performs the
59 * explicit move; it should be called before emitting a SEND instruction.
60 */
61 void
62 gen6_resolve_implied_move(struct brw_compile *p,
63 struct brw_reg *src,
64 GLuint msg_reg_nr)
65 {
66 struct intel_context *intel = &p->brw->intel;
67 if (intel->gen < 6)
68 return;
69
70 if (src->file == BRW_MESSAGE_REGISTER_FILE)
71 return;
72
73 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
74 brw_push_insn_state(p);
75 brw_set_mask_control(p, BRW_MASK_DISABLE);
76 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
77 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
78 retype(*src, BRW_REGISTER_TYPE_UD));
79 brw_pop_insn_state(p);
80 }
81 *src = brw_message_reg(msg_reg_nr);
82 }
83
84 static void
85 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
86 {
87 struct intel_context *intel = &p->brw->intel;
88 if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
89 reg->file = BRW_GENERAL_REGISTER_FILE;
90 reg->nr += 111;
91 }
92 }
93
94
95 void
96 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
97 struct brw_reg dest)
98 {
99 if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
100 dest.file != BRW_MESSAGE_REGISTER_FILE)
101 assert(dest.nr < 128);
102
103 gen7_convert_mrf_to_grf(p, &dest);
104
105 insn->bits1.da1.dest_reg_file = dest.file;
106 insn->bits1.da1.dest_reg_type = dest.type;
107 insn->bits1.da1.dest_address_mode = dest.address_mode;
108
109 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
110 insn->bits1.da1.dest_reg_nr = dest.nr;
111
112 if (insn->header.access_mode == BRW_ALIGN_1) {
113 insn->bits1.da1.dest_subreg_nr = dest.subnr;
114 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
115 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
116 insn->bits1.da1.dest_horiz_stride = dest.hstride;
117 }
118 else {
119 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
120 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
121 /* even ignored in da16, still need to set as '01' */
122 insn->bits1.da16.dest_horiz_stride = 1;
123 }
124 }
125 else {
126 insn->bits1.ia1.dest_subreg_nr = dest.subnr;
127
128 /* These are different sizes in align1 vs align16:
129 */
130 if (insn->header.access_mode == BRW_ALIGN_1) {
131 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
132 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
133 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
134 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
135 }
136 else {
137 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
138 /* even ignored in da16, still need to set as '01' */
139 insn->bits1.ia16.dest_horiz_stride = 1;
140 }
141 }
142
143 /* NEW: Set the execution size based on dest.width and
144 * insn->compression_control:
145 */
146 guess_execution_size(p, insn, dest);
147 }
148
149 extern int reg_type_size[];
150
151 static void
152 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
153 {
154 int hstride_for_reg[] = {0, 1, 2, 4};
155 int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
156 int width_for_reg[] = {1, 2, 4, 8, 16};
157 int execsize_for_reg[] = {1, 2, 4, 8, 16};
158 int width, hstride, vstride, execsize;
159
160 if (reg.file == BRW_IMMEDIATE_VALUE) {
161 /* 3.3.6: Region Parameters. Restriction: Immediate vectors
162 * mean the destination has to be 128-bit aligned and the
163 * destination horiz stride has to be a word.
164 */
165 if (reg.type == BRW_REGISTER_TYPE_V) {
166 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
167 reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
168 }
169
170 return;
171 }
172
173 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
174 reg.file == BRW_ARF_NULL)
175 return;
176
177 assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
178 hstride = hstride_for_reg[reg.hstride];
179
180 if (reg.vstride == 0xf) {
181 vstride = -1;
182 } else {
183 assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
184 vstride = vstride_for_reg[reg.vstride];
185 }
186
187 assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
188 width = width_for_reg[reg.width];
189
190 assert(insn->header.execution_size >= 0 &&
191 insn->header.execution_size < Elements(execsize_for_reg));
192 execsize = execsize_for_reg[insn->header.execution_size];
193
194 /* Restrictions from 3.3.10: Register Region Restrictions. */
195 /* 3. */
196 assert(execsize >= width);
197
198 /* 4. */
199 if (execsize == width && hstride != 0) {
200 assert(vstride == -1 || vstride == width * hstride);
201 }
202
203 /* 5. */
204 if (execsize == width && hstride == 0) {
205 /* no restriction on vstride. */
206 }
207
208 /* 6. */
209 if (width == 1) {
210 assert(hstride == 0);
211 }
212
213 /* 7. */
214 if (execsize == 1 && width == 1) {
215 assert(hstride == 0);
216 assert(vstride == 0);
217 }
218
219 /* 8. */
220 if (vstride == 0 && hstride == 0) {
221 assert(width == 1);
222 }
223
224 /* 10. Check destination issues. */
225 }
226
227 void
228 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
229 struct brw_reg reg)
230 {
231 if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
232 assert(reg.nr < 128);
233
234 gen7_convert_mrf_to_grf(p, &reg);
235
236 validate_reg(insn, reg);
237
238 insn->bits1.da1.src0_reg_file = reg.file;
239 insn->bits1.da1.src0_reg_type = reg.type;
240 insn->bits2.da1.src0_abs = reg.abs;
241 insn->bits2.da1.src0_negate = reg.negate;
242 insn->bits2.da1.src0_address_mode = reg.address_mode;
243
244 if (reg.file == BRW_IMMEDIATE_VALUE) {
245 insn->bits3.ud = reg.dw1.ud;
246
247 /* Required to set some fields in src1 as well:
248 */
249 insn->bits1.da1.src1_reg_file = 0; /* arf */
250 insn->bits1.da1.src1_reg_type = reg.type;
251 }
252 else
253 {
254 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
255 if (insn->header.access_mode == BRW_ALIGN_1) {
256 insn->bits2.da1.src0_subreg_nr = reg.subnr;
257 insn->bits2.da1.src0_reg_nr = reg.nr;
258 }
259 else {
260 insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
261 insn->bits2.da16.src0_reg_nr = reg.nr;
262 }
263 }
264 else {
265 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
266
267 if (insn->header.access_mode == BRW_ALIGN_1) {
268 insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
269 }
270 else {
271 insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
272 }
273 }
274
275 if (insn->header.access_mode == BRW_ALIGN_1) {
276 if (reg.width == BRW_WIDTH_1 &&
277 insn->header.execution_size == BRW_EXECUTE_1) {
278 insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
279 insn->bits2.da1.src0_width = BRW_WIDTH_1;
280 insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
281 }
282 else {
283 insn->bits2.da1.src0_horiz_stride = reg.hstride;
284 insn->bits2.da1.src0_width = reg.width;
285 insn->bits2.da1.src0_vert_stride = reg.vstride;
286 }
287 }
288 else {
289 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
290 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
291 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
292 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
293
294 /* This is an oddity of the fact we're using the same
295 * descriptions for registers in align_16 as align_1:
296 */
297 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
298 insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
299 else
300 insn->bits2.da16.src0_vert_stride = reg.vstride;
301 }
302 }
303 }
304
305
306 void brw_set_src1(struct brw_compile *p,
307 struct brw_instruction *insn,
308 struct brw_reg reg)
309 {
310 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
311
312 assert(reg.nr < 128);
313
314 gen7_convert_mrf_to_grf(p, &reg);
315
316 validate_reg(insn, reg);
317
318 insn->bits1.da1.src1_reg_file = reg.file;
319 insn->bits1.da1.src1_reg_type = reg.type;
320 insn->bits3.da1.src1_abs = reg.abs;
321 insn->bits3.da1.src1_negate = reg.negate;
322
323 /* Only src1 can be immediate in two-argument instructions.
324 */
325 assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
326
327 if (reg.file == BRW_IMMEDIATE_VALUE) {
328 insn->bits3.ud = reg.dw1.ud;
329 }
330 else {
331 /* This is a hardware restriction, which may or may not be lifted
332 * in the future:
333 */
334 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
335 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
336
337 if (insn->header.access_mode == BRW_ALIGN_1) {
338 insn->bits3.da1.src1_subreg_nr = reg.subnr;
339 insn->bits3.da1.src1_reg_nr = reg.nr;
340 }
341 else {
342 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
343 insn->bits3.da16.src1_reg_nr = reg.nr;
344 }
345
346 if (insn->header.access_mode == BRW_ALIGN_1) {
347 if (reg.width == BRW_WIDTH_1 &&
348 insn->header.execution_size == BRW_EXECUTE_1) {
349 insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
350 insn->bits3.da1.src1_width = BRW_WIDTH_1;
351 insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
352 }
353 else {
354 insn->bits3.da1.src1_horiz_stride = reg.hstride;
355 insn->bits3.da1.src1_width = reg.width;
356 insn->bits3.da1.src1_vert_stride = reg.vstride;
357 }
358 }
359 else {
360 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
361 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
362 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
363 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
364
365 /* This is an oddity of the fact we're using the same
366 * descriptions for registers in align_16 as align_1:
367 */
368 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
369 insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
370 else
371 insn->bits3.da16.src1_vert_stride = reg.vstride;
372 }
373 }
374 }
375
376 /**
377 * Set the Message Descriptor and Extended Message Descriptor fields
378 * for SEND messages.
379 *
380 * \note This zeroes out the Function Control bits, so it must be called
381 * \b before filling out any message-specific data. Callers can
382 * choose not to fill in irrelevant bits; they will be zero.
383 */
384 static void
385 brw_set_message_descriptor(struct brw_compile *p,
386 struct brw_instruction *inst,
387 enum brw_message_target sfid,
388 unsigned msg_length,
389 unsigned response_length,
390 bool header_present,
391 bool end_of_thread)
392 {
393 struct intel_context *intel = &p->brw->intel;
394
395 brw_set_src1(p, inst, brw_imm_d(0));
396
397 if (intel->gen >= 5) {
398 inst->bits3.generic_gen5.header_present = header_present;
399 inst->bits3.generic_gen5.response_length = response_length;
400 inst->bits3.generic_gen5.msg_length = msg_length;
401 inst->bits3.generic_gen5.end_of_thread = end_of_thread;
402
403 if (intel->gen >= 6) {
404 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
405 inst->header.destreg__conditionalmod = sfid;
406 } else {
407 /* Set Extended Message Descriptor (ex_desc) */
408 inst->bits2.send_gen5.sfid = sfid;
409 inst->bits2.send_gen5.end_of_thread = end_of_thread;
410 }
411 } else {
412 inst->bits3.generic.response_length = response_length;
413 inst->bits3.generic.msg_length = msg_length;
414 inst->bits3.generic.msg_target = sfid;
415 inst->bits3.generic.end_of_thread = end_of_thread;
416 }
417 }
418
419 static void brw_set_math_message( struct brw_compile *p,
420 struct brw_instruction *insn,
421 GLuint function,
422 GLuint integer_type,
423 bool low_precision,
424 bool saturate,
425 GLuint dataType )
426 {
427 struct brw_context *brw = p->brw;
428 struct intel_context *intel = &brw->intel;
429 unsigned msg_length;
430 unsigned response_length;
431
432 /* Infer message length from the function */
433 switch (function) {
434 case BRW_MATH_FUNCTION_POW:
435 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
436 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
437 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
438 msg_length = 2;
439 break;
440 default:
441 msg_length = 1;
442 break;
443 }
444
445 /* Infer response length from the function */
446 switch (function) {
447 case BRW_MATH_FUNCTION_SINCOS:
448 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
449 response_length = 2;
450 break;
451 default:
452 response_length = 1;
453 break;
454 }
455
456 brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
457 msg_length, response_length, false, false);
458 if (intel->gen == 5) {
459 insn->bits3.math_gen5.function = function;
460 insn->bits3.math_gen5.int_type = integer_type;
461 insn->bits3.math_gen5.precision = low_precision;
462 insn->bits3.math_gen5.saturate = saturate;
463 insn->bits3.math_gen5.data_type = dataType;
464 insn->bits3.math_gen5.snapshot = 0;
465 } else {
466 insn->bits3.math.function = function;
467 insn->bits3.math.int_type = integer_type;
468 insn->bits3.math.precision = low_precision;
469 insn->bits3.math.saturate = saturate;
470 insn->bits3.math.data_type = dataType;
471 }
472 }
473
474
475 static void brw_set_ff_sync_message(struct brw_compile *p,
476 struct brw_instruction *insn,
477 bool allocate,
478 GLuint response_length,
479 bool end_of_thread)
480 {
481 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
482 1, response_length, true, end_of_thread);
483 insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
484 insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
485 insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
486 insn->bits3.urb_gen5.allocate = allocate;
487 insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
488 insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
489 }
490
491 static void brw_set_urb_message( struct brw_compile *p,
492 struct brw_instruction *insn,
493 bool allocate,
494 bool used,
495 GLuint msg_length,
496 GLuint response_length,
497 bool end_of_thread,
498 bool complete,
499 GLuint offset,
500 GLuint swizzle_control )
501 {
502 struct brw_context *brw = p->brw;
503 struct intel_context *intel = &brw->intel;
504
505 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
506 msg_length, response_length, true, end_of_thread);
507 if (intel->gen == 7) {
508 insn->bits3.urb_gen7.opcode = 0; /* URB_WRITE_HWORD */
509 insn->bits3.urb_gen7.offset = offset;
510 assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
511 insn->bits3.urb_gen7.swizzle_control = swizzle_control;
512 /* per_slot_offset = 0 makes it ignore offsets in message header */
513 insn->bits3.urb_gen7.per_slot_offset = 0;
514 insn->bits3.urb_gen7.complete = complete;
515 } else if (intel->gen >= 5) {
516 insn->bits3.urb_gen5.opcode = 0; /* URB_WRITE */
517 insn->bits3.urb_gen5.offset = offset;
518 insn->bits3.urb_gen5.swizzle_control = swizzle_control;
519 insn->bits3.urb_gen5.allocate = allocate;
520 insn->bits3.urb_gen5.used = used; /* ? */
521 insn->bits3.urb_gen5.complete = complete;
522 } else {
523 insn->bits3.urb.opcode = 0; /* ? */
524 insn->bits3.urb.offset = offset;
525 insn->bits3.urb.swizzle_control = swizzle_control;
526 insn->bits3.urb.allocate = allocate;
527 insn->bits3.urb.used = used; /* ? */
528 insn->bits3.urb.complete = complete;
529 }
530 }
531
532 void
533 brw_set_dp_write_message(struct brw_compile *p,
534 struct brw_instruction *insn,
535 GLuint binding_table_index,
536 GLuint msg_control,
537 GLuint msg_type,
538 GLuint msg_length,
539 bool header_present,
540 GLuint last_render_target,
541 GLuint response_length,
542 GLuint end_of_thread,
543 GLuint send_commit_msg)
544 {
545 struct brw_context *brw = p->brw;
546 struct intel_context *intel = &brw->intel;
547 unsigned sfid;
548
549 if (intel->gen >= 7) {
550 /* Use the Render Cache for RT writes; otherwise use the Data Cache */
551 if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
552 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
553 else
554 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
555 } else if (intel->gen == 6) {
556 /* Use the render cache for all write messages. */
557 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
558 } else {
559 sfid = BRW_SFID_DATAPORT_WRITE;
560 }
561
562 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
563 header_present, end_of_thread);
564
565 if (intel->gen >= 7) {
566 insn->bits3.gen7_dp.binding_table_index = binding_table_index;
567 insn->bits3.gen7_dp.msg_control = msg_control;
568 insn->bits3.gen7_dp.last_render_target = last_render_target;
569 insn->bits3.gen7_dp.msg_type = msg_type;
570 } else if (intel->gen == 6) {
571 insn->bits3.gen6_dp.binding_table_index = binding_table_index;
572 insn->bits3.gen6_dp.msg_control = msg_control;
573 insn->bits3.gen6_dp.last_render_target = last_render_target;
574 insn->bits3.gen6_dp.msg_type = msg_type;
575 insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
576 } else if (intel->gen == 5) {
577 insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
578 insn->bits3.dp_write_gen5.msg_control = msg_control;
579 insn->bits3.dp_write_gen5.last_render_target = last_render_target;
580 insn->bits3.dp_write_gen5.msg_type = msg_type;
581 insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
582 } else {
583 insn->bits3.dp_write.binding_table_index = binding_table_index;
584 insn->bits3.dp_write.msg_control = msg_control;
585 insn->bits3.dp_write.last_render_target = last_render_target;
586 insn->bits3.dp_write.msg_type = msg_type;
587 insn->bits3.dp_write.send_commit_msg = send_commit_msg;
588 }
589 }
590
591 void
592 brw_set_dp_read_message(struct brw_compile *p,
593 struct brw_instruction *insn,
594 GLuint binding_table_index,
595 GLuint msg_control,
596 GLuint msg_type,
597 GLuint target_cache,
598 GLuint msg_length,
599 GLuint response_length)
600 {
601 struct brw_context *brw = p->brw;
602 struct intel_context *intel = &brw->intel;
603 unsigned sfid;
604
605 if (intel->gen >= 7) {
606 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
607 } else if (intel->gen == 6) {
608 if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
609 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
610 else
611 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
612 } else {
613 sfid = BRW_SFID_DATAPORT_READ;
614 }
615
616 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
617 true, false);
618
619 if (intel->gen >= 7) {
620 insn->bits3.gen7_dp.binding_table_index = binding_table_index;
621 insn->bits3.gen7_dp.msg_control = msg_control;
622 insn->bits3.gen7_dp.last_render_target = 0;
623 insn->bits3.gen7_dp.msg_type = msg_type;
624 } else if (intel->gen == 6) {
625 insn->bits3.gen6_dp.binding_table_index = binding_table_index;
626 insn->bits3.gen6_dp.msg_control = msg_control;
627 insn->bits3.gen6_dp.last_render_target = 0;
628 insn->bits3.gen6_dp.msg_type = msg_type;
629 insn->bits3.gen6_dp.send_commit_msg = 0;
630 } else if (intel->gen == 5) {
631 insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
632 insn->bits3.dp_read_gen5.msg_control = msg_control;
633 insn->bits3.dp_read_gen5.msg_type = msg_type;
634 insn->bits3.dp_read_gen5.target_cache = target_cache;
635 } else if (intel->is_g4x) {
636 insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
637 insn->bits3.dp_read_g4x.msg_control = msg_control; /*8:10*/
638 insn->bits3.dp_read_g4x.msg_type = msg_type; /*11:13*/
639 insn->bits3.dp_read_g4x.target_cache = target_cache; /*14:15*/
640 } else {
641 insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
642 insn->bits3.dp_read.msg_control = msg_control; /*8:11*/
643 insn->bits3.dp_read.msg_type = msg_type; /*12:13*/
644 insn->bits3.dp_read.target_cache = target_cache; /*14:15*/
645 }
646 }
647
648 static void brw_set_sampler_message(struct brw_compile *p,
649 struct brw_instruction *insn,
650 GLuint binding_table_index,
651 GLuint sampler,
652 GLuint msg_type,
653 GLuint response_length,
654 GLuint msg_length,
655 GLuint header_present,
656 GLuint simd_mode,
657 GLuint return_format)
658 {
659 struct brw_context *brw = p->brw;
660 struct intel_context *intel = &brw->intel;
661
662 brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
663 response_length, header_present, false);
664
665 if (intel->gen >= 7) {
666 insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
667 insn->bits3.sampler_gen7.sampler = sampler;
668 insn->bits3.sampler_gen7.msg_type = msg_type;
669 insn->bits3.sampler_gen7.simd_mode = simd_mode;
670 } else if (intel->gen >= 5) {
671 insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
672 insn->bits3.sampler_gen5.sampler = sampler;
673 insn->bits3.sampler_gen5.msg_type = msg_type;
674 insn->bits3.sampler_gen5.simd_mode = simd_mode;
675 } else if (intel->is_g4x) {
676 insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
677 insn->bits3.sampler_g4x.sampler = sampler;
678 insn->bits3.sampler_g4x.msg_type = msg_type;
679 } else {
680 insn->bits3.sampler.binding_table_index = binding_table_index;
681 insn->bits3.sampler.sampler = sampler;
682 insn->bits3.sampler.msg_type = msg_type;
683 insn->bits3.sampler.return_format = return_format;
684 }
685 }
686
687
688 #define next_insn brw_next_insn
689 struct brw_instruction *
690 brw_next_insn(struct brw_compile *p, GLuint opcode)
691 {
692 struct brw_instruction *insn;
693
694 assert(p->nr_insn + 1 < BRW_EU_MAX_INSN);
695
696 insn = &p->store[p->nr_insn++];
697 memcpy(insn, p->current, sizeof(*insn));
698
699 /* Reset this one-shot flag:
700 */
701
702 if (p->current->header.destreg__conditionalmod) {
703 p->current->header.destreg__conditionalmod = 0;
704 p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
705 }
706
707 insn->header.opcode = opcode;
708 return insn;
709 }
710
711 static struct brw_instruction *brw_alu1( struct brw_compile *p,
712 GLuint opcode,
713 struct brw_reg dest,
714 struct brw_reg src )
715 {
716 struct brw_instruction *insn = next_insn(p, opcode);
717 brw_set_dest(p, insn, dest);
718 brw_set_src0(p, insn, src);
719 return insn;
720 }
721
722 static struct brw_instruction *brw_alu2(struct brw_compile *p,
723 GLuint opcode,
724 struct brw_reg dest,
725 struct brw_reg src0,
726 struct brw_reg src1 )
727 {
728 struct brw_instruction *insn = next_insn(p, opcode);
729 brw_set_dest(p, insn, dest);
730 brw_set_src0(p, insn, src0);
731 brw_set_src1(p, insn, src1);
732 return insn;
733 }
734
735
736 /***********************************************************************
737 * Convenience routines.
738 */
739 #define ALU1(OP) \
740 struct brw_instruction *brw_##OP(struct brw_compile *p, \
741 struct brw_reg dest, \
742 struct brw_reg src0) \
743 { \
744 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
745 }
746
747 #define ALU2(OP) \
748 struct brw_instruction *brw_##OP(struct brw_compile *p, \
749 struct brw_reg dest, \
750 struct brw_reg src0, \
751 struct brw_reg src1) \
752 { \
753 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
754 }
755
756 /* Rounding operations (other than RNDD) require two instructions - the first
757 * stores a rounded value (possibly the wrong way) in the dest register, but
758 * also sets a per-channel "increment bit" in the flag register. A predicated
759 * add of 1.0 fixes dest to contain the desired result.
760 *
761 * Sandybridge and later appear to round correctly without an ADD.
762 */
763 #define ROUND(OP) \
764 void brw_##OP(struct brw_compile *p, \
765 struct brw_reg dest, \
766 struct brw_reg src) \
767 { \
768 struct brw_instruction *rnd, *add; \
769 rnd = next_insn(p, BRW_OPCODE_##OP); \
770 brw_set_dest(p, rnd, dest); \
771 brw_set_src0(p, rnd, src); \
772 \
773 if (p->brw->intel.gen < 6) { \
774 /* turn on round-increments */ \
775 rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R; \
776 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
777 add->header.predicate_control = BRW_PREDICATE_NORMAL; \
778 } \
779 }
780
781
782 ALU1(MOV)
783 ALU2(SEL)
784 ALU1(NOT)
785 ALU2(AND)
786 ALU2(OR)
787 ALU2(XOR)
788 ALU2(SHR)
789 ALU2(SHL)
790 ALU2(RSR)
791 ALU2(RSL)
792 ALU2(ASR)
793 ALU1(FRC)
794 ALU1(RNDD)
795 ALU2(MAC)
796 ALU2(MACH)
797 ALU1(LZD)
798 ALU2(DP4)
799 ALU2(DPH)
800 ALU2(DP3)
801 ALU2(DP2)
802 ALU2(LINE)
803 ALU2(PLN)
804
805
806 ROUND(RNDZ)
807 ROUND(RNDE)
808
809
810 struct brw_instruction *brw_ADD(struct brw_compile *p,
811 struct brw_reg dest,
812 struct brw_reg src0,
813 struct brw_reg src1)
814 {
815 /* 6.2.2: add */
816 if (src0.type == BRW_REGISTER_TYPE_F ||
817 (src0.file == BRW_IMMEDIATE_VALUE &&
818 src0.type == BRW_REGISTER_TYPE_VF)) {
819 assert(src1.type != BRW_REGISTER_TYPE_UD);
820 assert(src1.type != BRW_REGISTER_TYPE_D);
821 }
822
823 if (src1.type == BRW_REGISTER_TYPE_F ||
824 (src1.file == BRW_IMMEDIATE_VALUE &&
825 src1.type == BRW_REGISTER_TYPE_VF)) {
826 assert(src0.type != BRW_REGISTER_TYPE_UD);
827 assert(src0.type != BRW_REGISTER_TYPE_D);
828 }
829
830 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
831 }
832
833 struct brw_instruction *brw_MUL(struct brw_compile *p,
834 struct brw_reg dest,
835 struct brw_reg src0,
836 struct brw_reg src1)
837 {
838 /* 6.32.38: mul */
839 if (src0.type == BRW_REGISTER_TYPE_D ||
840 src0.type == BRW_REGISTER_TYPE_UD ||
841 src1.type == BRW_REGISTER_TYPE_D ||
842 src1.type == BRW_REGISTER_TYPE_UD) {
843 assert(dest.type != BRW_REGISTER_TYPE_F);
844 }
845
846 if (src0.type == BRW_REGISTER_TYPE_F ||
847 (src0.file == BRW_IMMEDIATE_VALUE &&
848 src0.type == BRW_REGISTER_TYPE_VF)) {
849 assert(src1.type != BRW_REGISTER_TYPE_UD);
850 assert(src1.type != BRW_REGISTER_TYPE_D);
851 }
852
853 if (src1.type == BRW_REGISTER_TYPE_F ||
854 (src1.file == BRW_IMMEDIATE_VALUE &&
855 src1.type == BRW_REGISTER_TYPE_VF)) {
856 assert(src0.type != BRW_REGISTER_TYPE_UD);
857 assert(src0.type != BRW_REGISTER_TYPE_D);
858 }
859
860 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
861 src0.nr != BRW_ARF_ACCUMULATOR);
862 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
863 src1.nr != BRW_ARF_ACCUMULATOR);
864
865 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
866 }
867
868
869 void brw_NOP(struct brw_compile *p)
870 {
871 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
872 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
873 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
874 brw_set_src1(p, insn, brw_imm_ud(0x0));
875 }
876
877
878
879
880
881 /***********************************************************************
882 * Comparisons, if/else/endif
883 */
884
885 struct brw_instruction *brw_JMPI(struct brw_compile *p,
886 struct brw_reg dest,
887 struct brw_reg src0,
888 struct brw_reg src1)
889 {
890 struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
891
892 insn->header.execution_size = 1;
893 insn->header.compression_control = BRW_COMPRESSION_NONE;
894 insn->header.mask_control = BRW_MASK_DISABLE;
895
896 p->current->header.predicate_control = BRW_PREDICATE_NONE;
897
898 return insn;
899 }
900
901 static void
902 push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
903 {
904 p->if_stack[p->if_stack_depth] = inst;
905
906 p->if_stack_depth++;
907 if (p->if_stack_array_size <= p->if_stack_depth) {
908 p->if_stack_array_size *= 2;
909 p->if_stack = reralloc(p->mem_ctx, p->if_stack, struct brw_instruction *,
910 p->if_stack_array_size);
911 }
912 }
913
914 /* EU takes the value from the flag register and pushes it onto some
915 * sort of a stack (presumably merging with any flag value already on
916 * the stack). Within an if block, the flags at the top of the stack
917 * control execution on each channel of the unit, eg. on each of the
918 * 16 pixel values in our wm programs.
919 *
920 * When the matching 'else' instruction is reached (presumably by
921 * countdown of the instruction count patched in by our ELSE/ENDIF
922 * functions), the relevent flags are inverted.
923 *
924 * When the matching 'endif' instruction is reached, the flags are
925 * popped off. If the stack is now empty, normal execution resumes.
926 */
927 struct brw_instruction *
928 brw_IF(struct brw_compile *p, GLuint execute_size)
929 {
930 struct intel_context *intel = &p->brw->intel;
931 struct brw_instruction *insn;
932
933 insn = next_insn(p, BRW_OPCODE_IF);
934
935 /* Override the defaults for this instruction:
936 */
937 if (intel->gen < 6) {
938 brw_set_dest(p, insn, brw_ip_reg());
939 brw_set_src0(p, insn, brw_ip_reg());
940 brw_set_src1(p, insn, brw_imm_d(0x0));
941 } else if (intel->gen == 6) {
942 brw_set_dest(p, insn, brw_imm_w(0));
943 insn->bits1.branch_gen6.jump_count = 0;
944 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
945 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
946 } else {
947 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
948 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
949 brw_set_src1(p, insn, brw_imm_ud(0));
950 insn->bits3.break_cont.jip = 0;
951 insn->bits3.break_cont.uip = 0;
952 }
953
954 insn->header.execution_size = execute_size;
955 insn->header.compression_control = BRW_COMPRESSION_NONE;
956 insn->header.predicate_control = BRW_PREDICATE_NORMAL;
957 insn->header.mask_control = BRW_MASK_ENABLE;
958 if (!p->single_program_flow)
959 insn->header.thread_control = BRW_THREAD_SWITCH;
960
961 p->current->header.predicate_control = BRW_PREDICATE_NONE;
962
963 push_if_stack(p, insn);
964 return insn;
965 }
966
967 /* This function is only used for gen6-style IF instructions with an
968 * embedded comparison (conditional modifier). It is not used on gen7.
969 */
970 struct brw_instruction *
971 gen6_IF(struct brw_compile *p, uint32_t conditional,
972 struct brw_reg src0, struct brw_reg src1)
973 {
974 struct brw_instruction *insn;
975
976 insn = next_insn(p, BRW_OPCODE_IF);
977
978 brw_set_dest(p, insn, brw_imm_w(0));
979 if (p->compressed) {
980 insn->header.execution_size = BRW_EXECUTE_16;
981 } else {
982 insn->header.execution_size = BRW_EXECUTE_8;
983 }
984 insn->bits1.branch_gen6.jump_count = 0;
985 brw_set_src0(p, insn, src0);
986 brw_set_src1(p, insn, src1);
987
988 assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
989 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
990 insn->header.destreg__conditionalmod = conditional;
991
992 if (!p->single_program_flow)
993 insn->header.thread_control = BRW_THREAD_SWITCH;
994
995 push_if_stack(p, insn);
996 return insn;
997 }
998
999 /**
1000 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1001 */
1002 static void
1003 convert_IF_ELSE_to_ADD(struct brw_compile *p,
1004 struct brw_instruction *if_inst,
1005 struct brw_instruction *else_inst)
1006 {
1007 /* The next instruction (where the ENDIF would be, if it existed) */
1008 struct brw_instruction *next_inst = &p->store[p->nr_insn];
1009
1010 assert(p->single_program_flow);
1011 assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1012 assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1013 assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1014
1015 /* Convert IF to an ADD instruction that moves the instruction pointer
1016 * to the first instruction of the ELSE block. If there is no ELSE
1017 * block, point to where ENDIF would be. Reverse the predicate.
1018 *
1019 * There's no need to execute an ENDIF since we don't need to do any
1020 * stack operations, and if we're currently executing, we just want to
1021 * continue normally.
1022 */
1023 if_inst->header.opcode = BRW_OPCODE_ADD;
1024 if_inst->header.predicate_inverse = 1;
1025
1026 if (else_inst != NULL) {
1027 /* Convert ELSE to an ADD instruction that points where the ENDIF
1028 * would be.
1029 */
1030 else_inst->header.opcode = BRW_OPCODE_ADD;
1031
1032 if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1033 else_inst->bits3.ud = (next_inst - else_inst) * 16;
1034 } else {
1035 if_inst->bits3.ud = (next_inst - if_inst) * 16;
1036 }
1037 }
1038
1039 /**
1040 * Patch IF and ELSE instructions with appropriate jump targets.
1041 */
1042 static void
1043 patch_IF_ELSE(struct brw_compile *p,
1044 struct brw_instruction *if_inst,
1045 struct brw_instruction *else_inst,
1046 struct brw_instruction *endif_inst)
1047 {
1048 struct intel_context *intel = &p->brw->intel;
1049
1050 /* We shouldn't be patching IF and ELSE instructions in single program flow
1051 * mode when gen < 6, because in single program flow mode on those
1052 * platforms, we convert flow control instructions to conditional ADDs that
1053 * operate on IP (see brw_ENDIF).
1054 *
1055 * However, on Gen6, writing to IP doesn't work in single program flow mode
1056 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1057 * not be updated by non-flow control instructions."). And on later
1058 * platforms, there is no significant benefit to converting control flow
1059 * instructions to conditional ADDs. So we do patch IF and ELSE
1060 * instructions in single program flow mode on those platforms.
1061 */
1062 if (intel->gen < 6)
1063 assert(!p->single_program_flow);
1064
1065 assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1066 assert(endif_inst != NULL);
1067 assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1068
1069 unsigned br = 1;
1070 /* Jump count is for 64bit data chunk each, so one 128bit instruction
1071 * requires 2 chunks.
1072 */
1073 if (intel->gen >= 5)
1074 br = 2;
1075
1076 assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1077 endif_inst->header.execution_size = if_inst->header.execution_size;
1078
1079 if (else_inst == NULL) {
1080 /* Patch IF -> ENDIF */
1081 if (intel->gen < 6) {
1082 /* Turn it into an IFF, which means no mask stack operations for
1083 * all-false and jumping past the ENDIF.
1084 */
1085 if_inst->header.opcode = BRW_OPCODE_IFF;
1086 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1087 if_inst->bits3.if_else.pop_count = 0;
1088 if_inst->bits3.if_else.pad0 = 0;
1089 } else if (intel->gen == 6) {
1090 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1091 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1092 } else {
1093 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1094 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1095 }
1096 } else {
1097 else_inst->header.execution_size = if_inst->header.execution_size;
1098
1099 /* Patch IF -> ELSE */
1100 if (intel->gen < 6) {
1101 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1102 if_inst->bits3.if_else.pop_count = 0;
1103 if_inst->bits3.if_else.pad0 = 0;
1104 } else if (intel->gen == 6) {
1105 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1106 }
1107
1108 /* Patch ELSE -> ENDIF */
1109 if (intel->gen < 6) {
1110 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1111 * matching ENDIF.
1112 */
1113 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1114 else_inst->bits3.if_else.pop_count = 1;
1115 else_inst->bits3.if_else.pad0 = 0;
1116 } else if (intel->gen == 6) {
1117 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1118 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1119 } else {
1120 /* The IF instruction's JIP should point just past the ELSE */
1121 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1122 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1123 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1124 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1125 }
1126 }
1127 }
1128
1129 void
1130 brw_ELSE(struct brw_compile *p)
1131 {
1132 struct intel_context *intel = &p->brw->intel;
1133 struct brw_instruction *insn;
1134
1135 insn = next_insn(p, BRW_OPCODE_ELSE);
1136
1137 if (intel->gen < 6) {
1138 brw_set_dest(p, insn, brw_ip_reg());
1139 brw_set_src0(p, insn, brw_ip_reg());
1140 brw_set_src1(p, insn, brw_imm_d(0x0));
1141 } else if (intel->gen == 6) {
1142 brw_set_dest(p, insn, brw_imm_w(0));
1143 insn->bits1.branch_gen6.jump_count = 0;
1144 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1145 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1146 } else {
1147 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1148 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1149 brw_set_src1(p, insn, brw_imm_ud(0));
1150 insn->bits3.break_cont.jip = 0;
1151 insn->bits3.break_cont.uip = 0;
1152 }
1153
1154 insn->header.compression_control = BRW_COMPRESSION_NONE;
1155 insn->header.mask_control = BRW_MASK_ENABLE;
1156 if (!p->single_program_flow)
1157 insn->header.thread_control = BRW_THREAD_SWITCH;
1158
1159 push_if_stack(p, insn);
1160 }
1161
1162 void
1163 brw_ENDIF(struct brw_compile *p)
1164 {
1165 struct intel_context *intel = &p->brw->intel;
1166 struct brw_instruction *insn;
1167 struct brw_instruction *else_inst = NULL;
1168 struct brw_instruction *if_inst = NULL;
1169
1170 /* Pop the IF and (optional) ELSE instructions from the stack */
1171 p->if_stack_depth--;
1172 if (p->if_stack[p->if_stack_depth]->header.opcode == BRW_OPCODE_ELSE) {
1173 else_inst = p->if_stack[p->if_stack_depth];
1174 p->if_stack_depth--;
1175 }
1176 if_inst = p->if_stack[p->if_stack_depth];
1177
1178 /* In single program flow mode, we can express IF and ELSE instructions
1179 * equivalently as ADD instructions that operate on IP. On platforms prior
1180 * to Gen6, flow control instructions cause an implied thread switch, so
1181 * this is a significant savings.
1182 *
1183 * However, on Gen6, writing to IP doesn't work in single program flow mode
1184 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1185 * not be updated by non-flow control instructions."). And on later
1186 * platforms, there is no significant benefit to converting control flow
1187 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1188 * Gen5.
1189 */
1190 if (intel->gen < 6 && p->single_program_flow) {
1191 /* ENDIF is useless; don't bother emitting it. */
1192 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1193 return;
1194 }
1195
1196 insn = next_insn(p, BRW_OPCODE_ENDIF);
1197
1198 if (intel->gen < 6) {
1199 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1200 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1201 brw_set_src1(p, insn, brw_imm_d(0x0));
1202 } else if (intel->gen == 6) {
1203 brw_set_dest(p, insn, brw_imm_w(0));
1204 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1205 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1206 } else {
1207 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1208 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1209 brw_set_src1(p, insn, brw_imm_ud(0));
1210 }
1211
1212 insn->header.compression_control = BRW_COMPRESSION_NONE;
1213 insn->header.mask_control = BRW_MASK_ENABLE;
1214 insn->header.thread_control = BRW_THREAD_SWITCH;
1215
1216 /* Also pop item off the stack in the endif instruction: */
1217 if (intel->gen < 6) {
1218 insn->bits3.if_else.jump_count = 0;
1219 insn->bits3.if_else.pop_count = 1;
1220 insn->bits3.if_else.pad0 = 0;
1221 } else if (intel->gen == 6) {
1222 insn->bits1.branch_gen6.jump_count = 2;
1223 } else {
1224 insn->bits3.break_cont.jip = 2;
1225 }
1226 patch_IF_ELSE(p, if_inst, else_inst, insn);
1227 }
1228
1229 struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count)
1230 {
1231 struct intel_context *intel = &p->brw->intel;
1232 struct brw_instruction *insn;
1233
1234 insn = next_insn(p, BRW_OPCODE_BREAK);
1235 if (intel->gen >= 6) {
1236 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1237 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1238 brw_set_src1(p, insn, brw_imm_d(0x0));
1239 } else {
1240 brw_set_dest(p, insn, brw_ip_reg());
1241 brw_set_src0(p, insn, brw_ip_reg());
1242 brw_set_src1(p, insn, brw_imm_d(0x0));
1243 insn->bits3.if_else.pad0 = 0;
1244 insn->bits3.if_else.pop_count = pop_count;
1245 }
1246 insn->header.compression_control = BRW_COMPRESSION_NONE;
1247 insn->header.execution_size = BRW_EXECUTE_8;
1248
1249 return insn;
1250 }
1251
1252 struct brw_instruction *gen6_CONT(struct brw_compile *p,
1253 struct brw_instruction *do_insn)
1254 {
1255 struct brw_instruction *insn;
1256
1257 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1258 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1259 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1260 brw_set_dest(p, insn, brw_ip_reg());
1261 brw_set_src0(p, insn, brw_ip_reg());
1262 brw_set_src1(p, insn, brw_imm_d(0x0));
1263
1264 insn->header.compression_control = BRW_COMPRESSION_NONE;
1265 insn->header.execution_size = BRW_EXECUTE_8;
1266 return insn;
1267 }
1268
1269 struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
1270 {
1271 struct brw_instruction *insn;
1272 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1273 brw_set_dest(p, insn, brw_ip_reg());
1274 brw_set_src0(p, insn, brw_ip_reg());
1275 brw_set_src1(p, insn, brw_imm_d(0x0));
1276 insn->header.compression_control = BRW_COMPRESSION_NONE;
1277 insn->header.execution_size = BRW_EXECUTE_8;
1278 /* insn->header.mask_control = BRW_MASK_DISABLE; */
1279 insn->bits3.if_else.pad0 = 0;
1280 insn->bits3.if_else.pop_count = pop_count;
1281 return insn;
1282 }
1283
1284 /* DO/WHILE loop:
1285 *
1286 * The DO/WHILE is just an unterminated loop -- break or continue are
1287 * used for control within the loop. We have a few ways they can be
1288 * done.
1289 *
1290 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1291 * jip and no DO instruction.
1292 *
1293 * For non-uniform control flow pre-gen6, there's a DO instruction to
1294 * push the mask, and a WHILE to jump back, and BREAK to get out and
1295 * pop the mask.
1296 *
1297 * For gen6, there's no more mask stack, so no need for DO. WHILE
1298 * just points back to the first instruction of the loop.
1299 */
1300 struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1301 {
1302 struct intel_context *intel = &p->brw->intel;
1303
1304 if (intel->gen >= 6 || p->single_program_flow) {
1305 return &p->store[p->nr_insn];
1306 } else {
1307 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1308
1309 /* Override the defaults for this instruction:
1310 */
1311 brw_set_dest(p, insn, brw_null_reg());
1312 brw_set_src0(p, insn, brw_null_reg());
1313 brw_set_src1(p, insn, brw_null_reg());
1314
1315 insn->header.compression_control = BRW_COMPRESSION_NONE;
1316 insn->header.execution_size = execute_size;
1317 insn->header.predicate_control = BRW_PREDICATE_NONE;
1318 /* insn->header.mask_control = BRW_MASK_ENABLE; */
1319 /* insn->header.mask_control = BRW_MASK_DISABLE; */
1320
1321 return insn;
1322 }
1323 }
1324
1325
1326
1327 struct brw_instruction *brw_WHILE(struct brw_compile *p,
1328 struct brw_instruction *do_insn)
1329 {
1330 struct intel_context *intel = &p->brw->intel;
1331 struct brw_instruction *insn;
1332 GLuint br = 1;
1333
1334 if (intel->gen >= 5)
1335 br = 2;
1336
1337 if (intel->gen >= 7) {
1338 insn = next_insn(p, BRW_OPCODE_WHILE);
1339
1340 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1341 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1342 brw_set_src1(p, insn, brw_imm_ud(0));
1343 insn->bits3.break_cont.jip = br * (do_insn - insn);
1344
1345 insn->header.execution_size = BRW_EXECUTE_8;
1346 } else if (intel->gen == 6) {
1347 insn = next_insn(p, BRW_OPCODE_WHILE);
1348
1349 brw_set_dest(p, insn, brw_imm_w(0));
1350 insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1351 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1352 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1353
1354 insn->header.execution_size = BRW_EXECUTE_8;
1355 } else {
1356 if (p->single_program_flow) {
1357 insn = next_insn(p, BRW_OPCODE_ADD);
1358
1359 brw_set_dest(p, insn, brw_ip_reg());
1360 brw_set_src0(p, insn, brw_ip_reg());
1361 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1362 insn->header.execution_size = BRW_EXECUTE_1;
1363 } else {
1364 insn = next_insn(p, BRW_OPCODE_WHILE);
1365
1366 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1367
1368 brw_set_dest(p, insn, brw_ip_reg());
1369 brw_set_src0(p, insn, brw_ip_reg());
1370 brw_set_src1(p, insn, brw_imm_d(0));
1371
1372 insn->header.execution_size = do_insn->header.execution_size;
1373 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1374 insn->bits3.if_else.pop_count = 0;
1375 insn->bits3.if_else.pad0 = 0;
1376 }
1377 }
1378 insn->header.compression_control = BRW_COMPRESSION_NONE;
1379 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1380
1381 return insn;
1382 }
1383
1384
1385 /* FORWARD JUMPS:
1386 */
1387 void brw_land_fwd_jump(struct brw_compile *p,
1388 struct brw_instruction *jmp_insn)
1389 {
1390 struct intel_context *intel = &p->brw->intel;
1391 struct brw_instruction *landing = &p->store[p->nr_insn];
1392 GLuint jmpi = 1;
1393
1394 if (intel->gen >= 5)
1395 jmpi = 2;
1396
1397 assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1398 assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1399
1400 jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
1401 }
1402
1403
1404
1405 /* To integrate with the above, it makes sense that the comparison
1406 * instruction should populate the flag register. It might be simpler
1407 * just to use the flag reg for most WM tasks?
1408 */
1409 void brw_CMP(struct brw_compile *p,
1410 struct brw_reg dest,
1411 GLuint conditional,
1412 struct brw_reg src0,
1413 struct brw_reg src1)
1414 {
1415 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1416
1417 insn->header.destreg__conditionalmod = conditional;
1418 brw_set_dest(p, insn, dest);
1419 brw_set_src0(p, insn, src0);
1420 brw_set_src1(p, insn, src1);
1421
1422 /* guess_execution_size(insn, src0); */
1423
1424
1425 /* Make it so that future instructions will use the computed flag
1426 * value until brw_set_predicate_control_flag_value() is called
1427 * again.
1428 */
1429 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1430 dest.nr == 0) {
1431 p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1432 p->flag_value = 0xff;
1433 }
1434 }
1435
1436 /* Issue 'wait' instruction for n1, host could program MMIO
1437 to wake up thread. */
1438 void brw_WAIT (struct brw_compile *p)
1439 {
1440 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1441 struct brw_reg src = brw_notification_1_reg();
1442
1443 brw_set_dest(p, insn, src);
1444 brw_set_src0(p, insn, src);
1445 brw_set_src1(p, insn, brw_null_reg());
1446 insn->header.execution_size = 0; /* must */
1447 insn->header.predicate_control = 0;
1448 insn->header.compression_control = 0;
1449 }
1450
1451
1452 /***********************************************************************
1453 * Helpers for the various SEND message types:
1454 */
1455
1456 /** Extended math function, float[8].
1457 */
1458 void brw_math( struct brw_compile *p,
1459 struct brw_reg dest,
1460 GLuint function,
1461 GLuint saturate,
1462 GLuint msg_reg_nr,
1463 struct brw_reg src,
1464 GLuint data_type,
1465 GLuint precision )
1466 {
1467 struct intel_context *intel = &p->brw->intel;
1468
1469 if (intel->gen >= 6) {
1470 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1471
1472 assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1473 assert(src.file == BRW_GENERAL_REGISTER_FILE);
1474
1475 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1476 if (intel->gen == 6)
1477 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1478
1479 /* Source modifiers are ignored for extended math instructions on Gen6. */
1480 if (intel->gen == 6) {
1481 assert(!src.negate);
1482 assert(!src.abs);
1483 }
1484
1485 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1486 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1487 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1488 assert(src.type != BRW_REGISTER_TYPE_F);
1489 } else {
1490 assert(src.type == BRW_REGISTER_TYPE_F);
1491 }
1492
1493 /* Math is the same ISA format as other opcodes, except that CondModifier
1494 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1495 */
1496 insn->header.destreg__conditionalmod = function;
1497 insn->header.saturate = saturate;
1498
1499 brw_set_dest(p, insn, dest);
1500 brw_set_src0(p, insn, src);
1501 brw_set_src1(p, insn, brw_null_reg());
1502 } else {
1503 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1504
1505 /* Example code doesn't set predicate_control for send
1506 * instructions.
1507 */
1508 insn->header.predicate_control = 0;
1509 insn->header.destreg__conditionalmod = msg_reg_nr;
1510
1511 brw_set_dest(p, insn, dest);
1512 brw_set_src0(p, insn, src);
1513 brw_set_math_message(p,
1514 insn,
1515 function,
1516 src.type == BRW_REGISTER_TYPE_D,
1517 precision,
1518 saturate,
1519 data_type);
1520 }
1521 }
1522
1523 /** Extended math function, float[8].
1524 */
1525 void brw_math2(struct brw_compile *p,
1526 struct brw_reg dest,
1527 GLuint function,
1528 struct brw_reg src0,
1529 struct brw_reg src1)
1530 {
1531 struct intel_context *intel = &p->brw->intel;
1532 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1533
1534 assert(intel->gen >= 6);
1535 (void) intel;
1536
1537
1538 assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1539 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1540 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1541
1542 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1543 if (intel->gen == 6) {
1544 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1545 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1546 }
1547
1548 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1549 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1550 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1551 assert(src0.type != BRW_REGISTER_TYPE_F);
1552 assert(src1.type != BRW_REGISTER_TYPE_F);
1553 } else {
1554 assert(src0.type == BRW_REGISTER_TYPE_F);
1555 assert(src1.type == BRW_REGISTER_TYPE_F);
1556 }
1557
1558 /* Source modifiers are ignored for extended math instructions on Gen6. */
1559 if (intel->gen == 6) {
1560 assert(!src0.negate);
1561 assert(!src0.abs);
1562 assert(!src1.negate);
1563 assert(!src1.abs);
1564 }
1565
1566 /* Math is the same ISA format as other opcodes, except that CondModifier
1567 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1568 */
1569 insn->header.destreg__conditionalmod = function;
1570
1571 brw_set_dest(p, insn, dest);
1572 brw_set_src0(p, insn, src0);
1573 brw_set_src1(p, insn, src1);
1574 }
1575
1576 /**
1577 * Extended math function, float[16].
1578 * Use 2 send instructions.
1579 */
1580 void brw_math_16( struct brw_compile *p,
1581 struct brw_reg dest,
1582 GLuint function,
1583 GLuint saturate,
1584 GLuint msg_reg_nr,
1585 struct brw_reg src,
1586 GLuint precision )
1587 {
1588 struct intel_context *intel = &p->brw->intel;
1589 struct brw_instruction *insn;
1590
1591 if (intel->gen >= 6) {
1592 insn = next_insn(p, BRW_OPCODE_MATH);
1593
1594 /* Math is the same ISA format as other opcodes, except that CondModifier
1595 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1596 */
1597 insn->header.destreg__conditionalmod = function;
1598 insn->header.saturate = saturate;
1599
1600 /* Source modifiers are ignored for extended math instructions. */
1601 assert(!src.negate);
1602 assert(!src.abs);
1603
1604 brw_set_dest(p, insn, dest);
1605 brw_set_src0(p, insn, src);
1606 brw_set_src1(p, insn, brw_null_reg());
1607 return;
1608 }
1609
1610 /* First instruction:
1611 */
1612 brw_push_insn_state(p);
1613 brw_set_predicate_control_flag_value(p, 0xff);
1614 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1615
1616 insn = next_insn(p, BRW_OPCODE_SEND);
1617 insn->header.destreg__conditionalmod = msg_reg_nr;
1618
1619 brw_set_dest(p, insn, dest);
1620 brw_set_src0(p, insn, src);
1621 brw_set_math_message(p,
1622 insn,
1623 function,
1624 BRW_MATH_INTEGER_UNSIGNED,
1625 precision,
1626 saturate,
1627 BRW_MATH_DATA_VECTOR);
1628
1629 /* Second instruction:
1630 */
1631 insn = next_insn(p, BRW_OPCODE_SEND);
1632 insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
1633 insn->header.destreg__conditionalmod = msg_reg_nr+1;
1634
1635 brw_set_dest(p, insn, offset(dest,1));
1636 brw_set_src0(p, insn, src);
1637 brw_set_math_message(p,
1638 insn,
1639 function,
1640 BRW_MATH_INTEGER_UNSIGNED,
1641 precision,
1642 saturate,
1643 BRW_MATH_DATA_VECTOR);
1644
1645 brw_pop_insn_state(p);
1646 }
1647
1648
1649 /**
1650 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1651 * using a constant offset per channel.
1652 *
1653 * The offset must be aligned to oword size (16 bytes). Used for
1654 * register spilling.
1655 */
1656 void brw_oword_block_write_scratch(struct brw_compile *p,
1657 struct brw_reg mrf,
1658 int num_regs,
1659 GLuint offset)
1660 {
1661 struct intel_context *intel = &p->brw->intel;
1662 uint32_t msg_control, msg_type;
1663 int mlen;
1664
1665 if (intel->gen >= 6)
1666 offset /= 16;
1667
1668 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1669
1670 if (num_regs == 1) {
1671 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1672 mlen = 2;
1673 } else {
1674 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1675 mlen = 3;
1676 }
1677
1678 /* Set up the message header. This is g0, with g0.2 filled with
1679 * the offset. We don't want to leave our offset around in g0 or
1680 * it'll screw up texture samples, so set it up inside the message
1681 * reg.
1682 */
1683 {
1684 brw_push_insn_state(p);
1685 brw_set_mask_control(p, BRW_MASK_DISABLE);
1686 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1687
1688 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1689
1690 /* set message header global offset field (reg 0, element 2) */
1691 brw_MOV(p,
1692 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1693 mrf.nr,
1694 2), BRW_REGISTER_TYPE_UD),
1695 brw_imm_ud(offset));
1696
1697 brw_pop_insn_state(p);
1698 }
1699
1700 {
1701 struct brw_reg dest;
1702 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1703 int send_commit_msg;
1704 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1705 BRW_REGISTER_TYPE_UW);
1706
1707 if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1708 insn->header.compression_control = BRW_COMPRESSION_NONE;
1709 src_header = vec16(src_header);
1710 }
1711 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1712 insn->header.destreg__conditionalmod = mrf.nr;
1713
1714 /* Until gen6, writes followed by reads from the same location
1715 * are not guaranteed to be ordered unless write_commit is set.
1716 * If set, then a no-op write is issued to the destination
1717 * register to set a dependency, and a read from the destination
1718 * can be used to ensure the ordering.
1719 *
1720 * For gen6, only writes between different threads need ordering
1721 * protection. Our use of DP writes is all about register
1722 * spilling within a thread.
1723 */
1724 if (intel->gen >= 6) {
1725 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1726 send_commit_msg = 0;
1727 } else {
1728 dest = src_header;
1729 send_commit_msg = 1;
1730 }
1731
1732 brw_set_dest(p, insn, dest);
1733 if (intel->gen >= 6) {
1734 brw_set_src0(p, insn, mrf);
1735 } else {
1736 brw_set_src0(p, insn, brw_null_reg());
1737 }
1738
1739 if (intel->gen >= 6)
1740 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1741 else
1742 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1743
1744 brw_set_dp_write_message(p,
1745 insn,
1746 255, /* binding table index (255=stateless) */
1747 msg_control,
1748 msg_type,
1749 mlen,
1750 true, /* header_present */
1751 0, /* not a render target */
1752 send_commit_msg, /* response_length */
1753 0, /* eot */
1754 send_commit_msg);
1755 }
1756 }
1757
1758
1759 /**
1760 * Read a block of owords (half a GRF each) from the scratch buffer
1761 * using a constant index per channel.
1762 *
1763 * Offset must be aligned to oword size (16 bytes). Used for register
1764 * spilling.
1765 */
1766 void
1767 brw_oword_block_read_scratch(struct brw_compile *p,
1768 struct brw_reg dest,
1769 struct brw_reg mrf,
1770 int num_regs,
1771 GLuint offset)
1772 {
1773 struct intel_context *intel = &p->brw->intel;
1774 uint32_t msg_control;
1775 int rlen;
1776
1777 if (intel->gen >= 6)
1778 offset /= 16;
1779
1780 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1781 dest = retype(dest, BRW_REGISTER_TYPE_UW);
1782
1783 if (num_regs == 1) {
1784 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1785 rlen = 1;
1786 } else {
1787 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1788 rlen = 2;
1789 }
1790
1791 {
1792 brw_push_insn_state(p);
1793 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1794 brw_set_mask_control(p, BRW_MASK_DISABLE);
1795
1796 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1797
1798 /* set message header global offset field (reg 0, element 2) */
1799 brw_MOV(p,
1800 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1801 mrf.nr,
1802 2), BRW_REGISTER_TYPE_UD),
1803 brw_imm_ud(offset));
1804
1805 brw_pop_insn_state(p);
1806 }
1807
1808 {
1809 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1810
1811 assert(insn->header.predicate_control == 0);
1812 insn->header.compression_control = BRW_COMPRESSION_NONE;
1813 insn->header.destreg__conditionalmod = mrf.nr;
1814
1815 brw_set_dest(p, insn, dest); /* UW? */
1816 if (intel->gen >= 6) {
1817 brw_set_src0(p, insn, mrf);
1818 } else {
1819 brw_set_src0(p, insn, brw_null_reg());
1820 }
1821
1822 brw_set_dp_read_message(p,
1823 insn,
1824 255, /* binding table index (255=stateless) */
1825 msg_control,
1826 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1827 BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
1828 1, /* msg_length */
1829 rlen);
1830 }
1831 }
1832
1833 /**
1834 * Read a float[4] vector from the data port Data Cache (const buffer).
1835 * Location (in buffer) should be a multiple of 16.
1836 * Used for fetching shader constants.
1837 */
1838 void brw_oword_block_read(struct brw_compile *p,
1839 struct brw_reg dest,
1840 struct brw_reg mrf,
1841 uint32_t offset,
1842 uint32_t bind_table_index)
1843 {
1844 struct intel_context *intel = &p->brw->intel;
1845
1846 /* On newer hardware, offset is in units of owords. */
1847 if (intel->gen >= 6)
1848 offset /= 16;
1849
1850 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1851
1852 brw_push_insn_state(p);
1853 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1854 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1855 brw_set_mask_control(p, BRW_MASK_DISABLE);
1856
1857 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1858
1859 /* set message header global offset field (reg 0, element 2) */
1860 brw_MOV(p,
1861 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1862 mrf.nr,
1863 2), BRW_REGISTER_TYPE_UD),
1864 brw_imm_ud(offset));
1865
1866 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1867 insn->header.destreg__conditionalmod = mrf.nr;
1868
1869 /* cast dest to a uword[8] vector */
1870 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1871
1872 brw_set_dest(p, insn, dest);
1873 if (intel->gen >= 6) {
1874 brw_set_src0(p, insn, mrf);
1875 } else {
1876 brw_set_src0(p, insn, brw_null_reg());
1877 }
1878
1879 brw_set_dp_read_message(p,
1880 insn,
1881 bind_table_index,
1882 BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
1883 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
1884 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1885 1, /* msg_length */
1886 1); /* response_length (1 reg, 2 owords!) */
1887
1888 brw_pop_insn_state(p);
1889 }
1890
1891 /**
1892 * Read a set of dwords from the data port Data Cache (const buffer).
1893 *
1894 * Location (in buffer) appears as UD offsets in the register after
1895 * the provided mrf header reg.
1896 */
1897 void brw_dword_scattered_read(struct brw_compile *p,
1898 struct brw_reg dest,
1899 struct brw_reg mrf,
1900 uint32_t bind_table_index)
1901 {
1902 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1903
1904 brw_push_insn_state(p);
1905 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1906 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1907 brw_set_mask_control(p, BRW_MASK_DISABLE);
1908 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1909 brw_pop_insn_state(p);
1910
1911 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1912 insn->header.destreg__conditionalmod = mrf.nr;
1913
1914 /* cast dest to a uword[8] vector */
1915 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1916
1917 brw_set_dest(p, insn, dest);
1918 brw_set_src0(p, insn, brw_null_reg());
1919
1920 brw_set_dp_read_message(p,
1921 insn,
1922 bind_table_index,
1923 BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS,
1924 BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ,
1925 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1926 2, /* msg_length */
1927 1); /* response_length */
1928 }
1929
1930
1931
1932 /**
1933 * Read float[4] constant(s) from VS constant buffer.
1934 * For relative addressing, two float[4] constants will be read into 'dest'.
1935 * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
1936 */
1937 void brw_dp_READ_4_vs(struct brw_compile *p,
1938 struct brw_reg dest,
1939 GLuint location,
1940 GLuint bind_table_index)
1941 {
1942 struct intel_context *intel = &p->brw->intel;
1943 struct brw_instruction *insn;
1944 GLuint msg_reg_nr = 1;
1945
1946 if (intel->gen >= 6)
1947 location /= 16;
1948
1949 /* Setup MRF[1] with location/offset into const buffer */
1950 brw_push_insn_state(p);
1951 brw_set_access_mode(p, BRW_ALIGN_1);
1952 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1953 brw_set_mask_control(p, BRW_MASK_DISABLE);
1954 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1955 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2),
1956 BRW_REGISTER_TYPE_UD),
1957 brw_imm_ud(location));
1958 brw_pop_insn_state(p);
1959
1960 insn = next_insn(p, BRW_OPCODE_SEND);
1961
1962 insn->header.predicate_control = BRW_PREDICATE_NONE;
1963 insn->header.compression_control = BRW_COMPRESSION_NONE;
1964 insn->header.destreg__conditionalmod = msg_reg_nr;
1965 insn->header.mask_control = BRW_MASK_DISABLE;
1966
1967 brw_set_dest(p, insn, dest);
1968 if (intel->gen >= 6) {
1969 brw_set_src0(p, insn, brw_message_reg(msg_reg_nr));
1970 } else {
1971 brw_set_src0(p, insn, brw_null_reg());
1972 }
1973
1974 brw_set_dp_read_message(p,
1975 insn,
1976 bind_table_index,
1977 0,
1978 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1979 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1980 1, /* msg_length */
1981 1); /* response_length (1 Oword) */
1982 }
1983
1984 /**
1985 * Read a float[4] constant per vertex from VS constant buffer, with
1986 * relative addressing.
1987 */
1988 void brw_dp_READ_4_vs_relative(struct brw_compile *p,
1989 struct brw_reg dest,
1990 struct brw_reg addr_reg,
1991 GLuint offset,
1992 GLuint bind_table_index)
1993 {
1994 struct intel_context *intel = &p->brw->intel;
1995 struct brw_reg src = brw_vec8_grf(0, 0);
1996 int msg_type;
1997
1998 /* Setup MRF[1] with offset into const buffer */
1999 brw_push_insn_state(p);
2000 brw_set_access_mode(p, BRW_ALIGN_1);
2001 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2002 brw_set_mask_control(p, BRW_MASK_DISABLE);
2003 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2004
2005 /* M1.0 is block offset 0, M1.4 is block offset 1, all other
2006 * fields ignored.
2007 */
2008 brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_D),
2009 addr_reg, brw_imm_d(offset));
2010 brw_pop_insn_state(p);
2011
2012 gen6_resolve_implied_move(p, &src, 0);
2013 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2014
2015 insn->header.predicate_control = BRW_PREDICATE_NONE;
2016 insn->header.compression_control = BRW_COMPRESSION_NONE;
2017 insn->header.destreg__conditionalmod = 0;
2018 insn->header.mask_control = BRW_MASK_DISABLE;
2019
2020 brw_set_dest(p, insn, dest);
2021 brw_set_src0(p, insn, src);
2022
2023 if (intel->gen >= 6)
2024 msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2025 else if (intel->gen == 5 || intel->is_g4x)
2026 msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2027 else
2028 msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2029
2030 brw_set_dp_read_message(p,
2031 insn,
2032 bind_table_index,
2033 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
2034 msg_type,
2035 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2036 2, /* msg_length */
2037 1); /* response_length */
2038 }
2039
2040
2041
2042 void brw_fb_WRITE(struct brw_compile *p,
2043 int dispatch_width,
2044 GLuint msg_reg_nr,
2045 struct brw_reg src0,
2046 GLuint binding_table_index,
2047 GLuint msg_length,
2048 GLuint response_length,
2049 bool eot,
2050 bool header_present)
2051 {
2052 struct intel_context *intel = &p->brw->intel;
2053 struct brw_instruction *insn;
2054 GLuint msg_control, msg_type;
2055 struct brw_reg dest;
2056
2057 if (dispatch_width == 16)
2058 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2059 else
2060 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2061
2062 if (intel->gen >= 6 && binding_table_index == 0) {
2063 insn = next_insn(p, BRW_OPCODE_SENDC);
2064 } else {
2065 insn = next_insn(p, BRW_OPCODE_SEND);
2066 }
2067 /* The execution mask is ignored for render target writes. */
2068 insn->header.predicate_control = 0;
2069 insn->header.compression_control = BRW_COMPRESSION_NONE;
2070
2071 if (intel->gen >= 6) {
2072 /* headerless version, just submit color payload */
2073 src0 = brw_message_reg(msg_reg_nr);
2074
2075 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2076 } else {
2077 insn->header.destreg__conditionalmod = msg_reg_nr;
2078
2079 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2080 }
2081
2082 if (dispatch_width == 16)
2083 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
2084 else
2085 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
2086
2087 brw_set_dest(p, insn, dest);
2088 brw_set_src0(p, insn, src0);
2089 brw_set_dp_write_message(p,
2090 insn,
2091 binding_table_index,
2092 msg_control,
2093 msg_type,
2094 msg_length,
2095 header_present,
2096 1, /* last render target write */
2097 response_length,
2098 eot,
2099 0 /* send_commit_msg */);
2100 }
2101
2102
2103 /**
2104 * Texture sample instruction.
2105 * Note: the msg_type plus msg_length values determine exactly what kind
2106 * of sampling operation is performed. See volume 4, page 161 of docs.
2107 */
2108 void brw_SAMPLE(struct brw_compile *p,
2109 struct brw_reg dest,
2110 GLuint msg_reg_nr,
2111 struct brw_reg src0,
2112 GLuint binding_table_index,
2113 GLuint sampler,
2114 GLuint writemask,
2115 GLuint msg_type,
2116 GLuint response_length,
2117 GLuint msg_length,
2118 GLuint header_present,
2119 GLuint simd_mode,
2120 GLuint return_format)
2121 {
2122 struct intel_context *intel = &p->brw->intel;
2123 bool need_stall = 0;
2124
2125 if (writemask == 0) {
2126 /*printf("%s: zero writemask??\n", __FUNCTION__); */
2127 return;
2128 }
2129
2130 /* Hardware doesn't do destination dependency checking on send
2131 * instructions properly. Add a workaround which generates the
2132 * dependency by other means. In practice it seems like this bug
2133 * only crops up for texture samples, and only where registers are
2134 * written by the send and then written again later without being
2135 * read in between. Luckily for us, we already track that
2136 * information and use it to modify the writemask for the
2137 * instruction, so that is a guide for whether a workaround is
2138 * needed.
2139 */
2140 if (writemask != WRITEMASK_XYZW) {
2141 GLuint dst_offset = 0;
2142 GLuint i, newmask = 0, len = 0;
2143
2144 for (i = 0; i < 4; i++) {
2145 if (writemask & (1<<i))
2146 break;
2147 dst_offset += 2;
2148 }
2149 for (; i < 4; i++) {
2150 if (!(writemask & (1<<i)))
2151 break;
2152 newmask |= 1<<i;
2153 len++;
2154 }
2155
2156 if (newmask != writemask) {
2157 need_stall = 1;
2158 /* printf("need stall %x %x\n", newmask , writemask); */
2159 }
2160 else {
2161 bool dispatch_16 = false;
2162
2163 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
2164
2165 guess_execution_size(p, p->current, dest);
2166 if (p->current->header.execution_size == BRW_EXECUTE_16)
2167 dispatch_16 = true;
2168
2169 newmask = ~newmask & WRITEMASK_XYZW;
2170
2171 brw_push_insn_state(p);
2172
2173 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2174 brw_set_mask_control(p, BRW_MASK_DISABLE);
2175
2176 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
2177 retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
2178 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
2179
2180 brw_pop_insn_state(p);
2181
2182 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
2183 dest = offset(dest, dst_offset);
2184
2185 /* For 16-wide dispatch, masked channels are skipped in the
2186 * response. For 8-wide, masked channels still take up slots,
2187 * and are just not written to.
2188 */
2189 if (dispatch_16)
2190 response_length = len * 2;
2191 }
2192 }
2193
2194 {
2195 struct brw_instruction *insn;
2196
2197 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2198
2199 insn = next_insn(p, BRW_OPCODE_SEND);
2200 insn->header.predicate_control = 0; /* XXX */
2201 insn->header.compression_control = BRW_COMPRESSION_NONE;
2202 if (intel->gen < 6)
2203 insn->header.destreg__conditionalmod = msg_reg_nr;
2204
2205 brw_set_dest(p, insn, dest);
2206 brw_set_src0(p, insn, src0);
2207 brw_set_sampler_message(p, insn,
2208 binding_table_index,
2209 sampler,
2210 msg_type,
2211 response_length,
2212 msg_length,
2213 header_present,
2214 simd_mode,
2215 return_format);
2216 }
2217
2218 if (need_stall) {
2219 struct brw_reg reg = vec8(offset(dest, response_length-1));
2220
2221 /* mov (8) r9.0<1>:f r9.0<8;8,1>:f { Align1 }
2222 */
2223 brw_push_insn_state(p);
2224 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2225 brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
2226 retype(reg, BRW_REGISTER_TYPE_UD));
2227 brw_pop_insn_state(p);
2228 }
2229
2230 }
2231
2232 /* All these variables are pretty confusing - we might be better off
2233 * using bitmasks and macros for this, in the old style. Or perhaps
2234 * just having the caller instantiate the fields in dword3 itself.
2235 */
2236 void brw_urb_WRITE(struct brw_compile *p,
2237 struct brw_reg dest,
2238 GLuint msg_reg_nr,
2239 struct brw_reg src0,
2240 bool allocate,
2241 bool used,
2242 GLuint msg_length,
2243 GLuint response_length,
2244 bool eot,
2245 bool writes_complete,
2246 GLuint offset,
2247 GLuint swizzle)
2248 {
2249 struct intel_context *intel = &p->brw->intel;
2250 struct brw_instruction *insn;
2251
2252 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2253
2254 if (intel->gen == 7) {
2255 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2256 brw_push_insn_state(p);
2257 brw_set_access_mode(p, BRW_ALIGN_1);
2258 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2259 BRW_REGISTER_TYPE_UD),
2260 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2261 brw_imm_ud(0xff00));
2262 brw_pop_insn_state(p);
2263 }
2264
2265 insn = next_insn(p, BRW_OPCODE_SEND);
2266
2267 assert(msg_length < BRW_MAX_MRF);
2268
2269 brw_set_dest(p, insn, dest);
2270 brw_set_src0(p, insn, src0);
2271 brw_set_src1(p, insn, brw_imm_d(0));
2272
2273 if (intel->gen < 6)
2274 insn->header.destreg__conditionalmod = msg_reg_nr;
2275
2276 brw_set_urb_message(p,
2277 insn,
2278 allocate,
2279 used,
2280 msg_length,
2281 response_length,
2282 eot,
2283 writes_complete,
2284 offset,
2285 swizzle);
2286 }
2287
2288 static int
2289 brw_find_next_block_end(struct brw_compile *p, int start)
2290 {
2291 int ip;
2292
2293 for (ip = start + 1; ip < p->nr_insn; ip++) {
2294 struct brw_instruction *insn = &p->store[ip];
2295
2296 switch (insn->header.opcode) {
2297 case BRW_OPCODE_ENDIF:
2298 case BRW_OPCODE_ELSE:
2299 case BRW_OPCODE_WHILE:
2300 return ip;
2301 }
2302 }
2303 assert(!"not reached");
2304 return start + 1;
2305 }
2306
2307 /* There is no DO instruction on gen6, so to find the end of the loop
2308 * we have to see if the loop is jumping back before our start
2309 * instruction.
2310 */
2311 static int
2312 brw_find_loop_end(struct brw_compile *p, int start)
2313 {
2314 struct intel_context *intel = &p->brw->intel;
2315 int ip;
2316 int br = 2;
2317
2318 for (ip = start + 1; ip < p->nr_insn; ip++) {
2319 struct brw_instruction *insn = &p->store[ip];
2320
2321 if (insn->header.opcode == BRW_OPCODE_WHILE) {
2322 int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
2323 : insn->bits3.break_cont.jip;
2324 if (ip + jip / br <= start)
2325 return ip;
2326 }
2327 }
2328 assert(!"not reached");
2329 return start + 1;
2330 }
2331
2332 /* After program generation, go back and update the UIP and JIP of
2333 * BREAK and CONT instructions to their correct locations.
2334 */
2335 void
2336 brw_set_uip_jip(struct brw_compile *p)
2337 {
2338 struct intel_context *intel = &p->brw->intel;
2339 int ip;
2340 int br = 2;
2341
2342 if (intel->gen < 6)
2343 return;
2344
2345 for (ip = 0; ip < p->nr_insn; ip++) {
2346 struct brw_instruction *insn = &p->store[ip];
2347
2348 switch (insn->header.opcode) {
2349 case BRW_OPCODE_BREAK:
2350 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2351 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2352 insn->bits3.break_cont.uip =
2353 br * (brw_find_loop_end(p, ip) - ip + (intel->gen == 6 ? 1 : 0));
2354 break;
2355 case BRW_OPCODE_CONTINUE:
2356 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2357 insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip);
2358
2359 assert(insn->bits3.break_cont.uip != 0);
2360 assert(insn->bits3.break_cont.jip != 0);
2361 break;
2362 }
2363 }
2364 }
2365
2366 void brw_ff_sync(struct brw_compile *p,
2367 struct brw_reg dest,
2368 GLuint msg_reg_nr,
2369 struct brw_reg src0,
2370 bool allocate,
2371 GLuint response_length,
2372 bool eot)
2373 {
2374 struct intel_context *intel = &p->brw->intel;
2375 struct brw_instruction *insn;
2376
2377 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2378
2379 insn = next_insn(p, BRW_OPCODE_SEND);
2380 brw_set_dest(p, insn, dest);
2381 brw_set_src0(p, insn, src0);
2382 brw_set_src1(p, insn, brw_imm_d(0));
2383
2384 if (intel->gen < 6)
2385 insn->header.destreg__conditionalmod = msg_reg_nr;
2386
2387 brw_set_ff_sync_message(p,
2388 insn,
2389 allocate,
2390 response_length,
2391 eot);
2392 }
2393
2394 /**
2395 * Emit the SEND instruction necessary to generate stream output data on Gen6
2396 * (for transform feedback).
2397 *
2398 * If send_commit_msg is true, this is the last piece of stream output data
2399 * from this thread, so send the data as a committed write. According to the
2400 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2401 *
2402 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2403 * writes are complete by sending the final write as a committed write."
2404 */
2405 void
2406 brw_svb_write(struct brw_compile *p,
2407 struct brw_reg dest,
2408 GLuint msg_reg_nr,
2409 struct brw_reg src0,
2410 GLuint binding_table_index,
2411 bool send_commit_msg)
2412 {
2413 struct brw_instruction *insn;
2414
2415 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2416
2417 insn = next_insn(p, BRW_OPCODE_SEND);
2418 brw_set_dest(p, insn, dest);
2419 brw_set_src0(p, insn, src0);
2420 brw_set_src1(p, insn, brw_imm_d(0));
2421 brw_set_dp_write_message(p, insn,
2422 binding_table_index,
2423 0, /* msg_control: ignored */
2424 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2425 1, /* msg_length */
2426 true, /* header_present */
2427 0, /* last_render_target: ignored */
2428 send_commit_msg, /* response_length */
2429 0, /* end_of_thread */
2430 send_commit_msg); /* send_commit_msg */
2431 }