i965/fs: Add support for translating ir_triop_fma into MAD.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "brw_context.h"
34 #include "brw_defines.h"
35 #include "brw_eu.h"
36
37 #include "glsl/ralloc.h"
38
39 /***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43 static void guess_execution_size(struct brw_compile *p,
44 struct brw_instruction *insn,
45 struct brw_reg reg)
46 {
47 if (reg.width == BRW_WIDTH_8 && p->compressed)
48 insn->header.execution_size = BRW_EXECUTE_16;
49 else
50 insn->header.execution_size = reg.width; /* note - definitions are compatible */
51 }
52
53
54 /**
55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
56 * registers, implicitly moving the operand to a message register.
57 *
58 * On Sandybridge, this is no longer the case. This function performs the
59 * explicit move; it should be called before emitting a SEND instruction.
60 */
61 void
62 gen6_resolve_implied_move(struct brw_compile *p,
63 struct brw_reg *src,
64 GLuint msg_reg_nr)
65 {
66 struct brw_context *brw = p->brw;
67 if (brw->gen < 6)
68 return;
69
70 if (src->file == BRW_MESSAGE_REGISTER_FILE)
71 return;
72
73 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
74 brw_push_insn_state(p);
75 brw_set_mask_control(p, BRW_MASK_DISABLE);
76 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
77 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
78 retype(*src, BRW_REGISTER_TYPE_UD));
79 brw_pop_insn_state(p);
80 }
81 *src = brw_message_reg(msg_reg_nr);
82 }
83
84 static void
85 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
86 {
87 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
88 * "The send with EOT should use register space R112-R127 for <src>. This is
89 * to enable loading of a new thread into the same slot while the message
90 * with EOT for current thread is pending dispatch."
91 *
92 * Since we're pretending to have 16 MRFs anyway, we may as well use the
93 * registers required for messages with EOT.
94 */
95 struct brw_context *brw = p->brw;
96 if (brw->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
97 reg->file = BRW_GENERAL_REGISTER_FILE;
98 reg->nr += GEN7_MRF_HACK_START;
99 }
100 }
101
102
103 void
104 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
105 struct brw_reg dest)
106 {
107 if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
108 dest.file != BRW_MESSAGE_REGISTER_FILE)
109 assert(dest.nr < 128);
110
111 gen7_convert_mrf_to_grf(p, &dest);
112
113 insn->bits1.da1.dest_reg_file = dest.file;
114 insn->bits1.da1.dest_reg_type = dest.type;
115 insn->bits1.da1.dest_address_mode = dest.address_mode;
116
117 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
118 insn->bits1.da1.dest_reg_nr = dest.nr;
119
120 if (insn->header.access_mode == BRW_ALIGN_1) {
121 insn->bits1.da1.dest_subreg_nr = dest.subnr;
122 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
123 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
124 insn->bits1.da1.dest_horiz_stride = dest.hstride;
125 }
126 else {
127 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
128 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
129 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
130 * Although Dst.HorzStride is a don't care for Align16, HW needs
131 * this to be programmed as "01".
132 */
133 insn->bits1.da16.dest_horiz_stride = 1;
134 }
135 }
136 else {
137 insn->bits1.ia1.dest_subreg_nr = dest.subnr;
138
139 /* These are different sizes in align1 vs align16:
140 */
141 if (insn->header.access_mode == BRW_ALIGN_1) {
142 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
143 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
144 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
145 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
146 }
147 else {
148 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
149 /* even ignored in da16, still need to set as '01' */
150 insn->bits1.ia16.dest_horiz_stride = 1;
151 }
152 }
153
154 /* NEW: Set the execution size based on dest.width and
155 * insn->compression_control:
156 */
157 guess_execution_size(p, insn, dest);
158 }
159
160 extern int reg_type_size[];
161
162 static void
163 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
164 {
165 int hstride_for_reg[] = {0, 1, 2, 4};
166 int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
167 int width_for_reg[] = {1, 2, 4, 8, 16};
168 int execsize_for_reg[] = {1, 2, 4, 8, 16};
169 int width, hstride, vstride, execsize;
170
171 if (reg.file == BRW_IMMEDIATE_VALUE) {
172 /* 3.3.6: Region Parameters. Restriction: Immediate vectors
173 * mean the destination has to be 128-bit aligned and the
174 * destination horiz stride has to be a word.
175 */
176 if (reg.type == BRW_REGISTER_TYPE_V) {
177 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
178 reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
179 }
180
181 return;
182 }
183
184 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
185 reg.file == BRW_ARF_NULL)
186 return;
187
188 assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
189 hstride = hstride_for_reg[reg.hstride];
190
191 if (reg.vstride == 0xf) {
192 vstride = -1;
193 } else {
194 assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
195 vstride = vstride_for_reg[reg.vstride];
196 }
197
198 assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
199 width = width_for_reg[reg.width];
200
201 assert(insn->header.execution_size >= 0 &&
202 insn->header.execution_size < Elements(execsize_for_reg));
203 execsize = execsize_for_reg[insn->header.execution_size];
204
205 /* Restrictions from 3.3.10: Register Region Restrictions. */
206 /* 3. */
207 assert(execsize >= width);
208
209 /* 4. */
210 if (execsize == width && hstride != 0) {
211 assert(vstride == -1 || vstride == width * hstride);
212 }
213
214 /* 5. */
215 if (execsize == width && hstride == 0) {
216 /* no restriction on vstride. */
217 }
218
219 /* 6. */
220 if (width == 1) {
221 assert(hstride == 0);
222 }
223
224 /* 7. */
225 if (execsize == 1 && width == 1) {
226 assert(hstride == 0);
227 assert(vstride == 0);
228 }
229
230 /* 8. */
231 if (vstride == 0 && hstride == 0) {
232 assert(width == 1);
233 }
234
235 /* 10. Check destination issues. */
236 }
237
238 void
239 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
240 struct brw_reg reg)
241 {
242 struct brw_context *brw = p->brw;
243
244 if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
245 assert(reg.nr < 128);
246
247 gen7_convert_mrf_to_grf(p, &reg);
248
249 if (brw->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
250 insn->header.opcode == BRW_OPCODE_SENDC)) {
251 /* Any source modifiers or regions will be ignored, since this just
252 * identifies the MRF/GRF to start reading the message contents from.
253 * Check for some likely failures.
254 */
255 assert(!reg.negate);
256 assert(!reg.abs);
257 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
258 }
259
260 validate_reg(insn, reg);
261
262 insn->bits1.da1.src0_reg_file = reg.file;
263 insn->bits1.da1.src0_reg_type = reg.type;
264 insn->bits2.da1.src0_abs = reg.abs;
265 insn->bits2.da1.src0_negate = reg.negate;
266 insn->bits2.da1.src0_address_mode = reg.address_mode;
267
268 if (reg.file == BRW_IMMEDIATE_VALUE) {
269 insn->bits3.ud = reg.dw1.ud;
270
271 /* Required to set some fields in src1 as well:
272 */
273 insn->bits1.da1.src1_reg_file = 0; /* arf */
274 insn->bits1.da1.src1_reg_type = reg.type;
275 }
276 else
277 {
278 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
279 if (insn->header.access_mode == BRW_ALIGN_1) {
280 insn->bits2.da1.src0_subreg_nr = reg.subnr;
281 insn->bits2.da1.src0_reg_nr = reg.nr;
282 }
283 else {
284 insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
285 insn->bits2.da16.src0_reg_nr = reg.nr;
286 }
287 }
288 else {
289 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
290
291 if (insn->header.access_mode == BRW_ALIGN_1) {
292 insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
293 }
294 else {
295 insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
296 }
297 }
298
299 if (insn->header.access_mode == BRW_ALIGN_1) {
300 if (reg.width == BRW_WIDTH_1 &&
301 insn->header.execution_size == BRW_EXECUTE_1) {
302 insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
303 insn->bits2.da1.src0_width = BRW_WIDTH_1;
304 insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
305 }
306 else {
307 insn->bits2.da1.src0_horiz_stride = reg.hstride;
308 insn->bits2.da1.src0_width = reg.width;
309 insn->bits2.da1.src0_vert_stride = reg.vstride;
310 }
311 }
312 else {
313 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
314 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
315 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
316 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
317
318 /* This is an oddity of the fact we're using the same
319 * descriptions for registers in align_16 as align_1:
320 */
321 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
322 insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
323 else
324 insn->bits2.da16.src0_vert_stride = reg.vstride;
325 }
326 }
327 }
328
329
330 void brw_set_src1(struct brw_compile *p,
331 struct brw_instruction *insn,
332 struct brw_reg reg)
333 {
334 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
335
336 if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
337 assert(reg.nr < 128);
338
339 gen7_convert_mrf_to_grf(p, &reg);
340
341 validate_reg(insn, reg);
342
343 insn->bits1.da1.src1_reg_file = reg.file;
344 insn->bits1.da1.src1_reg_type = reg.type;
345 insn->bits3.da1.src1_abs = reg.abs;
346 insn->bits3.da1.src1_negate = reg.negate;
347
348 /* Only src1 can be immediate in two-argument instructions.
349 */
350 assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
351
352 if (reg.file == BRW_IMMEDIATE_VALUE) {
353 insn->bits3.ud = reg.dw1.ud;
354 }
355 else {
356 /* This is a hardware restriction, which may or may not be lifted
357 * in the future:
358 */
359 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
360 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
361
362 if (insn->header.access_mode == BRW_ALIGN_1) {
363 insn->bits3.da1.src1_subreg_nr = reg.subnr;
364 insn->bits3.da1.src1_reg_nr = reg.nr;
365 }
366 else {
367 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
368 insn->bits3.da16.src1_reg_nr = reg.nr;
369 }
370
371 if (insn->header.access_mode == BRW_ALIGN_1) {
372 if (reg.width == BRW_WIDTH_1 &&
373 insn->header.execution_size == BRW_EXECUTE_1) {
374 insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
375 insn->bits3.da1.src1_width = BRW_WIDTH_1;
376 insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
377 }
378 else {
379 insn->bits3.da1.src1_horiz_stride = reg.hstride;
380 insn->bits3.da1.src1_width = reg.width;
381 insn->bits3.da1.src1_vert_stride = reg.vstride;
382 }
383 }
384 else {
385 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
386 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
387 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
388 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
389
390 /* This is an oddity of the fact we're using the same
391 * descriptions for registers in align_16 as align_1:
392 */
393 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
394 insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
395 else
396 insn->bits3.da16.src1_vert_stride = reg.vstride;
397 }
398 }
399 }
400
401 /**
402 * Set the Message Descriptor and Extended Message Descriptor fields
403 * for SEND messages.
404 *
405 * \note This zeroes out the Function Control bits, so it must be called
406 * \b before filling out any message-specific data. Callers can
407 * choose not to fill in irrelevant bits; they will be zero.
408 */
409 static void
410 brw_set_message_descriptor(struct brw_compile *p,
411 struct brw_instruction *inst,
412 enum brw_message_target sfid,
413 unsigned msg_length,
414 unsigned response_length,
415 bool header_present,
416 bool end_of_thread)
417 {
418 struct brw_context *brw = p->brw;
419
420 brw_set_src1(p, inst, brw_imm_d(0));
421
422 if (brw->gen >= 5) {
423 inst->bits3.generic_gen5.header_present = header_present;
424 inst->bits3.generic_gen5.response_length = response_length;
425 inst->bits3.generic_gen5.msg_length = msg_length;
426 inst->bits3.generic_gen5.end_of_thread = end_of_thread;
427
428 if (brw->gen >= 6) {
429 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
430 inst->header.destreg__conditionalmod = sfid;
431 } else {
432 /* Set Extended Message Descriptor (ex_desc) */
433 inst->bits2.send_gen5.sfid = sfid;
434 inst->bits2.send_gen5.end_of_thread = end_of_thread;
435 }
436 } else {
437 inst->bits3.generic.response_length = response_length;
438 inst->bits3.generic.msg_length = msg_length;
439 inst->bits3.generic.msg_target = sfid;
440 inst->bits3.generic.end_of_thread = end_of_thread;
441 }
442 }
443
444 static void brw_set_math_message( struct brw_compile *p,
445 struct brw_instruction *insn,
446 GLuint function,
447 GLuint integer_type,
448 bool low_precision,
449 GLuint dataType )
450 {
451 struct brw_context *brw = p->brw;
452 unsigned msg_length;
453 unsigned response_length;
454
455 /* Infer message length from the function */
456 switch (function) {
457 case BRW_MATH_FUNCTION_POW:
458 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
459 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
460 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
461 msg_length = 2;
462 break;
463 default:
464 msg_length = 1;
465 break;
466 }
467
468 /* Infer response length from the function */
469 switch (function) {
470 case BRW_MATH_FUNCTION_SINCOS:
471 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
472 response_length = 2;
473 break;
474 default:
475 response_length = 1;
476 break;
477 }
478
479
480 brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
481 msg_length, response_length, false, false);
482 if (brw->gen == 5) {
483 insn->bits3.math_gen5.function = function;
484 insn->bits3.math_gen5.int_type = integer_type;
485 insn->bits3.math_gen5.precision = low_precision;
486 insn->bits3.math_gen5.saturate = insn->header.saturate;
487 insn->bits3.math_gen5.data_type = dataType;
488 insn->bits3.math_gen5.snapshot = 0;
489 } else {
490 insn->bits3.math.function = function;
491 insn->bits3.math.int_type = integer_type;
492 insn->bits3.math.precision = low_precision;
493 insn->bits3.math.saturate = insn->header.saturate;
494 insn->bits3.math.data_type = dataType;
495 }
496 insn->header.saturate = 0;
497 }
498
499
500 static void brw_set_ff_sync_message(struct brw_compile *p,
501 struct brw_instruction *insn,
502 bool allocate,
503 GLuint response_length,
504 bool end_of_thread)
505 {
506 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
507 1, response_length, true, end_of_thread);
508 insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
509 insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
510 insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
511 insn->bits3.urb_gen5.allocate = allocate;
512 insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
513 insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
514 }
515
516 static void brw_set_urb_message( struct brw_compile *p,
517 struct brw_instruction *insn,
518 enum brw_urb_write_flags flags,
519 GLuint msg_length,
520 GLuint response_length,
521 GLuint offset,
522 GLuint swizzle_control )
523 {
524 struct brw_context *brw = p->brw;
525
526 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
527 msg_length, response_length, true,
528 flags & BRW_URB_WRITE_EOT);
529 if (brw->gen == 7) {
530 insn->bits3.urb_gen7.opcode = 0; /* URB_WRITE_HWORD */
531 insn->bits3.urb_gen7.offset = offset;
532 assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
533 insn->bits3.urb_gen7.swizzle_control = swizzle_control;
534 insn->bits3.urb_gen7.per_slot_offset =
535 flags & BRW_URB_WRITE_PER_SLOT_OFFSET ? 1 : 0;
536 insn->bits3.urb_gen7.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
537 } else if (brw->gen >= 5) {
538 insn->bits3.urb_gen5.opcode = 0; /* URB_WRITE */
539 insn->bits3.urb_gen5.offset = offset;
540 insn->bits3.urb_gen5.swizzle_control = swizzle_control;
541 insn->bits3.urb_gen5.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
542 insn->bits3.urb_gen5.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
543 insn->bits3.urb_gen5.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
544 } else {
545 insn->bits3.urb.opcode = 0; /* ? */
546 insn->bits3.urb.offset = offset;
547 insn->bits3.urb.swizzle_control = swizzle_control;
548 insn->bits3.urb.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
549 insn->bits3.urb.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
550 insn->bits3.urb.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
551 }
552 }
553
554 void
555 brw_set_dp_write_message(struct brw_compile *p,
556 struct brw_instruction *insn,
557 GLuint binding_table_index,
558 GLuint msg_control,
559 GLuint msg_type,
560 GLuint msg_length,
561 bool header_present,
562 GLuint last_render_target,
563 GLuint response_length,
564 GLuint end_of_thread,
565 GLuint send_commit_msg)
566 {
567 struct brw_context *brw = p->brw;
568 unsigned sfid;
569
570 if (brw->gen >= 7) {
571 /* Use the Render Cache for RT writes; otherwise use the Data Cache */
572 if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
573 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
574 else
575 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
576 } else if (brw->gen == 6) {
577 /* Use the render cache for all write messages. */
578 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
579 } else {
580 sfid = BRW_SFID_DATAPORT_WRITE;
581 }
582
583 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
584 header_present, end_of_thread);
585
586 if (brw->gen >= 7) {
587 insn->bits3.gen7_dp.binding_table_index = binding_table_index;
588 insn->bits3.gen7_dp.msg_control = msg_control;
589 insn->bits3.gen7_dp.last_render_target = last_render_target;
590 insn->bits3.gen7_dp.msg_type = msg_type;
591 } else if (brw->gen == 6) {
592 insn->bits3.gen6_dp.binding_table_index = binding_table_index;
593 insn->bits3.gen6_dp.msg_control = msg_control;
594 insn->bits3.gen6_dp.last_render_target = last_render_target;
595 insn->bits3.gen6_dp.msg_type = msg_type;
596 insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
597 } else if (brw->gen == 5) {
598 insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
599 insn->bits3.dp_write_gen5.msg_control = msg_control;
600 insn->bits3.dp_write_gen5.last_render_target = last_render_target;
601 insn->bits3.dp_write_gen5.msg_type = msg_type;
602 insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
603 } else {
604 insn->bits3.dp_write.binding_table_index = binding_table_index;
605 insn->bits3.dp_write.msg_control = msg_control;
606 insn->bits3.dp_write.last_render_target = last_render_target;
607 insn->bits3.dp_write.msg_type = msg_type;
608 insn->bits3.dp_write.send_commit_msg = send_commit_msg;
609 }
610 }
611
612 void
613 brw_set_dp_read_message(struct brw_compile *p,
614 struct brw_instruction *insn,
615 GLuint binding_table_index,
616 GLuint msg_control,
617 GLuint msg_type,
618 GLuint target_cache,
619 GLuint msg_length,
620 bool header_present,
621 GLuint response_length)
622 {
623 struct brw_context *brw = p->brw;
624 unsigned sfid;
625
626 if (brw->gen >= 7) {
627 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
628 } else if (brw->gen == 6) {
629 if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
630 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
631 else
632 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
633 } else {
634 sfid = BRW_SFID_DATAPORT_READ;
635 }
636
637 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
638 header_present, false);
639
640 if (brw->gen >= 7) {
641 insn->bits3.gen7_dp.binding_table_index = binding_table_index;
642 insn->bits3.gen7_dp.msg_control = msg_control;
643 insn->bits3.gen7_dp.last_render_target = 0;
644 insn->bits3.gen7_dp.msg_type = msg_type;
645 } else if (brw->gen == 6) {
646 insn->bits3.gen6_dp.binding_table_index = binding_table_index;
647 insn->bits3.gen6_dp.msg_control = msg_control;
648 insn->bits3.gen6_dp.last_render_target = 0;
649 insn->bits3.gen6_dp.msg_type = msg_type;
650 insn->bits3.gen6_dp.send_commit_msg = 0;
651 } else if (brw->gen == 5) {
652 insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
653 insn->bits3.dp_read_gen5.msg_control = msg_control;
654 insn->bits3.dp_read_gen5.msg_type = msg_type;
655 insn->bits3.dp_read_gen5.target_cache = target_cache;
656 } else if (brw->is_g4x) {
657 insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
658 insn->bits3.dp_read_g4x.msg_control = msg_control; /*8:10*/
659 insn->bits3.dp_read_g4x.msg_type = msg_type; /*11:13*/
660 insn->bits3.dp_read_g4x.target_cache = target_cache; /*14:15*/
661 } else {
662 insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
663 insn->bits3.dp_read.msg_control = msg_control; /*8:11*/
664 insn->bits3.dp_read.msg_type = msg_type; /*12:13*/
665 insn->bits3.dp_read.target_cache = target_cache; /*14:15*/
666 }
667 }
668
669 void
670 brw_set_sampler_message(struct brw_compile *p,
671 struct brw_instruction *insn,
672 GLuint binding_table_index,
673 GLuint sampler,
674 GLuint msg_type,
675 GLuint response_length,
676 GLuint msg_length,
677 GLuint header_present,
678 GLuint simd_mode,
679 GLuint return_format)
680 {
681 struct brw_context *brw = p->brw;
682
683 brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
684 response_length, header_present, false);
685
686 if (brw->gen >= 7) {
687 insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
688 insn->bits3.sampler_gen7.sampler = sampler;
689 insn->bits3.sampler_gen7.msg_type = msg_type;
690 insn->bits3.sampler_gen7.simd_mode = simd_mode;
691 } else if (brw->gen >= 5) {
692 insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
693 insn->bits3.sampler_gen5.sampler = sampler;
694 insn->bits3.sampler_gen5.msg_type = msg_type;
695 insn->bits3.sampler_gen5.simd_mode = simd_mode;
696 } else if (brw->is_g4x) {
697 insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
698 insn->bits3.sampler_g4x.sampler = sampler;
699 insn->bits3.sampler_g4x.msg_type = msg_type;
700 } else {
701 insn->bits3.sampler.binding_table_index = binding_table_index;
702 insn->bits3.sampler.sampler = sampler;
703 insn->bits3.sampler.msg_type = msg_type;
704 insn->bits3.sampler.return_format = return_format;
705 }
706 }
707
708
709 #define next_insn brw_next_insn
710 struct brw_instruction *
711 brw_next_insn(struct brw_compile *p, GLuint opcode)
712 {
713 struct brw_instruction *insn;
714
715 if (p->nr_insn + 1 > p->store_size) {
716 if (0)
717 printf("incresing the store size to %d\n", p->store_size << 1);
718 p->store_size <<= 1;
719 p->store = reralloc(p->mem_ctx, p->store,
720 struct brw_instruction, p->store_size);
721 if (!p->store)
722 assert(!"realloc eu store memeory failed");
723 }
724
725 p->next_insn_offset += 16;
726 insn = &p->store[p->nr_insn++];
727 memcpy(insn, p->current, sizeof(*insn));
728
729 /* Reset this one-shot flag:
730 */
731
732 if (p->current->header.destreg__conditionalmod) {
733 p->current->header.destreg__conditionalmod = 0;
734 p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
735 }
736
737 insn->header.opcode = opcode;
738 return insn;
739 }
740
741 static struct brw_instruction *brw_alu1( struct brw_compile *p,
742 GLuint opcode,
743 struct brw_reg dest,
744 struct brw_reg src )
745 {
746 struct brw_instruction *insn = next_insn(p, opcode);
747 brw_set_dest(p, insn, dest);
748 brw_set_src0(p, insn, src);
749 return insn;
750 }
751
752 static struct brw_instruction *brw_alu2(struct brw_compile *p,
753 GLuint opcode,
754 struct brw_reg dest,
755 struct brw_reg src0,
756 struct brw_reg src1 )
757 {
758 struct brw_instruction *insn = next_insn(p, opcode);
759 brw_set_dest(p, insn, dest);
760 brw_set_src0(p, insn, src0);
761 brw_set_src1(p, insn, src1);
762 return insn;
763 }
764
765 static int
766 get_3src_subreg_nr(struct brw_reg reg)
767 {
768 if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
769 assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
770 return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
771 } else {
772 return reg.subnr / 4;
773 }
774 }
775
776 static struct brw_instruction *brw_alu3(struct brw_compile *p,
777 GLuint opcode,
778 struct brw_reg dest,
779 struct brw_reg src0,
780 struct brw_reg src1,
781 struct brw_reg src2)
782 {
783 struct brw_context *brw = p->brw;
784 struct brw_instruction *insn = next_insn(p, opcode);
785
786 gen7_convert_mrf_to_grf(p, &dest);
787
788 assert(insn->header.access_mode == BRW_ALIGN_16);
789
790 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
791 dest.file == BRW_MESSAGE_REGISTER_FILE);
792 assert(dest.nr < 128);
793 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
794 assert(dest.type == BRW_REGISTER_TYPE_F ||
795 dest.type == BRW_REGISTER_TYPE_D ||
796 dest.type == BRW_REGISTER_TYPE_UD);
797 insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
798 insn->bits1.da3src.dest_reg_nr = dest.nr;
799 insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
800 insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
801 guess_execution_size(p, insn, dest);
802
803 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
804 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
805 assert(src0.nr < 128);
806 insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
807 insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
808 insn->bits2.da3src.src0_reg_nr = src0.nr;
809 insn->bits1.da3src.src0_abs = src0.abs;
810 insn->bits1.da3src.src0_negate = src0.negate;
811 insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
812
813 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
814 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
815 assert(src1.nr < 128);
816 insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
817 insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
818 insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
819 insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
820 insn->bits3.da3src.src1_reg_nr = src1.nr;
821 insn->bits1.da3src.src1_abs = src1.abs;
822 insn->bits1.da3src.src1_negate = src1.negate;
823
824 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
825 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
826 assert(src2.nr < 128);
827 insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
828 insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
829 insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
830 insn->bits3.da3src.src2_reg_nr = src2.nr;
831 insn->bits1.da3src.src2_abs = src2.abs;
832 insn->bits1.da3src.src2_negate = src2.negate;
833
834 if (brw->gen >= 7) {
835 /* Set both the source and destination types based on dest.type,
836 * ignoring the source register types. The MAD and LRP emitters ensure
837 * that all four types are float. The BFE and BFI2 emitters, however,
838 * may send us mixed D and UD types and want us to ignore that and use
839 * the destination type.
840 */
841 switch (dest.type) {
842 case BRW_REGISTER_TYPE_F:
843 insn->bits1.da3src.src_type = BRW_3SRC_TYPE_F;
844 insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_F;
845 break;
846 case BRW_REGISTER_TYPE_D:
847 insn->bits1.da3src.src_type = BRW_3SRC_TYPE_D;
848 insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_D;
849 break;
850 case BRW_REGISTER_TYPE_UD:
851 insn->bits1.da3src.src_type = BRW_3SRC_TYPE_UD;
852 insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_UD;
853 break;
854 }
855 }
856
857 return insn;
858 }
859
860
861 /***********************************************************************
862 * Convenience routines.
863 */
864 #define ALU1(OP) \
865 struct brw_instruction *brw_##OP(struct brw_compile *p, \
866 struct brw_reg dest, \
867 struct brw_reg src0) \
868 { \
869 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
870 }
871
872 #define ALU2(OP) \
873 struct brw_instruction *brw_##OP(struct brw_compile *p, \
874 struct brw_reg dest, \
875 struct brw_reg src0, \
876 struct brw_reg src1) \
877 { \
878 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
879 }
880
881 #define ALU3(OP) \
882 struct brw_instruction *brw_##OP(struct brw_compile *p, \
883 struct brw_reg dest, \
884 struct brw_reg src0, \
885 struct brw_reg src1, \
886 struct brw_reg src2) \
887 { \
888 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
889 }
890
891 #define ALU3F(OP) \
892 struct brw_instruction *brw_##OP(struct brw_compile *p, \
893 struct brw_reg dest, \
894 struct brw_reg src0, \
895 struct brw_reg src1, \
896 struct brw_reg src2) \
897 { \
898 assert(dest.type == BRW_REGISTER_TYPE_F); \
899 assert(src0.type == BRW_REGISTER_TYPE_F); \
900 assert(src1.type == BRW_REGISTER_TYPE_F); \
901 assert(src2.type == BRW_REGISTER_TYPE_F); \
902 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
903 }
904
905 /* Rounding operations (other than RNDD) require two instructions - the first
906 * stores a rounded value (possibly the wrong way) in the dest register, but
907 * also sets a per-channel "increment bit" in the flag register. A predicated
908 * add of 1.0 fixes dest to contain the desired result.
909 *
910 * Sandybridge and later appear to round correctly without an ADD.
911 */
912 #define ROUND(OP) \
913 void brw_##OP(struct brw_compile *p, \
914 struct brw_reg dest, \
915 struct brw_reg src) \
916 { \
917 struct brw_instruction *rnd, *add; \
918 rnd = next_insn(p, BRW_OPCODE_##OP); \
919 brw_set_dest(p, rnd, dest); \
920 brw_set_src0(p, rnd, src); \
921 \
922 if (p->brw->gen < 6) { \
923 /* turn on round-increments */ \
924 rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R; \
925 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
926 add->header.predicate_control = BRW_PREDICATE_NORMAL; \
927 } \
928 }
929
930
931 ALU1(MOV)
932 ALU2(SEL)
933 ALU1(NOT)
934 ALU2(AND)
935 ALU2(OR)
936 ALU2(XOR)
937 ALU2(SHR)
938 ALU2(SHL)
939 ALU2(RSR)
940 ALU2(RSL)
941 ALU2(ASR)
942 ALU1(F32TO16)
943 ALU1(F16TO32)
944 ALU1(FRC)
945 ALU1(RNDD)
946 ALU2(MAC)
947 ALU2(MACH)
948 ALU1(LZD)
949 ALU2(DP4)
950 ALU2(DPH)
951 ALU2(DP3)
952 ALU2(DP2)
953 ALU2(LINE)
954 ALU2(PLN)
955 ALU3F(MAD)
956 ALU3F(LRP)
957 ALU1(BFREV)
958 ALU3(BFE)
959 ALU2(BFI1)
960 ALU3(BFI2)
961 ALU1(FBH)
962 ALU1(FBL)
963 ALU1(CBIT)
964
965 ROUND(RNDZ)
966 ROUND(RNDE)
967
968
969 struct brw_instruction *brw_ADD(struct brw_compile *p,
970 struct brw_reg dest,
971 struct brw_reg src0,
972 struct brw_reg src1)
973 {
974 /* 6.2.2: add */
975 if (src0.type == BRW_REGISTER_TYPE_F ||
976 (src0.file == BRW_IMMEDIATE_VALUE &&
977 src0.type == BRW_REGISTER_TYPE_VF)) {
978 assert(src1.type != BRW_REGISTER_TYPE_UD);
979 assert(src1.type != BRW_REGISTER_TYPE_D);
980 }
981
982 if (src1.type == BRW_REGISTER_TYPE_F ||
983 (src1.file == BRW_IMMEDIATE_VALUE &&
984 src1.type == BRW_REGISTER_TYPE_VF)) {
985 assert(src0.type != BRW_REGISTER_TYPE_UD);
986 assert(src0.type != BRW_REGISTER_TYPE_D);
987 }
988
989 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
990 }
991
992 struct brw_instruction *brw_AVG(struct brw_compile *p,
993 struct brw_reg dest,
994 struct brw_reg src0,
995 struct brw_reg src1)
996 {
997 assert(dest.type == src0.type);
998 assert(src0.type == src1.type);
999 switch (src0.type) {
1000 case BRW_REGISTER_TYPE_B:
1001 case BRW_REGISTER_TYPE_UB:
1002 case BRW_REGISTER_TYPE_W:
1003 case BRW_REGISTER_TYPE_UW:
1004 case BRW_REGISTER_TYPE_D:
1005 case BRW_REGISTER_TYPE_UD:
1006 break;
1007 default:
1008 assert(!"Bad type for brw_AVG");
1009 }
1010
1011 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1012 }
1013
1014 struct brw_instruction *brw_MUL(struct brw_compile *p,
1015 struct brw_reg dest,
1016 struct brw_reg src0,
1017 struct brw_reg src1)
1018 {
1019 /* 6.32.38: mul */
1020 if (src0.type == BRW_REGISTER_TYPE_D ||
1021 src0.type == BRW_REGISTER_TYPE_UD ||
1022 src1.type == BRW_REGISTER_TYPE_D ||
1023 src1.type == BRW_REGISTER_TYPE_UD) {
1024 assert(dest.type != BRW_REGISTER_TYPE_F);
1025 }
1026
1027 if (src0.type == BRW_REGISTER_TYPE_F ||
1028 (src0.file == BRW_IMMEDIATE_VALUE &&
1029 src0.type == BRW_REGISTER_TYPE_VF)) {
1030 assert(src1.type != BRW_REGISTER_TYPE_UD);
1031 assert(src1.type != BRW_REGISTER_TYPE_D);
1032 }
1033
1034 if (src1.type == BRW_REGISTER_TYPE_F ||
1035 (src1.file == BRW_IMMEDIATE_VALUE &&
1036 src1.type == BRW_REGISTER_TYPE_VF)) {
1037 assert(src0.type != BRW_REGISTER_TYPE_UD);
1038 assert(src0.type != BRW_REGISTER_TYPE_D);
1039 }
1040
1041 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1042 src0.nr != BRW_ARF_ACCUMULATOR);
1043 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1044 src1.nr != BRW_ARF_ACCUMULATOR);
1045
1046 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1047 }
1048
1049
1050 void brw_NOP(struct brw_compile *p)
1051 {
1052 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
1053 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1054 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1055 brw_set_src1(p, insn, brw_imm_ud(0x0));
1056 }
1057
1058
1059
1060
1061
1062 /***********************************************************************
1063 * Comparisons, if/else/endif
1064 */
1065
1066 struct brw_instruction *brw_JMPI(struct brw_compile *p,
1067 struct brw_reg dest,
1068 struct brw_reg src0,
1069 struct brw_reg src1)
1070 {
1071 struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
1072
1073 insn->header.execution_size = 1;
1074 insn->header.compression_control = BRW_COMPRESSION_NONE;
1075 insn->header.mask_control = BRW_MASK_DISABLE;
1076
1077 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1078
1079 return insn;
1080 }
1081
1082 static void
1083 push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1084 {
1085 p->if_stack[p->if_stack_depth] = inst - p->store;
1086
1087 p->if_stack_depth++;
1088 if (p->if_stack_array_size <= p->if_stack_depth) {
1089 p->if_stack_array_size *= 2;
1090 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1091 p->if_stack_array_size);
1092 }
1093 }
1094
1095 static struct brw_instruction *
1096 pop_if_stack(struct brw_compile *p)
1097 {
1098 p->if_stack_depth--;
1099 return &p->store[p->if_stack[p->if_stack_depth]];
1100 }
1101
1102 static void
1103 push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1104 {
1105 if (p->loop_stack_array_size < p->loop_stack_depth) {
1106 p->loop_stack_array_size *= 2;
1107 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1108 p->loop_stack_array_size);
1109 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1110 p->loop_stack_array_size);
1111 }
1112
1113 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1114 p->loop_stack_depth++;
1115 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1116 }
1117
1118 static struct brw_instruction *
1119 get_inner_do_insn(struct brw_compile *p)
1120 {
1121 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1122 }
1123
1124 /* EU takes the value from the flag register and pushes it onto some
1125 * sort of a stack (presumably merging with any flag value already on
1126 * the stack). Within an if block, the flags at the top of the stack
1127 * control execution on each channel of the unit, eg. on each of the
1128 * 16 pixel values in our wm programs.
1129 *
1130 * When the matching 'else' instruction is reached (presumably by
1131 * countdown of the instruction count patched in by our ELSE/ENDIF
1132 * functions), the relevent flags are inverted.
1133 *
1134 * When the matching 'endif' instruction is reached, the flags are
1135 * popped off. If the stack is now empty, normal execution resumes.
1136 */
1137 struct brw_instruction *
1138 brw_IF(struct brw_compile *p, GLuint execute_size)
1139 {
1140 struct brw_context *brw = p->brw;
1141 struct brw_instruction *insn;
1142
1143 insn = next_insn(p, BRW_OPCODE_IF);
1144
1145 /* Override the defaults for this instruction:
1146 */
1147 if (brw->gen < 6) {
1148 brw_set_dest(p, insn, brw_ip_reg());
1149 brw_set_src0(p, insn, brw_ip_reg());
1150 brw_set_src1(p, insn, brw_imm_d(0x0));
1151 } else if (brw->gen == 6) {
1152 brw_set_dest(p, insn, brw_imm_w(0));
1153 insn->bits1.branch_gen6.jump_count = 0;
1154 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1155 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1156 } else {
1157 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1158 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1159 brw_set_src1(p, insn, brw_imm_ud(0));
1160 insn->bits3.break_cont.jip = 0;
1161 insn->bits3.break_cont.uip = 0;
1162 }
1163
1164 insn->header.execution_size = execute_size;
1165 insn->header.compression_control = BRW_COMPRESSION_NONE;
1166 insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1167 insn->header.mask_control = BRW_MASK_ENABLE;
1168 if (!p->single_program_flow)
1169 insn->header.thread_control = BRW_THREAD_SWITCH;
1170
1171 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1172
1173 push_if_stack(p, insn);
1174 p->if_depth_in_loop[p->loop_stack_depth]++;
1175 return insn;
1176 }
1177
1178 /* This function is only used for gen6-style IF instructions with an
1179 * embedded comparison (conditional modifier). It is not used on gen7.
1180 */
1181 struct brw_instruction *
1182 gen6_IF(struct brw_compile *p, uint32_t conditional,
1183 struct brw_reg src0, struct brw_reg src1)
1184 {
1185 struct brw_instruction *insn;
1186
1187 insn = next_insn(p, BRW_OPCODE_IF);
1188
1189 brw_set_dest(p, insn, brw_imm_w(0));
1190 if (p->compressed) {
1191 insn->header.execution_size = BRW_EXECUTE_16;
1192 } else {
1193 insn->header.execution_size = BRW_EXECUTE_8;
1194 }
1195 insn->bits1.branch_gen6.jump_count = 0;
1196 brw_set_src0(p, insn, src0);
1197 brw_set_src1(p, insn, src1);
1198
1199 assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1200 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1201 insn->header.destreg__conditionalmod = conditional;
1202
1203 if (!p->single_program_flow)
1204 insn->header.thread_control = BRW_THREAD_SWITCH;
1205
1206 push_if_stack(p, insn);
1207 return insn;
1208 }
1209
1210 /**
1211 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1212 */
1213 static void
1214 convert_IF_ELSE_to_ADD(struct brw_compile *p,
1215 struct brw_instruction *if_inst,
1216 struct brw_instruction *else_inst)
1217 {
1218 /* The next instruction (where the ENDIF would be, if it existed) */
1219 struct brw_instruction *next_inst = &p->store[p->nr_insn];
1220
1221 assert(p->single_program_flow);
1222 assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1223 assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1224 assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1225
1226 /* Convert IF to an ADD instruction that moves the instruction pointer
1227 * to the first instruction of the ELSE block. If there is no ELSE
1228 * block, point to where ENDIF would be. Reverse the predicate.
1229 *
1230 * There's no need to execute an ENDIF since we don't need to do any
1231 * stack operations, and if we're currently executing, we just want to
1232 * continue normally.
1233 */
1234 if_inst->header.opcode = BRW_OPCODE_ADD;
1235 if_inst->header.predicate_inverse = 1;
1236
1237 if (else_inst != NULL) {
1238 /* Convert ELSE to an ADD instruction that points where the ENDIF
1239 * would be.
1240 */
1241 else_inst->header.opcode = BRW_OPCODE_ADD;
1242
1243 if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1244 else_inst->bits3.ud = (next_inst - else_inst) * 16;
1245 } else {
1246 if_inst->bits3.ud = (next_inst - if_inst) * 16;
1247 }
1248 }
1249
1250 /**
1251 * Patch IF and ELSE instructions with appropriate jump targets.
1252 */
1253 static void
1254 patch_IF_ELSE(struct brw_compile *p,
1255 struct brw_instruction *if_inst,
1256 struct brw_instruction *else_inst,
1257 struct brw_instruction *endif_inst)
1258 {
1259 struct brw_context *brw = p->brw;
1260
1261 /* We shouldn't be patching IF and ELSE instructions in single program flow
1262 * mode when gen < 6, because in single program flow mode on those
1263 * platforms, we convert flow control instructions to conditional ADDs that
1264 * operate on IP (see brw_ENDIF).
1265 *
1266 * However, on Gen6, writing to IP doesn't work in single program flow mode
1267 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1268 * not be updated by non-flow control instructions."). And on later
1269 * platforms, there is no significant benefit to converting control flow
1270 * instructions to conditional ADDs. So we do patch IF and ELSE
1271 * instructions in single program flow mode on those platforms.
1272 */
1273 if (brw->gen < 6)
1274 assert(!p->single_program_flow);
1275
1276 assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1277 assert(endif_inst != NULL);
1278 assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1279
1280 unsigned br = 1;
1281 /* Jump count is for 64bit data chunk each, so one 128bit instruction
1282 * requires 2 chunks.
1283 */
1284 if (brw->gen >= 5)
1285 br = 2;
1286
1287 assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1288 endif_inst->header.execution_size = if_inst->header.execution_size;
1289
1290 if (else_inst == NULL) {
1291 /* Patch IF -> ENDIF */
1292 if (brw->gen < 6) {
1293 /* Turn it into an IFF, which means no mask stack operations for
1294 * all-false and jumping past the ENDIF.
1295 */
1296 if_inst->header.opcode = BRW_OPCODE_IFF;
1297 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1298 if_inst->bits3.if_else.pop_count = 0;
1299 if_inst->bits3.if_else.pad0 = 0;
1300 } else if (brw->gen == 6) {
1301 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1302 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1303 } else {
1304 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1305 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1306 }
1307 } else {
1308 else_inst->header.execution_size = if_inst->header.execution_size;
1309
1310 /* Patch IF -> ELSE */
1311 if (brw->gen < 6) {
1312 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1313 if_inst->bits3.if_else.pop_count = 0;
1314 if_inst->bits3.if_else.pad0 = 0;
1315 } else if (brw->gen == 6) {
1316 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1317 }
1318
1319 /* Patch ELSE -> ENDIF */
1320 if (brw->gen < 6) {
1321 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1322 * matching ENDIF.
1323 */
1324 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1325 else_inst->bits3.if_else.pop_count = 1;
1326 else_inst->bits3.if_else.pad0 = 0;
1327 } else if (brw->gen == 6) {
1328 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1329 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1330 } else {
1331 /* The IF instruction's JIP should point just past the ELSE */
1332 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1333 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1334 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1335 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1336 }
1337 }
1338 }
1339
1340 void
1341 brw_ELSE(struct brw_compile *p)
1342 {
1343 struct brw_context *brw = p->brw;
1344 struct brw_instruction *insn;
1345
1346 insn = next_insn(p, BRW_OPCODE_ELSE);
1347
1348 if (brw->gen < 6) {
1349 brw_set_dest(p, insn, brw_ip_reg());
1350 brw_set_src0(p, insn, brw_ip_reg());
1351 brw_set_src1(p, insn, brw_imm_d(0x0));
1352 } else if (brw->gen == 6) {
1353 brw_set_dest(p, insn, brw_imm_w(0));
1354 insn->bits1.branch_gen6.jump_count = 0;
1355 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1356 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1357 } else {
1358 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1359 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1360 brw_set_src1(p, insn, brw_imm_ud(0));
1361 insn->bits3.break_cont.jip = 0;
1362 insn->bits3.break_cont.uip = 0;
1363 }
1364
1365 insn->header.compression_control = BRW_COMPRESSION_NONE;
1366 insn->header.mask_control = BRW_MASK_ENABLE;
1367 if (!p->single_program_flow)
1368 insn->header.thread_control = BRW_THREAD_SWITCH;
1369
1370 push_if_stack(p, insn);
1371 }
1372
1373 void
1374 brw_ENDIF(struct brw_compile *p)
1375 {
1376 struct brw_context *brw = p->brw;
1377 struct brw_instruction *insn = NULL;
1378 struct brw_instruction *else_inst = NULL;
1379 struct brw_instruction *if_inst = NULL;
1380 struct brw_instruction *tmp;
1381 bool emit_endif = true;
1382
1383 /* In single program flow mode, we can express IF and ELSE instructions
1384 * equivalently as ADD instructions that operate on IP. On platforms prior
1385 * to Gen6, flow control instructions cause an implied thread switch, so
1386 * this is a significant savings.
1387 *
1388 * However, on Gen6, writing to IP doesn't work in single program flow mode
1389 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1390 * not be updated by non-flow control instructions."). And on later
1391 * platforms, there is no significant benefit to converting control flow
1392 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1393 * Gen5.
1394 */
1395 if (brw->gen < 6 && p->single_program_flow)
1396 emit_endif = false;
1397
1398 /*
1399 * A single next_insn() may change the base adress of instruction store
1400 * memory(p->store), so call it first before referencing the instruction
1401 * store pointer from an index
1402 */
1403 if (emit_endif)
1404 insn = next_insn(p, BRW_OPCODE_ENDIF);
1405
1406 /* Pop the IF and (optional) ELSE instructions from the stack */
1407 p->if_depth_in_loop[p->loop_stack_depth]--;
1408 tmp = pop_if_stack(p);
1409 if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1410 else_inst = tmp;
1411 tmp = pop_if_stack(p);
1412 }
1413 if_inst = tmp;
1414
1415 if (!emit_endif) {
1416 /* ENDIF is useless; don't bother emitting it. */
1417 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1418 return;
1419 }
1420
1421 if (brw->gen < 6) {
1422 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1423 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1424 brw_set_src1(p, insn, brw_imm_d(0x0));
1425 } else if (brw->gen == 6) {
1426 brw_set_dest(p, insn, brw_imm_w(0));
1427 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1428 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1429 } else {
1430 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1431 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1432 brw_set_src1(p, insn, brw_imm_ud(0));
1433 }
1434
1435 insn->header.compression_control = BRW_COMPRESSION_NONE;
1436 insn->header.mask_control = BRW_MASK_ENABLE;
1437 insn->header.thread_control = BRW_THREAD_SWITCH;
1438
1439 /* Also pop item off the stack in the endif instruction: */
1440 if (brw->gen < 6) {
1441 insn->bits3.if_else.jump_count = 0;
1442 insn->bits3.if_else.pop_count = 1;
1443 insn->bits3.if_else.pad0 = 0;
1444 } else if (brw->gen == 6) {
1445 insn->bits1.branch_gen6.jump_count = 2;
1446 } else {
1447 insn->bits3.break_cont.jip = 2;
1448 }
1449 patch_IF_ELSE(p, if_inst, else_inst, insn);
1450 }
1451
1452 struct brw_instruction *brw_BREAK(struct brw_compile *p)
1453 {
1454 struct brw_context *brw = p->brw;
1455 struct brw_instruction *insn;
1456
1457 insn = next_insn(p, BRW_OPCODE_BREAK);
1458 if (brw->gen >= 6) {
1459 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1460 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1461 brw_set_src1(p, insn, brw_imm_d(0x0));
1462 } else {
1463 brw_set_dest(p, insn, brw_ip_reg());
1464 brw_set_src0(p, insn, brw_ip_reg());
1465 brw_set_src1(p, insn, brw_imm_d(0x0));
1466 insn->bits3.if_else.pad0 = 0;
1467 insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1468 }
1469 insn->header.compression_control = BRW_COMPRESSION_NONE;
1470 insn->header.execution_size = BRW_EXECUTE_8;
1471
1472 return insn;
1473 }
1474
1475 struct brw_instruction *gen6_CONT(struct brw_compile *p)
1476 {
1477 struct brw_instruction *insn;
1478
1479 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1480 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1481 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1482 brw_set_dest(p, insn, brw_ip_reg());
1483 brw_set_src0(p, insn, brw_ip_reg());
1484 brw_set_src1(p, insn, brw_imm_d(0x0));
1485
1486 insn->header.compression_control = BRW_COMPRESSION_NONE;
1487 insn->header.execution_size = BRW_EXECUTE_8;
1488 return insn;
1489 }
1490
1491 struct brw_instruction *brw_CONT(struct brw_compile *p)
1492 {
1493 struct brw_instruction *insn;
1494 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1495 brw_set_dest(p, insn, brw_ip_reg());
1496 brw_set_src0(p, insn, brw_ip_reg());
1497 brw_set_src1(p, insn, brw_imm_d(0x0));
1498 insn->header.compression_control = BRW_COMPRESSION_NONE;
1499 insn->header.execution_size = BRW_EXECUTE_8;
1500 /* insn->header.mask_control = BRW_MASK_DISABLE; */
1501 insn->bits3.if_else.pad0 = 0;
1502 insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1503 return insn;
1504 }
1505
1506 struct brw_instruction *gen6_HALT(struct brw_compile *p)
1507 {
1508 struct brw_instruction *insn;
1509
1510 insn = next_insn(p, BRW_OPCODE_HALT);
1511 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1512 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1513 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1514
1515 if (p->compressed) {
1516 insn->header.execution_size = BRW_EXECUTE_16;
1517 } else {
1518 insn->header.compression_control = BRW_COMPRESSION_NONE;
1519 insn->header.execution_size = BRW_EXECUTE_8;
1520 }
1521 return insn;
1522 }
1523
1524 /* DO/WHILE loop:
1525 *
1526 * The DO/WHILE is just an unterminated loop -- break or continue are
1527 * used for control within the loop. We have a few ways they can be
1528 * done.
1529 *
1530 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1531 * jip and no DO instruction.
1532 *
1533 * For non-uniform control flow pre-gen6, there's a DO instruction to
1534 * push the mask, and a WHILE to jump back, and BREAK to get out and
1535 * pop the mask.
1536 *
1537 * For gen6, there's no more mask stack, so no need for DO. WHILE
1538 * just points back to the first instruction of the loop.
1539 */
1540 struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1541 {
1542 struct brw_context *brw = p->brw;
1543
1544 if (brw->gen >= 6 || p->single_program_flow) {
1545 push_loop_stack(p, &p->store[p->nr_insn]);
1546 return &p->store[p->nr_insn];
1547 } else {
1548 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1549
1550 push_loop_stack(p, insn);
1551
1552 /* Override the defaults for this instruction:
1553 */
1554 brw_set_dest(p, insn, brw_null_reg());
1555 brw_set_src0(p, insn, brw_null_reg());
1556 brw_set_src1(p, insn, brw_null_reg());
1557
1558 insn->header.compression_control = BRW_COMPRESSION_NONE;
1559 insn->header.execution_size = execute_size;
1560 insn->header.predicate_control = BRW_PREDICATE_NONE;
1561 /* insn->header.mask_control = BRW_MASK_ENABLE; */
1562 /* insn->header.mask_control = BRW_MASK_DISABLE; */
1563
1564 return insn;
1565 }
1566 }
1567
1568 /**
1569 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1570 * instruction here.
1571 *
1572 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1573 * nesting, since it can always just point to the end of the block/current loop.
1574 */
1575 static void
1576 brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1577 {
1578 struct brw_context *brw = p->brw;
1579 struct brw_instruction *do_inst = get_inner_do_insn(p);
1580 struct brw_instruction *inst;
1581 int br = (brw->gen == 5) ? 2 : 1;
1582
1583 for (inst = while_inst - 1; inst != do_inst; inst--) {
1584 /* If the jump count is != 0, that means that this instruction has already
1585 * been patched because it's part of a loop inside of the one we're
1586 * patching.
1587 */
1588 if (inst->header.opcode == BRW_OPCODE_BREAK &&
1589 inst->bits3.if_else.jump_count == 0) {
1590 inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1591 } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1592 inst->bits3.if_else.jump_count == 0) {
1593 inst->bits3.if_else.jump_count = br * (while_inst - inst);
1594 }
1595 }
1596 }
1597
1598 struct brw_instruction *brw_WHILE(struct brw_compile *p)
1599 {
1600 struct brw_context *brw = p->brw;
1601 struct brw_instruction *insn, *do_insn;
1602 GLuint br = 1;
1603
1604 if (brw->gen >= 5)
1605 br = 2;
1606
1607 if (brw->gen >= 7) {
1608 insn = next_insn(p, BRW_OPCODE_WHILE);
1609 do_insn = get_inner_do_insn(p);
1610
1611 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1612 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1613 brw_set_src1(p, insn, brw_imm_ud(0));
1614 insn->bits3.break_cont.jip = br * (do_insn - insn);
1615
1616 insn->header.execution_size = BRW_EXECUTE_8;
1617 } else if (brw->gen == 6) {
1618 insn = next_insn(p, BRW_OPCODE_WHILE);
1619 do_insn = get_inner_do_insn(p);
1620
1621 brw_set_dest(p, insn, brw_imm_w(0));
1622 insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1623 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1624 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1625
1626 insn->header.execution_size = BRW_EXECUTE_8;
1627 } else {
1628 if (p->single_program_flow) {
1629 insn = next_insn(p, BRW_OPCODE_ADD);
1630 do_insn = get_inner_do_insn(p);
1631
1632 brw_set_dest(p, insn, brw_ip_reg());
1633 brw_set_src0(p, insn, brw_ip_reg());
1634 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1635 insn->header.execution_size = BRW_EXECUTE_1;
1636 } else {
1637 insn = next_insn(p, BRW_OPCODE_WHILE);
1638 do_insn = get_inner_do_insn(p);
1639
1640 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1641
1642 brw_set_dest(p, insn, brw_ip_reg());
1643 brw_set_src0(p, insn, brw_ip_reg());
1644 brw_set_src1(p, insn, brw_imm_d(0));
1645
1646 insn->header.execution_size = do_insn->header.execution_size;
1647 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1648 insn->bits3.if_else.pop_count = 0;
1649 insn->bits3.if_else.pad0 = 0;
1650
1651 brw_patch_break_cont(p, insn);
1652 }
1653 }
1654 insn->header.compression_control = BRW_COMPRESSION_NONE;
1655 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1656
1657 p->loop_stack_depth--;
1658
1659 return insn;
1660 }
1661
1662
1663 /* FORWARD JUMPS:
1664 */
1665 void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1666 {
1667 struct brw_context *brw = p->brw;
1668 struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1669 GLuint jmpi = 1;
1670
1671 if (brw->gen >= 5)
1672 jmpi = 2;
1673
1674 assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1675 assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1676
1677 jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
1678 }
1679
1680
1681
1682 /* To integrate with the above, it makes sense that the comparison
1683 * instruction should populate the flag register. It might be simpler
1684 * just to use the flag reg for most WM tasks?
1685 */
1686 void brw_CMP(struct brw_compile *p,
1687 struct brw_reg dest,
1688 GLuint conditional,
1689 struct brw_reg src0,
1690 struct brw_reg src1)
1691 {
1692 struct brw_context *brw = p->brw;
1693 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1694
1695 insn->header.destreg__conditionalmod = conditional;
1696 brw_set_dest(p, insn, dest);
1697 brw_set_src0(p, insn, src0);
1698 brw_set_src1(p, insn, src1);
1699
1700 /* guess_execution_size(insn, src0); */
1701
1702
1703 /* Make it so that future instructions will use the computed flag
1704 * value until brw_set_predicate_control_flag_value() is called
1705 * again.
1706 */
1707 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1708 dest.nr == 0) {
1709 p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1710 p->flag_value = 0xff;
1711 }
1712
1713 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1714 * page says:
1715 * "Any CMP instruction with a null destination must use a {switch}."
1716 *
1717 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1718 * mentioned on their work-arounds pages.
1719 */
1720 if (brw->gen == 7) {
1721 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1722 dest.nr == BRW_ARF_NULL) {
1723 insn->header.thread_control = BRW_THREAD_SWITCH;
1724 }
1725 }
1726 }
1727
1728 /* Issue 'wait' instruction for n1, host could program MMIO
1729 to wake up thread. */
1730 void brw_WAIT (struct brw_compile *p)
1731 {
1732 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1733 struct brw_reg src = brw_notification_1_reg();
1734
1735 brw_set_dest(p, insn, src);
1736 brw_set_src0(p, insn, src);
1737 brw_set_src1(p, insn, brw_null_reg());
1738 insn->header.execution_size = 0; /* must */
1739 insn->header.predicate_control = 0;
1740 insn->header.compression_control = 0;
1741 }
1742
1743
1744 /***********************************************************************
1745 * Helpers for the various SEND message types:
1746 */
1747
1748 /** Extended math function, float[8].
1749 */
1750 void brw_math( struct brw_compile *p,
1751 struct brw_reg dest,
1752 GLuint function,
1753 GLuint msg_reg_nr,
1754 struct brw_reg src,
1755 GLuint data_type,
1756 GLuint precision )
1757 {
1758 struct brw_context *brw = p->brw;
1759
1760 if (brw->gen >= 6) {
1761 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1762
1763 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1764 (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1765 assert(src.file == BRW_GENERAL_REGISTER_FILE);
1766
1767 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1768 if (brw->gen == 6)
1769 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1770
1771 /* Source modifiers are ignored for extended math instructions on Gen6. */
1772 if (brw->gen == 6) {
1773 assert(!src.negate);
1774 assert(!src.abs);
1775 }
1776
1777 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1778 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1779 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1780 assert(src.type != BRW_REGISTER_TYPE_F);
1781 } else {
1782 assert(src.type == BRW_REGISTER_TYPE_F);
1783 }
1784
1785 /* Math is the same ISA format as other opcodes, except that CondModifier
1786 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1787 */
1788 insn->header.destreg__conditionalmod = function;
1789
1790 brw_set_dest(p, insn, dest);
1791 brw_set_src0(p, insn, src);
1792 brw_set_src1(p, insn, brw_null_reg());
1793 } else {
1794 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1795
1796 /* Example code doesn't set predicate_control for send
1797 * instructions.
1798 */
1799 insn->header.predicate_control = 0;
1800 insn->header.destreg__conditionalmod = msg_reg_nr;
1801
1802 brw_set_dest(p, insn, dest);
1803 brw_set_src0(p, insn, src);
1804 brw_set_math_message(p,
1805 insn,
1806 function,
1807 src.type == BRW_REGISTER_TYPE_D,
1808 precision,
1809 data_type);
1810 }
1811 }
1812
1813 /** Extended math function, float[8].
1814 */
1815 void brw_math2(struct brw_compile *p,
1816 struct brw_reg dest,
1817 GLuint function,
1818 struct brw_reg src0,
1819 struct brw_reg src1)
1820 {
1821 struct brw_context *brw = p->brw;
1822 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1823
1824 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1825 (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1826 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1827 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1828
1829 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1830 if (brw->gen == 6) {
1831 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1832 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1833 }
1834
1835 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1836 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1837 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1838 assert(src0.type != BRW_REGISTER_TYPE_F);
1839 assert(src1.type != BRW_REGISTER_TYPE_F);
1840 } else {
1841 assert(src0.type == BRW_REGISTER_TYPE_F);
1842 assert(src1.type == BRW_REGISTER_TYPE_F);
1843 }
1844
1845 /* Source modifiers are ignored for extended math instructions on Gen6. */
1846 if (brw->gen == 6) {
1847 assert(!src0.negate);
1848 assert(!src0.abs);
1849 assert(!src1.negate);
1850 assert(!src1.abs);
1851 }
1852
1853 /* Math is the same ISA format as other opcodes, except that CondModifier
1854 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1855 */
1856 insn->header.destreg__conditionalmod = function;
1857
1858 brw_set_dest(p, insn, dest);
1859 brw_set_src0(p, insn, src0);
1860 brw_set_src1(p, insn, src1);
1861 }
1862
1863
1864 /**
1865 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1866 * using a constant offset per channel.
1867 *
1868 * The offset must be aligned to oword size (16 bytes). Used for
1869 * register spilling.
1870 */
1871 void brw_oword_block_write_scratch(struct brw_compile *p,
1872 struct brw_reg mrf,
1873 int num_regs,
1874 GLuint offset)
1875 {
1876 struct brw_context *brw = p->brw;
1877 uint32_t msg_control, msg_type;
1878 int mlen;
1879
1880 if (brw->gen >= 6)
1881 offset /= 16;
1882
1883 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1884
1885 if (num_regs == 1) {
1886 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1887 mlen = 2;
1888 } else {
1889 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1890 mlen = 3;
1891 }
1892
1893 /* Set up the message header. This is g0, with g0.2 filled with
1894 * the offset. We don't want to leave our offset around in g0 or
1895 * it'll screw up texture samples, so set it up inside the message
1896 * reg.
1897 */
1898 {
1899 brw_push_insn_state(p);
1900 brw_set_mask_control(p, BRW_MASK_DISABLE);
1901 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1902
1903 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1904
1905 /* set message header global offset field (reg 0, element 2) */
1906 brw_MOV(p,
1907 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1908 mrf.nr,
1909 2), BRW_REGISTER_TYPE_UD),
1910 brw_imm_ud(offset));
1911
1912 brw_pop_insn_state(p);
1913 }
1914
1915 {
1916 struct brw_reg dest;
1917 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1918 int send_commit_msg;
1919 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1920 BRW_REGISTER_TYPE_UW);
1921
1922 if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1923 insn->header.compression_control = BRW_COMPRESSION_NONE;
1924 src_header = vec16(src_header);
1925 }
1926 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1927 insn->header.destreg__conditionalmod = mrf.nr;
1928
1929 /* Until gen6, writes followed by reads from the same location
1930 * are not guaranteed to be ordered unless write_commit is set.
1931 * If set, then a no-op write is issued to the destination
1932 * register to set a dependency, and a read from the destination
1933 * can be used to ensure the ordering.
1934 *
1935 * For gen6, only writes between different threads need ordering
1936 * protection. Our use of DP writes is all about register
1937 * spilling within a thread.
1938 */
1939 if (brw->gen >= 6) {
1940 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1941 send_commit_msg = 0;
1942 } else {
1943 dest = src_header;
1944 send_commit_msg = 1;
1945 }
1946
1947 brw_set_dest(p, insn, dest);
1948 if (brw->gen >= 6) {
1949 brw_set_src0(p, insn, mrf);
1950 } else {
1951 brw_set_src0(p, insn, brw_null_reg());
1952 }
1953
1954 if (brw->gen >= 6)
1955 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1956 else
1957 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1958
1959 brw_set_dp_write_message(p,
1960 insn,
1961 255, /* binding table index (255=stateless) */
1962 msg_control,
1963 msg_type,
1964 mlen,
1965 true, /* header_present */
1966 0, /* not a render target */
1967 send_commit_msg, /* response_length */
1968 0, /* eot */
1969 send_commit_msg);
1970 }
1971 }
1972
1973
1974 /**
1975 * Read a block of owords (half a GRF each) from the scratch buffer
1976 * using a constant index per channel.
1977 *
1978 * Offset must be aligned to oword size (16 bytes). Used for register
1979 * spilling.
1980 */
1981 void
1982 brw_oword_block_read_scratch(struct brw_compile *p,
1983 struct brw_reg dest,
1984 struct brw_reg mrf,
1985 int num_regs,
1986 GLuint offset)
1987 {
1988 struct brw_context *brw = p->brw;
1989 uint32_t msg_control;
1990 int rlen;
1991
1992 if (brw->gen >= 6)
1993 offset /= 16;
1994
1995 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1996 dest = retype(dest, BRW_REGISTER_TYPE_UW);
1997
1998 if (num_regs == 1) {
1999 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2000 rlen = 1;
2001 } else {
2002 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2003 rlen = 2;
2004 }
2005
2006 {
2007 brw_push_insn_state(p);
2008 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2009 brw_set_mask_control(p, BRW_MASK_DISABLE);
2010
2011 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2012
2013 /* set message header global offset field (reg 0, element 2) */
2014 brw_MOV(p,
2015 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2016 mrf.nr,
2017 2), BRW_REGISTER_TYPE_UD),
2018 brw_imm_ud(offset));
2019
2020 brw_pop_insn_state(p);
2021 }
2022
2023 {
2024 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2025
2026 assert(insn->header.predicate_control == 0);
2027 insn->header.compression_control = BRW_COMPRESSION_NONE;
2028 insn->header.destreg__conditionalmod = mrf.nr;
2029
2030 brw_set_dest(p, insn, dest); /* UW? */
2031 if (brw->gen >= 6) {
2032 brw_set_src0(p, insn, mrf);
2033 } else {
2034 brw_set_src0(p, insn, brw_null_reg());
2035 }
2036
2037 brw_set_dp_read_message(p,
2038 insn,
2039 255, /* binding table index (255=stateless) */
2040 msg_control,
2041 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2042 BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2043 1, /* msg_length */
2044 true, /* header_present */
2045 rlen);
2046 }
2047 }
2048
2049 /**
2050 * Read a float[4] vector from the data port Data Cache (const buffer).
2051 * Location (in buffer) should be a multiple of 16.
2052 * Used for fetching shader constants.
2053 */
2054 void brw_oword_block_read(struct brw_compile *p,
2055 struct brw_reg dest,
2056 struct brw_reg mrf,
2057 uint32_t offset,
2058 uint32_t bind_table_index)
2059 {
2060 struct brw_context *brw = p->brw;
2061
2062 /* On newer hardware, offset is in units of owords. */
2063 if (brw->gen >= 6)
2064 offset /= 16;
2065
2066 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2067
2068 brw_push_insn_state(p);
2069 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2070 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2071 brw_set_mask_control(p, BRW_MASK_DISABLE);
2072
2073 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2074
2075 /* set message header global offset field (reg 0, element 2) */
2076 brw_MOV(p,
2077 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2078 mrf.nr,
2079 2), BRW_REGISTER_TYPE_UD),
2080 brw_imm_ud(offset));
2081
2082 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2083 insn->header.destreg__conditionalmod = mrf.nr;
2084
2085 /* cast dest to a uword[8] vector */
2086 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2087
2088 brw_set_dest(p, insn, dest);
2089 if (brw->gen >= 6) {
2090 brw_set_src0(p, insn, mrf);
2091 } else {
2092 brw_set_src0(p, insn, brw_null_reg());
2093 }
2094
2095 brw_set_dp_read_message(p,
2096 insn,
2097 bind_table_index,
2098 BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2099 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2100 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2101 1, /* msg_length */
2102 true, /* header_present */
2103 1); /* response_length (1 reg, 2 owords!) */
2104
2105 brw_pop_insn_state(p);
2106 }
2107
2108
2109 void brw_fb_WRITE(struct brw_compile *p,
2110 int dispatch_width,
2111 GLuint msg_reg_nr,
2112 struct brw_reg src0,
2113 GLuint msg_control,
2114 GLuint binding_table_index,
2115 GLuint msg_length,
2116 GLuint response_length,
2117 bool eot,
2118 bool header_present)
2119 {
2120 struct brw_context *brw = p->brw;
2121 struct brw_instruction *insn;
2122 GLuint msg_type;
2123 struct brw_reg dest;
2124
2125 if (dispatch_width == 16)
2126 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2127 else
2128 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2129
2130 if (brw->gen >= 6) {
2131 insn = next_insn(p, BRW_OPCODE_SENDC);
2132 } else {
2133 insn = next_insn(p, BRW_OPCODE_SEND);
2134 }
2135 /* The execution mask is ignored for render target writes. */
2136 insn->header.predicate_control = 0;
2137 insn->header.compression_control = BRW_COMPRESSION_NONE;
2138
2139 if (brw->gen >= 6) {
2140 /* headerless version, just submit color payload */
2141 src0 = brw_message_reg(msg_reg_nr);
2142
2143 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2144 } else {
2145 insn->header.destreg__conditionalmod = msg_reg_nr;
2146
2147 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2148 }
2149
2150 brw_set_dest(p, insn, dest);
2151 brw_set_src0(p, insn, src0);
2152 brw_set_dp_write_message(p,
2153 insn,
2154 binding_table_index,
2155 msg_control,
2156 msg_type,
2157 msg_length,
2158 header_present,
2159 eot, /* last render target write */
2160 response_length,
2161 eot,
2162 0 /* send_commit_msg */);
2163 }
2164
2165
2166 /**
2167 * Texture sample instruction.
2168 * Note: the msg_type plus msg_length values determine exactly what kind
2169 * of sampling operation is performed. See volume 4, page 161 of docs.
2170 */
2171 void brw_SAMPLE(struct brw_compile *p,
2172 struct brw_reg dest,
2173 GLuint msg_reg_nr,
2174 struct brw_reg src0,
2175 GLuint binding_table_index,
2176 GLuint sampler,
2177 GLuint msg_type,
2178 GLuint response_length,
2179 GLuint msg_length,
2180 GLuint header_present,
2181 GLuint simd_mode,
2182 GLuint return_format)
2183 {
2184 struct brw_context *brw = p->brw;
2185 struct brw_instruction *insn;
2186
2187 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2188
2189 insn = next_insn(p, BRW_OPCODE_SEND);
2190 insn->header.predicate_control = 0; /* XXX */
2191 insn->header.compression_control = BRW_COMPRESSION_NONE;
2192 if (brw->gen < 6)
2193 insn->header.destreg__conditionalmod = msg_reg_nr;
2194
2195 brw_set_dest(p, insn, dest);
2196 brw_set_src0(p, insn, src0);
2197 brw_set_sampler_message(p, insn,
2198 binding_table_index,
2199 sampler,
2200 msg_type,
2201 response_length,
2202 msg_length,
2203 header_present,
2204 simd_mode,
2205 return_format);
2206 }
2207
2208 /* All these variables are pretty confusing - we might be better off
2209 * using bitmasks and macros for this, in the old style. Or perhaps
2210 * just having the caller instantiate the fields in dword3 itself.
2211 */
2212 void brw_urb_WRITE(struct brw_compile *p,
2213 struct brw_reg dest,
2214 GLuint msg_reg_nr,
2215 struct brw_reg src0,
2216 enum brw_urb_write_flags flags,
2217 GLuint msg_length,
2218 GLuint response_length,
2219 GLuint offset,
2220 GLuint swizzle)
2221 {
2222 struct brw_context *brw = p->brw;
2223 struct brw_instruction *insn;
2224
2225 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2226
2227 if (brw->gen == 7) {
2228 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2229 brw_push_insn_state(p);
2230 brw_set_access_mode(p, BRW_ALIGN_1);
2231 brw_set_mask_control(p, BRW_MASK_DISABLE);
2232 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2233 BRW_REGISTER_TYPE_UD),
2234 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2235 brw_imm_ud(0xff00));
2236 brw_pop_insn_state(p);
2237 }
2238
2239 insn = next_insn(p, BRW_OPCODE_SEND);
2240
2241 assert(msg_length < BRW_MAX_MRF);
2242
2243 brw_set_dest(p, insn, dest);
2244 brw_set_src0(p, insn, src0);
2245 brw_set_src1(p, insn, brw_imm_d(0));
2246
2247 if (brw->gen < 6)
2248 insn->header.destreg__conditionalmod = msg_reg_nr;
2249
2250 brw_set_urb_message(p,
2251 insn,
2252 flags,
2253 msg_length,
2254 response_length,
2255 offset,
2256 swizzle);
2257 }
2258
2259 static int
2260 next_ip(struct brw_compile *p, int ip)
2261 {
2262 struct brw_instruction *insn = (void *)p->store + ip;
2263
2264 if (insn->header.cmpt_control)
2265 return ip + 8;
2266 else
2267 return ip + 16;
2268 }
2269
2270 static int
2271 brw_find_next_block_end(struct brw_compile *p, int start)
2272 {
2273 int ip;
2274 void *store = p->store;
2275
2276 for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2277 struct brw_instruction *insn = store + ip;
2278
2279 switch (insn->header.opcode) {
2280 case BRW_OPCODE_ENDIF:
2281 case BRW_OPCODE_ELSE:
2282 case BRW_OPCODE_WHILE:
2283 case BRW_OPCODE_HALT:
2284 return ip;
2285 }
2286 }
2287
2288 return 0;
2289 }
2290
2291 /* There is no DO instruction on gen6, so to find the end of the loop
2292 * we have to see if the loop is jumping back before our start
2293 * instruction.
2294 */
2295 static int
2296 brw_find_loop_end(struct brw_compile *p, int start)
2297 {
2298 struct brw_context *brw = p->brw;
2299 int ip;
2300 int scale = 8;
2301 void *store = p->store;
2302
2303 /* Always start after the instruction (such as a WHILE) we're trying to fix
2304 * up.
2305 */
2306 for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2307 struct brw_instruction *insn = store + ip;
2308
2309 if (insn->header.opcode == BRW_OPCODE_WHILE) {
2310 int jip = brw->gen == 6 ? insn->bits1.branch_gen6.jump_count
2311 : insn->bits3.break_cont.jip;
2312 if (ip + jip * scale <= start)
2313 return ip;
2314 }
2315 }
2316 assert(!"not reached");
2317 return start;
2318 }
2319
2320 /* After program generation, go back and update the UIP and JIP of
2321 * BREAK, CONT, and HALT instructions to their correct locations.
2322 */
2323 void
2324 brw_set_uip_jip(struct brw_compile *p)
2325 {
2326 struct brw_context *brw = p->brw;
2327 int ip;
2328 int scale = 8;
2329 void *store = p->store;
2330
2331 if (brw->gen < 6)
2332 return;
2333
2334 for (ip = 0; ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2335 struct brw_instruction *insn = store + ip;
2336
2337 if (insn->header.cmpt_control) {
2338 /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2339 assert(insn->header.opcode != BRW_OPCODE_BREAK &&
2340 insn->header.opcode != BRW_OPCODE_CONTINUE &&
2341 insn->header.opcode != BRW_OPCODE_HALT);
2342 continue;
2343 }
2344
2345 int block_end_ip = brw_find_next_block_end(p, ip);
2346 switch (insn->header.opcode) {
2347 case BRW_OPCODE_BREAK:
2348 assert(block_end_ip != 0);
2349 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2350 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2351 insn->bits3.break_cont.uip =
2352 (brw_find_loop_end(p, ip) - ip +
2353 (brw->gen == 6 ? 16 : 0)) / scale;
2354 break;
2355 case BRW_OPCODE_CONTINUE:
2356 assert(block_end_ip != 0);
2357 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2358 insn->bits3.break_cont.uip =
2359 (brw_find_loop_end(p, ip) - ip) / scale;
2360
2361 assert(insn->bits3.break_cont.uip != 0);
2362 assert(insn->bits3.break_cont.jip != 0);
2363 break;
2364
2365 case BRW_OPCODE_ENDIF:
2366 if (block_end_ip == 0)
2367 insn->bits3.break_cont.jip = 2;
2368 else
2369 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2370 break;
2371
2372 case BRW_OPCODE_HALT:
2373 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2374 *
2375 * "In case of the halt instruction not inside any conditional
2376 * code block, the value of <JIP> and <UIP> should be the
2377 * same. In case of the halt instruction inside conditional code
2378 * block, the <UIP> should be the end of the program, and the
2379 * <JIP> should be end of the most inner conditional code block."
2380 *
2381 * The uip will have already been set by whoever set up the
2382 * instruction.
2383 */
2384 if (block_end_ip == 0) {
2385 insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
2386 } else {
2387 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2388 }
2389 assert(insn->bits3.break_cont.uip != 0);
2390 assert(insn->bits3.break_cont.jip != 0);
2391 break;
2392 }
2393 }
2394 }
2395
2396 void brw_ff_sync(struct brw_compile *p,
2397 struct brw_reg dest,
2398 GLuint msg_reg_nr,
2399 struct brw_reg src0,
2400 bool allocate,
2401 GLuint response_length,
2402 bool eot)
2403 {
2404 struct brw_context *brw = p->brw;
2405 struct brw_instruction *insn;
2406
2407 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2408
2409 insn = next_insn(p, BRW_OPCODE_SEND);
2410 brw_set_dest(p, insn, dest);
2411 brw_set_src0(p, insn, src0);
2412 brw_set_src1(p, insn, brw_imm_d(0));
2413
2414 if (brw->gen < 6)
2415 insn->header.destreg__conditionalmod = msg_reg_nr;
2416
2417 brw_set_ff_sync_message(p,
2418 insn,
2419 allocate,
2420 response_length,
2421 eot);
2422 }
2423
2424 /**
2425 * Emit the SEND instruction necessary to generate stream output data on Gen6
2426 * (for transform feedback).
2427 *
2428 * If send_commit_msg is true, this is the last piece of stream output data
2429 * from this thread, so send the data as a committed write. According to the
2430 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2431 *
2432 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2433 * writes are complete by sending the final write as a committed write."
2434 */
2435 void
2436 brw_svb_write(struct brw_compile *p,
2437 struct brw_reg dest,
2438 GLuint msg_reg_nr,
2439 struct brw_reg src0,
2440 GLuint binding_table_index,
2441 bool send_commit_msg)
2442 {
2443 struct brw_instruction *insn;
2444
2445 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2446
2447 insn = next_insn(p, BRW_OPCODE_SEND);
2448 brw_set_dest(p, insn, dest);
2449 brw_set_src0(p, insn, src0);
2450 brw_set_src1(p, insn, brw_imm_d(0));
2451 brw_set_dp_write_message(p, insn,
2452 binding_table_index,
2453 0, /* msg_control: ignored */
2454 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2455 1, /* msg_length */
2456 true, /* header_present */
2457 0, /* last_render_target: ignored */
2458 send_commit_msg, /* response_length */
2459 0, /* end_of_thread */
2460 send_commit_msg); /* send_commit_msg */
2461 }
2462
2463 /**
2464 * This instruction is generated as a single-channel align1 instruction by
2465 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2466 *
2467 * We can't use the typed atomic op in the FS because that has the execution
2468 * mask ANDed with the pixel mask, but we just want to write the one dword for
2469 * all the pixels.
2470 *
2471 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2472 * one u32. So we use the same untyped atomic write message as the pixel
2473 * shader.
2474 *
2475 * The untyped atomic operation requires a BUFFER surface type with RAW
2476 * format, and is only accessible through the legacy DATA_CACHE dataport
2477 * messages.
2478 */
2479 void brw_shader_time_add(struct brw_compile *p,
2480 struct brw_reg payload,
2481 uint32_t surf_index)
2482 {
2483 struct brw_context *brw = p->brw;
2484 assert(brw->gen >= 7);
2485
2486 brw_push_insn_state(p);
2487 brw_set_access_mode(p, BRW_ALIGN_1);
2488 brw_set_mask_control(p, BRW_MASK_DISABLE);
2489 struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
2490 brw_pop_insn_state(p);
2491
2492 /* We use brw_vec1_reg and unmasked because we want to increment the given
2493 * offset only once.
2494 */
2495 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2496 BRW_ARF_NULL, 0));
2497 brw_set_src0(p, send, brw_vec1_reg(payload.file,
2498 payload.nr, 0));
2499
2500 uint32_t sfid, msg_type;
2501 if (brw->is_haswell) {
2502 sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2503 msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP;
2504 } else {
2505 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
2506 msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
2507 }
2508
2509 bool header_present = false;
2510 bool eot = false;
2511 uint32_t mlen = 2; /* offset, value */
2512 uint32_t rlen = 0;
2513 brw_set_message_descriptor(p, send, sfid, mlen, rlen, header_present, eot);
2514
2515 send->bits3.ud |= msg_type << 14;
2516 send->bits3.ud |= 0 << 13; /* no return data */
2517 send->bits3.ud |= 1 << 12; /* SIMD8 mode */
2518 send->bits3.ud |= BRW_AOP_ADD << 8;
2519 send->bits3.ud |= surf_index << 0;
2520 }