i965/gen7: Use WE_all mode when enabling channel masks for URB write.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "brw_context.h"
34 #include "brw_defines.h"
35 #include "brw_eu.h"
36
37 #include "glsl/ralloc.h"
38
39 /***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43 static void guess_execution_size(struct brw_compile *p,
44 struct brw_instruction *insn,
45 struct brw_reg reg)
46 {
47 if (reg.width == BRW_WIDTH_8 && p->compressed)
48 insn->header.execution_size = BRW_EXECUTE_16;
49 else
50 insn->header.execution_size = reg.width; /* note - definitions are compatible */
51 }
52
53
54 /**
55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
56 * registers, implicitly moving the operand to a message register.
57 *
58 * On Sandybridge, this is no longer the case. This function performs the
59 * explicit move; it should be called before emitting a SEND instruction.
60 */
61 void
62 gen6_resolve_implied_move(struct brw_compile *p,
63 struct brw_reg *src,
64 GLuint msg_reg_nr)
65 {
66 struct intel_context *intel = &p->brw->intel;
67 if (intel->gen < 6)
68 return;
69
70 if (src->file == BRW_MESSAGE_REGISTER_FILE)
71 return;
72
73 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
74 brw_push_insn_state(p);
75 brw_set_mask_control(p, BRW_MASK_DISABLE);
76 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
77 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
78 retype(*src, BRW_REGISTER_TYPE_UD));
79 brw_pop_insn_state(p);
80 }
81 *src = brw_message_reg(msg_reg_nr);
82 }
83
84 static void
85 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
86 {
87 /* From the BSpec / ISA Reference / send - [DevIVB+]:
88 * "The send with EOT should use register space R112-R127 for <src>. This is
89 * to enable loading of a new thread into the same slot while the message
90 * with EOT for current thread is pending dispatch."
91 *
92 * Since we're pretending to have 16 MRFs anyway, we may as well use the
93 * registers required for messages with EOT.
94 */
95 struct intel_context *intel = &p->brw->intel;
96 if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
97 reg->file = BRW_GENERAL_REGISTER_FILE;
98 reg->nr += GEN7_MRF_HACK_START;
99 }
100 }
101
102
103 void
104 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
105 struct brw_reg dest)
106 {
107 if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
108 dest.file != BRW_MESSAGE_REGISTER_FILE)
109 assert(dest.nr < 128);
110
111 gen7_convert_mrf_to_grf(p, &dest);
112
113 insn->bits1.da1.dest_reg_file = dest.file;
114 insn->bits1.da1.dest_reg_type = dest.type;
115 insn->bits1.da1.dest_address_mode = dest.address_mode;
116
117 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
118 insn->bits1.da1.dest_reg_nr = dest.nr;
119
120 if (insn->header.access_mode == BRW_ALIGN_1) {
121 insn->bits1.da1.dest_subreg_nr = dest.subnr;
122 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
123 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
124 insn->bits1.da1.dest_horiz_stride = dest.hstride;
125 }
126 else {
127 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
128 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
129 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
130 * Although Dst.HorzStride is a don't care for Align16, HW needs
131 * this to be programmed as "01".
132 */
133 insn->bits1.da16.dest_horiz_stride = 1;
134 }
135 }
136 else {
137 insn->bits1.ia1.dest_subreg_nr = dest.subnr;
138
139 /* These are different sizes in align1 vs align16:
140 */
141 if (insn->header.access_mode == BRW_ALIGN_1) {
142 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
143 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
144 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
145 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
146 }
147 else {
148 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
149 /* even ignored in da16, still need to set as '01' */
150 insn->bits1.ia16.dest_horiz_stride = 1;
151 }
152 }
153
154 /* NEW: Set the execution size based on dest.width and
155 * insn->compression_control:
156 */
157 guess_execution_size(p, insn, dest);
158 }
159
160 extern int reg_type_size[];
161
162 static void
163 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
164 {
165 int hstride_for_reg[] = {0, 1, 2, 4};
166 int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
167 int width_for_reg[] = {1, 2, 4, 8, 16};
168 int execsize_for_reg[] = {1, 2, 4, 8, 16};
169 int width, hstride, vstride, execsize;
170
171 if (reg.file == BRW_IMMEDIATE_VALUE) {
172 /* 3.3.6: Region Parameters. Restriction: Immediate vectors
173 * mean the destination has to be 128-bit aligned and the
174 * destination horiz stride has to be a word.
175 */
176 if (reg.type == BRW_REGISTER_TYPE_V) {
177 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
178 reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
179 }
180
181 return;
182 }
183
184 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
185 reg.file == BRW_ARF_NULL)
186 return;
187
188 assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
189 hstride = hstride_for_reg[reg.hstride];
190
191 if (reg.vstride == 0xf) {
192 vstride = -1;
193 } else {
194 assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
195 vstride = vstride_for_reg[reg.vstride];
196 }
197
198 assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
199 width = width_for_reg[reg.width];
200
201 assert(insn->header.execution_size >= 0 &&
202 insn->header.execution_size < Elements(execsize_for_reg));
203 execsize = execsize_for_reg[insn->header.execution_size];
204
205 /* Restrictions from 3.3.10: Register Region Restrictions. */
206 /* 3. */
207 assert(execsize >= width);
208
209 /* 4. */
210 if (execsize == width && hstride != 0) {
211 assert(vstride == -1 || vstride == width * hstride);
212 }
213
214 /* 5. */
215 if (execsize == width && hstride == 0) {
216 /* no restriction on vstride. */
217 }
218
219 /* 6. */
220 if (width == 1) {
221 assert(hstride == 0);
222 }
223
224 /* 7. */
225 if (execsize == 1 && width == 1) {
226 assert(hstride == 0);
227 assert(vstride == 0);
228 }
229
230 /* 8. */
231 if (vstride == 0 && hstride == 0) {
232 assert(width == 1);
233 }
234
235 /* 10. Check destination issues. */
236 }
237
238 void
239 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
240 struct brw_reg reg)
241 {
242 struct brw_context *brw = p->brw;
243 struct intel_context *intel = &brw->intel;
244
245 if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
246 assert(reg.nr < 128);
247
248 gen7_convert_mrf_to_grf(p, &reg);
249
250 if (intel->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
251 insn->header.opcode == BRW_OPCODE_SENDC)) {
252 /* Any source modifiers or regions will be ignored, since this just
253 * identifies the MRF/GRF to start reading the message contents from.
254 * Check for some likely failures.
255 */
256 assert(!reg.negate);
257 assert(!reg.abs);
258 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
259 }
260
261 validate_reg(insn, reg);
262
263 insn->bits1.da1.src0_reg_file = reg.file;
264 insn->bits1.da1.src0_reg_type = reg.type;
265 insn->bits2.da1.src0_abs = reg.abs;
266 insn->bits2.da1.src0_negate = reg.negate;
267 insn->bits2.da1.src0_address_mode = reg.address_mode;
268
269 if (reg.file == BRW_IMMEDIATE_VALUE) {
270 insn->bits3.ud = reg.dw1.ud;
271
272 /* Required to set some fields in src1 as well:
273 */
274 insn->bits1.da1.src1_reg_file = 0; /* arf */
275 insn->bits1.da1.src1_reg_type = reg.type;
276 }
277 else
278 {
279 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
280 if (insn->header.access_mode == BRW_ALIGN_1) {
281 insn->bits2.da1.src0_subreg_nr = reg.subnr;
282 insn->bits2.da1.src0_reg_nr = reg.nr;
283 }
284 else {
285 insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
286 insn->bits2.da16.src0_reg_nr = reg.nr;
287 }
288 }
289 else {
290 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
291
292 if (insn->header.access_mode == BRW_ALIGN_1) {
293 insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
294 }
295 else {
296 insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
297 }
298 }
299
300 if (insn->header.access_mode == BRW_ALIGN_1) {
301 if (reg.width == BRW_WIDTH_1 &&
302 insn->header.execution_size == BRW_EXECUTE_1) {
303 insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
304 insn->bits2.da1.src0_width = BRW_WIDTH_1;
305 insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
306 }
307 else {
308 insn->bits2.da1.src0_horiz_stride = reg.hstride;
309 insn->bits2.da1.src0_width = reg.width;
310 insn->bits2.da1.src0_vert_stride = reg.vstride;
311 }
312 }
313 else {
314 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
315 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
316 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
317 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
318
319 /* This is an oddity of the fact we're using the same
320 * descriptions for registers in align_16 as align_1:
321 */
322 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
323 insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
324 else
325 insn->bits2.da16.src0_vert_stride = reg.vstride;
326 }
327 }
328 }
329
330
331 void brw_set_src1(struct brw_compile *p,
332 struct brw_instruction *insn,
333 struct brw_reg reg)
334 {
335 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
336
337 if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
338 assert(reg.nr < 128);
339
340 gen7_convert_mrf_to_grf(p, &reg);
341
342 validate_reg(insn, reg);
343
344 insn->bits1.da1.src1_reg_file = reg.file;
345 insn->bits1.da1.src1_reg_type = reg.type;
346 insn->bits3.da1.src1_abs = reg.abs;
347 insn->bits3.da1.src1_negate = reg.negate;
348
349 /* Only src1 can be immediate in two-argument instructions.
350 */
351 assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
352
353 if (reg.file == BRW_IMMEDIATE_VALUE) {
354 insn->bits3.ud = reg.dw1.ud;
355 }
356 else {
357 /* This is a hardware restriction, which may or may not be lifted
358 * in the future:
359 */
360 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
361 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
362
363 if (insn->header.access_mode == BRW_ALIGN_1) {
364 insn->bits3.da1.src1_subreg_nr = reg.subnr;
365 insn->bits3.da1.src1_reg_nr = reg.nr;
366 }
367 else {
368 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
369 insn->bits3.da16.src1_reg_nr = reg.nr;
370 }
371
372 if (insn->header.access_mode == BRW_ALIGN_1) {
373 if (reg.width == BRW_WIDTH_1 &&
374 insn->header.execution_size == BRW_EXECUTE_1) {
375 insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
376 insn->bits3.da1.src1_width = BRW_WIDTH_1;
377 insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
378 }
379 else {
380 insn->bits3.da1.src1_horiz_stride = reg.hstride;
381 insn->bits3.da1.src1_width = reg.width;
382 insn->bits3.da1.src1_vert_stride = reg.vstride;
383 }
384 }
385 else {
386 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
387 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
388 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
389 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
390
391 /* This is an oddity of the fact we're using the same
392 * descriptions for registers in align_16 as align_1:
393 */
394 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
395 insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
396 else
397 insn->bits3.da16.src1_vert_stride = reg.vstride;
398 }
399 }
400 }
401
402 /**
403 * Set the Message Descriptor and Extended Message Descriptor fields
404 * for SEND messages.
405 *
406 * \note This zeroes out the Function Control bits, so it must be called
407 * \b before filling out any message-specific data. Callers can
408 * choose not to fill in irrelevant bits; they will be zero.
409 */
410 static void
411 brw_set_message_descriptor(struct brw_compile *p,
412 struct brw_instruction *inst,
413 enum brw_message_target sfid,
414 unsigned msg_length,
415 unsigned response_length,
416 bool header_present,
417 bool end_of_thread)
418 {
419 struct intel_context *intel = &p->brw->intel;
420
421 brw_set_src1(p, inst, brw_imm_d(0));
422
423 if (intel->gen >= 5) {
424 inst->bits3.generic_gen5.header_present = header_present;
425 inst->bits3.generic_gen5.response_length = response_length;
426 inst->bits3.generic_gen5.msg_length = msg_length;
427 inst->bits3.generic_gen5.end_of_thread = end_of_thread;
428
429 if (intel->gen >= 6) {
430 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
431 inst->header.destreg__conditionalmod = sfid;
432 } else {
433 /* Set Extended Message Descriptor (ex_desc) */
434 inst->bits2.send_gen5.sfid = sfid;
435 inst->bits2.send_gen5.end_of_thread = end_of_thread;
436 }
437 } else {
438 inst->bits3.generic.response_length = response_length;
439 inst->bits3.generic.msg_length = msg_length;
440 inst->bits3.generic.msg_target = sfid;
441 inst->bits3.generic.end_of_thread = end_of_thread;
442 }
443 }
444
445 static void brw_set_math_message( struct brw_compile *p,
446 struct brw_instruction *insn,
447 GLuint function,
448 GLuint integer_type,
449 bool low_precision,
450 GLuint dataType )
451 {
452 struct brw_context *brw = p->brw;
453 struct intel_context *intel = &brw->intel;
454 unsigned msg_length;
455 unsigned response_length;
456
457 /* Infer message length from the function */
458 switch (function) {
459 case BRW_MATH_FUNCTION_POW:
460 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
461 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
462 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
463 msg_length = 2;
464 break;
465 default:
466 msg_length = 1;
467 break;
468 }
469
470 /* Infer response length from the function */
471 switch (function) {
472 case BRW_MATH_FUNCTION_SINCOS:
473 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
474 response_length = 2;
475 break;
476 default:
477 response_length = 1;
478 break;
479 }
480
481
482 brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
483 msg_length, response_length, false, false);
484 if (intel->gen == 5) {
485 insn->bits3.math_gen5.function = function;
486 insn->bits3.math_gen5.int_type = integer_type;
487 insn->bits3.math_gen5.precision = low_precision;
488 insn->bits3.math_gen5.saturate = insn->header.saturate;
489 insn->bits3.math_gen5.data_type = dataType;
490 insn->bits3.math_gen5.snapshot = 0;
491 } else {
492 insn->bits3.math.function = function;
493 insn->bits3.math.int_type = integer_type;
494 insn->bits3.math.precision = low_precision;
495 insn->bits3.math.saturate = insn->header.saturate;
496 insn->bits3.math.data_type = dataType;
497 }
498 insn->header.saturate = 0;
499 }
500
501
502 static void brw_set_ff_sync_message(struct brw_compile *p,
503 struct brw_instruction *insn,
504 bool allocate,
505 GLuint response_length,
506 bool end_of_thread)
507 {
508 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
509 1, response_length, true, end_of_thread);
510 insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
511 insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
512 insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
513 insn->bits3.urb_gen5.allocate = allocate;
514 insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
515 insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
516 }
517
518 static void brw_set_urb_message( struct brw_compile *p,
519 struct brw_instruction *insn,
520 bool allocate,
521 bool used,
522 GLuint msg_length,
523 GLuint response_length,
524 bool end_of_thread,
525 bool complete,
526 GLuint offset,
527 GLuint swizzle_control )
528 {
529 struct brw_context *brw = p->brw;
530 struct intel_context *intel = &brw->intel;
531
532 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
533 msg_length, response_length, true, end_of_thread);
534 if (intel->gen == 7) {
535 insn->bits3.urb_gen7.opcode = 0; /* URB_WRITE_HWORD */
536 insn->bits3.urb_gen7.offset = offset;
537 assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
538 insn->bits3.urb_gen7.swizzle_control = swizzle_control;
539 /* per_slot_offset = 0 makes it ignore offsets in message header */
540 insn->bits3.urb_gen7.per_slot_offset = 0;
541 insn->bits3.urb_gen7.complete = complete;
542 } else if (intel->gen >= 5) {
543 insn->bits3.urb_gen5.opcode = 0; /* URB_WRITE */
544 insn->bits3.urb_gen5.offset = offset;
545 insn->bits3.urb_gen5.swizzle_control = swizzle_control;
546 insn->bits3.urb_gen5.allocate = allocate;
547 insn->bits3.urb_gen5.used = used; /* ? */
548 insn->bits3.urb_gen5.complete = complete;
549 } else {
550 insn->bits3.urb.opcode = 0; /* ? */
551 insn->bits3.urb.offset = offset;
552 insn->bits3.urb.swizzle_control = swizzle_control;
553 insn->bits3.urb.allocate = allocate;
554 insn->bits3.urb.used = used; /* ? */
555 insn->bits3.urb.complete = complete;
556 }
557 }
558
559 void
560 brw_set_dp_write_message(struct brw_compile *p,
561 struct brw_instruction *insn,
562 GLuint binding_table_index,
563 GLuint msg_control,
564 GLuint msg_type,
565 GLuint msg_length,
566 bool header_present,
567 GLuint last_render_target,
568 GLuint response_length,
569 GLuint end_of_thread,
570 GLuint send_commit_msg)
571 {
572 struct brw_context *brw = p->brw;
573 struct intel_context *intel = &brw->intel;
574 unsigned sfid;
575
576 if (intel->gen >= 7) {
577 /* Use the Render Cache for RT writes; otherwise use the Data Cache */
578 if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
579 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
580 else
581 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
582 } else if (intel->gen == 6) {
583 /* Use the render cache for all write messages. */
584 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
585 } else {
586 sfid = BRW_SFID_DATAPORT_WRITE;
587 }
588
589 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
590 header_present, end_of_thread);
591
592 if (intel->gen >= 7) {
593 insn->bits3.gen7_dp.binding_table_index = binding_table_index;
594 insn->bits3.gen7_dp.msg_control = msg_control;
595 insn->bits3.gen7_dp.last_render_target = last_render_target;
596 insn->bits3.gen7_dp.msg_type = msg_type;
597 } else if (intel->gen == 6) {
598 insn->bits3.gen6_dp.binding_table_index = binding_table_index;
599 insn->bits3.gen6_dp.msg_control = msg_control;
600 insn->bits3.gen6_dp.last_render_target = last_render_target;
601 insn->bits3.gen6_dp.msg_type = msg_type;
602 insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
603 } else if (intel->gen == 5) {
604 insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
605 insn->bits3.dp_write_gen5.msg_control = msg_control;
606 insn->bits3.dp_write_gen5.last_render_target = last_render_target;
607 insn->bits3.dp_write_gen5.msg_type = msg_type;
608 insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
609 } else {
610 insn->bits3.dp_write.binding_table_index = binding_table_index;
611 insn->bits3.dp_write.msg_control = msg_control;
612 insn->bits3.dp_write.last_render_target = last_render_target;
613 insn->bits3.dp_write.msg_type = msg_type;
614 insn->bits3.dp_write.send_commit_msg = send_commit_msg;
615 }
616 }
617
618 void
619 brw_set_dp_read_message(struct brw_compile *p,
620 struct brw_instruction *insn,
621 GLuint binding_table_index,
622 GLuint msg_control,
623 GLuint msg_type,
624 GLuint target_cache,
625 GLuint msg_length,
626 bool header_present,
627 GLuint response_length)
628 {
629 struct brw_context *brw = p->brw;
630 struct intel_context *intel = &brw->intel;
631 unsigned sfid;
632
633 if (intel->gen >= 7) {
634 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
635 } else if (intel->gen == 6) {
636 if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
637 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
638 else
639 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
640 } else {
641 sfid = BRW_SFID_DATAPORT_READ;
642 }
643
644 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
645 header_present, false);
646
647 if (intel->gen >= 7) {
648 insn->bits3.gen7_dp.binding_table_index = binding_table_index;
649 insn->bits3.gen7_dp.msg_control = msg_control;
650 insn->bits3.gen7_dp.last_render_target = 0;
651 insn->bits3.gen7_dp.msg_type = msg_type;
652 } else if (intel->gen == 6) {
653 insn->bits3.gen6_dp.binding_table_index = binding_table_index;
654 insn->bits3.gen6_dp.msg_control = msg_control;
655 insn->bits3.gen6_dp.last_render_target = 0;
656 insn->bits3.gen6_dp.msg_type = msg_type;
657 insn->bits3.gen6_dp.send_commit_msg = 0;
658 } else if (intel->gen == 5) {
659 insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
660 insn->bits3.dp_read_gen5.msg_control = msg_control;
661 insn->bits3.dp_read_gen5.msg_type = msg_type;
662 insn->bits3.dp_read_gen5.target_cache = target_cache;
663 } else if (intel->is_g4x) {
664 insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
665 insn->bits3.dp_read_g4x.msg_control = msg_control; /*8:10*/
666 insn->bits3.dp_read_g4x.msg_type = msg_type; /*11:13*/
667 insn->bits3.dp_read_g4x.target_cache = target_cache; /*14:15*/
668 } else {
669 insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
670 insn->bits3.dp_read.msg_control = msg_control; /*8:11*/
671 insn->bits3.dp_read.msg_type = msg_type; /*12:13*/
672 insn->bits3.dp_read.target_cache = target_cache; /*14:15*/
673 }
674 }
675
676 void
677 brw_set_sampler_message(struct brw_compile *p,
678 struct brw_instruction *insn,
679 GLuint binding_table_index,
680 GLuint sampler,
681 GLuint msg_type,
682 GLuint response_length,
683 GLuint msg_length,
684 GLuint header_present,
685 GLuint simd_mode,
686 GLuint return_format)
687 {
688 struct brw_context *brw = p->brw;
689 struct intel_context *intel = &brw->intel;
690
691 brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
692 response_length, header_present, false);
693
694 if (intel->gen >= 7) {
695 insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
696 insn->bits3.sampler_gen7.sampler = sampler;
697 insn->bits3.sampler_gen7.msg_type = msg_type;
698 insn->bits3.sampler_gen7.simd_mode = simd_mode;
699 } else if (intel->gen >= 5) {
700 insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
701 insn->bits3.sampler_gen5.sampler = sampler;
702 insn->bits3.sampler_gen5.msg_type = msg_type;
703 insn->bits3.sampler_gen5.simd_mode = simd_mode;
704 } else if (intel->is_g4x) {
705 insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
706 insn->bits3.sampler_g4x.sampler = sampler;
707 insn->bits3.sampler_g4x.msg_type = msg_type;
708 } else {
709 insn->bits3.sampler.binding_table_index = binding_table_index;
710 insn->bits3.sampler.sampler = sampler;
711 insn->bits3.sampler.msg_type = msg_type;
712 insn->bits3.sampler.return_format = return_format;
713 }
714 }
715
716
717 #define next_insn brw_next_insn
718 struct brw_instruction *
719 brw_next_insn(struct brw_compile *p, GLuint opcode)
720 {
721 struct brw_instruction *insn;
722
723 if (p->nr_insn + 1 > p->store_size) {
724 if (0)
725 printf("incresing the store size to %d\n", p->store_size << 1);
726 p->store_size <<= 1;
727 p->store = reralloc(p->mem_ctx, p->store,
728 struct brw_instruction, p->store_size);
729 if (!p->store)
730 assert(!"realloc eu store memeory failed");
731 }
732
733 p->next_insn_offset += 16;
734 insn = &p->store[p->nr_insn++];
735 memcpy(insn, p->current, sizeof(*insn));
736
737 /* Reset this one-shot flag:
738 */
739
740 if (p->current->header.destreg__conditionalmod) {
741 p->current->header.destreg__conditionalmod = 0;
742 p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
743 }
744
745 insn->header.opcode = opcode;
746 return insn;
747 }
748
749 static struct brw_instruction *brw_alu1( struct brw_compile *p,
750 GLuint opcode,
751 struct brw_reg dest,
752 struct brw_reg src )
753 {
754 struct brw_instruction *insn = next_insn(p, opcode);
755 brw_set_dest(p, insn, dest);
756 brw_set_src0(p, insn, src);
757 return insn;
758 }
759
760 static struct brw_instruction *brw_alu2(struct brw_compile *p,
761 GLuint opcode,
762 struct brw_reg dest,
763 struct brw_reg src0,
764 struct brw_reg src1 )
765 {
766 struct brw_instruction *insn = next_insn(p, opcode);
767 brw_set_dest(p, insn, dest);
768 brw_set_src0(p, insn, src0);
769 brw_set_src1(p, insn, src1);
770 return insn;
771 }
772
773 static int
774 get_3src_subreg_nr(struct brw_reg reg)
775 {
776 if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
777 assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
778 return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
779 } else {
780 return reg.subnr / 4;
781 }
782 }
783
784 static struct brw_instruction *brw_alu3(struct brw_compile *p,
785 GLuint opcode,
786 struct brw_reg dest,
787 struct brw_reg src0,
788 struct brw_reg src1,
789 struct brw_reg src2)
790 {
791 struct brw_instruction *insn = next_insn(p, opcode);
792
793 gen7_convert_mrf_to_grf(p, &dest);
794
795 assert(insn->header.access_mode == BRW_ALIGN_16);
796
797 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
798 dest.file == BRW_MESSAGE_REGISTER_FILE);
799 assert(dest.nr < 128);
800 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
801 assert(dest.type == BRW_REGISTER_TYPE_F);
802 insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
803 insn->bits1.da3src.dest_reg_nr = dest.nr;
804 insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
805 insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
806 guess_execution_size(p, insn, dest);
807
808 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
809 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
810 assert(src0.nr < 128);
811 assert(src0.type == BRW_REGISTER_TYPE_F);
812 insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
813 insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
814 insn->bits2.da3src.src0_reg_nr = src0.nr;
815 insn->bits1.da3src.src0_abs = src0.abs;
816 insn->bits1.da3src.src0_negate = src0.negate;
817 insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
818
819 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
820 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
821 assert(src1.nr < 128);
822 assert(src1.type == BRW_REGISTER_TYPE_F);
823 insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
824 insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
825 insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
826 insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
827 insn->bits3.da3src.src1_reg_nr = src1.nr;
828 insn->bits1.da3src.src1_abs = src1.abs;
829 insn->bits1.da3src.src1_negate = src1.negate;
830
831 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
832 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
833 assert(src2.nr < 128);
834 assert(src2.type == BRW_REGISTER_TYPE_F);
835 insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
836 insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
837 insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
838 insn->bits3.da3src.src2_reg_nr = src2.nr;
839 insn->bits1.da3src.src2_abs = src2.abs;
840 insn->bits1.da3src.src2_negate = src2.negate;
841
842 return insn;
843 }
844
845
846 /***********************************************************************
847 * Convenience routines.
848 */
849 #define ALU1(OP) \
850 struct brw_instruction *brw_##OP(struct brw_compile *p, \
851 struct brw_reg dest, \
852 struct brw_reg src0) \
853 { \
854 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
855 }
856
857 #define ALU2(OP) \
858 struct brw_instruction *brw_##OP(struct brw_compile *p, \
859 struct brw_reg dest, \
860 struct brw_reg src0, \
861 struct brw_reg src1) \
862 { \
863 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
864 }
865
866 #define ALU3(OP) \
867 struct brw_instruction *brw_##OP(struct brw_compile *p, \
868 struct brw_reg dest, \
869 struct brw_reg src0, \
870 struct brw_reg src1, \
871 struct brw_reg src2) \
872 { \
873 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
874 }
875
876 /* Rounding operations (other than RNDD) require two instructions - the first
877 * stores a rounded value (possibly the wrong way) in the dest register, but
878 * also sets a per-channel "increment bit" in the flag register. A predicated
879 * add of 1.0 fixes dest to contain the desired result.
880 *
881 * Sandybridge and later appear to round correctly without an ADD.
882 */
883 #define ROUND(OP) \
884 void brw_##OP(struct brw_compile *p, \
885 struct brw_reg dest, \
886 struct brw_reg src) \
887 { \
888 struct brw_instruction *rnd, *add; \
889 rnd = next_insn(p, BRW_OPCODE_##OP); \
890 brw_set_dest(p, rnd, dest); \
891 brw_set_src0(p, rnd, src); \
892 \
893 if (p->brw->intel.gen < 6) { \
894 /* turn on round-increments */ \
895 rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R; \
896 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
897 add->header.predicate_control = BRW_PREDICATE_NORMAL; \
898 } \
899 }
900
901
902 ALU1(MOV)
903 ALU2(SEL)
904 ALU1(NOT)
905 ALU2(AND)
906 ALU2(OR)
907 ALU2(XOR)
908 ALU2(SHR)
909 ALU2(SHL)
910 ALU2(RSR)
911 ALU2(RSL)
912 ALU2(ASR)
913 ALU1(F32TO16)
914 ALU1(F16TO32)
915 ALU1(FRC)
916 ALU1(RNDD)
917 ALU2(MAC)
918 ALU2(MACH)
919 ALU1(LZD)
920 ALU2(DP4)
921 ALU2(DPH)
922 ALU2(DP3)
923 ALU2(DP2)
924 ALU2(LINE)
925 ALU2(PLN)
926 ALU3(MAD)
927 ALU3(LRP)
928
929 ROUND(RNDZ)
930 ROUND(RNDE)
931
932
933 struct brw_instruction *brw_ADD(struct brw_compile *p,
934 struct brw_reg dest,
935 struct brw_reg src0,
936 struct brw_reg src1)
937 {
938 /* 6.2.2: add */
939 if (src0.type == BRW_REGISTER_TYPE_F ||
940 (src0.file == BRW_IMMEDIATE_VALUE &&
941 src0.type == BRW_REGISTER_TYPE_VF)) {
942 assert(src1.type != BRW_REGISTER_TYPE_UD);
943 assert(src1.type != BRW_REGISTER_TYPE_D);
944 }
945
946 if (src1.type == BRW_REGISTER_TYPE_F ||
947 (src1.file == BRW_IMMEDIATE_VALUE &&
948 src1.type == BRW_REGISTER_TYPE_VF)) {
949 assert(src0.type != BRW_REGISTER_TYPE_UD);
950 assert(src0.type != BRW_REGISTER_TYPE_D);
951 }
952
953 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
954 }
955
956 struct brw_instruction *brw_AVG(struct brw_compile *p,
957 struct brw_reg dest,
958 struct brw_reg src0,
959 struct brw_reg src1)
960 {
961 assert(dest.type == src0.type);
962 assert(src0.type == src1.type);
963 switch (src0.type) {
964 case BRW_REGISTER_TYPE_B:
965 case BRW_REGISTER_TYPE_UB:
966 case BRW_REGISTER_TYPE_W:
967 case BRW_REGISTER_TYPE_UW:
968 case BRW_REGISTER_TYPE_D:
969 case BRW_REGISTER_TYPE_UD:
970 break;
971 default:
972 assert(!"Bad type for brw_AVG");
973 }
974
975 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
976 }
977
978 struct brw_instruction *brw_MUL(struct brw_compile *p,
979 struct brw_reg dest,
980 struct brw_reg src0,
981 struct brw_reg src1)
982 {
983 /* 6.32.38: mul */
984 if (src0.type == BRW_REGISTER_TYPE_D ||
985 src0.type == BRW_REGISTER_TYPE_UD ||
986 src1.type == BRW_REGISTER_TYPE_D ||
987 src1.type == BRW_REGISTER_TYPE_UD) {
988 assert(dest.type != BRW_REGISTER_TYPE_F);
989 }
990
991 if (src0.type == BRW_REGISTER_TYPE_F ||
992 (src0.file == BRW_IMMEDIATE_VALUE &&
993 src0.type == BRW_REGISTER_TYPE_VF)) {
994 assert(src1.type != BRW_REGISTER_TYPE_UD);
995 assert(src1.type != BRW_REGISTER_TYPE_D);
996 }
997
998 if (src1.type == BRW_REGISTER_TYPE_F ||
999 (src1.file == BRW_IMMEDIATE_VALUE &&
1000 src1.type == BRW_REGISTER_TYPE_VF)) {
1001 assert(src0.type != BRW_REGISTER_TYPE_UD);
1002 assert(src0.type != BRW_REGISTER_TYPE_D);
1003 }
1004
1005 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1006 src0.nr != BRW_ARF_ACCUMULATOR);
1007 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1008 src1.nr != BRW_ARF_ACCUMULATOR);
1009
1010 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1011 }
1012
1013
1014 void brw_NOP(struct brw_compile *p)
1015 {
1016 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
1017 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1018 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1019 brw_set_src1(p, insn, brw_imm_ud(0x0));
1020 }
1021
1022
1023
1024
1025
1026 /***********************************************************************
1027 * Comparisons, if/else/endif
1028 */
1029
1030 struct brw_instruction *brw_JMPI(struct brw_compile *p,
1031 struct brw_reg dest,
1032 struct brw_reg src0,
1033 struct brw_reg src1)
1034 {
1035 struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
1036
1037 insn->header.execution_size = 1;
1038 insn->header.compression_control = BRW_COMPRESSION_NONE;
1039 insn->header.mask_control = BRW_MASK_DISABLE;
1040
1041 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1042
1043 return insn;
1044 }
1045
1046 static void
1047 push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1048 {
1049 p->if_stack[p->if_stack_depth] = inst - p->store;
1050
1051 p->if_stack_depth++;
1052 if (p->if_stack_array_size <= p->if_stack_depth) {
1053 p->if_stack_array_size *= 2;
1054 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1055 p->if_stack_array_size);
1056 }
1057 }
1058
1059 static struct brw_instruction *
1060 pop_if_stack(struct brw_compile *p)
1061 {
1062 p->if_stack_depth--;
1063 return &p->store[p->if_stack[p->if_stack_depth]];
1064 }
1065
1066 static void
1067 push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1068 {
1069 if (p->loop_stack_array_size < p->loop_stack_depth) {
1070 p->loop_stack_array_size *= 2;
1071 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1072 p->loop_stack_array_size);
1073 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1074 p->loop_stack_array_size);
1075 }
1076
1077 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1078 p->loop_stack_depth++;
1079 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1080 }
1081
1082 static struct brw_instruction *
1083 get_inner_do_insn(struct brw_compile *p)
1084 {
1085 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1086 }
1087
1088 /* EU takes the value from the flag register and pushes it onto some
1089 * sort of a stack (presumably merging with any flag value already on
1090 * the stack). Within an if block, the flags at the top of the stack
1091 * control execution on each channel of the unit, eg. on each of the
1092 * 16 pixel values in our wm programs.
1093 *
1094 * When the matching 'else' instruction is reached (presumably by
1095 * countdown of the instruction count patched in by our ELSE/ENDIF
1096 * functions), the relevent flags are inverted.
1097 *
1098 * When the matching 'endif' instruction is reached, the flags are
1099 * popped off. If the stack is now empty, normal execution resumes.
1100 */
1101 struct brw_instruction *
1102 brw_IF(struct brw_compile *p, GLuint execute_size)
1103 {
1104 struct intel_context *intel = &p->brw->intel;
1105 struct brw_instruction *insn;
1106
1107 insn = next_insn(p, BRW_OPCODE_IF);
1108
1109 /* Override the defaults for this instruction:
1110 */
1111 if (intel->gen < 6) {
1112 brw_set_dest(p, insn, brw_ip_reg());
1113 brw_set_src0(p, insn, brw_ip_reg());
1114 brw_set_src1(p, insn, brw_imm_d(0x0));
1115 } else if (intel->gen == 6) {
1116 brw_set_dest(p, insn, brw_imm_w(0));
1117 insn->bits1.branch_gen6.jump_count = 0;
1118 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1119 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1120 } else {
1121 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1122 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1123 brw_set_src1(p, insn, brw_imm_ud(0));
1124 insn->bits3.break_cont.jip = 0;
1125 insn->bits3.break_cont.uip = 0;
1126 }
1127
1128 insn->header.execution_size = execute_size;
1129 insn->header.compression_control = BRW_COMPRESSION_NONE;
1130 insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1131 insn->header.mask_control = BRW_MASK_ENABLE;
1132 if (!p->single_program_flow)
1133 insn->header.thread_control = BRW_THREAD_SWITCH;
1134
1135 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1136
1137 push_if_stack(p, insn);
1138 p->if_depth_in_loop[p->loop_stack_depth]++;
1139 return insn;
1140 }
1141
1142 /* This function is only used for gen6-style IF instructions with an
1143 * embedded comparison (conditional modifier). It is not used on gen7.
1144 */
1145 struct brw_instruction *
1146 gen6_IF(struct brw_compile *p, uint32_t conditional,
1147 struct brw_reg src0, struct brw_reg src1)
1148 {
1149 struct brw_instruction *insn;
1150
1151 insn = next_insn(p, BRW_OPCODE_IF);
1152
1153 brw_set_dest(p, insn, brw_imm_w(0));
1154 if (p->compressed) {
1155 insn->header.execution_size = BRW_EXECUTE_16;
1156 } else {
1157 insn->header.execution_size = BRW_EXECUTE_8;
1158 }
1159 insn->bits1.branch_gen6.jump_count = 0;
1160 brw_set_src0(p, insn, src0);
1161 brw_set_src1(p, insn, src1);
1162
1163 assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1164 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1165 insn->header.destreg__conditionalmod = conditional;
1166
1167 if (!p->single_program_flow)
1168 insn->header.thread_control = BRW_THREAD_SWITCH;
1169
1170 push_if_stack(p, insn);
1171 return insn;
1172 }
1173
1174 /**
1175 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1176 */
1177 static void
1178 convert_IF_ELSE_to_ADD(struct brw_compile *p,
1179 struct brw_instruction *if_inst,
1180 struct brw_instruction *else_inst)
1181 {
1182 /* The next instruction (where the ENDIF would be, if it existed) */
1183 struct brw_instruction *next_inst = &p->store[p->nr_insn];
1184
1185 assert(p->single_program_flow);
1186 assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1187 assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1188 assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1189
1190 /* Convert IF to an ADD instruction that moves the instruction pointer
1191 * to the first instruction of the ELSE block. If there is no ELSE
1192 * block, point to where ENDIF would be. Reverse the predicate.
1193 *
1194 * There's no need to execute an ENDIF since we don't need to do any
1195 * stack operations, and if we're currently executing, we just want to
1196 * continue normally.
1197 */
1198 if_inst->header.opcode = BRW_OPCODE_ADD;
1199 if_inst->header.predicate_inverse = 1;
1200
1201 if (else_inst != NULL) {
1202 /* Convert ELSE to an ADD instruction that points where the ENDIF
1203 * would be.
1204 */
1205 else_inst->header.opcode = BRW_OPCODE_ADD;
1206
1207 if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1208 else_inst->bits3.ud = (next_inst - else_inst) * 16;
1209 } else {
1210 if_inst->bits3.ud = (next_inst - if_inst) * 16;
1211 }
1212 }
1213
1214 /**
1215 * Patch IF and ELSE instructions with appropriate jump targets.
1216 */
1217 static void
1218 patch_IF_ELSE(struct brw_compile *p,
1219 struct brw_instruction *if_inst,
1220 struct brw_instruction *else_inst,
1221 struct brw_instruction *endif_inst)
1222 {
1223 struct intel_context *intel = &p->brw->intel;
1224
1225 /* We shouldn't be patching IF and ELSE instructions in single program flow
1226 * mode when gen < 6, because in single program flow mode on those
1227 * platforms, we convert flow control instructions to conditional ADDs that
1228 * operate on IP (see brw_ENDIF).
1229 *
1230 * However, on Gen6, writing to IP doesn't work in single program flow mode
1231 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1232 * not be updated by non-flow control instructions."). And on later
1233 * platforms, there is no significant benefit to converting control flow
1234 * instructions to conditional ADDs. So we do patch IF and ELSE
1235 * instructions in single program flow mode on those platforms.
1236 */
1237 if (intel->gen < 6)
1238 assert(!p->single_program_flow);
1239
1240 assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1241 assert(endif_inst != NULL);
1242 assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1243
1244 unsigned br = 1;
1245 /* Jump count is for 64bit data chunk each, so one 128bit instruction
1246 * requires 2 chunks.
1247 */
1248 if (intel->gen >= 5)
1249 br = 2;
1250
1251 assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1252 endif_inst->header.execution_size = if_inst->header.execution_size;
1253
1254 if (else_inst == NULL) {
1255 /* Patch IF -> ENDIF */
1256 if (intel->gen < 6) {
1257 /* Turn it into an IFF, which means no mask stack operations for
1258 * all-false and jumping past the ENDIF.
1259 */
1260 if_inst->header.opcode = BRW_OPCODE_IFF;
1261 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1262 if_inst->bits3.if_else.pop_count = 0;
1263 if_inst->bits3.if_else.pad0 = 0;
1264 } else if (intel->gen == 6) {
1265 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1266 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1267 } else {
1268 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1269 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1270 }
1271 } else {
1272 else_inst->header.execution_size = if_inst->header.execution_size;
1273
1274 /* Patch IF -> ELSE */
1275 if (intel->gen < 6) {
1276 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1277 if_inst->bits3.if_else.pop_count = 0;
1278 if_inst->bits3.if_else.pad0 = 0;
1279 } else if (intel->gen == 6) {
1280 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1281 }
1282
1283 /* Patch ELSE -> ENDIF */
1284 if (intel->gen < 6) {
1285 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1286 * matching ENDIF.
1287 */
1288 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1289 else_inst->bits3.if_else.pop_count = 1;
1290 else_inst->bits3.if_else.pad0 = 0;
1291 } else if (intel->gen == 6) {
1292 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1293 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1294 } else {
1295 /* The IF instruction's JIP should point just past the ELSE */
1296 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1297 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1298 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1299 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1300 }
1301 }
1302 }
1303
1304 void
1305 brw_ELSE(struct brw_compile *p)
1306 {
1307 struct intel_context *intel = &p->brw->intel;
1308 struct brw_instruction *insn;
1309
1310 insn = next_insn(p, BRW_OPCODE_ELSE);
1311
1312 if (intel->gen < 6) {
1313 brw_set_dest(p, insn, brw_ip_reg());
1314 brw_set_src0(p, insn, brw_ip_reg());
1315 brw_set_src1(p, insn, brw_imm_d(0x0));
1316 } else if (intel->gen == 6) {
1317 brw_set_dest(p, insn, brw_imm_w(0));
1318 insn->bits1.branch_gen6.jump_count = 0;
1319 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1320 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1321 } else {
1322 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1323 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1324 brw_set_src1(p, insn, brw_imm_ud(0));
1325 insn->bits3.break_cont.jip = 0;
1326 insn->bits3.break_cont.uip = 0;
1327 }
1328
1329 insn->header.compression_control = BRW_COMPRESSION_NONE;
1330 insn->header.mask_control = BRW_MASK_ENABLE;
1331 if (!p->single_program_flow)
1332 insn->header.thread_control = BRW_THREAD_SWITCH;
1333
1334 push_if_stack(p, insn);
1335 }
1336
1337 void
1338 brw_ENDIF(struct brw_compile *p)
1339 {
1340 struct intel_context *intel = &p->brw->intel;
1341 struct brw_instruction *insn = NULL;
1342 struct brw_instruction *else_inst = NULL;
1343 struct brw_instruction *if_inst = NULL;
1344 struct brw_instruction *tmp;
1345 bool emit_endif = true;
1346
1347 /* In single program flow mode, we can express IF and ELSE instructions
1348 * equivalently as ADD instructions that operate on IP. On platforms prior
1349 * to Gen6, flow control instructions cause an implied thread switch, so
1350 * this is a significant savings.
1351 *
1352 * However, on Gen6, writing to IP doesn't work in single program flow mode
1353 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1354 * not be updated by non-flow control instructions."). And on later
1355 * platforms, there is no significant benefit to converting control flow
1356 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1357 * Gen5.
1358 */
1359 if (intel->gen < 6 && p->single_program_flow)
1360 emit_endif = false;
1361
1362 /*
1363 * A single next_insn() may change the base adress of instruction store
1364 * memory(p->store), so call it first before referencing the instruction
1365 * store pointer from an index
1366 */
1367 if (emit_endif)
1368 insn = next_insn(p, BRW_OPCODE_ENDIF);
1369
1370 /* Pop the IF and (optional) ELSE instructions from the stack */
1371 p->if_depth_in_loop[p->loop_stack_depth]--;
1372 tmp = pop_if_stack(p);
1373 if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1374 else_inst = tmp;
1375 tmp = pop_if_stack(p);
1376 }
1377 if_inst = tmp;
1378
1379 if (!emit_endif) {
1380 /* ENDIF is useless; don't bother emitting it. */
1381 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1382 return;
1383 }
1384
1385 if (intel->gen < 6) {
1386 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1387 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1388 brw_set_src1(p, insn, brw_imm_d(0x0));
1389 } else if (intel->gen == 6) {
1390 brw_set_dest(p, insn, brw_imm_w(0));
1391 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1392 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1393 } else {
1394 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1395 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1396 brw_set_src1(p, insn, brw_imm_ud(0));
1397 }
1398
1399 insn->header.compression_control = BRW_COMPRESSION_NONE;
1400 insn->header.mask_control = BRW_MASK_ENABLE;
1401 insn->header.thread_control = BRW_THREAD_SWITCH;
1402
1403 /* Also pop item off the stack in the endif instruction: */
1404 if (intel->gen < 6) {
1405 insn->bits3.if_else.jump_count = 0;
1406 insn->bits3.if_else.pop_count = 1;
1407 insn->bits3.if_else.pad0 = 0;
1408 } else if (intel->gen == 6) {
1409 insn->bits1.branch_gen6.jump_count = 2;
1410 } else {
1411 insn->bits3.break_cont.jip = 2;
1412 }
1413 patch_IF_ELSE(p, if_inst, else_inst, insn);
1414 }
1415
1416 struct brw_instruction *brw_BREAK(struct brw_compile *p)
1417 {
1418 struct intel_context *intel = &p->brw->intel;
1419 struct brw_instruction *insn;
1420
1421 insn = next_insn(p, BRW_OPCODE_BREAK);
1422 if (intel->gen >= 6) {
1423 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1424 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1425 brw_set_src1(p, insn, brw_imm_d(0x0));
1426 } else {
1427 brw_set_dest(p, insn, brw_ip_reg());
1428 brw_set_src0(p, insn, brw_ip_reg());
1429 brw_set_src1(p, insn, brw_imm_d(0x0));
1430 insn->bits3.if_else.pad0 = 0;
1431 insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1432 }
1433 insn->header.compression_control = BRW_COMPRESSION_NONE;
1434 insn->header.execution_size = BRW_EXECUTE_8;
1435
1436 return insn;
1437 }
1438
1439 struct brw_instruction *gen6_CONT(struct brw_compile *p)
1440 {
1441 struct brw_instruction *insn;
1442
1443 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1444 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1445 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1446 brw_set_dest(p, insn, brw_ip_reg());
1447 brw_set_src0(p, insn, brw_ip_reg());
1448 brw_set_src1(p, insn, brw_imm_d(0x0));
1449
1450 insn->header.compression_control = BRW_COMPRESSION_NONE;
1451 insn->header.execution_size = BRW_EXECUTE_8;
1452 return insn;
1453 }
1454
1455 struct brw_instruction *brw_CONT(struct brw_compile *p)
1456 {
1457 struct brw_instruction *insn;
1458 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1459 brw_set_dest(p, insn, brw_ip_reg());
1460 brw_set_src0(p, insn, brw_ip_reg());
1461 brw_set_src1(p, insn, brw_imm_d(0x0));
1462 insn->header.compression_control = BRW_COMPRESSION_NONE;
1463 insn->header.execution_size = BRW_EXECUTE_8;
1464 /* insn->header.mask_control = BRW_MASK_DISABLE; */
1465 insn->bits3.if_else.pad0 = 0;
1466 insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1467 return insn;
1468 }
1469
1470 struct brw_instruction *gen6_HALT(struct brw_compile *p)
1471 {
1472 struct brw_instruction *insn;
1473
1474 insn = next_insn(p, BRW_OPCODE_HALT);
1475 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1476 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1477 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1478
1479 if (p->compressed) {
1480 insn->header.execution_size = BRW_EXECUTE_16;
1481 } else {
1482 insn->header.compression_control = BRW_COMPRESSION_NONE;
1483 insn->header.execution_size = BRW_EXECUTE_8;
1484 }
1485 return insn;
1486 }
1487
1488 /* DO/WHILE loop:
1489 *
1490 * The DO/WHILE is just an unterminated loop -- break or continue are
1491 * used for control within the loop. We have a few ways they can be
1492 * done.
1493 *
1494 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1495 * jip and no DO instruction.
1496 *
1497 * For non-uniform control flow pre-gen6, there's a DO instruction to
1498 * push the mask, and a WHILE to jump back, and BREAK to get out and
1499 * pop the mask.
1500 *
1501 * For gen6, there's no more mask stack, so no need for DO. WHILE
1502 * just points back to the first instruction of the loop.
1503 */
1504 struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1505 {
1506 struct intel_context *intel = &p->brw->intel;
1507
1508 if (intel->gen >= 6 || p->single_program_flow) {
1509 push_loop_stack(p, &p->store[p->nr_insn]);
1510 return &p->store[p->nr_insn];
1511 } else {
1512 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1513
1514 push_loop_stack(p, insn);
1515
1516 /* Override the defaults for this instruction:
1517 */
1518 brw_set_dest(p, insn, brw_null_reg());
1519 brw_set_src0(p, insn, brw_null_reg());
1520 brw_set_src1(p, insn, brw_null_reg());
1521
1522 insn->header.compression_control = BRW_COMPRESSION_NONE;
1523 insn->header.execution_size = execute_size;
1524 insn->header.predicate_control = BRW_PREDICATE_NONE;
1525 /* insn->header.mask_control = BRW_MASK_ENABLE; */
1526 /* insn->header.mask_control = BRW_MASK_DISABLE; */
1527
1528 return insn;
1529 }
1530 }
1531
1532 /**
1533 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1534 * instruction here.
1535 *
1536 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1537 * nesting, since it can always just point to the end of the block/current loop.
1538 */
1539 static void
1540 brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1541 {
1542 struct intel_context *intel = &p->brw->intel;
1543 struct brw_instruction *do_inst = get_inner_do_insn(p);
1544 struct brw_instruction *inst;
1545 int br = (intel->gen == 5) ? 2 : 1;
1546
1547 for (inst = while_inst - 1; inst != do_inst; inst--) {
1548 /* If the jump count is != 0, that means that this instruction has already
1549 * been patched because it's part of a loop inside of the one we're
1550 * patching.
1551 */
1552 if (inst->header.opcode == BRW_OPCODE_BREAK &&
1553 inst->bits3.if_else.jump_count == 0) {
1554 inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1555 } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1556 inst->bits3.if_else.jump_count == 0) {
1557 inst->bits3.if_else.jump_count = br * (while_inst - inst);
1558 }
1559 }
1560 }
1561
1562 struct brw_instruction *brw_WHILE(struct brw_compile *p)
1563 {
1564 struct intel_context *intel = &p->brw->intel;
1565 struct brw_instruction *insn, *do_insn;
1566 GLuint br = 1;
1567
1568 if (intel->gen >= 5)
1569 br = 2;
1570
1571 if (intel->gen >= 7) {
1572 insn = next_insn(p, BRW_OPCODE_WHILE);
1573 do_insn = get_inner_do_insn(p);
1574
1575 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1576 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1577 brw_set_src1(p, insn, brw_imm_ud(0));
1578 insn->bits3.break_cont.jip = br * (do_insn - insn);
1579
1580 insn->header.execution_size = BRW_EXECUTE_8;
1581 } else if (intel->gen == 6) {
1582 insn = next_insn(p, BRW_OPCODE_WHILE);
1583 do_insn = get_inner_do_insn(p);
1584
1585 brw_set_dest(p, insn, brw_imm_w(0));
1586 insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1587 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1588 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1589
1590 insn->header.execution_size = BRW_EXECUTE_8;
1591 } else {
1592 if (p->single_program_flow) {
1593 insn = next_insn(p, BRW_OPCODE_ADD);
1594 do_insn = get_inner_do_insn(p);
1595
1596 brw_set_dest(p, insn, brw_ip_reg());
1597 brw_set_src0(p, insn, brw_ip_reg());
1598 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1599 insn->header.execution_size = BRW_EXECUTE_1;
1600 } else {
1601 insn = next_insn(p, BRW_OPCODE_WHILE);
1602 do_insn = get_inner_do_insn(p);
1603
1604 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1605
1606 brw_set_dest(p, insn, brw_ip_reg());
1607 brw_set_src0(p, insn, brw_ip_reg());
1608 brw_set_src1(p, insn, brw_imm_d(0));
1609
1610 insn->header.execution_size = do_insn->header.execution_size;
1611 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1612 insn->bits3.if_else.pop_count = 0;
1613 insn->bits3.if_else.pad0 = 0;
1614
1615 brw_patch_break_cont(p, insn);
1616 }
1617 }
1618 insn->header.compression_control = BRW_COMPRESSION_NONE;
1619 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1620
1621 p->loop_stack_depth--;
1622
1623 return insn;
1624 }
1625
1626
1627 /* FORWARD JUMPS:
1628 */
1629 void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1630 {
1631 struct intel_context *intel = &p->brw->intel;
1632 struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1633 GLuint jmpi = 1;
1634
1635 if (intel->gen >= 5)
1636 jmpi = 2;
1637
1638 assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1639 assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1640
1641 jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
1642 }
1643
1644
1645
1646 /* To integrate with the above, it makes sense that the comparison
1647 * instruction should populate the flag register. It might be simpler
1648 * just to use the flag reg for most WM tasks?
1649 */
1650 void brw_CMP(struct brw_compile *p,
1651 struct brw_reg dest,
1652 GLuint conditional,
1653 struct brw_reg src0,
1654 struct brw_reg src1)
1655 {
1656 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1657
1658 insn->header.destreg__conditionalmod = conditional;
1659 brw_set_dest(p, insn, dest);
1660 brw_set_src0(p, insn, src0);
1661 brw_set_src1(p, insn, src1);
1662
1663 /* guess_execution_size(insn, src0); */
1664
1665
1666 /* Make it so that future instructions will use the computed flag
1667 * value until brw_set_predicate_control_flag_value() is called
1668 * again.
1669 */
1670 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1671 dest.nr == 0) {
1672 p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1673 p->flag_value = 0xff;
1674 }
1675 }
1676
1677 /* Issue 'wait' instruction for n1, host could program MMIO
1678 to wake up thread. */
1679 void brw_WAIT (struct brw_compile *p)
1680 {
1681 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1682 struct brw_reg src = brw_notification_1_reg();
1683
1684 brw_set_dest(p, insn, src);
1685 brw_set_src0(p, insn, src);
1686 brw_set_src1(p, insn, brw_null_reg());
1687 insn->header.execution_size = 0; /* must */
1688 insn->header.predicate_control = 0;
1689 insn->header.compression_control = 0;
1690 }
1691
1692
1693 /***********************************************************************
1694 * Helpers for the various SEND message types:
1695 */
1696
1697 /** Extended math function, float[8].
1698 */
1699 void brw_math( struct brw_compile *p,
1700 struct brw_reg dest,
1701 GLuint function,
1702 GLuint msg_reg_nr,
1703 struct brw_reg src,
1704 GLuint data_type,
1705 GLuint precision )
1706 {
1707 struct intel_context *intel = &p->brw->intel;
1708
1709 if (intel->gen >= 6) {
1710 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1711
1712 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1713 (intel->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1714 assert(src.file == BRW_GENERAL_REGISTER_FILE);
1715
1716 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1717 if (intel->gen == 6)
1718 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1719
1720 /* Source modifiers are ignored for extended math instructions on Gen6. */
1721 if (intel->gen == 6) {
1722 assert(!src.negate);
1723 assert(!src.abs);
1724 }
1725
1726 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1727 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1728 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1729 assert(src.type != BRW_REGISTER_TYPE_F);
1730 } else {
1731 assert(src.type == BRW_REGISTER_TYPE_F);
1732 }
1733
1734 /* Math is the same ISA format as other opcodes, except that CondModifier
1735 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1736 */
1737 insn->header.destreg__conditionalmod = function;
1738
1739 brw_set_dest(p, insn, dest);
1740 brw_set_src0(p, insn, src);
1741 brw_set_src1(p, insn, brw_null_reg());
1742 } else {
1743 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1744
1745 /* Example code doesn't set predicate_control for send
1746 * instructions.
1747 */
1748 insn->header.predicate_control = 0;
1749 insn->header.destreg__conditionalmod = msg_reg_nr;
1750
1751 brw_set_dest(p, insn, dest);
1752 brw_set_src0(p, insn, src);
1753 brw_set_math_message(p,
1754 insn,
1755 function,
1756 src.type == BRW_REGISTER_TYPE_D,
1757 precision,
1758 data_type);
1759 }
1760 }
1761
1762 /** Extended math function, float[8].
1763 */
1764 void brw_math2(struct brw_compile *p,
1765 struct brw_reg dest,
1766 GLuint function,
1767 struct brw_reg src0,
1768 struct brw_reg src1)
1769 {
1770 struct intel_context *intel = &p->brw->intel;
1771 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1772
1773 assert(intel->gen >= 6);
1774 (void) intel;
1775
1776
1777 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1778 (intel->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1779 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1780 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1781
1782 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1783 if (intel->gen == 6) {
1784 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1785 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1786 }
1787
1788 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1789 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1790 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1791 assert(src0.type != BRW_REGISTER_TYPE_F);
1792 assert(src1.type != BRW_REGISTER_TYPE_F);
1793 } else {
1794 assert(src0.type == BRW_REGISTER_TYPE_F);
1795 assert(src1.type == BRW_REGISTER_TYPE_F);
1796 }
1797
1798 /* Source modifiers are ignored for extended math instructions on Gen6. */
1799 if (intel->gen == 6) {
1800 assert(!src0.negate);
1801 assert(!src0.abs);
1802 assert(!src1.negate);
1803 assert(!src1.abs);
1804 }
1805
1806 /* Math is the same ISA format as other opcodes, except that CondModifier
1807 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1808 */
1809 insn->header.destreg__conditionalmod = function;
1810
1811 brw_set_dest(p, insn, dest);
1812 brw_set_src0(p, insn, src0);
1813 brw_set_src1(p, insn, src1);
1814 }
1815
1816
1817 /**
1818 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1819 * using a constant offset per channel.
1820 *
1821 * The offset must be aligned to oword size (16 bytes). Used for
1822 * register spilling.
1823 */
1824 void brw_oword_block_write_scratch(struct brw_compile *p,
1825 struct brw_reg mrf,
1826 int num_regs,
1827 GLuint offset)
1828 {
1829 struct intel_context *intel = &p->brw->intel;
1830 uint32_t msg_control, msg_type;
1831 int mlen;
1832
1833 if (intel->gen >= 6)
1834 offset /= 16;
1835
1836 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1837
1838 if (num_regs == 1) {
1839 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1840 mlen = 2;
1841 } else {
1842 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1843 mlen = 3;
1844 }
1845
1846 /* Set up the message header. This is g0, with g0.2 filled with
1847 * the offset. We don't want to leave our offset around in g0 or
1848 * it'll screw up texture samples, so set it up inside the message
1849 * reg.
1850 */
1851 {
1852 brw_push_insn_state(p);
1853 brw_set_mask_control(p, BRW_MASK_DISABLE);
1854 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1855
1856 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1857
1858 /* set message header global offset field (reg 0, element 2) */
1859 brw_MOV(p,
1860 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1861 mrf.nr,
1862 2), BRW_REGISTER_TYPE_UD),
1863 brw_imm_ud(offset));
1864
1865 brw_pop_insn_state(p);
1866 }
1867
1868 {
1869 struct brw_reg dest;
1870 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1871 int send_commit_msg;
1872 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1873 BRW_REGISTER_TYPE_UW);
1874
1875 if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1876 insn->header.compression_control = BRW_COMPRESSION_NONE;
1877 src_header = vec16(src_header);
1878 }
1879 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1880 insn->header.destreg__conditionalmod = mrf.nr;
1881
1882 /* Until gen6, writes followed by reads from the same location
1883 * are not guaranteed to be ordered unless write_commit is set.
1884 * If set, then a no-op write is issued to the destination
1885 * register to set a dependency, and a read from the destination
1886 * can be used to ensure the ordering.
1887 *
1888 * For gen6, only writes between different threads need ordering
1889 * protection. Our use of DP writes is all about register
1890 * spilling within a thread.
1891 */
1892 if (intel->gen >= 6) {
1893 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1894 send_commit_msg = 0;
1895 } else {
1896 dest = src_header;
1897 send_commit_msg = 1;
1898 }
1899
1900 brw_set_dest(p, insn, dest);
1901 if (intel->gen >= 6) {
1902 brw_set_src0(p, insn, mrf);
1903 } else {
1904 brw_set_src0(p, insn, brw_null_reg());
1905 }
1906
1907 if (intel->gen >= 6)
1908 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1909 else
1910 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1911
1912 brw_set_dp_write_message(p,
1913 insn,
1914 255, /* binding table index (255=stateless) */
1915 msg_control,
1916 msg_type,
1917 mlen,
1918 true, /* header_present */
1919 0, /* not a render target */
1920 send_commit_msg, /* response_length */
1921 0, /* eot */
1922 send_commit_msg);
1923 }
1924 }
1925
1926
1927 /**
1928 * Read a block of owords (half a GRF each) from the scratch buffer
1929 * using a constant index per channel.
1930 *
1931 * Offset must be aligned to oword size (16 bytes). Used for register
1932 * spilling.
1933 */
1934 void
1935 brw_oword_block_read_scratch(struct brw_compile *p,
1936 struct brw_reg dest,
1937 struct brw_reg mrf,
1938 int num_regs,
1939 GLuint offset)
1940 {
1941 struct intel_context *intel = &p->brw->intel;
1942 uint32_t msg_control;
1943 int rlen;
1944
1945 if (intel->gen >= 6)
1946 offset /= 16;
1947
1948 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1949 dest = retype(dest, BRW_REGISTER_TYPE_UW);
1950
1951 if (num_regs == 1) {
1952 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1953 rlen = 1;
1954 } else {
1955 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1956 rlen = 2;
1957 }
1958
1959 {
1960 brw_push_insn_state(p);
1961 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1962 brw_set_mask_control(p, BRW_MASK_DISABLE);
1963
1964 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1965
1966 /* set message header global offset field (reg 0, element 2) */
1967 brw_MOV(p,
1968 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1969 mrf.nr,
1970 2), BRW_REGISTER_TYPE_UD),
1971 brw_imm_ud(offset));
1972
1973 brw_pop_insn_state(p);
1974 }
1975
1976 {
1977 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1978
1979 assert(insn->header.predicate_control == 0);
1980 insn->header.compression_control = BRW_COMPRESSION_NONE;
1981 insn->header.destreg__conditionalmod = mrf.nr;
1982
1983 brw_set_dest(p, insn, dest); /* UW? */
1984 if (intel->gen >= 6) {
1985 brw_set_src0(p, insn, mrf);
1986 } else {
1987 brw_set_src0(p, insn, brw_null_reg());
1988 }
1989
1990 brw_set_dp_read_message(p,
1991 insn,
1992 255, /* binding table index (255=stateless) */
1993 msg_control,
1994 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1995 BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
1996 1, /* msg_length */
1997 true, /* header_present */
1998 rlen);
1999 }
2000 }
2001
2002 /**
2003 * Read a float[4] vector from the data port Data Cache (const buffer).
2004 * Location (in buffer) should be a multiple of 16.
2005 * Used for fetching shader constants.
2006 */
2007 void brw_oword_block_read(struct brw_compile *p,
2008 struct brw_reg dest,
2009 struct brw_reg mrf,
2010 uint32_t offset,
2011 uint32_t bind_table_index)
2012 {
2013 struct intel_context *intel = &p->brw->intel;
2014
2015 /* On newer hardware, offset is in units of owords. */
2016 if (intel->gen >= 6)
2017 offset /= 16;
2018
2019 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2020
2021 brw_push_insn_state(p);
2022 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2023 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2024 brw_set_mask_control(p, BRW_MASK_DISABLE);
2025
2026 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2027
2028 /* set message header global offset field (reg 0, element 2) */
2029 brw_MOV(p,
2030 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2031 mrf.nr,
2032 2), BRW_REGISTER_TYPE_UD),
2033 brw_imm_ud(offset));
2034
2035 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2036 insn->header.destreg__conditionalmod = mrf.nr;
2037
2038 /* cast dest to a uword[8] vector */
2039 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2040
2041 brw_set_dest(p, insn, dest);
2042 if (intel->gen >= 6) {
2043 brw_set_src0(p, insn, mrf);
2044 } else {
2045 brw_set_src0(p, insn, brw_null_reg());
2046 }
2047
2048 brw_set_dp_read_message(p,
2049 insn,
2050 bind_table_index,
2051 BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2052 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2053 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2054 1, /* msg_length */
2055 true, /* header_present */
2056 1); /* response_length (1 reg, 2 owords!) */
2057
2058 brw_pop_insn_state(p);
2059 }
2060
2061
2062 void brw_fb_WRITE(struct brw_compile *p,
2063 int dispatch_width,
2064 GLuint msg_reg_nr,
2065 struct brw_reg src0,
2066 GLuint msg_control,
2067 GLuint binding_table_index,
2068 GLuint msg_length,
2069 GLuint response_length,
2070 bool eot,
2071 bool header_present)
2072 {
2073 struct intel_context *intel = &p->brw->intel;
2074 struct brw_instruction *insn;
2075 GLuint msg_type;
2076 struct brw_reg dest;
2077
2078 if (dispatch_width == 16)
2079 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2080 else
2081 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2082
2083 if (intel->gen >= 6) {
2084 insn = next_insn(p, BRW_OPCODE_SENDC);
2085 } else {
2086 insn = next_insn(p, BRW_OPCODE_SEND);
2087 }
2088 /* The execution mask is ignored for render target writes. */
2089 insn->header.predicate_control = 0;
2090 insn->header.compression_control = BRW_COMPRESSION_NONE;
2091
2092 if (intel->gen >= 6) {
2093 /* headerless version, just submit color payload */
2094 src0 = brw_message_reg(msg_reg_nr);
2095
2096 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2097 } else {
2098 insn->header.destreg__conditionalmod = msg_reg_nr;
2099
2100 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2101 }
2102
2103 brw_set_dest(p, insn, dest);
2104 brw_set_src0(p, insn, src0);
2105 brw_set_dp_write_message(p,
2106 insn,
2107 binding_table_index,
2108 msg_control,
2109 msg_type,
2110 msg_length,
2111 header_present,
2112 eot, /* last render target write */
2113 response_length,
2114 eot,
2115 0 /* send_commit_msg */);
2116 }
2117
2118
2119 /**
2120 * Texture sample instruction.
2121 * Note: the msg_type plus msg_length values determine exactly what kind
2122 * of sampling operation is performed. See volume 4, page 161 of docs.
2123 */
2124 void brw_SAMPLE(struct brw_compile *p,
2125 struct brw_reg dest,
2126 GLuint msg_reg_nr,
2127 struct brw_reg src0,
2128 GLuint binding_table_index,
2129 GLuint sampler,
2130 GLuint msg_type,
2131 GLuint response_length,
2132 GLuint msg_length,
2133 GLuint header_present,
2134 GLuint simd_mode,
2135 GLuint return_format)
2136 {
2137 struct intel_context *intel = &p->brw->intel;
2138 struct brw_instruction *insn;
2139
2140 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2141
2142 insn = next_insn(p, BRW_OPCODE_SEND);
2143 insn->header.predicate_control = 0; /* XXX */
2144 insn->header.compression_control = BRW_COMPRESSION_NONE;
2145 if (intel->gen < 6)
2146 insn->header.destreg__conditionalmod = msg_reg_nr;
2147
2148 brw_set_dest(p, insn, dest);
2149 brw_set_src0(p, insn, src0);
2150 brw_set_sampler_message(p, insn,
2151 binding_table_index,
2152 sampler,
2153 msg_type,
2154 response_length,
2155 msg_length,
2156 header_present,
2157 simd_mode,
2158 return_format);
2159 }
2160
2161 /* All these variables are pretty confusing - we might be better off
2162 * using bitmasks and macros for this, in the old style. Or perhaps
2163 * just having the caller instantiate the fields in dword3 itself.
2164 */
2165 void brw_urb_WRITE(struct brw_compile *p,
2166 struct brw_reg dest,
2167 GLuint msg_reg_nr,
2168 struct brw_reg src0,
2169 bool allocate,
2170 bool used,
2171 GLuint msg_length,
2172 GLuint response_length,
2173 bool eot,
2174 bool writes_complete,
2175 GLuint offset,
2176 GLuint swizzle)
2177 {
2178 struct intel_context *intel = &p->brw->intel;
2179 struct brw_instruction *insn;
2180
2181 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2182
2183 if (intel->gen == 7) {
2184 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2185 brw_push_insn_state(p);
2186 brw_set_access_mode(p, BRW_ALIGN_1);
2187 brw_set_mask_control(p, BRW_MASK_DISABLE);
2188 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2189 BRW_REGISTER_TYPE_UD),
2190 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2191 brw_imm_ud(0xff00));
2192 brw_pop_insn_state(p);
2193 }
2194
2195 insn = next_insn(p, BRW_OPCODE_SEND);
2196
2197 assert(msg_length < BRW_MAX_MRF);
2198
2199 brw_set_dest(p, insn, dest);
2200 brw_set_src0(p, insn, src0);
2201 brw_set_src1(p, insn, brw_imm_d(0));
2202
2203 if (intel->gen < 6)
2204 insn->header.destreg__conditionalmod = msg_reg_nr;
2205
2206 brw_set_urb_message(p,
2207 insn,
2208 allocate,
2209 used,
2210 msg_length,
2211 response_length,
2212 eot,
2213 writes_complete,
2214 offset,
2215 swizzle);
2216 }
2217
2218 static int
2219 next_ip(struct brw_compile *p, int ip)
2220 {
2221 struct brw_instruction *insn = (void *)p->store + ip;
2222
2223 if (insn->header.cmpt_control)
2224 return ip + 8;
2225 else
2226 return ip + 16;
2227 }
2228
2229 static int
2230 brw_find_next_block_end(struct brw_compile *p, int start)
2231 {
2232 int ip;
2233 void *store = p->store;
2234
2235 for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2236 struct brw_instruction *insn = store + ip;
2237
2238 switch (insn->header.opcode) {
2239 case BRW_OPCODE_ENDIF:
2240 case BRW_OPCODE_ELSE:
2241 case BRW_OPCODE_WHILE:
2242 case BRW_OPCODE_HALT:
2243 return ip;
2244 }
2245 }
2246
2247 return 0;
2248 }
2249
2250 /* There is no DO instruction on gen6, so to find the end of the loop
2251 * we have to see if the loop is jumping back before our start
2252 * instruction.
2253 */
2254 static int
2255 brw_find_loop_end(struct brw_compile *p, int start)
2256 {
2257 struct intel_context *intel = &p->brw->intel;
2258 int ip;
2259 int scale = 8;
2260 void *store = p->store;
2261
2262 /* Always start after the instruction (such as a WHILE) we're trying to fix
2263 * up.
2264 */
2265 for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2266 struct brw_instruction *insn = store + ip;
2267
2268 if (insn->header.opcode == BRW_OPCODE_WHILE) {
2269 int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
2270 : insn->bits3.break_cont.jip;
2271 if (ip + jip * scale <= start)
2272 return ip;
2273 }
2274 }
2275 assert(!"not reached");
2276 return start;
2277 }
2278
2279 /* After program generation, go back and update the UIP and JIP of
2280 * BREAK, CONT, and HALT instructions to their correct locations.
2281 */
2282 void
2283 brw_set_uip_jip(struct brw_compile *p)
2284 {
2285 struct intel_context *intel = &p->brw->intel;
2286 int ip;
2287 int scale = 8;
2288 void *store = p->store;
2289
2290 if (intel->gen < 6)
2291 return;
2292
2293 for (ip = 0; ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2294 struct brw_instruction *insn = store + ip;
2295
2296 if (insn->header.cmpt_control) {
2297 /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2298 assert(insn->header.opcode != BRW_OPCODE_BREAK &&
2299 insn->header.opcode != BRW_OPCODE_CONTINUE &&
2300 insn->header.opcode != BRW_OPCODE_HALT);
2301 continue;
2302 }
2303
2304 int block_end_ip = brw_find_next_block_end(p, ip);
2305 switch (insn->header.opcode) {
2306 case BRW_OPCODE_BREAK:
2307 assert(block_end_ip != 0);
2308 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2309 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2310 insn->bits3.break_cont.uip =
2311 (brw_find_loop_end(p, ip) - ip +
2312 (intel->gen == 6 ? 16 : 0)) / scale;
2313 break;
2314 case BRW_OPCODE_CONTINUE:
2315 assert(block_end_ip != 0);
2316 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2317 insn->bits3.break_cont.uip =
2318 (brw_find_loop_end(p, ip) - ip) / scale;
2319
2320 assert(insn->bits3.break_cont.uip != 0);
2321 assert(insn->bits3.break_cont.jip != 0);
2322 break;
2323
2324 case BRW_OPCODE_ENDIF:
2325 if (block_end_ip == 0)
2326 insn->bits3.break_cont.jip = 2;
2327 else
2328 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2329 break;
2330
2331 case BRW_OPCODE_HALT:
2332 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2333 *
2334 * "In case of the halt instruction not inside any conditional
2335 * code block, the value of <JIP> and <UIP> should be the
2336 * same. In case of the halt instruction inside conditional code
2337 * block, the <UIP> should be the end of the program, and the
2338 * <JIP> should be end of the most inner conditional code block."
2339 *
2340 * The uip will have already been set by whoever set up the
2341 * instruction.
2342 */
2343 if (block_end_ip == 0) {
2344 insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
2345 } else {
2346 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2347 }
2348 assert(insn->bits3.break_cont.uip != 0);
2349 assert(insn->bits3.break_cont.jip != 0);
2350 break;
2351 }
2352 }
2353 }
2354
2355 void brw_ff_sync(struct brw_compile *p,
2356 struct brw_reg dest,
2357 GLuint msg_reg_nr,
2358 struct brw_reg src0,
2359 bool allocate,
2360 GLuint response_length,
2361 bool eot)
2362 {
2363 struct intel_context *intel = &p->brw->intel;
2364 struct brw_instruction *insn;
2365
2366 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2367
2368 insn = next_insn(p, BRW_OPCODE_SEND);
2369 brw_set_dest(p, insn, dest);
2370 brw_set_src0(p, insn, src0);
2371 brw_set_src1(p, insn, brw_imm_d(0));
2372
2373 if (intel->gen < 6)
2374 insn->header.destreg__conditionalmod = msg_reg_nr;
2375
2376 brw_set_ff_sync_message(p,
2377 insn,
2378 allocate,
2379 response_length,
2380 eot);
2381 }
2382
2383 /**
2384 * Emit the SEND instruction necessary to generate stream output data on Gen6
2385 * (for transform feedback).
2386 *
2387 * If send_commit_msg is true, this is the last piece of stream output data
2388 * from this thread, so send the data as a committed write. According to the
2389 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2390 *
2391 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2392 * writes are complete by sending the final write as a committed write."
2393 */
2394 void
2395 brw_svb_write(struct brw_compile *p,
2396 struct brw_reg dest,
2397 GLuint msg_reg_nr,
2398 struct brw_reg src0,
2399 GLuint binding_table_index,
2400 bool send_commit_msg)
2401 {
2402 struct brw_instruction *insn;
2403
2404 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2405
2406 insn = next_insn(p, BRW_OPCODE_SEND);
2407 brw_set_dest(p, insn, dest);
2408 brw_set_src0(p, insn, src0);
2409 brw_set_src1(p, insn, brw_imm_d(0));
2410 brw_set_dp_write_message(p, insn,
2411 binding_table_index,
2412 0, /* msg_control: ignored */
2413 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2414 1, /* msg_length */
2415 true, /* header_present */
2416 0, /* last_render_target: ignored */
2417 send_commit_msg, /* response_length */
2418 0, /* end_of_thread */
2419 send_commit_msg); /* send_commit_msg */
2420 }
2421
2422 /**
2423 * This instruction is generated as a single-channel align1 instruction by
2424 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2425 *
2426 * We can't use the typed atomic op in the FS because that has the execution
2427 * mask ANDed with the pixel mask, but we just want to write the one dword for
2428 * all the pixels.
2429 *
2430 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2431 * one u32. So we use the same untyped atomic write message as the pixel
2432 * shader.
2433 *
2434 * The untyped atomic operation requires a BUFFER surface type with RAW
2435 * format, and is only accessible through the legacy DATA_CACHE dataport
2436 * messages.
2437 */
2438 void brw_shader_time_add(struct brw_compile *p,
2439 int base_mrf,
2440 uint32_t surf_index)
2441 {
2442 struct intel_context *intel = &p->brw->intel;
2443 assert(intel->gen >= 7);
2444
2445 brw_push_insn_state(p);
2446 brw_set_access_mode(p, BRW_ALIGN_1);
2447 brw_set_mask_control(p, BRW_MASK_DISABLE);
2448 struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
2449 brw_pop_insn_state(p);
2450
2451 /* We use brw_vec1_reg and unmasked because we want to increment the given
2452 * offset only once.
2453 */
2454 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2455 BRW_ARF_NULL, 0));
2456 brw_set_src0(p, send, brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2457 base_mrf, 0));
2458
2459 uint32_t sfid, msg_type;
2460 if (intel->is_haswell) {
2461 sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2462 msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP;
2463 } else {
2464 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
2465 msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
2466 }
2467
2468 bool header_present = false;
2469 bool eot = false;
2470 uint32_t mlen = 2; /* offset, value */
2471 uint32_t rlen = 0;
2472 brw_set_message_descriptor(p, send, sfid, mlen, rlen, header_present, eot);
2473
2474 send->bits3.ud |= msg_type << 14;
2475 send->bits3.ud |= 0 << 13; /* no return data */
2476 send->bits3.ud |= 1 << 12; /* SIMD8 mode */
2477 send->bits3.ud |= BRW_AOP_ADD << 8;
2478 send->bits3.ud |= surf_index << 0;
2479 }