i965: Add support for AVG instruction.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "brw_context.h"
34 #include "brw_defines.h"
35 #include "brw_eu.h"
36
37 #include "glsl/ralloc.h"
38
39 /***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43 static void guess_execution_size(struct brw_compile *p,
44 struct brw_instruction *insn,
45 struct brw_reg reg)
46 {
47 if (reg.width == BRW_WIDTH_8 && p->compressed)
48 insn->header.execution_size = BRW_EXECUTE_16;
49 else
50 insn->header.execution_size = reg.width; /* note - definitions are compatible */
51 }
52
53
54 /**
55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
56 * registers, implicitly moving the operand to a message register.
57 *
58 * On Sandybridge, this is no longer the case. This function performs the
59 * explicit move; it should be called before emitting a SEND instruction.
60 */
61 void
62 gen6_resolve_implied_move(struct brw_compile *p,
63 struct brw_reg *src,
64 GLuint msg_reg_nr)
65 {
66 struct intel_context *intel = &p->brw->intel;
67 if (intel->gen < 6)
68 return;
69
70 if (src->file == BRW_MESSAGE_REGISTER_FILE)
71 return;
72
73 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
74 brw_push_insn_state(p);
75 brw_set_mask_control(p, BRW_MASK_DISABLE);
76 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
77 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
78 retype(*src, BRW_REGISTER_TYPE_UD));
79 brw_pop_insn_state(p);
80 }
81 *src = brw_message_reg(msg_reg_nr);
82 }
83
84 static void
85 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
86 {
87 /* From the BSpec / ISA Reference / send - [DevIVB+]:
88 * "The send with EOT should use register space R112-R127 for <src>. This is
89 * to enable loading of a new thread into the same slot while the message
90 * with EOT for current thread is pending dispatch."
91 *
92 * Since we're pretending to have 16 MRFs anyway, we may as well use the
93 * registers required for messages with EOT.
94 */
95 struct intel_context *intel = &p->brw->intel;
96 if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
97 reg->file = BRW_GENERAL_REGISTER_FILE;
98 reg->nr += GEN7_MRF_HACK_START;
99 }
100 }
101
102
103 void
104 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
105 struct brw_reg dest)
106 {
107 if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
108 dest.file != BRW_MESSAGE_REGISTER_FILE)
109 assert(dest.nr < 128);
110
111 gen7_convert_mrf_to_grf(p, &dest);
112
113 insn->bits1.da1.dest_reg_file = dest.file;
114 insn->bits1.da1.dest_reg_type = dest.type;
115 insn->bits1.da1.dest_address_mode = dest.address_mode;
116
117 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
118 insn->bits1.da1.dest_reg_nr = dest.nr;
119
120 if (insn->header.access_mode == BRW_ALIGN_1) {
121 insn->bits1.da1.dest_subreg_nr = dest.subnr;
122 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
123 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
124 insn->bits1.da1.dest_horiz_stride = dest.hstride;
125 }
126 else {
127 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
128 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
129 /* even ignored in da16, still need to set as '01' */
130 insn->bits1.da16.dest_horiz_stride = 1;
131 }
132 }
133 else {
134 insn->bits1.ia1.dest_subreg_nr = dest.subnr;
135
136 /* These are different sizes in align1 vs align16:
137 */
138 if (insn->header.access_mode == BRW_ALIGN_1) {
139 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
140 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
141 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
142 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
143 }
144 else {
145 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
146 /* even ignored in da16, still need to set as '01' */
147 insn->bits1.ia16.dest_horiz_stride = 1;
148 }
149 }
150
151 /* NEW: Set the execution size based on dest.width and
152 * insn->compression_control:
153 */
154 guess_execution_size(p, insn, dest);
155 }
156
157 extern int reg_type_size[];
158
159 static void
160 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
161 {
162 int hstride_for_reg[] = {0, 1, 2, 4};
163 int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
164 int width_for_reg[] = {1, 2, 4, 8, 16};
165 int execsize_for_reg[] = {1, 2, 4, 8, 16};
166 int width, hstride, vstride, execsize;
167
168 if (reg.file == BRW_IMMEDIATE_VALUE) {
169 /* 3.3.6: Region Parameters. Restriction: Immediate vectors
170 * mean the destination has to be 128-bit aligned and the
171 * destination horiz stride has to be a word.
172 */
173 if (reg.type == BRW_REGISTER_TYPE_V) {
174 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
175 reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
176 }
177
178 return;
179 }
180
181 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
182 reg.file == BRW_ARF_NULL)
183 return;
184
185 assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
186 hstride = hstride_for_reg[reg.hstride];
187
188 if (reg.vstride == 0xf) {
189 vstride = -1;
190 } else {
191 assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
192 vstride = vstride_for_reg[reg.vstride];
193 }
194
195 assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
196 width = width_for_reg[reg.width];
197
198 assert(insn->header.execution_size >= 0 &&
199 insn->header.execution_size < Elements(execsize_for_reg));
200 execsize = execsize_for_reg[insn->header.execution_size];
201
202 /* Restrictions from 3.3.10: Register Region Restrictions. */
203 /* 3. */
204 assert(execsize >= width);
205
206 /* 4. */
207 if (execsize == width && hstride != 0) {
208 assert(vstride == -1 || vstride == width * hstride);
209 }
210
211 /* 5. */
212 if (execsize == width && hstride == 0) {
213 /* no restriction on vstride. */
214 }
215
216 /* 6. */
217 if (width == 1) {
218 assert(hstride == 0);
219 }
220
221 /* 7. */
222 if (execsize == 1 && width == 1) {
223 assert(hstride == 0);
224 assert(vstride == 0);
225 }
226
227 /* 8. */
228 if (vstride == 0 && hstride == 0) {
229 assert(width == 1);
230 }
231
232 /* 10. Check destination issues. */
233 }
234
235 void
236 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
237 struct brw_reg reg)
238 {
239 if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
240 assert(reg.nr < 128);
241
242 gen7_convert_mrf_to_grf(p, &reg);
243
244 validate_reg(insn, reg);
245
246 insn->bits1.da1.src0_reg_file = reg.file;
247 insn->bits1.da1.src0_reg_type = reg.type;
248 insn->bits2.da1.src0_abs = reg.abs;
249 insn->bits2.da1.src0_negate = reg.negate;
250 insn->bits2.da1.src0_address_mode = reg.address_mode;
251
252 if (reg.file == BRW_IMMEDIATE_VALUE) {
253 insn->bits3.ud = reg.dw1.ud;
254
255 /* Required to set some fields in src1 as well:
256 */
257 insn->bits1.da1.src1_reg_file = 0; /* arf */
258 insn->bits1.da1.src1_reg_type = reg.type;
259 }
260 else
261 {
262 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
263 if (insn->header.access_mode == BRW_ALIGN_1) {
264 insn->bits2.da1.src0_subreg_nr = reg.subnr;
265 insn->bits2.da1.src0_reg_nr = reg.nr;
266 }
267 else {
268 insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
269 insn->bits2.da16.src0_reg_nr = reg.nr;
270 }
271 }
272 else {
273 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
274
275 if (insn->header.access_mode == BRW_ALIGN_1) {
276 insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
277 }
278 else {
279 insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
280 }
281 }
282
283 if (insn->header.access_mode == BRW_ALIGN_1) {
284 if (reg.width == BRW_WIDTH_1 &&
285 insn->header.execution_size == BRW_EXECUTE_1) {
286 insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
287 insn->bits2.da1.src0_width = BRW_WIDTH_1;
288 insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
289 }
290 else {
291 insn->bits2.da1.src0_horiz_stride = reg.hstride;
292 insn->bits2.da1.src0_width = reg.width;
293 insn->bits2.da1.src0_vert_stride = reg.vstride;
294 }
295 }
296 else {
297 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
298 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
299 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
300 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
301
302 /* This is an oddity of the fact we're using the same
303 * descriptions for registers in align_16 as align_1:
304 */
305 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
306 insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
307 else
308 insn->bits2.da16.src0_vert_stride = reg.vstride;
309 }
310 }
311 }
312
313
314 void brw_set_src1(struct brw_compile *p,
315 struct brw_instruction *insn,
316 struct brw_reg reg)
317 {
318 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
319
320 assert(reg.nr < 128);
321
322 gen7_convert_mrf_to_grf(p, &reg);
323
324 validate_reg(insn, reg);
325
326 insn->bits1.da1.src1_reg_file = reg.file;
327 insn->bits1.da1.src1_reg_type = reg.type;
328 insn->bits3.da1.src1_abs = reg.abs;
329 insn->bits3.da1.src1_negate = reg.negate;
330
331 /* Only src1 can be immediate in two-argument instructions.
332 */
333 assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
334
335 if (reg.file == BRW_IMMEDIATE_VALUE) {
336 insn->bits3.ud = reg.dw1.ud;
337 }
338 else {
339 /* This is a hardware restriction, which may or may not be lifted
340 * in the future:
341 */
342 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
343 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
344
345 if (insn->header.access_mode == BRW_ALIGN_1) {
346 insn->bits3.da1.src1_subreg_nr = reg.subnr;
347 insn->bits3.da1.src1_reg_nr = reg.nr;
348 }
349 else {
350 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
351 insn->bits3.da16.src1_reg_nr = reg.nr;
352 }
353
354 if (insn->header.access_mode == BRW_ALIGN_1) {
355 if (reg.width == BRW_WIDTH_1 &&
356 insn->header.execution_size == BRW_EXECUTE_1) {
357 insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
358 insn->bits3.da1.src1_width = BRW_WIDTH_1;
359 insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
360 }
361 else {
362 insn->bits3.da1.src1_horiz_stride = reg.hstride;
363 insn->bits3.da1.src1_width = reg.width;
364 insn->bits3.da1.src1_vert_stride = reg.vstride;
365 }
366 }
367 else {
368 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
369 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
370 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
371 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
372
373 /* This is an oddity of the fact we're using the same
374 * descriptions for registers in align_16 as align_1:
375 */
376 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
377 insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
378 else
379 insn->bits3.da16.src1_vert_stride = reg.vstride;
380 }
381 }
382 }
383
384 /**
385 * Set the Message Descriptor and Extended Message Descriptor fields
386 * for SEND messages.
387 *
388 * \note This zeroes out the Function Control bits, so it must be called
389 * \b before filling out any message-specific data. Callers can
390 * choose not to fill in irrelevant bits; they will be zero.
391 */
392 static void
393 brw_set_message_descriptor(struct brw_compile *p,
394 struct brw_instruction *inst,
395 enum brw_message_target sfid,
396 unsigned msg_length,
397 unsigned response_length,
398 bool header_present,
399 bool end_of_thread)
400 {
401 struct intel_context *intel = &p->brw->intel;
402
403 brw_set_src1(p, inst, brw_imm_d(0));
404
405 if (intel->gen >= 5) {
406 inst->bits3.generic_gen5.header_present = header_present;
407 inst->bits3.generic_gen5.response_length = response_length;
408 inst->bits3.generic_gen5.msg_length = msg_length;
409 inst->bits3.generic_gen5.end_of_thread = end_of_thread;
410
411 if (intel->gen >= 6) {
412 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
413 inst->header.destreg__conditionalmod = sfid;
414 } else {
415 /* Set Extended Message Descriptor (ex_desc) */
416 inst->bits2.send_gen5.sfid = sfid;
417 inst->bits2.send_gen5.end_of_thread = end_of_thread;
418 }
419 } else {
420 inst->bits3.generic.response_length = response_length;
421 inst->bits3.generic.msg_length = msg_length;
422 inst->bits3.generic.msg_target = sfid;
423 inst->bits3.generic.end_of_thread = end_of_thread;
424 }
425 }
426
427 static void brw_set_math_message( struct brw_compile *p,
428 struct brw_instruction *insn,
429 GLuint function,
430 GLuint integer_type,
431 bool low_precision,
432 bool saturate,
433 GLuint dataType )
434 {
435 struct brw_context *brw = p->brw;
436 struct intel_context *intel = &brw->intel;
437 unsigned msg_length;
438 unsigned response_length;
439
440 /* Infer message length from the function */
441 switch (function) {
442 case BRW_MATH_FUNCTION_POW:
443 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
444 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
445 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
446 msg_length = 2;
447 break;
448 default:
449 msg_length = 1;
450 break;
451 }
452
453 /* Infer response length from the function */
454 switch (function) {
455 case BRW_MATH_FUNCTION_SINCOS:
456 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
457 response_length = 2;
458 break;
459 default:
460 response_length = 1;
461 break;
462 }
463
464 brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
465 msg_length, response_length, false, false);
466 if (intel->gen == 5) {
467 insn->bits3.math_gen5.function = function;
468 insn->bits3.math_gen5.int_type = integer_type;
469 insn->bits3.math_gen5.precision = low_precision;
470 insn->bits3.math_gen5.saturate = saturate;
471 insn->bits3.math_gen5.data_type = dataType;
472 insn->bits3.math_gen5.snapshot = 0;
473 } else {
474 insn->bits3.math.function = function;
475 insn->bits3.math.int_type = integer_type;
476 insn->bits3.math.precision = low_precision;
477 insn->bits3.math.saturate = saturate;
478 insn->bits3.math.data_type = dataType;
479 }
480 }
481
482
483 static void brw_set_ff_sync_message(struct brw_compile *p,
484 struct brw_instruction *insn,
485 bool allocate,
486 GLuint response_length,
487 bool end_of_thread)
488 {
489 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
490 1, response_length, true, end_of_thread);
491 insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
492 insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
493 insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
494 insn->bits3.urb_gen5.allocate = allocate;
495 insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
496 insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
497 }
498
499 static void brw_set_urb_message( struct brw_compile *p,
500 struct brw_instruction *insn,
501 bool allocate,
502 bool used,
503 GLuint msg_length,
504 GLuint response_length,
505 bool end_of_thread,
506 bool complete,
507 GLuint offset,
508 GLuint swizzle_control )
509 {
510 struct brw_context *brw = p->brw;
511 struct intel_context *intel = &brw->intel;
512
513 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
514 msg_length, response_length, true, end_of_thread);
515 if (intel->gen == 7) {
516 insn->bits3.urb_gen7.opcode = 0; /* URB_WRITE_HWORD */
517 insn->bits3.urb_gen7.offset = offset;
518 assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
519 insn->bits3.urb_gen7.swizzle_control = swizzle_control;
520 /* per_slot_offset = 0 makes it ignore offsets in message header */
521 insn->bits3.urb_gen7.per_slot_offset = 0;
522 insn->bits3.urb_gen7.complete = complete;
523 } else if (intel->gen >= 5) {
524 insn->bits3.urb_gen5.opcode = 0; /* URB_WRITE */
525 insn->bits3.urb_gen5.offset = offset;
526 insn->bits3.urb_gen5.swizzle_control = swizzle_control;
527 insn->bits3.urb_gen5.allocate = allocate;
528 insn->bits3.urb_gen5.used = used; /* ? */
529 insn->bits3.urb_gen5.complete = complete;
530 } else {
531 insn->bits3.urb.opcode = 0; /* ? */
532 insn->bits3.urb.offset = offset;
533 insn->bits3.urb.swizzle_control = swizzle_control;
534 insn->bits3.urb.allocate = allocate;
535 insn->bits3.urb.used = used; /* ? */
536 insn->bits3.urb.complete = complete;
537 }
538 }
539
540 void
541 brw_set_dp_write_message(struct brw_compile *p,
542 struct brw_instruction *insn,
543 GLuint binding_table_index,
544 GLuint msg_control,
545 GLuint msg_type,
546 GLuint msg_length,
547 bool header_present,
548 GLuint last_render_target,
549 GLuint response_length,
550 GLuint end_of_thread,
551 GLuint send_commit_msg)
552 {
553 struct brw_context *brw = p->brw;
554 struct intel_context *intel = &brw->intel;
555 unsigned sfid;
556
557 if (intel->gen >= 7) {
558 /* Use the Render Cache for RT writes; otherwise use the Data Cache */
559 if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
560 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
561 else
562 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
563 } else if (intel->gen == 6) {
564 /* Use the render cache for all write messages. */
565 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
566 } else {
567 sfid = BRW_SFID_DATAPORT_WRITE;
568 }
569
570 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
571 header_present, end_of_thread);
572
573 if (intel->gen >= 7) {
574 insn->bits3.gen7_dp.binding_table_index = binding_table_index;
575 insn->bits3.gen7_dp.msg_control = msg_control;
576 insn->bits3.gen7_dp.last_render_target = last_render_target;
577 insn->bits3.gen7_dp.msg_type = msg_type;
578 } else if (intel->gen == 6) {
579 insn->bits3.gen6_dp.binding_table_index = binding_table_index;
580 insn->bits3.gen6_dp.msg_control = msg_control;
581 insn->bits3.gen6_dp.last_render_target = last_render_target;
582 insn->bits3.gen6_dp.msg_type = msg_type;
583 insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
584 } else if (intel->gen == 5) {
585 insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
586 insn->bits3.dp_write_gen5.msg_control = msg_control;
587 insn->bits3.dp_write_gen5.last_render_target = last_render_target;
588 insn->bits3.dp_write_gen5.msg_type = msg_type;
589 insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
590 } else {
591 insn->bits3.dp_write.binding_table_index = binding_table_index;
592 insn->bits3.dp_write.msg_control = msg_control;
593 insn->bits3.dp_write.last_render_target = last_render_target;
594 insn->bits3.dp_write.msg_type = msg_type;
595 insn->bits3.dp_write.send_commit_msg = send_commit_msg;
596 }
597 }
598
599 void
600 brw_set_dp_read_message(struct brw_compile *p,
601 struct brw_instruction *insn,
602 GLuint binding_table_index,
603 GLuint msg_control,
604 GLuint msg_type,
605 GLuint target_cache,
606 GLuint msg_length,
607 GLuint response_length)
608 {
609 struct brw_context *brw = p->brw;
610 struct intel_context *intel = &brw->intel;
611 unsigned sfid;
612
613 if (intel->gen >= 7) {
614 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
615 } else if (intel->gen == 6) {
616 if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
617 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
618 else
619 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
620 } else {
621 sfid = BRW_SFID_DATAPORT_READ;
622 }
623
624 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
625 true, false);
626
627 if (intel->gen >= 7) {
628 insn->bits3.gen7_dp.binding_table_index = binding_table_index;
629 insn->bits3.gen7_dp.msg_control = msg_control;
630 insn->bits3.gen7_dp.last_render_target = 0;
631 insn->bits3.gen7_dp.msg_type = msg_type;
632 } else if (intel->gen == 6) {
633 insn->bits3.gen6_dp.binding_table_index = binding_table_index;
634 insn->bits3.gen6_dp.msg_control = msg_control;
635 insn->bits3.gen6_dp.last_render_target = 0;
636 insn->bits3.gen6_dp.msg_type = msg_type;
637 insn->bits3.gen6_dp.send_commit_msg = 0;
638 } else if (intel->gen == 5) {
639 insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
640 insn->bits3.dp_read_gen5.msg_control = msg_control;
641 insn->bits3.dp_read_gen5.msg_type = msg_type;
642 insn->bits3.dp_read_gen5.target_cache = target_cache;
643 } else if (intel->is_g4x) {
644 insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
645 insn->bits3.dp_read_g4x.msg_control = msg_control; /*8:10*/
646 insn->bits3.dp_read_g4x.msg_type = msg_type; /*11:13*/
647 insn->bits3.dp_read_g4x.target_cache = target_cache; /*14:15*/
648 } else {
649 insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
650 insn->bits3.dp_read.msg_control = msg_control; /*8:11*/
651 insn->bits3.dp_read.msg_type = msg_type; /*12:13*/
652 insn->bits3.dp_read.target_cache = target_cache; /*14:15*/
653 }
654 }
655
656 void
657 brw_set_sampler_message(struct brw_compile *p,
658 struct brw_instruction *insn,
659 GLuint binding_table_index,
660 GLuint sampler,
661 GLuint msg_type,
662 GLuint response_length,
663 GLuint msg_length,
664 GLuint header_present,
665 GLuint simd_mode,
666 GLuint return_format)
667 {
668 struct brw_context *brw = p->brw;
669 struct intel_context *intel = &brw->intel;
670
671 brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
672 response_length, header_present, false);
673
674 if (intel->gen >= 7) {
675 insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
676 insn->bits3.sampler_gen7.sampler = sampler;
677 insn->bits3.sampler_gen7.msg_type = msg_type;
678 insn->bits3.sampler_gen7.simd_mode = simd_mode;
679 } else if (intel->gen >= 5) {
680 insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
681 insn->bits3.sampler_gen5.sampler = sampler;
682 insn->bits3.sampler_gen5.msg_type = msg_type;
683 insn->bits3.sampler_gen5.simd_mode = simd_mode;
684 } else if (intel->is_g4x) {
685 insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
686 insn->bits3.sampler_g4x.sampler = sampler;
687 insn->bits3.sampler_g4x.msg_type = msg_type;
688 } else {
689 insn->bits3.sampler.binding_table_index = binding_table_index;
690 insn->bits3.sampler.sampler = sampler;
691 insn->bits3.sampler.msg_type = msg_type;
692 insn->bits3.sampler.return_format = return_format;
693 }
694 }
695
696
697 #define next_insn brw_next_insn
698 struct brw_instruction *
699 brw_next_insn(struct brw_compile *p, GLuint opcode)
700 {
701 struct brw_instruction *insn;
702
703 if (p->nr_insn + 1 > p->store_size) {
704 if (0)
705 printf("incresing the store size to %d\n", p->store_size << 1);
706 p->store_size <<= 1;
707 p->store = reralloc(p->mem_ctx, p->store,
708 struct brw_instruction, p->store_size);
709 if (!p->store)
710 assert(!"realloc eu store memeory failed");
711 }
712
713 insn = &p->store[p->nr_insn++];
714 memcpy(insn, p->current, sizeof(*insn));
715
716 /* Reset this one-shot flag:
717 */
718
719 if (p->current->header.destreg__conditionalmod) {
720 p->current->header.destreg__conditionalmod = 0;
721 p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
722 }
723
724 insn->header.opcode = opcode;
725 return insn;
726 }
727
728 static struct brw_instruction *brw_alu1( struct brw_compile *p,
729 GLuint opcode,
730 struct brw_reg dest,
731 struct brw_reg src )
732 {
733 struct brw_instruction *insn = next_insn(p, opcode);
734 brw_set_dest(p, insn, dest);
735 brw_set_src0(p, insn, src);
736 return insn;
737 }
738
739 static struct brw_instruction *brw_alu2(struct brw_compile *p,
740 GLuint opcode,
741 struct brw_reg dest,
742 struct brw_reg src0,
743 struct brw_reg src1 )
744 {
745 struct brw_instruction *insn = next_insn(p, opcode);
746 brw_set_dest(p, insn, dest);
747 brw_set_src0(p, insn, src0);
748 brw_set_src1(p, insn, src1);
749 return insn;
750 }
751
752 static int
753 get_3src_subreg_nr(struct brw_reg reg)
754 {
755 if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
756 assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
757 return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
758 } else {
759 return reg.subnr / 4;
760 }
761 }
762
763 static struct brw_instruction *brw_alu3(struct brw_compile *p,
764 GLuint opcode,
765 struct brw_reg dest,
766 struct brw_reg src0,
767 struct brw_reg src1,
768 struct brw_reg src2)
769 {
770 struct brw_instruction *insn = next_insn(p, opcode);
771
772 gen7_convert_mrf_to_grf(p, &dest);
773
774 assert(insn->header.access_mode == BRW_ALIGN_16);
775
776 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
777 dest.file == BRW_MESSAGE_REGISTER_FILE);
778 assert(dest.nr < 128);
779 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
780 assert(dest.type = BRW_REGISTER_TYPE_F);
781 insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
782 insn->bits1.da3src.dest_reg_nr = dest.nr;
783 insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
784 insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
785 guess_execution_size(p, insn, dest);
786
787 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
788 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
789 assert(src0.nr < 128);
790 assert(src0.type == BRW_REGISTER_TYPE_F);
791 insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
792 insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
793 insn->bits2.da3src.src0_reg_nr = src0.nr;
794 insn->bits1.da3src.src0_abs = src0.abs;
795 insn->bits1.da3src.src0_negate = src0.negate;
796 insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
797
798 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
799 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
800 assert(src1.nr < 128);
801 assert(src1.type == BRW_REGISTER_TYPE_F);
802 insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
803 insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
804 insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
805 insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
806 insn->bits3.da3src.src1_reg_nr = src1.nr;
807 insn->bits1.da3src.src1_abs = src1.abs;
808 insn->bits1.da3src.src1_negate = src1.negate;
809
810 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
811 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
812 assert(src2.nr < 128);
813 assert(src2.type == BRW_REGISTER_TYPE_F);
814 insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
815 insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
816 insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
817 insn->bits3.da3src.src2_reg_nr = src2.nr;
818 insn->bits1.da3src.src2_abs = src2.abs;
819 insn->bits1.da3src.src2_negate = src2.negate;
820
821 return insn;
822 }
823
824
825 /***********************************************************************
826 * Convenience routines.
827 */
828 #define ALU1(OP) \
829 struct brw_instruction *brw_##OP(struct brw_compile *p, \
830 struct brw_reg dest, \
831 struct brw_reg src0) \
832 { \
833 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
834 }
835
836 #define ALU2(OP) \
837 struct brw_instruction *brw_##OP(struct brw_compile *p, \
838 struct brw_reg dest, \
839 struct brw_reg src0, \
840 struct brw_reg src1) \
841 { \
842 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
843 }
844
845 #define ALU3(OP) \
846 struct brw_instruction *brw_##OP(struct brw_compile *p, \
847 struct brw_reg dest, \
848 struct brw_reg src0, \
849 struct brw_reg src1, \
850 struct brw_reg src2) \
851 { \
852 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
853 }
854
855 /* Rounding operations (other than RNDD) require two instructions - the first
856 * stores a rounded value (possibly the wrong way) in the dest register, but
857 * also sets a per-channel "increment bit" in the flag register. A predicated
858 * add of 1.0 fixes dest to contain the desired result.
859 *
860 * Sandybridge and later appear to round correctly without an ADD.
861 */
862 #define ROUND(OP) \
863 void brw_##OP(struct brw_compile *p, \
864 struct brw_reg dest, \
865 struct brw_reg src) \
866 { \
867 struct brw_instruction *rnd, *add; \
868 rnd = next_insn(p, BRW_OPCODE_##OP); \
869 brw_set_dest(p, rnd, dest); \
870 brw_set_src0(p, rnd, src); \
871 \
872 if (p->brw->intel.gen < 6) { \
873 /* turn on round-increments */ \
874 rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R; \
875 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
876 add->header.predicate_control = BRW_PREDICATE_NORMAL; \
877 } \
878 }
879
880
881 ALU1(MOV)
882 ALU2(SEL)
883 ALU1(NOT)
884 ALU2(AND)
885 ALU2(OR)
886 ALU2(XOR)
887 ALU2(SHR)
888 ALU2(SHL)
889 ALU2(RSR)
890 ALU2(RSL)
891 ALU2(ASR)
892 ALU1(FRC)
893 ALU1(RNDD)
894 ALU2(MAC)
895 ALU2(MACH)
896 ALU1(LZD)
897 ALU2(DP4)
898 ALU2(DPH)
899 ALU2(DP3)
900 ALU2(DP2)
901 ALU2(LINE)
902 ALU2(PLN)
903 ALU3(MAD)
904
905 ROUND(RNDZ)
906 ROUND(RNDE)
907
908
909 struct brw_instruction *brw_ADD(struct brw_compile *p,
910 struct brw_reg dest,
911 struct brw_reg src0,
912 struct brw_reg src1)
913 {
914 /* 6.2.2: add */
915 if (src0.type == BRW_REGISTER_TYPE_F ||
916 (src0.file == BRW_IMMEDIATE_VALUE &&
917 src0.type == BRW_REGISTER_TYPE_VF)) {
918 assert(src1.type != BRW_REGISTER_TYPE_UD);
919 assert(src1.type != BRW_REGISTER_TYPE_D);
920 }
921
922 if (src1.type == BRW_REGISTER_TYPE_F ||
923 (src1.file == BRW_IMMEDIATE_VALUE &&
924 src1.type == BRW_REGISTER_TYPE_VF)) {
925 assert(src0.type != BRW_REGISTER_TYPE_UD);
926 assert(src0.type != BRW_REGISTER_TYPE_D);
927 }
928
929 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
930 }
931
932 struct brw_instruction *brw_AVG(struct brw_compile *p,
933 struct brw_reg dest,
934 struct brw_reg src0,
935 struct brw_reg src1)
936 {
937 assert(dest.type == src0.type);
938 assert(src0.type == src1.type);
939 switch (src0.type) {
940 case BRW_REGISTER_TYPE_B:
941 case BRW_REGISTER_TYPE_UB:
942 case BRW_REGISTER_TYPE_W:
943 case BRW_REGISTER_TYPE_UW:
944 case BRW_REGISTER_TYPE_D:
945 case BRW_REGISTER_TYPE_UD:
946 break;
947 default:
948 assert(!"Bad type for brw_AVG");
949 }
950
951 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
952 }
953
954 struct brw_instruction *brw_MUL(struct brw_compile *p,
955 struct brw_reg dest,
956 struct brw_reg src0,
957 struct brw_reg src1)
958 {
959 /* 6.32.38: mul */
960 if (src0.type == BRW_REGISTER_TYPE_D ||
961 src0.type == BRW_REGISTER_TYPE_UD ||
962 src1.type == BRW_REGISTER_TYPE_D ||
963 src1.type == BRW_REGISTER_TYPE_UD) {
964 assert(dest.type != BRW_REGISTER_TYPE_F);
965 }
966
967 if (src0.type == BRW_REGISTER_TYPE_F ||
968 (src0.file == BRW_IMMEDIATE_VALUE &&
969 src0.type == BRW_REGISTER_TYPE_VF)) {
970 assert(src1.type != BRW_REGISTER_TYPE_UD);
971 assert(src1.type != BRW_REGISTER_TYPE_D);
972 }
973
974 if (src1.type == BRW_REGISTER_TYPE_F ||
975 (src1.file == BRW_IMMEDIATE_VALUE &&
976 src1.type == BRW_REGISTER_TYPE_VF)) {
977 assert(src0.type != BRW_REGISTER_TYPE_UD);
978 assert(src0.type != BRW_REGISTER_TYPE_D);
979 }
980
981 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
982 src0.nr != BRW_ARF_ACCUMULATOR);
983 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
984 src1.nr != BRW_ARF_ACCUMULATOR);
985
986 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
987 }
988
989
990 void brw_NOP(struct brw_compile *p)
991 {
992 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
993 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
994 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
995 brw_set_src1(p, insn, brw_imm_ud(0x0));
996 }
997
998
999
1000
1001
1002 /***********************************************************************
1003 * Comparisons, if/else/endif
1004 */
1005
1006 struct brw_instruction *brw_JMPI(struct brw_compile *p,
1007 struct brw_reg dest,
1008 struct brw_reg src0,
1009 struct brw_reg src1)
1010 {
1011 struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
1012
1013 insn->header.execution_size = 1;
1014 insn->header.compression_control = BRW_COMPRESSION_NONE;
1015 insn->header.mask_control = BRW_MASK_DISABLE;
1016
1017 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1018
1019 return insn;
1020 }
1021
1022 static void
1023 push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1024 {
1025 p->if_stack[p->if_stack_depth] = inst - p->store;
1026
1027 p->if_stack_depth++;
1028 if (p->if_stack_array_size <= p->if_stack_depth) {
1029 p->if_stack_array_size *= 2;
1030 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1031 p->if_stack_array_size);
1032 }
1033 }
1034
1035 static struct brw_instruction *
1036 pop_if_stack(struct brw_compile *p)
1037 {
1038 p->if_stack_depth--;
1039 return &p->store[p->if_stack[p->if_stack_depth]];
1040 }
1041
1042 static void
1043 push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1044 {
1045 if (p->loop_stack_array_size < p->loop_stack_depth) {
1046 p->loop_stack_array_size *= 2;
1047 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1048 p->loop_stack_array_size);
1049 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1050 p->loop_stack_array_size);
1051 }
1052
1053 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1054 p->loop_stack_depth++;
1055 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1056 }
1057
1058 static struct brw_instruction *
1059 get_inner_do_insn(struct brw_compile *p)
1060 {
1061 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1062 }
1063
1064 /* EU takes the value from the flag register and pushes it onto some
1065 * sort of a stack (presumably merging with any flag value already on
1066 * the stack). Within an if block, the flags at the top of the stack
1067 * control execution on each channel of the unit, eg. on each of the
1068 * 16 pixel values in our wm programs.
1069 *
1070 * When the matching 'else' instruction is reached (presumably by
1071 * countdown of the instruction count patched in by our ELSE/ENDIF
1072 * functions), the relevent flags are inverted.
1073 *
1074 * When the matching 'endif' instruction is reached, the flags are
1075 * popped off. If the stack is now empty, normal execution resumes.
1076 */
1077 struct brw_instruction *
1078 brw_IF(struct brw_compile *p, GLuint execute_size)
1079 {
1080 struct intel_context *intel = &p->brw->intel;
1081 struct brw_instruction *insn;
1082
1083 insn = next_insn(p, BRW_OPCODE_IF);
1084
1085 /* Override the defaults for this instruction:
1086 */
1087 if (intel->gen < 6) {
1088 brw_set_dest(p, insn, brw_ip_reg());
1089 brw_set_src0(p, insn, brw_ip_reg());
1090 brw_set_src1(p, insn, brw_imm_d(0x0));
1091 } else if (intel->gen == 6) {
1092 brw_set_dest(p, insn, brw_imm_w(0));
1093 insn->bits1.branch_gen6.jump_count = 0;
1094 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1095 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1096 } else {
1097 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1098 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1099 brw_set_src1(p, insn, brw_imm_ud(0));
1100 insn->bits3.break_cont.jip = 0;
1101 insn->bits3.break_cont.uip = 0;
1102 }
1103
1104 insn->header.execution_size = execute_size;
1105 insn->header.compression_control = BRW_COMPRESSION_NONE;
1106 insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1107 insn->header.mask_control = BRW_MASK_ENABLE;
1108 if (!p->single_program_flow)
1109 insn->header.thread_control = BRW_THREAD_SWITCH;
1110
1111 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1112
1113 push_if_stack(p, insn);
1114 p->if_depth_in_loop[p->loop_stack_depth]++;
1115 return insn;
1116 }
1117
1118 /* This function is only used for gen6-style IF instructions with an
1119 * embedded comparison (conditional modifier). It is not used on gen7.
1120 */
1121 struct brw_instruction *
1122 gen6_IF(struct brw_compile *p, uint32_t conditional,
1123 struct brw_reg src0, struct brw_reg src1)
1124 {
1125 struct brw_instruction *insn;
1126
1127 insn = next_insn(p, BRW_OPCODE_IF);
1128
1129 brw_set_dest(p, insn, brw_imm_w(0));
1130 if (p->compressed) {
1131 insn->header.execution_size = BRW_EXECUTE_16;
1132 } else {
1133 insn->header.execution_size = BRW_EXECUTE_8;
1134 }
1135 insn->bits1.branch_gen6.jump_count = 0;
1136 brw_set_src0(p, insn, src0);
1137 brw_set_src1(p, insn, src1);
1138
1139 assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1140 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1141 insn->header.destreg__conditionalmod = conditional;
1142
1143 if (!p->single_program_flow)
1144 insn->header.thread_control = BRW_THREAD_SWITCH;
1145
1146 push_if_stack(p, insn);
1147 return insn;
1148 }
1149
1150 /**
1151 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1152 */
1153 static void
1154 convert_IF_ELSE_to_ADD(struct brw_compile *p,
1155 struct brw_instruction *if_inst,
1156 struct brw_instruction *else_inst)
1157 {
1158 /* The next instruction (where the ENDIF would be, if it existed) */
1159 struct brw_instruction *next_inst = &p->store[p->nr_insn];
1160
1161 assert(p->single_program_flow);
1162 assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1163 assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1164 assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1165
1166 /* Convert IF to an ADD instruction that moves the instruction pointer
1167 * to the first instruction of the ELSE block. If there is no ELSE
1168 * block, point to where ENDIF would be. Reverse the predicate.
1169 *
1170 * There's no need to execute an ENDIF since we don't need to do any
1171 * stack operations, and if we're currently executing, we just want to
1172 * continue normally.
1173 */
1174 if_inst->header.opcode = BRW_OPCODE_ADD;
1175 if_inst->header.predicate_inverse = 1;
1176
1177 if (else_inst != NULL) {
1178 /* Convert ELSE to an ADD instruction that points where the ENDIF
1179 * would be.
1180 */
1181 else_inst->header.opcode = BRW_OPCODE_ADD;
1182
1183 if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1184 else_inst->bits3.ud = (next_inst - else_inst) * 16;
1185 } else {
1186 if_inst->bits3.ud = (next_inst - if_inst) * 16;
1187 }
1188 }
1189
1190 /**
1191 * Patch IF and ELSE instructions with appropriate jump targets.
1192 */
1193 static void
1194 patch_IF_ELSE(struct brw_compile *p,
1195 struct brw_instruction *if_inst,
1196 struct brw_instruction *else_inst,
1197 struct brw_instruction *endif_inst)
1198 {
1199 struct intel_context *intel = &p->brw->intel;
1200
1201 /* We shouldn't be patching IF and ELSE instructions in single program flow
1202 * mode when gen < 6, because in single program flow mode on those
1203 * platforms, we convert flow control instructions to conditional ADDs that
1204 * operate on IP (see brw_ENDIF).
1205 *
1206 * However, on Gen6, writing to IP doesn't work in single program flow mode
1207 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1208 * not be updated by non-flow control instructions."). And on later
1209 * platforms, there is no significant benefit to converting control flow
1210 * instructions to conditional ADDs. So we do patch IF and ELSE
1211 * instructions in single program flow mode on those platforms.
1212 */
1213 if (intel->gen < 6)
1214 assert(!p->single_program_flow);
1215
1216 assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1217 assert(endif_inst != NULL);
1218 assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1219
1220 unsigned br = 1;
1221 /* Jump count is for 64bit data chunk each, so one 128bit instruction
1222 * requires 2 chunks.
1223 */
1224 if (intel->gen >= 5)
1225 br = 2;
1226
1227 assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1228 endif_inst->header.execution_size = if_inst->header.execution_size;
1229
1230 if (else_inst == NULL) {
1231 /* Patch IF -> ENDIF */
1232 if (intel->gen < 6) {
1233 /* Turn it into an IFF, which means no mask stack operations for
1234 * all-false and jumping past the ENDIF.
1235 */
1236 if_inst->header.opcode = BRW_OPCODE_IFF;
1237 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1238 if_inst->bits3.if_else.pop_count = 0;
1239 if_inst->bits3.if_else.pad0 = 0;
1240 } else if (intel->gen == 6) {
1241 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1242 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1243 } else {
1244 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1245 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1246 }
1247 } else {
1248 else_inst->header.execution_size = if_inst->header.execution_size;
1249
1250 /* Patch IF -> ELSE */
1251 if (intel->gen < 6) {
1252 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1253 if_inst->bits3.if_else.pop_count = 0;
1254 if_inst->bits3.if_else.pad0 = 0;
1255 } else if (intel->gen == 6) {
1256 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1257 }
1258
1259 /* Patch ELSE -> ENDIF */
1260 if (intel->gen < 6) {
1261 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1262 * matching ENDIF.
1263 */
1264 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1265 else_inst->bits3.if_else.pop_count = 1;
1266 else_inst->bits3.if_else.pad0 = 0;
1267 } else if (intel->gen == 6) {
1268 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1269 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1270 } else {
1271 /* The IF instruction's JIP should point just past the ELSE */
1272 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1273 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1274 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1275 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1276 }
1277 }
1278 }
1279
1280 void
1281 brw_ELSE(struct brw_compile *p)
1282 {
1283 struct intel_context *intel = &p->brw->intel;
1284 struct brw_instruction *insn;
1285
1286 insn = next_insn(p, BRW_OPCODE_ELSE);
1287
1288 if (intel->gen < 6) {
1289 brw_set_dest(p, insn, brw_ip_reg());
1290 brw_set_src0(p, insn, brw_ip_reg());
1291 brw_set_src1(p, insn, brw_imm_d(0x0));
1292 } else if (intel->gen == 6) {
1293 brw_set_dest(p, insn, brw_imm_w(0));
1294 insn->bits1.branch_gen6.jump_count = 0;
1295 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1296 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1297 } else {
1298 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1299 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1300 brw_set_src1(p, insn, brw_imm_ud(0));
1301 insn->bits3.break_cont.jip = 0;
1302 insn->bits3.break_cont.uip = 0;
1303 }
1304
1305 insn->header.compression_control = BRW_COMPRESSION_NONE;
1306 insn->header.mask_control = BRW_MASK_ENABLE;
1307 if (!p->single_program_flow)
1308 insn->header.thread_control = BRW_THREAD_SWITCH;
1309
1310 push_if_stack(p, insn);
1311 }
1312
1313 void
1314 brw_ENDIF(struct brw_compile *p)
1315 {
1316 struct intel_context *intel = &p->brw->intel;
1317 struct brw_instruction *insn = NULL;
1318 struct brw_instruction *else_inst = NULL;
1319 struct brw_instruction *if_inst = NULL;
1320 struct brw_instruction *tmp;
1321 bool emit_endif = true;
1322
1323 /* In single program flow mode, we can express IF and ELSE instructions
1324 * equivalently as ADD instructions that operate on IP. On platforms prior
1325 * to Gen6, flow control instructions cause an implied thread switch, so
1326 * this is a significant savings.
1327 *
1328 * However, on Gen6, writing to IP doesn't work in single program flow mode
1329 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1330 * not be updated by non-flow control instructions."). And on later
1331 * platforms, there is no significant benefit to converting control flow
1332 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1333 * Gen5.
1334 */
1335 if (intel->gen < 6 && p->single_program_flow)
1336 emit_endif = false;
1337
1338 /*
1339 * A single next_insn() may change the base adress of instruction store
1340 * memory(p->store), so call it first before referencing the instruction
1341 * store pointer from an index
1342 */
1343 if (emit_endif)
1344 insn = next_insn(p, BRW_OPCODE_ENDIF);
1345
1346 /* Pop the IF and (optional) ELSE instructions from the stack */
1347 p->if_depth_in_loop[p->loop_stack_depth]--;
1348 tmp = pop_if_stack(p);
1349 if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1350 else_inst = tmp;
1351 tmp = pop_if_stack(p);
1352 }
1353 if_inst = tmp;
1354
1355 if (!emit_endif) {
1356 /* ENDIF is useless; don't bother emitting it. */
1357 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1358 return;
1359 }
1360
1361 if (intel->gen < 6) {
1362 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1363 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1364 brw_set_src1(p, insn, brw_imm_d(0x0));
1365 } else if (intel->gen == 6) {
1366 brw_set_dest(p, insn, brw_imm_w(0));
1367 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1368 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1369 } else {
1370 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1371 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1372 brw_set_src1(p, insn, brw_imm_ud(0));
1373 }
1374
1375 insn->header.compression_control = BRW_COMPRESSION_NONE;
1376 insn->header.mask_control = BRW_MASK_ENABLE;
1377 insn->header.thread_control = BRW_THREAD_SWITCH;
1378
1379 /* Also pop item off the stack in the endif instruction: */
1380 if (intel->gen < 6) {
1381 insn->bits3.if_else.jump_count = 0;
1382 insn->bits3.if_else.pop_count = 1;
1383 insn->bits3.if_else.pad0 = 0;
1384 } else if (intel->gen == 6) {
1385 insn->bits1.branch_gen6.jump_count = 2;
1386 } else {
1387 insn->bits3.break_cont.jip = 2;
1388 }
1389 patch_IF_ELSE(p, if_inst, else_inst, insn);
1390 }
1391
1392 struct brw_instruction *brw_BREAK(struct brw_compile *p)
1393 {
1394 struct intel_context *intel = &p->brw->intel;
1395 struct brw_instruction *insn;
1396
1397 insn = next_insn(p, BRW_OPCODE_BREAK);
1398 if (intel->gen >= 6) {
1399 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1400 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1401 brw_set_src1(p, insn, brw_imm_d(0x0));
1402 } else {
1403 brw_set_dest(p, insn, brw_ip_reg());
1404 brw_set_src0(p, insn, brw_ip_reg());
1405 brw_set_src1(p, insn, brw_imm_d(0x0));
1406 insn->bits3.if_else.pad0 = 0;
1407 insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1408 }
1409 insn->header.compression_control = BRW_COMPRESSION_NONE;
1410 insn->header.execution_size = BRW_EXECUTE_8;
1411
1412 return insn;
1413 }
1414
1415 struct brw_instruction *gen6_CONT(struct brw_compile *p)
1416 {
1417 struct brw_instruction *insn;
1418
1419 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1420 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1421 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1422 brw_set_dest(p, insn, brw_ip_reg());
1423 brw_set_src0(p, insn, brw_ip_reg());
1424 brw_set_src1(p, insn, brw_imm_d(0x0));
1425
1426 insn->header.compression_control = BRW_COMPRESSION_NONE;
1427 insn->header.execution_size = BRW_EXECUTE_8;
1428 return insn;
1429 }
1430
1431 struct brw_instruction *brw_CONT(struct brw_compile *p)
1432 {
1433 struct brw_instruction *insn;
1434 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1435 brw_set_dest(p, insn, brw_ip_reg());
1436 brw_set_src0(p, insn, brw_ip_reg());
1437 brw_set_src1(p, insn, brw_imm_d(0x0));
1438 insn->header.compression_control = BRW_COMPRESSION_NONE;
1439 insn->header.execution_size = BRW_EXECUTE_8;
1440 /* insn->header.mask_control = BRW_MASK_DISABLE; */
1441 insn->bits3.if_else.pad0 = 0;
1442 insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1443 return insn;
1444 }
1445
1446 /* DO/WHILE loop:
1447 *
1448 * The DO/WHILE is just an unterminated loop -- break or continue are
1449 * used for control within the loop. We have a few ways they can be
1450 * done.
1451 *
1452 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1453 * jip and no DO instruction.
1454 *
1455 * For non-uniform control flow pre-gen6, there's a DO instruction to
1456 * push the mask, and a WHILE to jump back, and BREAK to get out and
1457 * pop the mask.
1458 *
1459 * For gen6, there's no more mask stack, so no need for DO. WHILE
1460 * just points back to the first instruction of the loop.
1461 */
1462 struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1463 {
1464 struct intel_context *intel = &p->brw->intel;
1465
1466 if (intel->gen >= 6 || p->single_program_flow) {
1467 push_loop_stack(p, &p->store[p->nr_insn]);
1468 return &p->store[p->nr_insn];
1469 } else {
1470 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1471
1472 push_loop_stack(p, insn);
1473
1474 /* Override the defaults for this instruction:
1475 */
1476 brw_set_dest(p, insn, brw_null_reg());
1477 brw_set_src0(p, insn, brw_null_reg());
1478 brw_set_src1(p, insn, brw_null_reg());
1479
1480 insn->header.compression_control = BRW_COMPRESSION_NONE;
1481 insn->header.execution_size = execute_size;
1482 insn->header.predicate_control = BRW_PREDICATE_NONE;
1483 /* insn->header.mask_control = BRW_MASK_ENABLE; */
1484 /* insn->header.mask_control = BRW_MASK_DISABLE; */
1485
1486 return insn;
1487 }
1488 }
1489
1490 /**
1491 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1492 * instruction here.
1493 *
1494 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1495 * nesting, since it can always just point to the end of the block/current loop.
1496 */
1497 static void
1498 brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1499 {
1500 struct intel_context *intel = &p->brw->intel;
1501 struct brw_instruction *do_inst = get_inner_do_insn(p);
1502 struct brw_instruction *inst;
1503 int br = (intel->gen == 5) ? 2 : 1;
1504
1505 for (inst = while_inst - 1; inst != do_inst; inst--) {
1506 /* If the jump count is != 0, that means that this instruction has already
1507 * been patched because it's part of a loop inside of the one we're
1508 * patching.
1509 */
1510 if (inst->header.opcode == BRW_OPCODE_BREAK &&
1511 inst->bits3.if_else.jump_count == 0) {
1512 inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1513 } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1514 inst->bits3.if_else.jump_count == 0) {
1515 inst->bits3.if_else.jump_count = br * (while_inst - inst);
1516 }
1517 }
1518 }
1519
1520 struct brw_instruction *brw_WHILE(struct brw_compile *p)
1521 {
1522 struct intel_context *intel = &p->brw->intel;
1523 struct brw_instruction *insn, *do_insn;
1524 GLuint br = 1;
1525
1526 if (intel->gen >= 5)
1527 br = 2;
1528
1529 if (intel->gen >= 7) {
1530 insn = next_insn(p, BRW_OPCODE_WHILE);
1531 do_insn = get_inner_do_insn(p);
1532
1533 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1534 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1535 brw_set_src1(p, insn, brw_imm_ud(0));
1536 insn->bits3.break_cont.jip = br * (do_insn - insn);
1537
1538 insn->header.execution_size = BRW_EXECUTE_8;
1539 } else if (intel->gen == 6) {
1540 insn = next_insn(p, BRW_OPCODE_WHILE);
1541 do_insn = get_inner_do_insn(p);
1542
1543 brw_set_dest(p, insn, brw_imm_w(0));
1544 insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1545 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1546 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1547
1548 insn->header.execution_size = BRW_EXECUTE_8;
1549 } else {
1550 if (p->single_program_flow) {
1551 insn = next_insn(p, BRW_OPCODE_ADD);
1552 do_insn = get_inner_do_insn(p);
1553
1554 brw_set_dest(p, insn, brw_ip_reg());
1555 brw_set_src0(p, insn, brw_ip_reg());
1556 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1557 insn->header.execution_size = BRW_EXECUTE_1;
1558 } else {
1559 insn = next_insn(p, BRW_OPCODE_WHILE);
1560 do_insn = get_inner_do_insn(p);
1561
1562 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1563
1564 brw_set_dest(p, insn, brw_ip_reg());
1565 brw_set_src0(p, insn, brw_ip_reg());
1566 brw_set_src1(p, insn, brw_imm_d(0));
1567
1568 insn->header.execution_size = do_insn->header.execution_size;
1569 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1570 insn->bits3.if_else.pop_count = 0;
1571 insn->bits3.if_else.pad0 = 0;
1572
1573 brw_patch_break_cont(p, insn);
1574 }
1575 }
1576 insn->header.compression_control = BRW_COMPRESSION_NONE;
1577 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1578
1579 p->loop_stack_depth--;
1580
1581 return insn;
1582 }
1583
1584
1585 /* FORWARD JUMPS:
1586 */
1587 void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1588 {
1589 struct intel_context *intel = &p->brw->intel;
1590 struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1591 GLuint jmpi = 1;
1592
1593 if (intel->gen >= 5)
1594 jmpi = 2;
1595
1596 assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1597 assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1598
1599 jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
1600 }
1601
1602
1603
1604 /* To integrate with the above, it makes sense that the comparison
1605 * instruction should populate the flag register. It might be simpler
1606 * just to use the flag reg for most WM tasks?
1607 */
1608 void brw_CMP(struct brw_compile *p,
1609 struct brw_reg dest,
1610 GLuint conditional,
1611 struct brw_reg src0,
1612 struct brw_reg src1)
1613 {
1614 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1615
1616 insn->header.destreg__conditionalmod = conditional;
1617 brw_set_dest(p, insn, dest);
1618 brw_set_src0(p, insn, src0);
1619 brw_set_src1(p, insn, src1);
1620
1621 /* guess_execution_size(insn, src0); */
1622
1623
1624 /* Make it so that future instructions will use the computed flag
1625 * value until brw_set_predicate_control_flag_value() is called
1626 * again.
1627 */
1628 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1629 dest.nr == 0) {
1630 p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1631 p->flag_value = 0xff;
1632 }
1633 }
1634
1635 /* Issue 'wait' instruction for n1, host could program MMIO
1636 to wake up thread. */
1637 void brw_WAIT (struct brw_compile *p)
1638 {
1639 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1640 struct brw_reg src = brw_notification_1_reg();
1641
1642 brw_set_dest(p, insn, src);
1643 brw_set_src0(p, insn, src);
1644 brw_set_src1(p, insn, brw_null_reg());
1645 insn->header.execution_size = 0; /* must */
1646 insn->header.predicate_control = 0;
1647 insn->header.compression_control = 0;
1648 }
1649
1650
1651 /***********************************************************************
1652 * Helpers for the various SEND message types:
1653 */
1654
1655 /** Extended math function, float[8].
1656 */
1657 void brw_math( struct brw_compile *p,
1658 struct brw_reg dest,
1659 GLuint function,
1660 GLuint saturate,
1661 GLuint msg_reg_nr,
1662 struct brw_reg src,
1663 GLuint data_type,
1664 GLuint precision )
1665 {
1666 struct intel_context *intel = &p->brw->intel;
1667
1668 if (intel->gen >= 6) {
1669 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1670
1671 assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1672 assert(src.file == BRW_GENERAL_REGISTER_FILE);
1673
1674 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1675 if (intel->gen == 6)
1676 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1677
1678 /* Source modifiers are ignored for extended math instructions on Gen6. */
1679 if (intel->gen == 6) {
1680 assert(!src.negate);
1681 assert(!src.abs);
1682 }
1683
1684 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1685 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1686 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1687 assert(src.type != BRW_REGISTER_TYPE_F);
1688 } else {
1689 assert(src.type == BRW_REGISTER_TYPE_F);
1690 }
1691
1692 /* Math is the same ISA format as other opcodes, except that CondModifier
1693 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1694 */
1695 insn->header.destreg__conditionalmod = function;
1696 insn->header.saturate = saturate;
1697
1698 brw_set_dest(p, insn, dest);
1699 brw_set_src0(p, insn, src);
1700 brw_set_src1(p, insn, brw_null_reg());
1701 } else {
1702 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1703
1704 /* Example code doesn't set predicate_control for send
1705 * instructions.
1706 */
1707 insn->header.predicate_control = 0;
1708 insn->header.destreg__conditionalmod = msg_reg_nr;
1709
1710 brw_set_dest(p, insn, dest);
1711 brw_set_src0(p, insn, src);
1712 brw_set_math_message(p,
1713 insn,
1714 function,
1715 src.type == BRW_REGISTER_TYPE_D,
1716 precision,
1717 saturate,
1718 data_type);
1719 }
1720 }
1721
1722 /** Extended math function, float[8].
1723 */
1724 void brw_math2(struct brw_compile *p,
1725 struct brw_reg dest,
1726 GLuint function,
1727 struct brw_reg src0,
1728 struct brw_reg src1)
1729 {
1730 struct intel_context *intel = &p->brw->intel;
1731 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1732
1733 assert(intel->gen >= 6);
1734 (void) intel;
1735
1736
1737 assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1738 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1739 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1740
1741 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1742 if (intel->gen == 6) {
1743 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1744 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1745 }
1746
1747 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1748 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1749 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1750 assert(src0.type != BRW_REGISTER_TYPE_F);
1751 assert(src1.type != BRW_REGISTER_TYPE_F);
1752 } else {
1753 assert(src0.type == BRW_REGISTER_TYPE_F);
1754 assert(src1.type == BRW_REGISTER_TYPE_F);
1755 }
1756
1757 /* Source modifiers are ignored for extended math instructions on Gen6. */
1758 if (intel->gen == 6) {
1759 assert(!src0.negate);
1760 assert(!src0.abs);
1761 assert(!src1.negate);
1762 assert(!src1.abs);
1763 }
1764
1765 /* Math is the same ISA format as other opcodes, except that CondModifier
1766 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1767 */
1768 insn->header.destreg__conditionalmod = function;
1769
1770 brw_set_dest(p, insn, dest);
1771 brw_set_src0(p, insn, src0);
1772 brw_set_src1(p, insn, src1);
1773 }
1774
1775 /**
1776 * Extended math function, float[16].
1777 * Use 2 send instructions.
1778 */
1779 void brw_math_16( struct brw_compile *p,
1780 struct brw_reg dest,
1781 GLuint function,
1782 GLuint saturate,
1783 GLuint msg_reg_nr,
1784 struct brw_reg src,
1785 GLuint precision )
1786 {
1787 struct intel_context *intel = &p->brw->intel;
1788 struct brw_instruction *insn;
1789
1790 if (intel->gen >= 6) {
1791 insn = next_insn(p, BRW_OPCODE_MATH);
1792
1793 /* Math is the same ISA format as other opcodes, except that CondModifier
1794 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1795 */
1796 insn->header.destreg__conditionalmod = function;
1797 insn->header.saturate = saturate;
1798
1799 /* Source modifiers are ignored for extended math instructions. */
1800 assert(!src.negate);
1801 assert(!src.abs);
1802
1803 brw_set_dest(p, insn, dest);
1804 brw_set_src0(p, insn, src);
1805 brw_set_src1(p, insn, brw_null_reg());
1806 return;
1807 }
1808
1809 /* First instruction:
1810 */
1811 brw_push_insn_state(p);
1812 brw_set_predicate_control_flag_value(p, 0xff);
1813 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1814
1815 insn = next_insn(p, BRW_OPCODE_SEND);
1816 insn->header.destreg__conditionalmod = msg_reg_nr;
1817
1818 brw_set_dest(p, insn, dest);
1819 brw_set_src0(p, insn, src);
1820 brw_set_math_message(p,
1821 insn,
1822 function,
1823 BRW_MATH_INTEGER_UNSIGNED,
1824 precision,
1825 saturate,
1826 BRW_MATH_DATA_VECTOR);
1827
1828 /* Second instruction:
1829 */
1830 insn = next_insn(p, BRW_OPCODE_SEND);
1831 insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
1832 insn->header.destreg__conditionalmod = msg_reg_nr+1;
1833
1834 brw_set_dest(p, insn, offset(dest,1));
1835 brw_set_src0(p, insn, src);
1836 brw_set_math_message(p,
1837 insn,
1838 function,
1839 BRW_MATH_INTEGER_UNSIGNED,
1840 precision,
1841 saturate,
1842 BRW_MATH_DATA_VECTOR);
1843
1844 brw_pop_insn_state(p);
1845 }
1846
1847
1848 /**
1849 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1850 * using a constant offset per channel.
1851 *
1852 * The offset must be aligned to oword size (16 bytes). Used for
1853 * register spilling.
1854 */
1855 void brw_oword_block_write_scratch(struct brw_compile *p,
1856 struct brw_reg mrf,
1857 int num_regs,
1858 GLuint offset)
1859 {
1860 struct intel_context *intel = &p->brw->intel;
1861 uint32_t msg_control, msg_type;
1862 int mlen;
1863
1864 if (intel->gen >= 6)
1865 offset /= 16;
1866
1867 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1868
1869 if (num_regs == 1) {
1870 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1871 mlen = 2;
1872 } else {
1873 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1874 mlen = 3;
1875 }
1876
1877 /* Set up the message header. This is g0, with g0.2 filled with
1878 * the offset. We don't want to leave our offset around in g0 or
1879 * it'll screw up texture samples, so set it up inside the message
1880 * reg.
1881 */
1882 {
1883 brw_push_insn_state(p);
1884 brw_set_mask_control(p, BRW_MASK_DISABLE);
1885 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1886
1887 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1888
1889 /* set message header global offset field (reg 0, element 2) */
1890 brw_MOV(p,
1891 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1892 mrf.nr,
1893 2), BRW_REGISTER_TYPE_UD),
1894 brw_imm_ud(offset));
1895
1896 brw_pop_insn_state(p);
1897 }
1898
1899 {
1900 struct brw_reg dest;
1901 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1902 int send_commit_msg;
1903 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1904 BRW_REGISTER_TYPE_UW);
1905
1906 if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1907 insn->header.compression_control = BRW_COMPRESSION_NONE;
1908 src_header = vec16(src_header);
1909 }
1910 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1911 insn->header.destreg__conditionalmod = mrf.nr;
1912
1913 /* Until gen6, writes followed by reads from the same location
1914 * are not guaranteed to be ordered unless write_commit is set.
1915 * If set, then a no-op write is issued to the destination
1916 * register to set a dependency, and a read from the destination
1917 * can be used to ensure the ordering.
1918 *
1919 * For gen6, only writes between different threads need ordering
1920 * protection. Our use of DP writes is all about register
1921 * spilling within a thread.
1922 */
1923 if (intel->gen >= 6) {
1924 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1925 send_commit_msg = 0;
1926 } else {
1927 dest = src_header;
1928 send_commit_msg = 1;
1929 }
1930
1931 brw_set_dest(p, insn, dest);
1932 if (intel->gen >= 6) {
1933 brw_set_src0(p, insn, mrf);
1934 } else {
1935 brw_set_src0(p, insn, brw_null_reg());
1936 }
1937
1938 if (intel->gen >= 6)
1939 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1940 else
1941 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1942
1943 brw_set_dp_write_message(p,
1944 insn,
1945 255, /* binding table index (255=stateless) */
1946 msg_control,
1947 msg_type,
1948 mlen,
1949 true, /* header_present */
1950 0, /* not a render target */
1951 send_commit_msg, /* response_length */
1952 0, /* eot */
1953 send_commit_msg);
1954 }
1955 }
1956
1957
1958 /**
1959 * Read a block of owords (half a GRF each) from the scratch buffer
1960 * using a constant index per channel.
1961 *
1962 * Offset must be aligned to oword size (16 bytes). Used for register
1963 * spilling.
1964 */
1965 void
1966 brw_oword_block_read_scratch(struct brw_compile *p,
1967 struct brw_reg dest,
1968 struct brw_reg mrf,
1969 int num_regs,
1970 GLuint offset)
1971 {
1972 struct intel_context *intel = &p->brw->intel;
1973 uint32_t msg_control;
1974 int rlen;
1975
1976 if (intel->gen >= 6)
1977 offset /= 16;
1978
1979 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1980 dest = retype(dest, BRW_REGISTER_TYPE_UW);
1981
1982 if (num_regs == 1) {
1983 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1984 rlen = 1;
1985 } else {
1986 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1987 rlen = 2;
1988 }
1989
1990 {
1991 brw_push_insn_state(p);
1992 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1993 brw_set_mask_control(p, BRW_MASK_DISABLE);
1994
1995 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1996
1997 /* set message header global offset field (reg 0, element 2) */
1998 brw_MOV(p,
1999 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2000 mrf.nr,
2001 2), BRW_REGISTER_TYPE_UD),
2002 brw_imm_ud(offset));
2003
2004 brw_pop_insn_state(p);
2005 }
2006
2007 {
2008 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2009
2010 assert(insn->header.predicate_control == 0);
2011 insn->header.compression_control = BRW_COMPRESSION_NONE;
2012 insn->header.destreg__conditionalmod = mrf.nr;
2013
2014 brw_set_dest(p, insn, dest); /* UW? */
2015 if (intel->gen >= 6) {
2016 brw_set_src0(p, insn, mrf);
2017 } else {
2018 brw_set_src0(p, insn, brw_null_reg());
2019 }
2020
2021 brw_set_dp_read_message(p,
2022 insn,
2023 255, /* binding table index (255=stateless) */
2024 msg_control,
2025 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2026 BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2027 1, /* msg_length */
2028 rlen);
2029 }
2030 }
2031
2032 /**
2033 * Read a float[4] vector from the data port Data Cache (const buffer).
2034 * Location (in buffer) should be a multiple of 16.
2035 * Used for fetching shader constants.
2036 */
2037 void brw_oword_block_read(struct brw_compile *p,
2038 struct brw_reg dest,
2039 struct brw_reg mrf,
2040 uint32_t offset,
2041 uint32_t bind_table_index)
2042 {
2043 struct intel_context *intel = &p->brw->intel;
2044
2045 /* On newer hardware, offset is in units of owords. */
2046 if (intel->gen >= 6)
2047 offset /= 16;
2048
2049 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2050
2051 brw_push_insn_state(p);
2052 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2053 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2054 brw_set_mask_control(p, BRW_MASK_DISABLE);
2055
2056 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2057
2058 /* set message header global offset field (reg 0, element 2) */
2059 brw_MOV(p,
2060 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2061 mrf.nr,
2062 2), BRW_REGISTER_TYPE_UD),
2063 brw_imm_ud(offset));
2064
2065 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2066 insn->header.destreg__conditionalmod = mrf.nr;
2067
2068 /* cast dest to a uword[8] vector */
2069 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2070
2071 brw_set_dest(p, insn, dest);
2072 if (intel->gen >= 6) {
2073 brw_set_src0(p, insn, mrf);
2074 } else {
2075 brw_set_src0(p, insn, brw_null_reg());
2076 }
2077
2078 brw_set_dp_read_message(p,
2079 insn,
2080 bind_table_index,
2081 BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2082 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2083 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2084 1, /* msg_length */
2085 1); /* response_length (1 reg, 2 owords!) */
2086
2087 brw_pop_insn_state(p);
2088 }
2089
2090 /**
2091 * Read a set of dwords from the data port Data Cache (const buffer).
2092 *
2093 * Location (in buffer) appears as UD offsets in the register after
2094 * the provided mrf header reg.
2095 */
2096 void brw_dword_scattered_read(struct brw_compile *p,
2097 struct brw_reg dest,
2098 struct brw_reg mrf,
2099 uint32_t bind_table_index)
2100 {
2101 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2102
2103 brw_push_insn_state(p);
2104 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2105 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2106 brw_set_mask_control(p, BRW_MASK_DISABLE);
2107 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2108 brw_pop_insn_state(p);
2109
2110 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2111 insn->header.destreg__conditionalmod = mrf.nr;
2112
2113 /* cast dest to a uword[8] vector */
2114 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2115
2116 brw_set_dest(p, insn, dest);
2117 brw_set_src0(p, insn, brw_null_reg());
2118
2119 brw_set_dp_read_message(p,
2120 insn,
2121 bind_table_index,
2122 BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS,
2123 BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ,
2124 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2125 2, /* msg_length */
2126 1); /* response_length */
2127 }
2128
2129
2130
2131 /**
2132 * Read float[4] constant(s) from VS constant buffer.
2133 * For relative addressing, two float[4] constants will be read into 'dest'.
2134 * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
2135 */
2136 void brw_dp_READ_4_vs(struct brw_compile *p,
2137 struct brw_reg dest,
2138 GLuint location,
2139 GLuint bind_table_index)
2140 {
2141 struct intel_context *intel = &p->brw->intel;
2142 struct brw_instruction *insn;
2143 GLuint msg_reg_nr = 1;
2144
2145 if (intel->gen >= 6)
2146 location /= 16;
2147
2148 /* Setup MRF[1] with location/offset into const buffer */
2149 brw_push_insn_state(p);
2150 brw_set_access_mode(p, BRW_ALIGN_1);
2151 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2152 brw_set_mask_control(p, BRW_MASK_DISABLE);
2153 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2154 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2),
2155 BRW_REGISTER_TYPE_UD),
2156 brw_imm_ud(location));
2157 brw_pop_insn_state(p);
2158
2159 insn = next_insn(p, BRW_OPCODE_SEND);
2160
2161 insn->header.predicate_control = BRW_PREDICATE_NONE;
2162 insn->header.compression_control = BRW_COMPRESSION_NONE;
2163 insn->header.destreg__conditionalmod = msg_reg_nr;
2164 insn->header.mask_control = BRW_MASK_DISABLE;
2165
2166 brw_set_dest(p, insn, dest);
2167 if (intel->gen >= 6) {
2168 brw_set_src0(p, insn, brw_message_reg(msg_reg_nr));
2169 } else {
2170 brw_set_src0(p, insn, brw_null_reg());
2171 }
2172
2173 brw_set_dp_read_message(p,
2174 insn,
2175 bind_table_index,
2176 0,
2177 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2178 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2179 1, /* msg_length */
2180 1); /* response_length (1 Oword) */
2181 }
2182
2183 /**
2184 * Read a float[4] constant per vertex from VS constant buffer, with
2185 * relative addressing.
2186 */
2187 void brw_dp_READ_4_vs_relative(struct brw_compile *p,
2188 struct brw_reg dest,
2189 struct brw_reg addr_reg,
2190 GLuint offset,
2191 GLuint bind_table_index)
2192 {
2193 struct intel_context *intel = &p->brw->intel;
2194 struct brw_reg src = brw_vec8_grf(0, 0);
2195 int msg_type;
2196
2197 /* Setup MRF[1] with offset into const buffer */
2198 brw_push_insn_state(p);
2199 brw_set_access_mode(p, BRW_ALIGN_1);
2200 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2201 brw_set_mask_control(p, BRW_MASK_DISABLE);
2202 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2203
2204 /* M1.0 is block offset 0, M1.4 is block offset 1, all other
2205 * fields ignored.
2206 */
2207 brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_D),
2208 addr_reg, brw_imm_d(offset));
2209 brw_pop_insn_state(p);
2210
2211 gen6_resolve_implied_move(p, &src, 0);
2212 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2213
2214 insn->header.predicate_control = BRW_PREDICATE_NONE;
2215 insn->header.compression_control = BRW_COMPRESSION_NONE;
2216 insn->header.destreg__conditionalmod = 0;
2217 insn->header.mask_control = BRW_MASK_DISABLE;
2218
2219 brw_set_dest(p, insn, dest);
2220 brw_set_src0(p, insn, src);
2221
2222 if (intel->gen >= 6)
2223 msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2224 else if (intel->gen == 5 || intel->is_g4x)
2225 msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2226 else
2227 msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2228
2229 brw_set_dp_read_message(p,
2230 insn,
2231 bind_table_index,
2232 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
2233 msg_type,
2234 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2235 2, /* msg_length */
2236 1); /* response_length */
2237 }
2238
2239
2240
2241 void brw_fb_WRITE(struct brw_compile *p,
2242 int dispatch_width,
2243 GLuint msg_reg_nr,
2244 struct brw_reg src0,
2245 GLuint msg_control,
2246 GLuint binding_table_index,
2247 GLuint msg_length,
2248 GLuint response_length,
2249 bool eot,
2250 bool header_present)
2251 {
2252 struct intel_context *intel = &p->brw->intel;
2253 struct brw_instruction *insn;
2254 GLuint msg_type;
2255 struct brw_reg dest;
2256
2257 if (dispatch_width == 16)
2258 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2259 else
2260 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2261
2262 if (intel->gen >= 6 && binding_table_index == 0) {
2263 insn = next_insn(p, BRW_OPCODE_SENDC);
2264 } else {
2265 insn = next_insn(p, BRW_OPCODE_SEND);
2266 }
2267 /* The execution mask is ignored for render target writes. */
2268 insn->header.predicate_control = 0;
2269 insn->header.compression_control = BRW_COMPRESSION_NONE;
2270
2271 if (intel->gen >= 6) {
2272 /* headerless version, just submit color payload */
2273 src0 = brw_message_reg(msg_reg_nr);
2274
2275 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2276 } else {
2277 insn->header.destreg__conditionalmod = msg_reg_nr;
2278
2279 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2280 }
2281
2282 brw_set_dest(p, insn, dest);
2283 brw_set_src0(p, insn, src0);
2284 brw_set_dp_write_message(p,
2285 insn,
2286 binding_table_index,
2287 msg_control,
2288 msg_type,
2289 msg_length,
2290 header_present,
2291 eot, /* last render target write */
2292 response_length,
2293 eot,
2294 0 /* send_commit_msg */);
2295 }
2296
2297
2298 /**
2299 * Texture sample instruction.
2300 * Note: the msg_type plus msg_length values determine exactly what kind
2301 * of sampling operation is performed. See volume 4, page 161 of docs.
2302 */
2303 void brw_SAMPLE(struct brw_compile *p,
2304 struct brw_reg dest,
2305 GLuint msg_reg_nr,
2306 struct brw_reg src0,
2307 GLuint binding_table_index,
2308 GLuint sampler,
2309 GLuint writemask,
2310 GLuint msg_type,
2311 GLuint response_length,
2312 GLuint msg_length,
2313 GLuint header_present,
2314 GLuint simd_mode,
2315 GLuint return_format)
2316 {
2317 struct intel_context *intel = &p->brw->intel;
2318 bool need_stall = 0;
2319
2320 if (writemask == 0) {
2321 /*printf("%s: zero writemask??\n", __FUNCTION__); */
2322 return;
2323 }
2324
2325 /* Hardware doesn't do destination dependency checking on send
2326 * instructions properly. Add a workaround which generates the
2327 * dependency by other means. In practice it seems like this bug
2328 * only crops up for texture samples, and only where registers are
2329 * written by the send and then written again later without being
2330 * read in between. Luckily for us, we already track that
2331 * information and use it to modify the writemask for the
2332 * instruction, so that is a guide for whether a workaround is
2333 * needed.
2334 */
2335 if (writemask != WRITEMASK_XYZW) {
2336 GLuint dst_offset = 0;
2337 GLuint i, newmask = 0, len = 0;
2338
2339 for (i = 0; i < 4; i++) {
2340 if (writemask & (1<<i))
2341 break;
2342 dst_offset += 2;
2343 }
2344 for (; i < 4; i++) {
2345 if (!(writemask & (1<<i)))
2346 break;
2347 newmask |= 1<<i;
2348 len++;
2349 }
2350
2351 if (newmask != writemask) {
2352 need_stall = 1;
2353 /* printf("need stall %x %x\n", newmask , writemask); */
2354 }
2355 else {
2356 bool dispatch_16 = false;
2357
2358 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
2359
2360 guess_execution_size(p, p->current, dest);
2361 if (p->current->header.execution_size == BRW_EXECUTE_16)
2362 dispatch_16 = true;
2363
2364 newmask = ~newmask & WRITEMASK_XYZW;
2365
2366 brw_push_insn_state(p);
2367
2368 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2369 brw_set_mask_control(p, BRW_MASK_DISABLE);
2370
2371 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
2372 retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
2373 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
2374
2375 brw_pop_insn_state(p);
2376
2377 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
2378 dest = offset(dest, dst_offset);
2379
2380 /* For 16-wide dispatch, masked channels are skipped in the
2381 * response. For 8-wide, masked channels still take up slots,
2382 * and are just not written to.
2383 */
2384 if (dispatch_16)
2385 response_length = len * 2;
2386 }
2387 }
2388
2389 {
2390 struct brw_instruction *insn;
2391
2392 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2393
2394 insn = next_insn(p, BRW_OPCODE_SEND);
2395 insn->header.predicate_control = 0; /* XXX */
2396 insn->header.compression_control = BRW_COMPRESSION_NONE;
2397 if (intel->gen < 6)
2398 insn->header.destreg__conditionalmod = msg_reg_nr;
2399
2400 brw_set_dest(p, insn, dest);
2401 brw_set_src0(p, insn, src0);
2402 brw_set_sampler_message(p, insn,
2403 binding_table_index,
2404 sampler,
2405 msg_type,
2406 response_length,
2407 msg_length,
2408 header_present,
2409 simd_mode,
2410 return_format);
2411 }
2412
2413 if (need_stall) {
2414 struct brw_reg reg = vec8(offset(dest, response_length-1));
2415
2416 /* mov (8) r9.0<1>:f r9.0<8;8,1>:f { Align1 }
2417 */
2418 brw_push_insn_state(p);
2419 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2420 brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
2421 retype(reg, BRW_REGISTER_TYPE_UD));
2422 brw_pop_insn_state(p);
2423 }
2424
2425 }
2426
2427 /* All these variables are pretty confusing - we might be better off
2428 * using bitmasks and macros for this, in the old style. Or perhaps
2429 * just having the caller instantiate the fields in dword3 itself.
2430 */
2431 void brw_urb_WRITE(struct brw_compile *p,
2432 struct brw_reg dest,
2433 GLuint msg_reg_nr,
2434 struct brw_reg src0,
2435 bool allocate,
2436 bool used,
2437 GLuint msg_length,
2438 GLuint response_length,
2439 bool eot,
2440 bool writes_complete,
2441 GLuint offset,
2442 GLuint swizzle)
2443 {
2444 struct intel_context *intel = &p->brw->intel;
2445 struct brw_instruction *insn;
2446
2447 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2448
2449 if (intel->gen == 7) {
2450 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2451 brw_push_insn_state(p);
2452 brw_set_access_mode(p, BRW_ALIGN_1);
2453 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2454 BRW_REGISTER_TYPE_UD),
2455 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2456 brw_imm_ud(0xff00));
2457 brw_pop_insn_state(p);
2458 }
2459
2460 insn = next_insn(p, BRW_OPCODE_SEND);
2461
2462 assert(msg_length < BRW_MAX_MRF);
2463
2464 brw_set_dest(p, insn, dest);
2465 brw_set_src0(p, insn, src0);
2466 brw_set_src1(p, insn, brw_imm_d(0));
2467
2468 if (intel->gen < 6)
2469 insn->header.destreg__conditionalmod = msg_reg_nr;
2470
2471 brw_set_urb_message(p,
2472 insn,
2473 allocate,
2474 used,
2475 msg_length,
2476 response_length,
2477 eot,
2478 writes_complete,
2479 offset,
2480 swizzle);
2481 }
2482
2483 static int
2484 brw_find_next_block_end(struct brw_compile *p, int start)
2485 {
2486 int ip;
2487
2488 for (ip = start + 1; ip < p->nr_insn; ip++) {
2489 struct brw_instruction *insn = &p->store[ip];
2490
2491 switch (insn->header.opcode) {
2492 case BRW_OPCODE_ENDIF:
2493 case BRW_OPCODE_ELSE:
2494 case BRW_OPCODE_WHILE:
2495 return ip;
2496 }
2497 }
2498 assert(!"not reached");
2499 return start + 1;
2500 }
2501
2502 /* There is no DO instruction on gen6, so to find the end of the loop
2503 * we have to see if the loop is jumping back before our start
2504 * instruction.
2505 */
2506 static int
2507 brw_find_loop_end(struct brw_compile *p, int start)
2508 {
2509 struct intel_context *intel = &p->brw->intel;
2510 int ip;
2511 int br = 2;
2512
2513 for (ip = start + 1; ip < p->nr_insn; ip++) {
2514 struct brw_instruction *insn = &p->store[ip];
2515
2516 if (insn->header.opcode == BRW_OPCODE_WHILE) {
2517 int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
2518 : insn->bits3.break_cont.jip;
2519 if (ip + jip / br <= start)
2520 return ip;
2521 }
2522 }
2523 assert(!"not reached");
2524 return start + 1;
2525 }
2526
2527 /* After program generation, go back and update the UIP and JIP of
2528 * BREAK and CONT instructions to their correct locations.
2529 */
2530 void
2531 brw_set_uip_jip(struct brw_compile *p)
2532 {
2533 struct intel_context *intel = &p->brw->intel;
2534 int ip;
2535 int br = 2;
2536
2537 if (intel->gen < 6)
2538 return;
2539
2540 for (ip = 0; ip < p->nr_insn; ip++) {
2541 struct brw_instruction *insn = &p->store[ip];
2542
2543 switch (insn->header.opcode) {
2544 case BRW_OPCODE_BREAK:
2545 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2546 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2547 insn->bits3.break_cont.uip =
2548 br * (brw_find_loop_end(p, ip) - ip + (intel->gen == 6 ? 1 : 0));
2549 break;
2550 case BRW_OPCODE_CONTINUE:
2551 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2552 insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip);
2553
2554 assert(insn->bits3.break_cont.uip != 0);
2555 assert(insn->bits3.break_cont.jip != 0);
2556 break;
2557 }
2558 }
2559 }
2560
2561 void brw_ff_sync(struct brw_compile *p,
2562 struct brw_reg dest,
2563 GLuint msg_reg_nr,
2564 struct brw_reg src0,
2565 bool allocate,
2566 GLuint response_length,
2567 bool eot)
2568 {
2569 struct intel_context *intel = &p->brw->intel;
2570 struct brw_instruction *insn;
2571
2572 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2573
2574 insn = next_insn(p, BRW_OPCODE_SEND);
2575 brw_set_dest(p, insn, dest);
2576 brw_set_src0(p, insn, src0);
2577 brw_set_src1(p, insn, brw_imm_d(0));
2578
2579 if (intel->gen < 6)
2580 insn->header.destreg__conditionalmod = msg_reg_nr;
2581
2582 brw_set_ff_sync_message(p,
2583 insn,
2584 allocate,
2585 response_length,
2586 eot);
2587 }
2588
2589 /**
2590 * Emit the SEND instruction necessary to generate stream output data on Gen6
2591 * (for transform feedback).
2592 *
2593 * If send_commit_msg is true, this is the last piece of stream output data
2594 * from this thread, so send the data as a committed write. According to the
2595 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2596 *
2597 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2598 * writes are complete by sending the final write as a committed write."
2599 */
2600 void
2601 brw_svb_write(struct brw_compile *p,
2602 struct brw_reg dest,
2603 GLuint msg_reg_nr,
2604 struct brw_reg src0,
2605 GLuint binding_table_index,
2606 bool send_commit_msg)
2607 {
2608 struct brw_instruction *insn;
2609
2610 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2611
2612 insn = next_insn(p, BRW_OPCODE_SEND);
2613 brw_set_dest(p, insn, dest);
2614 brw_set_src0(p, insn, src0);
2615 brw_set_src1(p, insn, brw_imm_d(0));
2616 brw_set_dp_write_message(p, insn,
2617 binding_table_index,
2618 0, /* msg_control: ignored */
2619 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2620 1, /* msg_length */
2621 true, /* header_present */
2622 0, /* last_render_target: ignored */
2623 send_commit_msg, /* response_length */
2624 0, /* end_of_thread */
2625 send_commit_msg); /* send_commit_msg */
2626 }