i965: Move brw_land_fwd_jump() to compilation unit of its use.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_context.h"
34 #include "brw_defines.h"
35 #include "brw_eu.h"
36
37 #include "glsl/ralloc.h"
38
39 /***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43 static void guess_execution_size(struct brw_compile *p,
44 struct brw_instruction *insn,
45 struct brw_reg reg)
46 {
47 if (reg.width == BRW_WIDTH_8 && p->compressed)
48 insn->header.execution_size = BRW_EXECUTE_16;
49 else
50 insn->header.execution_size = reg.width; /* note - definitions are compatible */
51 }
52
53
54 /**
55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
56 * registers, implicitly moving the operand to a message register.
57 *
58 * On Sandybridge, this is no longer the case. This function performs the
59 * explicit move; it should be called before emitting a SEND instruction.
60 */
61 void
62 gen6_resolve_implied_move(struct brw_compile *p,
63 struct brw_reg *src,
64 unsigned msg_reg_nr)
65 {
66 struct brw_context *brw = p->brw;
67 if (brw->gen < 6)
68 return;
69
70 if (src->file == BRW_MESSAGE_REGISTER_FILE)
71 return;
72
73 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
74 brw_push_insn_state(p);
75 brw_set_mask_control(p, BRW_MASK_DISABLE);
76 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
77 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
78 retype(*src, BRW_REGISTER_TYPE_UD));
79 brw_pop_insn_state(p);
80 }
81 *src = brw_message_reg(msg_reg_nr);
82 }
83
84 static void
85 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
86 {
87 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
88 * "The send with EOT should use register space R112-R127 for <src>. This is
89 * to enable loading of a new thread into the same slot while the message
90 * with EOT for current thread is pending dispatch."
91 *
92 * Since we're pretending to have 16 MRFs anyway, we may as well use the
93 * registers required for messages with EOT.
94 */
95 struct brw_context *brw = p->brw;
96 if (brw->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
97 reg->file = BRW_GENERAL_REGISTER_FILE;
98 reg->nr += GEN7_MRF_HACK_START;
99 }
100 }
101
102 /**
103 * Convert a brw_reg_type enumeration value into the hardware representation.
104 *
105 * The hardware encoding may depend on whether the value is an immediate.
106 */
107 unsigned
108 brw_reg_type_to_hw_type(const struct brw_context *brw,
109 enum brw_reg_type type, unsigned file)
110 {
111 if (file == BRW_IMMEDIATE_VALUE) {
112 const static int imm_hw_types[] = {
113 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
114 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
115 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
116 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
117 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
118 [BRW_REGISTER_TYPE_UB] = -1,
119 [BRW_REGISTER_TYPE_B] = -1,
120 [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
121 [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
122 [BRW_REGISTER_TYPE_V] = BRW_HW_REG_IMM_TYPE_V,
123 [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
124 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
125 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
126 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
127 };
128 assert(type < ARRAY_SIZE(imm_hw_types));
129 assert(imm_hw_types[type] != -1);
130 assert(brw->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
131 return imm_hw_types[type];
132 } else {
133 /* Non-immediate registers */
134 const static int hw_types[] = {
135 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
136 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
137 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
138 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
139 [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
140 [BRW_REGISTER_TYPE_B] = BRW_HW_REG_NON_IMM_TYPE_B,
141 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
142 [BRW_REGISTER_TYPE_UV] = -1,
143 [BRW_REGISTER_TYPE_VF] = -1,
144 [BRW_REGISTER_TYPE_V] = -1,
145 [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
146 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
147 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
148 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
149 };
150 assert(type < ARRAY_SIZE(hw_types));
151 assert(hw_types[type] != -1);
152 assert(brw->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
153 assert(brw->gen >= 8 || type < BRW_REGISTER_TYPE_HF);
154 return hw_types[type];
155 }
156 }
157
158 void
159 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
160 struct brw_reg dest)
161 {
162 if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
163 dest.file != BRW_MESSAGE_REGISTER_FILE)
164 assert(dest.nr < 128);
165
166 gen7_convert_mrf_to_grf(p, &dest);
167
168 insn->bits1.da1.dest_reg_file = dest.file;
169 insn->bits1.da1.dest_reg_type =
170 brw_reg_type_to_hw_type(p->brw, dest.type, dest.file);
171 insn->bits1.da1.dest_address_mode = dest.address_mode;
172
173 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
174 insn->bits1.da1.dest_reg_nr = dest.nr;
175
176 if (insn->header.access_mode == BRW_ALIGN_1) {
177 insn->bits1.da1.dest_subreg_nr = dest.subnr;
178 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
179 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
180 insn->bits1.da1.dest_horiz_stride = dest.hstride;
181 }
182 else {
183 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
184 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
185 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
186 dest.file == BRW_MESSAGE_REGISTER_FILE) {
187 assert(dest.dw1.bits.writemask != 0);
188 }
189 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
190 * Although Dst.HorzStride is a don't care for Align16, HW needs
191 * this to be programmed as "01".
192 */
193 insn->bits1.da16.dest_horiz_stride = 1;
194 }
195 }
196 else {
197 insn->bits1.ia1.dest_subreg_nr = dest.subnr;
198
199 /* These are different sizes in align1 vs align16:
200 */
201 if (insn->header.access_mode == BRW_ALIGN_1) {
202 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
203 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
204 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
205 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
206 }
207 else {
208 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
209 /* even ignored in da16, still need to set as '01' */
210 insn->bits1.ia16.dest_horiz_stride = 1;
211 }
212 }
213
214 /* NEW: Set the execution size based on dest.width and
215 * insn->compression_control:
216 */
217 guess_execution_size(p, insn, dest);
218 }
219
220 extern int reg_type_size[];
221
222 static void
223 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
224 {
225 int hstride_for_reg[] = {0, 1, 2, 4};
226 int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
227 int width_for_reg[] = {1, 2, 4, 8, 16};
228 int execsize_for_reg[] = {1, 2, 4, 8, 16};
229 int width, hstride, vstride, execsize;
230
231 if (reg.file == BRW_IMMEDIATE_VALUE) {
232 /* 3.3.6: Region Parameters. Restriction: Immediate vectors
233 * mean the destination has to be 128-bit aligned and the
234 * destination horiz stride has to be a word.
235 */
236 if (reg.type == BRW_REGISTER_TYPE_V) {
237 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
238 reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
239 }
240
241 return;
242 }
243
244 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
245 reg.file == BRW_ARF_NULL)
246 return;
247
248 assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
249 hstride = hstride_for_reg[reg.hstride];
250
251 if (reg.vstride == 0xf) {
252 vstride = -1;
253 } else {
254 assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
255 vstride = vstride_for_reg[reg.vstride];
256 }
257
258 assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
259 width = width_for_reg[reg.width];
260
261 assert(insn->header.execution_size >= 0 &&
262 insn->header.execution_size < Elements(execsize_for_reg));
263 execsize = execsize_for_reg[insn->header.execution_size];
264
265 /* Restrictions from 3.3.10: Register Region Restrictions. */
266 /* 3. */
267 assert(execsize >= width);
268
269 /* 4. */
270 if (execsize == width && hstride != 0) {
271 assert(vstride == -1 || vstride == width * hstride);
272 }
273
274 /* 5. */
275 if (execsize == width && hstride == 0) {
276 /* no restriction on vstride. */
277 }
278
279 /* 6. */
280 if (width == 1) {
281 assert(hstride == 0);
282 }
283
284 /* 7. */
285 if (execsize == 1 && width == 1) {
286 assert(hstride == 0);
287 assert(vstride == 0);
288 }
289
290 /* 8. */
291 if (vstride == 0 && hstride == 0) {
292 assert(width == 1);
293 }
294
295 /* 10. Check destination issues. */
296 }
297
298 void
299 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
300 struct brw_reg reg)
301 {
302 struct brw_context *brw = p->brw;
303
304 if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
305 assert(reg.nr < 128);
306
307 gen7_convert_mrf_to_grf(p, &reg);
308
309 if (brw->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
310 insn->header.opcode == BRW_OPCODE_SENDC)) {
311 /* Any source modifiers or regions will be ignored, since this just
312 * identifies the MRF/GRF to start reading the message contents from.
313 * Check for some likely failures.
314 */
315 assert(!reg.negate);
316 assert(!reg.abs);
317 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
318 }
319
320 validate_reg(insn, reg);
321
322 insn->bits1.da1.src0_reg_file = reg.file;
323 insn->bits1.da1.src0_reg_type =
324 brw_reg_type_to_hw_type(brw, reg.type, reg.file);
325 insn->bits2.da1.src0_abs = reg.abs;
326 insn->bits2.da1.src0_negate = reg.negate;
327 insn->bits2.da1.src0_address_mode = reg.address_mode;
328
329 if (reg.file == BRW_IMMEDIATE_VALUE) {
330 insn->bits3.ud = reg.dw1.ud;
331
332 /* The Bspec's section titled "Non-present Operands" claims that if src0
333 * is an immediate that src1's type must be the same as that of src0.
334 *
335 * The SNB+ DataTypeIndex instruction compaction tables contain mappings
336 * that do not follow this rule. E.g., from the IVB/HSW table:
337 *
338 * DataTypeIndex 18-Bit Mapping Mapped Meaning
339 * 3 001000001011111101 r:f | i:vf | a:ud | <1> | dir |
340 *
341 * And from the SNB table:
342 *
343 * DataTypeIndex 18-Bit Mapping Mapped Meaning
344 * 8 001000000111101100 a:w | i:w | a:ud | <1> | dir |
345 *
346 * Neither of these cause warnings from the simulator when used,
347 * compacted or otherwise. In fact, all compaction mappings that have an
348 * immediate in src0 use a:ud for src1.
349 *
350 * The GM45 instruction compaction tables do not contain mapped meanings
351 * so it's not clear whether it has the restriction. We'll assume it was
352 * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
353 */
354 insn->bits1.da1.src1_reg_file = 0; /* arf */
355 if (brw->gen < 6) {
356 insn->bits1.da1.src1_reg_type = insn->bits1.da1.src0_reg_type;
357 } else {
358 insn->bits1.da1.src1_reg_type = BRW_HW_REG_TYPE_UD;
359 }
360
361 /* Compacted instructions only have 12-bits (plus 1 for the other 20)
362 * for immediate values. Presumably the hardware engineers realized
363 * that the only useful floating-point value that could be represented
364 * in this format is 0.0, which can also be represented as a VF-typed
365 * immediate, so they gave us the previously mentioned mapping on IVB+.
366 *
367 * Strangely, we do have a mapping for imm:f in src1, so we don't need
368 * to do this there.
369 *
370 * If we see a 0.0:F, change the type to VF so that it can be compacted.
371 */
372 if (insn->bits3.ud == 0x0 &&
373 insn->bits1.da1.src0_reg_type == BRW_HW_REG_TYPE_F) {
374 insn->bits1.da1.src0_reg_type = BRW_HW_REG_IMM_TYPE_VF;
375 }
376 }
377 else
378 {
379 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
380 if (insn->header.access_mode == BRW_ALIGN_1) {
381 insn->bits2.da1.src0_subreg_nr = reg.subnr;
382 insn->bits2.da1.src0_reg_nr = reg.nr;
383 }
384 else {
385 insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
386 insn->bits2.da16.src0_reg_nr = reg.nr;
387 }
388 }
389 else {
390 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
391
392 if (insn->header.access_mode == BRW_ALIGN_1) {
393 insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
394 }
395 else {
396 insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
397 }
398 }
399
400 if (insn->header.access_mode == BRW_ALIGN_1) {
401 if (reg.width == BRW_WIDTH_1 &&
402 insn->header.execution_size == BRW_EXECUTE_1) {
403 insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
404 insn->bits2.da1.src0_width = BRW_WIDTH_1;
405 insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
406 }
407 else {
408 insn->bits2.da1.src0_horiz_stride = reg.hstride;
409 insn->bits2.da1.src0_width = reg.width;
410 insn->bits2.da1.src0_vert_stride = reg.vstride;
411 }
412 }
413 else {
414 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
415 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
416 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
417 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
418
419 /* This is an oddity of the fact we're using the same
420 * descriptions for registers in align_16 as align_1:
421 */
422 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
423 insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
424 else
425 insn->bits2.da16.src0_vert_stride = reg.vstride;
426 }
427 }
428 }
429
430
431 void
432 brw_set_src1(struct brw_compile *p,
433 struct brw_instruction *insn,
434 struct brw_reg reg)
435 {
436 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
437
438 if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
439 assert(reg.nr < 128);
440
441 gen7_convert_mrf_to_grf(p, &reg);
442
443 validate_reg(insn, reg);
444
445 insn->bits1.da1.src1_reg_file = reg.file;
446 insn->bits1.da1.src1_reg_type =
447 brw_reg_type_to_hw_type(p->brw, reg.type, reg.file);
448 insn->bits3.da1.src1_abs = reg.abs;
449 insn->bits3.da1.src1_negate = reg.negate;
450
451 /* Only src1 can be immediate in two-argument instructions.
452 */
453 assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
454
455 if (reg.file == BRW_IMMEDIATE_VALUE) {
456 insn->bits3.ud = reg.dw1.ud;
457 }
458 else {
459 /* This is a hardware restriction, which may or may not be lifted
460 * in the future:
461 */
462 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
463 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
464
465 if (insn->header.access_mode == BRW_ALIGN_1) {
466 insn->bits3.da1.src1_subreg_nr = reg.subnr;
467 insn->bits3.da1.src1_reg_nr = reg.nr;
468 }
469 else {
470 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
471 insn->bits3.da16.src1_reg_nr = reg.nr;
472 }
473
474 if (insn->header.access_mode == BRW_ALIGN_1) {
475 if (reg.width == BRW_WIDTH_1 &&
476 insn->header.execution_size == BRW_EXECUTE_1) {
477 insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
478 insn->bits3.da1.src1_width = BRW_WIDTH_1;
479 insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
480 }
481 else {
482 insn->bits3.da1.src1_horiz_stride = reg.hstride;
483 insn->bits3.da1.src1_width = reg.width;
484 insn->bits3.da1.src1_vert_stride = reg.vstride;
485 }
486 }
487 else {
488 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
489 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
490 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
491 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
492
493 /* This is an oddity of the fact we're using the same
494 * descriptions for registers in align_16 as align_1:
495 */
496 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
497 insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
498 else
499 insn->bits3.da16.src1_vert_stride = reg.vstride;
500 }
501 }
502 }
503
504 /**
505 * Set the Message Descriptor and Extended Message Descriptor fields
506 * for SEND messages.
507 *
508 * \note This zeroes out the Function Control bits, so it must be called
509 * \b before filling out any message-specific data. Callers can
510 * choose not to fill in irrelevant bits; they will be zero.
511 */
512 static void
513 brw_set_message_descriptor(struct brw_compile *p,
514 struct brw_instruction *inst,
515 enum brw_message_target sfid,
516 unsigned msg_length,
517 unsigned response_length,
518 bool header_present,
519 bool end_of_thread)
520 {
521 struct brw_context *brw = p->brw;
522
523 brw_set_src1(p, inst, brw_imm_d(0));
524
525 if (brw->gen >= 5) {
526 inst->bits3.generic_gen5.header_present = header_present;
527 inst->bits3.generic_gen5.response_length = response_length;
528 inst->bits3.generic_gen5.msg_length = msg_length;
529 inst->bits3.generic_gen5.end_of_thread = end_of_thread;
530
531 if (brw->gen >= 6) {
532 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
533 inst->header.destreg__conditionalmod = sfid;
534 } else {
535 /* Set Extended Message Descriptor (ex_desc) */
536 inst->bits2.send_gen5.sfid = sfid;
537 inst->bits2.send_gen5.end_of_thread = end_of_thread;
538 }
539 } else {
540 inst->bits3.generic.response_length = response_length;
541 inst->bits3.generic.msg_length = msg_length;
542 inst->bits3.generic.msg_target = sfid;
543 inst->bits3.generic.end_of_thread = end_of_thread;
544 }
545 }
546
547 static void brw_set_math_message( struct brw_compile *p,
548 struct brw_instruction *insn,
549 unsigned function,
550 unsigned integer_type,
551 bool low_precision,
552 unsigned dataType )
553 {
554 struct brw_context *brw = p->brw;
555 unsigned msg_length;
556 unsigned response_length;
557
558 /* Infer message length from the function */
559 switch (function) {
560 case BRW_MATH_FUNCTION_POW:
561 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
562 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
563 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
564 msg_length = 2;
565 break;
566 default:
567 msg_length = 1;
568 break;
569 }
570
571 /* Infer response length from the function */
572 switch (function) {
573 case BRW_MATH_FUNCTION_SINCOS:
574 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
575 response_length = 2;
576 break;
577 default:
578 response_length = 1;
579 break;
580 }
581
582
583 brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
584 msg_length, response_length, false, false);
585 if (brw->gen == 5) {
586 insn->bits3.math_gen5.function = function;
587 insn->bits3.math_gen5.int_type = integer_type;
588 insn->bits3.math_gen5.precision = low_precision;
589 insn->bits3.math_gen5.saturate = insn->header.saturate;
590 insn->bits3.math_gen5.data_type = dataType;
591 insn->bits3.math_gen5.snapshot = 0;
592 } else {
593 insn->bits3.math.function = function;
594 insn->bits3.math.int_type = integer_type;
595 insn->bits3.math.precision = low_precision;
596 insn->bits3.math.saturate = insn->header.saturate;
597 insn->bits3.math.data_type = dataType;
598 }
599 insn->header.saturate = 0;
600 }
601
602
603 static void brw_set_ff_sync_message(struct brw_compile *p,
604 struct brw_instruction *insn,
605 bool allocate,
606 unsigned response_length,
607 bool end_of_thread)
608 {
609 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
610 1, response_length, true, end_of_thread);
611 insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
612 insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
613 insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
614 insn->bits3.urb_gen5.allocate = allocate;
615 insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
616 insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
617 }
618
619 static void brw_set_urb_message( struct brw_compile *p,
620 struct brw_instruction *insn,
621 enum brw_urb_write_flags flags,
622 unsigned msg_length,
623 unsigned response_length,
624 unsigned offset,
625 unsigned swizzle_control )
626 {
627 struct brw_context *brw = p->brw;
628
629 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
630 msg_length, response_length, true,
631 flags & BRW_URB_WRITE_EOT);
632 if (brw->gen == 7) {
633 if (flags & BRW_URB_WRITE_OWORD) {
634 assert(msg_length == 2); /* header + one OWORD of data */
635 insn->bits3.urb_gen7.opcode = BRW_URB_OPCODE_WRITE_OWORD;
636 } else {
637 insn->bits3.urb_gen7.opcode = BRW_URB_OPCODE_WRITE_HWORD;
638 }
639 insn->bits3.urb_gen7.offset = offset;
640 assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
641 insn->bits3.urb_gen7.swizzle_control = swizzle_control;
642 insn->bits3.urb_gen7.per_slot_offset =
643 flags & BRW_URB_WRITE_PER_SLOT_OFFSET ? 1 : 0;
644 insn->bits3.urb_gen7.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
645 } else if (brw->gen >= 5) {
646 insn->bits3.urb_gen5.opcode = 0; /* URB_WRITE */
647 insn->bits3.urb_gen5.offset = offset;
648 insn->bits3.urb_gen5.swizzle_control = swizzle_control;
649 insn->bits3.urb_gen5.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
650 insn->bits3.urb_gen5.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
651 insn->bits3.urb_gen5.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
652 } else {
653 insn->bits3.urb.opcode = 0; /* ? */
654 insn->bits3.urb.offset = offset;
655 insn->bits3.urb.swizzle_control = swizzle_control;
656 insn->bits3.urb.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
657 insn->bits3.urb.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
658 insn->bits3.urb.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
659 }
660 }
661
662 void
663 brw_set_dp_write_message(struct brw_compile *p,
664 struct brw_instruction *insn,
665 unsigned binding_table_index,
666 unsigned msg_control,
667 unsigned msg_type,
668 unsigned msg_length,
669 bool header_present,
670 unsigned last_render_target,
671 unsigned response_length,
672 unsigned end_of_thread,
673 unsigned send_commit_msg)
674 {
675 struct brw_context *brw = p->brw;
676 unsigned sfid;
677
678 if (brw->gen >= 7) {
679 /* Use the Render Cache for RT writes; otherwise use the Data Cache */
680 if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
681 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
682 else
683 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
684 } else if (brw->gen == 6) {
685 /* Use the render cache for all write messages. */
686 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
687 } else {
688 sfid = BRW_SFID_DATAPORT_WRITE;
689 }
690
691 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
692 header_present, end_of_thread);
693
694 if (brw->gen >= 7) {
695 insn->bits3.gen7_dp.binding_table_index = binding_table_index;
696 insn->bits3.gen7_dp.msg_control = msg_control;
697 insn->bits3.gen7_dp.last_render_target = last_render_target;
698 insn->bits3.gen7_dp.msg_type = msg_type;
699 } else if (brw->gen == 6) {
700 insn->bits3.gen6_dp.binding_table_index = binding_table_index;
701 insn->bits3.gen6_dp.msg_control = msg_control;
702 insn->bits3.gen6_dp.last_render_target = last_render_target;
703 insn->bits3.gen6_dp.msg_type = msg_type;
704 insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
705 } else if (brw->gen == 5) {
706 insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
707 insn->bits3.dp_write_gen5.msg_control = msg_control;
708 insn->bits3.dp_write_gen5.last_render_target = last_render_target;
709 insn->bits3.dp_write_gen5.msg_type = msg_type;
710 insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
711 } else {
712 insn->bits3.dp_write.binding_table_index = binding_table_index;
713 insn->bits3.dp_write.msg_control = msg_control;
714 insn->bits3.dp_write.last_render_target = last_render_target;
715 insn->bits3.dp_write.msg_type = msg_type;
716 insn->bits3.dp_write.send_commit_msg = send_commit_msg;
717 }
718 }
719
720 void
721 brw_set_dp_read_message(struct brw_compile *p,
722 struct brw_instruction *insn,
723 unsigned binding_table_index,
724 unsigned msg_control,
725 unsigned msg_type,
726 unsigned target_cache,
727 unsigned msg_length,
728 bool header_present,
729 unsigned response_length)
730 {
731 struct brw_context *brw = p->brw;
732 unsigned sfid;
733
734 if (brw->gen >= 7) {
735 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
736 } else if (brw->gen == 6) {
737 if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
738 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
739 else
740 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
741 } else {
742 sfid = BRW_SFID_DATAPORT_READ;
743 }
744
745 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
746 header_present, false);
747
748 if (brw->gen >= 7) {
749 insn->bits3.gen7_dp.binding_table_index = binding_table_index;
750 insn->bits3.gen7_dp.msg_control = msg_control;
751 insn->bits3.gen7_dp.last_render_target = 0;
752 insn->bits3.gen7_dp.msg_type = msg_type;
753 } else if (brw->gen == 6) {
754 insn->bits3.gen6_dp.binding_table_index = binding_table_index;
755 insn->bits3.gen6_dp.msg_control = msg_control;
756 insn->bits3.gen6_dp.last_render_target = 0;
757 insn->bits3.gen6_dp.msg_type = msg_type;
758 insn->bits3.gen6_dp.send_commit_msg = 0;
759 } else if (brw->gen == 5) {
760 insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
761 insn->bits3.dp_read_gen5.msg_control = msg_control;
762 insn->bits3.dp_read_gen5.msg_type = msg_type;
763 insn->bits3.dp_read_gen5.target_cache = target_cache;
764 } else if (brw->is_g4x) {
765 insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
766 insn->bits3.dp_read_g4x.msg_control = msg_control; /*8:10*/
767 insn->bits3.dp_read_g4x.msg_type = msg_type; /*11:13*/
768 insn->bits3.dp_read_g4x.target_cache = target_cache; /*14:15*/
769 } else {
770 insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
771 insn->bits3.dp_read.msg_control = msg_control; /*8:11*/
772 insn->bits3.dp_read.msg_type = msg_type; /*12:13*/
773 insn->bits3.dp_read.target_cache = target_cache; /*14:15*/
774 }
775 }
776
777 void
778 brw_set_sampler_message(struct brw_compile *p,
779 struct brw_instruction *insn,
780 unsigned binding_table_index,
781 unsigned sampler,
782 unsigned msg_type,
783 unsigned response_length,
784 unsigned msg_length,
785 unsigned header_present,
786 unsigned simd_mode,
787 unsigned return_format)
788 {
789 struct brw_context *brw = p->brw;
790
791 brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
792 response_length, header_present, false);
793
794 if (brw->gen >= 7) {
795 insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
796 insn->bits3.sampler_gen7.sampler = sampler;
797 insn->bits3.sampler_gen7.msg_type = msg_type;
798 insn->bits3.sampler_gen7.simd_mode = simd_mode;
799 } else if (brw->gen >= 5) {
800 insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
801 insn->bits3.sampler_gen5.sampler = sampler;
802 insn->bits3.sampler_gen5.msg_type = msg_type;
803 insn->bits3.sampler_gen5.simd_mode = simd_mode;
804 } else if (brw->is_g4x) {
805 insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
806 insn->bits3.sampler_g4x.sampler = sampler;
807 insn->bits3.sampler_g4x.msg_type = msg_type;
808 } else {
809 insn->bits3.sampler.binding_table_index = binding_table_index;
810 insn->bits3.sampler.sampler = sampler;
811 insn->bits3.sampler.msg_type = msg_type;
812 insn->bits3.sampler.return_format = return_format;
813 }
814 }
815
816
817 #define next_insn brw_next_insn
818 struct brw_instruction *
819 brw_next_insn(struct brw_compile *p, unsigned opcode)
820 {
821 struct brw_instruction *insn;
822
823 if (p->nr_insn + 1 > p->store_size) {
824 if (0) {
825 fprintf(stderr, "incresing the store size to %d\n",
826 p->store_size << 1);
827 }
828 p->store_size <<= 1;
829 p->store = reralloc(p->mem_ctx, p->store,
830 struct brw_instruction, p->store_size);
831 if (!p->store)
832 assert(!"realloc eu store memeory failed");
833 }
834
835 p->next_insn_offset += 16;
836 insn = &p->store[p->nr_insn++];
837 memcpy(insn, p->current, sizeof(*insn));
838
839 /* Reset this one-shot flag:
840 */
841
842 if (p->current->header.destreg__conditionalmod) {
843 p->current->header.destreg__conditionalmod = 0;
844 p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
845 }
846
847 insn->header.opcode = opcode;
848 return insn;
849 }
850
851 static struct brw_instruction *brw_alu1( struct brw_compile *p,
852 unsigned opcode,
853 struct brw_reg dest,
854 struct brw_reg src )
855 {
856 struct brw_instruction *insn = next_insn(p, opcode);
857 brw_set_dest(p, insn, dest);
858 brw_set_src0(p, insn, src);
859 return insn;
860 }
861
862 static struct brw_instruction *brw_alu2(struct brw_compile *p,
863 unsigned opcode,
864 struct brw_reg dest,
865 struct brw_reg src0,
866 struct brw_reg src1 )
867 {
868 struct brw_instruction *insn = next_insn(p, opcode);
869 brw_set_dest(p, insn, dest);
870 brw_set_src0(p, insn, src0);
871 brw_set_src1(p, insn, src1);
872 return insn;
873 }
874
875 static int
876 get_3src_subreg_nr(struct brw_reg reg)
877 {
878 if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
879 assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
880 return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
881 } else {
882 return reg.subnr / 4;
883 }
884 }
885
886 static struct brw_instruction *brw_alu3(struct brw_compile *p,
887 unsigned opcode,
888 struct brw_reg dest,
889 struct brw_reg src0,
890 struct brw_reg src1,
891 struct brw_reg src2)
892 {
893 struct brw_context *brw = p->brw;
894 struct brw_instruction *insn = next_insn(p, opcode);
895
896 gen7_convert_mrf_to_grf(p, &dest);
897
898 assert(insn->header.access_mode == BRW_ALIGN_16);
899
900 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
901 dest.file == BRW_MESSAGE_REGISTER_FILE);
902 assert(dest.nr < 128);
903 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
904 assert(dest.type == BRW_REGISTER_TYPE_F ||
905 dest.type == BRW_REGISTER_TYPE_D ||
906 dest.type == BRW_REGISTER_TYPE_UD);
907 insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
908 insn->bits1.da3src.dest_reg_nr = dest.nr;
909 insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
910 insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
911 guess_execution_size(p, insn, dest);
912
913 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
914 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
915 assert(src0.nr < 128);
916 insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
917 insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
918 insn->bits2.da3src.src0_reg_nr = src0.nr;
919 insn->bits1.da3src.src0_abs = src0.abs;
920 insn->bits1.da3src.src0_negate = src0.negate;
921 insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
922
923 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
924 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
925 assert(src1.nr < 128);
926 insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
927 insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
928 insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
929 insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
930 insn->bits3.da3src.src1_reg_nr = src1.nr;
931 insn->bits1.da3src.src1_abs = src1.abs;
932 insn->bits1.da3src.src1_negate = src1.negate;
933
934 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
935 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
936 assert(src2.nr < 128);
937 insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
938 insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
939 insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
940 insn->bits3.da3src.src2_reg_nr = src2.nr;
941 insn->bits1.da3src.src2_abs = src2.abs;
942 insn->bits1.da3src.src2_negate = src2.negate;
943
944 if (brw->gen >= 7) {
945 /* Set both the source and destination types based on dest.type,
946 * ignoring the source register types. The MAD and LRP emitters ensure
947 * that all four types are float. The BFE and BFI2 emitters, however,
948 * may send us mixed D and UD types and want us to ignore that and use
949 * the destination type.
950 */
951 switch (dest.type) {
952 case BRW_REGISTER_TYPE_F:
953 insn->bits1.da3src.src_type = BRW_3SRC_TYPE_F;
954 insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_F;
955 break;
956 case BRW_REGISTER_TYPE_D:
957 insn->bits1.da3src.src_type = BRW_3SRC_TYPE_D;
958 insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_D;
959 break;
960 case BRW_REGISTER_TYPE_UD:
961 insn->bits1.da3src.src_type = BRW_3SRC_TYPE_UD;
962 insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_UD;
963 break;
964 }
965 }
966
967 return insn;
968 }
969
970
971 /***********************************************************************
972 * Convenience routines.
973 */
974 #define ALU1(OP) \
975 struct brw_instruction *brw_##OP(struct brw_compile *p, \
976 struct brw_reg dest, \
977 struct brw_reg src0) \
978 { \
979 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
980 }
981
982 #define ALU2(OP) \
983 struct brw_instruction *brw_##OP(struct brw_compile *p, \
984 struct brw_reg dest, \
985 struct brw_reg src0, \
986 struct brw_reg src1) \
987 { \
988 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
989 }
990
991 #define ALU3(OP) \
992 struct brw_instruction *brw_##OP(struct brw_compile *p, \
993 struct brw_reg dest, \
994 struct brw_reg src0, \
995 struct brw_reg src1, \
996 struct brw_reg src2) \
997 { \
998 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
999 }
1000
1001 #define ALU3F(OP) \
1002 struct brw_instruction *brw_##OP(struct brw_compile *p, \
1003 struct brw_reg dest, \
1004 struct brw_reg src0, \
1005 struct brw_reg src1, \
1006 struct brw_reg src2) \
1007 { \
1008 assert(dest.type == BRW_REGISTER_TYPE_F); \
1009 assert(src0.type == BRW_REGISTER_TYPE_F); \
1010 assert(src1.type == BRW_REGISTER_TYPE_F); \
1011 assert(src2.type == BRW_REGISTER_TYPE_F); \
1012 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1013 }
1014
1015 /* Rounding operations (other than RNDD) require two instructions - the first
1016 * stores a rounded value (possibly the wrong way) in the dest register, but
1017 * also sets a per-channel "increment bit" in the flag register. A predicated
1018 * add of 1.0 fixes dest to contain the desired result.
1019 *
1020 * Sandybridge and later appear to round correctly without an ADD.
1021 */
1022 #define ROUND(OP) \
1023 void brw_##OP(struct brw_compile *p, \
1024 struct brw_reg dest, \
1025 struct brw_reg src) \
1026 { \
1027 struct brw_instruction *rnd, *add; \
1028 rnd = next_insn(p, BRW_OPCODE_##OP); \
1029 brw_set_dest(p, rnd, dest); \
1030 brw_set_src0(p, rnd, src); \
1031 \
1032 if (p->brw->gen < 6) { \
1033 /* turn on round-increments */ \
1034 rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R; \
1035 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
1036 add->header.predicate_control = BRW_PREDICATE_NORMAL; \
1037 } \
1038 }
1039
1040
1041 ALU1(MOV)
1042 ALU2(SEL)
1043 ALU1(NOT)
1044 ALU2(AND)
1045 ALU2(OR)
1046 ALU2(XOR)
1047 ALU2(SHR)
1048 ALU2(SHL)
1049 ALU2(ASR)
1050 ALU1(F32TO16)
1051 ALU1(F16TO32)
1052 ALU1(FRC)
1053 ALU1(RNDD)
1054 ALU2(MAC)
1055 ALU2(MACH)
1056 ALU1(LZD)
1057 ALU2(DP4)
1058 ALU2(DPH)
1059 ALU2(DP3)
1060 ALU2(DP2)
1061 ALU2(LINE)
1062 ALU2(PLN)
1063 ALU3F(MAD)
1064 ALU3F(LRP)
1065 ALU1(BFREV)
1066 ALU3(BFE)
1067 ALU2(BFI1)
1068 ALU3(BFI2)
1069 ALU1(FBH)
1070 ALU1(FBL)
1071 ALU1(CBIT)
1072 ALU2(ADDC)
1073 ALU2(SUBB)
1074
1075 ROUND(RNDZ)
1076 ROUND(RNDE)
1077
1078
1079 struct brw_instruction *brw_ADD(struct brw_compile *p,
1080 struct brw_reg dest,
1081 struct brw_reg src0,
1082 struct brw_reg src1)
1083 {
1084 /* 6.2.2: add */
1085 if (src0.type == BRW_REGISTER_TYPE_F ||
1086 (src0.file == BRW_IMMEDIATE_VALUE &&
1087 src0.type == BRW_REGISTER_TYPE_VF)) {
1088 assert(src1.type != BRW_REGISTER_TYPE_UD);
1089 assert(src1.type != BRW_REGISTER_TYPE_D);
1090 }
1091
1092 if (src1.type == BRW_REGISTER_TYPE_F ||
1093 (src1.file == BRW_IMMEDIATE_VALUE &&
1094 src1.type == BRW_REGISTER_TYPE_VF)) {
1095 assert(src0.type != BRW_REGISTER_TYPE_UD);
1096 assert(src0.type != BRW_REGISTER_TYPE_D);
1097 }
1098
1099 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1100 }
1101
1102 struct brw_instruction *brw_AVG(struct brw_compile *p,
1103 struct brw_reg dest,
1104 struct brw_reg src0,
1105 struct brw_reg src1)
1106 {
1107 assert(dest.type == src0.type);
1108 assert(src0.type == src1.type);
1109 switch (src0.type) {
1110 case BRW_REGISTER_TYPE_B:
1111 case BRW_REGISTER_TYPE_UB:
1112 case BRW_REGISTER_TYPE_W:
1113 case BRW_REGISTER_TYPE_UW:
1114 case BRW_REGISTER_TYPE_D:
1115 case BRW_REGISTER_TYPE_UD:
1116 break;
1117 default:
1118 assert(!"Bad type for brw_AVG");
1119 }
1120
1121 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1122 }
1123
1124 struct brw_instruction *brw_MUL(struct brw_compile *p,
1125 struct brw_reg dest,
1126 struct brw_reg src0,
1127 struct brw_reg src1)
1128 {
1129 /* 6.32.38: mul */
1130 if (src0.type == BRW_REGISTER_TYPE_D ||
1131 src0.type == BRW_REGISTER_TYPE_UD ||
1132 src1.type == BRW_REGISTER_TYPE_D ||
1133 src1.type == BRW_REGISTER_TYPE_UD) {
1134 assert(dest.type != BRW_REGISTER_TYPE_F);
1135 }
1136
1137 if (src0.type == BRW_REGISTER_TYPE_F ||
1138 (src0.file == BRW_IMMEDIATE_VALUE &&
1139 src0.type == BRW_REGISTER_TYPE_VF)) {
1140 assert(src1.type != BRW_REGISTER_TYPE_UD);
1141 assert(src1.type != BRW_REGISTER_TYPE_D);
1142 }
1143
1144 if (src1.type == BRW_REGISTER_TYPE_F ||
1145 (src1.file == BRW_IMMEDIATE_VALUE &&
1146 src1.type == BRW_REGISTER_TYPE_VF)) {
1147 assert(src0.type != BRW_REGISTER_TYPE_UD);
1148 assert(src0.type != BRW_REGISTER_TYPE_D);
1149 }
1150
1151 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1152 src0.nr != BRW_ARF_ACCUMULATOR);
1153 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1154 src1.nr != BRW_ARF_ACCUMULATOR);
1155
1156 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1157 }
1158
1159
1160 void brw_NOP(struct brw_compile *p)
1161 {
1162 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
1163 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1164 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1165 brw_set_src1(p, insn, brw_imm_ud(0x0));
1166 }
1167
1168
1169
1170
1171
1172 /***********************************************************************
1173 * Comparisons, if/else/endif
1174 */
1175
1176 struct brw_instruction *brw_JMPI(struct brw_compile *p,
1177 struct brw_reg dest,
1178 struct brw_reg src0,
1179 struct brw_reg src1)
1180 {
1181 struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
1182
1183 insn->header.execution_size = 1;
1184 insn->header.compression_control = BRW_COMPRESSION_NONE;
1185 insn->header.mask_control = BRW_MASK_DISABLE;
1186
1187 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1188
1189 return insn;
1190 }
1191
1192 static void
1193 push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1194 {
1195 p->if_stack[p->if_stack_depth] = inst - p->store;
1196
1197 p->if_stack_depth++;
1198 if (p->if_stack_array_size <= p->if_stack_depth) {
1199 p->if_stack_array_size *= 2;
1200 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1201 p->if_stack_array_size);
1202 }
1203 }
1204
1205 static struct brw_instruction *
1206 pop_if_stack(struct brw_compile *p)
1207 {
1208 p->if_stack_depth--;
1209 return &p->store[p->if_stack[p->if_stack_depth]];
1210 }
1211
1212 static void
1213 push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1214 {
1215 if (p->loop_stack_array_size < p->loop_stack_depth) {
1216 p->loop_stack_array_size *= 2;
1217 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1218 p->loop_stack_array_size);
1219 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1220 p->loop_stack_array_size);
1221 }
1222
1223 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1224 p->loop_stack_depth++;
1225 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1226 }
1227
1228 static struct brw_instruction *
1229 get_inner_do_insn(struct brw_compile *p)
1230 {
1231 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1232 }
1233
1234 /* EU takes the value from the flag register and pushes it onto some
1235 * sort of a stack (presumably merging with any flag value already on
1236 * the stack). Within an if block, the flags at the top of the stack
1237 * control execution on each channel of the unit, eg. on each of the
1238 * 16 pixel values in our wm programs.
1239 *
1240 * When the matching 'else' instruction is reached (presumably by
1241 * countdown of the instruction count patched in by our ELSE/ENDIF
1242 * functions), the relevent flags are inverted.
1243 *
1244 * When the matching 'endif' instruction is reached, the flags are
1245 * popped off. If the stack is now empty, normal execution resumes.
1246 */
1247 struct brw_instruction *
1248 brw_IF(struct brw_compile *p, unsigned execute_size)
1249 {
1250 struct brw_context *brw = p->brw;
1251 struct brw_instruction *insn;
1252
1253 insn = next_insn(p, BRW_OPCODE_IF);
1254
1255 /* Override the defaults for this instruction:
1256 */
1257 if (brw->gen < 6) {
1258 brw_set_dest(p, insn, brw_ip_reg());
1259 brw_set_src0(p, insn, brw_ip_reg());
1260 brw_set_src1(p, insn, brw_imm_d(0x0));
1261 } else if (brw->gen == 6) {
1262 brw_set_dest(p, insn, brw_imm_w(0));
1263 insn->bits1.branch_gen6.jump_count = 0;
1264 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1265 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1266 } else {
1267 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1268 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1269 brw_set_src1(p, insn, brw_imm_ud(0));
1270 insn->bits3.break_cont.jip = 0;
1271 insn->bits3.break_cont.uip = 0;
1272 }
1273
1274 insn->header.execution_size = execute_size;
1275 insn->header.compression_control = BRW_COMPRESSION_NONE;
1276 insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1277 insn->header.mask_control = BRW_MASK_ENABLE;
1278 if (!p->single_program_flow)
1279 insn->header.thread_control = BRW_THREAD_SWITCH;
1280
1281 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1282
1283 push_if_stack(p, insn);
1284 p->if_depth_in_loop[p->loop_stack_depth]++;
1285 return insn;
1286 }
1287
1288 /* This function is only used for gen6-style IF instructions with an
1289 * embedded comparison (conditional modifier). It is not used on gen7.
1290 */
1291 struct brw_instruction *
1292 gen6_IF(struct brw_compile *p, uint32_t conditional,
1293 struct brw_reg src0, struct brw_reg src1)
1294 {
1295 struct brw_instruction *insn;
1296
1297 insn = next_insn(p, BRW_OPCODE_IF);
1298
1299 brw_set_dest(p, insn, brw_imm_w(0));
1300 if (p->compressed) {
1301 insn->header.execution_size = BRW_EXECUTE_16;
1302 } else {
1303 insn->header.execution_size = BRW_EXECUTE_8;
1304 }
1305 insn->bits1.branch_gen6.jump_count = 0;
1306 brw_set_src0(p, insn, src0);
1307 brw_set_src1(p, insn, src1);
1308
1309 assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1310 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1311 insn->header.destreg__conditionalmod = conditional;
1312
1313 if (!p->single_program_flow)
1314 insn->header.thread_control = BRW_THREAD_SWITCH;
1315
1316 push_if_stack(p, insn);
1317 return insn;
1318 }
1319
1320 /**
1321 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1322 */
1323 static void
1324 convert_IF_ELSE_to_ADD(struct brw_compile *p,
1325 struct brw_instruction *if_inst,
1326 struct brw_instruction *else_inst)
1327 {
1328 /* The next instruction (where the ENDIF would be, if it existed) */
1329 struct brw_instruction *next_inst = &p->store[p->nr_insn];
1330
1331 assert(p->single_program_flow);
1332 assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1333 assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1334 assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1335
1336 /* Convert IF to an ADD instruction that moves the instruction pointer
1337 * to the first instruction of the ELSE block. If there is no ELSE
1338 * block, point to where ENDIF would be. Reverse the predicate.
1339 *
1340 * There's no need to execute an ENDIF since we don't need to do any
1341 * stack operations, and if we're currently executing, we just want to
1342 * continue normally.
1343 */
1344 if_inst->header.opcode = BRW_OPCODE_ADD;
1345 if_inst->header.predicate_inverse = 1;
1346
1347 if (else_inst != NULL) {
1348 /* Convert ELSE to an ADD instruction that points where the ENDIF
1349 * would be.
1350 */
1351 else_inst->header.opcode = BRW_OPCODE_ADD;
1352
1353 if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1354 else_inst->bits3.ud = (next_inst - else_inst) * 16;
1355 } else {
1356 if_inst->bits3.ud = (next_inst - if_inst) * 16;
1357 }
1358 }
1359
1360 /**
1361 * Patch IF and ELSE instructions with appropriate jump targets.
1362 */
1363 static void
1364 patch_IF_ELSE(struct brw_compile *p,
1365 struct brw_instruction *if_inst,
1366 struct brw_instruction *else_inst,
1367 struct brw_instruction *endif_inst)
1368 {
1369 struct brw_context *brw = p->brw;
1370
1371 /* We shouldn't be patching IF and ELSE instructions in single program flow
1372 * mode when gen < 6, because in single program flow mode on those
1373 * platforms, we convert flow control instructions to conditional ADDs that
1374 * operate on IP (see brw_ENDIF).
1375 *
1376 * However, on Gen6, writing to IP doesn't work in single program flow mode
1377 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1378 * not be updated by non-flow control instructions."). And on later
1379 * platforms, there is no significant benefit to converting control flow
1380 * instructions to conditional ADDs. So we do patch IF and ELSE
1381 * instructions in single program flow mode on those platforms.
1382 */
1383 if (brw->gen < 6)
1384 assert(!p->single_program_flow);
1385
1386 assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1387 assert(endif_inst != NULL);
1388 assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1389
1390 unsigned br = 1;
1391 /* Jump count is for 64bit data chunk each, so one 128bit instruction
1392 * requires 2 chunks.
1393 */
1394 if (brw->gen >= 5)
1395 br = 2;
1396
1397 assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1398 endif_inst->header.execution_size = if_inst->header.execution_size;
1399
1400 if (else_inst == NULL) {
1401 /* Patch IF -> ENDIF */
1402 if (brw->gen < 6) {
1403 /* Turn it into an IFF, which means no mask stack operations for
1404 * all-false and jumping past the ENDIF.
1405 */
1406 if_inst->header.opcode = BRW_OPCODE_IFF;
1407 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1408 if_inst->bits3.if_else.pop_count = 0;
1409 if_inst->bits3.if_else.pad0 = 0;
1410 } else if (brw->gen == 6) {
1411 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1412 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1413 } else {
1414 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1415 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1416 }
1417 } else {
1418 else_inst->header.execution_size = if_inst->header.execution_size;
1419
1420 /* Patch IF -> ELSE */
1421 if (brw->gen < 6) {
1422 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1423 if_inst->bits3.if_else.pop_count = 0;
1424 if_inst->bits3.if_else.pad0 = 0;
1425 } else if (brw->gen == 6) {
1426 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1427 }
1428
1429 /* Patch ELSE -> ENDIF */
1430 if (brw->gen < 6) {
1431 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1432 * matching ENDIF.
1433 */
1434 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1435 else_inst->bits3.if_else.pop_count = 1;
1436 else_inst->bits3.if_else.pad0 = 0;
1437 } else if (brw->gen == 6) {
1438 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1439 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1440 } else {
1441 /* The IF instruction's JIP should point just past the ELSE */
1442 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1443 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1444 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1445 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1446 }
1447 }
1448 }
1449
1450 void
1451 brw_ELSE(struct brw_compile *p)
1452 {
1453 struct brw_context *brw = p->brw;
1454 struct brw_instruction *insn;
1455
1456 insn = next_insn(p, BRW_OPCODE_ELSE);
1457
1458 if (brw->gen < 6) {
1459 brw_set_dest(p, insn, brw_ip_reg());
1460 brw_set_src0(p, insn, brw_ip_reg());
1461 brw_set_src1(p, insn, brw_imm_d(0x0));
1462 } else if (brw->gen == 6) {
1463 brw_set_dest(p, insn, brw_imm_w(0));
1464 insn->bits1.branch_gen6.jump_count = 0;
1465 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1466 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1467 } else {
1468 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1469 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1470 brw_set_src1(p, insn, brw_imm_ud(0));
1471 insn->bits3.break_cont.jip = 0;
1472 insn->bits3.break_cont.uip = 0;
1473 }
1474
1475 insn->header.compression_control = BRW_COMPRESSION_NONE;
1476 insn->header.mask_control = BRW_MASK_ENABLE;
1477 if (!p->single_program_flow)
1478 insn->header.thread_control = BRW_THREAD_SWITCH;
1479
1480 push_if_stack(p, insn);
1481 }
1482
1483 void
1484 brw_ENDIF(struct brw_compile *p)
1485 {
1486 struct brw_context *brw = p->brw;
1487 struct brw_instruction *insn = NULL;
1488 struct brw_instruction *else_inst = NULL;
1489 struct brw_instruction *if_inst = NULL;
1490 struct brw_instruction *tmp;
1491 bool emit_endif = true;
1492
1493 /* In single program flow mode, we can express IF and ELSE instructions
1494 * equivalently as ADD instructions that operate on IP. On platforms prior
1495 * to Gen6, flow control instructions cause an implied thread switch, so
1496 * this is a significant savings.
1497 *
1498 * However, on Gen6, writing to IP doesn't work in single program flow mode
1499 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1500 * not be updated by non-flow control instructions."). And on later
1501 * platforms, there is no significant benefit to converting control flow
1502 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1503 * Gen5.
1504 */
1505 if (brw->gen < 6 && p->single_program_flow)
1506 emit_endif = false;
1507
1508 /*
1509 * A single next_insn() may change the base adress of instruction store
1510 * memory(p->store), so call it first before referencing the instruction
1511 * store pointer from an index
1512 */
1513 if (emit_endif)
1514 insn = next_insn(p, BRW_OPCODE_ENDIF);
1515
1516 /* Pop the IF and (optional) ELSE instructions from the stack */
1517 p->if_depth_in_loop[p->loop_stack_depth]--;
1518 tmp = pop_if_stack(p);
1519 if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1520 else_inst = tmp;
1521 tmp = pop_if_stack(p);
1522 }
1523 if_inst = tmp;
1524
1525 if (!emit_endif) {
1526 /* ENDIF is useless; don't bother emitting it. */
1527 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1528 return;
1529 }
1530
1531 if (brw->gen < 6) {
1532 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1533 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1534 brw_set_src1(p, insn, brw_imm_d(0x0));
1535 } else if (brw->gen == 6) {
1536 brw_set_dest(p, insn, brw_imm_w(0));
1537 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1538 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1539 } else {
1540 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1541 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1542 brw_set_src1(p, insn, brw_imm_ud(0));
1543 }
1544
1545 insn->header.compression_control = BRW_COMPRESSION_NONE;
1546 insn->header.mask_control = BRW_MASK_ENABLE;
1547 insn->header.thread_control = BRW_THREAD_SWITCH;
1548
1549 /* Also pop item off the stack in the endif instruction: */
1550 if (brw->gen < 6) {
1551 insn->bits3.if_else.jump_count = 0;
1552 insn->bits3.if_else.pop_count = 1;
1553 insn->bits3.if_else.pad0 = 0;
1554 } else if (brw->gen == 6) {
1555 insn->bits1.branch_gen6.jump_count = 2;
1556 } else {
1557 insn->bits3.break_cont.jip = 2;
1558 }
1559 patch_IF_ELSE(p, if_inst, else_inst, insn);
1560 }
1561
1562 struct brw_instruction *brw_BREAK(struct brw_compile *p)
1563 {
1564 struct brw_context *brw = p->brw;
1565 struct brw_instruction *insn;
1566
1567 insn = next_insn(p, BRW_OPCODE_BREAK);
1568 if (brw->gen >= 6) {
1569 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1570 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1571 brw_set_src1(p, insn, brw_imm_d(0x0));
1572 } else {
1573 brw_set_dest(p, insn, brw_ip_reg());
1574 brw_set_src0(p, insn, brw_ip_reg());
1575 brw_set_src1(p, insn, brw_imm_d(0x0));
1576 insn->bits3.if_else.pad0 = 0;
1577 insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1578 }
1579 insn->header.compression_control = BRW_COMPRESSION_NONE;
1580 insn->header.execution_size = BRW_EXECUTE_8;
1581
1582 return insn;
1583 }
1584
1585 struct brw_instruction *gen6_CONT(struct brw_compile *p)
1586 {
1587 struct brw_instruction *insn;
1588
1589 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1590 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1591 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1592 brw_set_dest(p, insn, brw_ip_reg());
1593 brw_set_src0(p, insn, brw_ip_reg());
1594 brw_set_src1(p, insn, brw_imm_d(0x0));
1595
1596 insn->header.compression_control = BRW_COMPRESSION_NONE;
1597 insn->header.execution_size = BRW_EXECUTE_8;
1598 return insn;
1599 }
1600
1601 struct brw_instruction *brw_CONT(struct brw_compile *p)
1602 {
1603 struct brw_instruction *insn;
1604 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1605 brw_set_dest(p, insn, brw_ip_reg());
1606 brw_set_src0(p, insn, brw_ip_reg());
1607 brw_set_src1(p, insn, brw_imm_d(0x0));
1608 insn->header.compression_control = BRW_COMPRESSION_NONE;
1609 insn->header.execution_size = BRW_EXECUTE_8;
1610 /* insn->header.mask_control = BRW_MASK_DISABLE; */
1611 insn->bits3.if_else.pad0 = 0;
1612 insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1613 return insn;
1614 }
1615
1616 struct brw_instruction *gen6_HALT(struct brw_compile *p)
1617 {
1618 struct brw_instruction *insn;
1619
1620 insn = next_insn(p, BRW_OPCODE_HALT);
1621 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1622 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1623 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1624
1625 if (p->compressed) {
1626 insn->header.execution_size = BRW_EXECUTE_16;
1627 } else {
1628 insn->header.compression_control = BRW_COMPRESSION_NONE;
1629 insn->header.execution_size = BRW_EXECUTE_8;
1630 }
1631 return insn;
1632 }
1633
1634 /* DO/WHILE loop:
1635 *
1636 * The DO/WHILE is just an unterminated loop -- break or continue are
1637 * used for control within the loop. We have a few ways they can be
1638 * done.
1639 *
1640 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1641 * jip and no DO instruction.
1642 *
1643 * For non-uniform control flow pre-gen6, there's a DO instruction to
1644 * push the mask, and a WHILE to jump back, and BREAK to get out and
1645 * pop the mask.
1646 *
1647 * For gen6, there's no more mask stack, so no need for DO. WHILE
1648 * just points back to the first instruction of the loop.
1649 */
1650 struct brw_instruction *brw_DO(struct brw_compile *p, unsigned execute_size)
1651 {
1652 struct brw_context *brw = p->brw;
1653
1654 if (brw->gen >= 6 || p->single_program_flow) {
1655 push_loop_stack(p, &p->store[p->nr_insn]);
1656 return &p->store[p->nr_insn];
1657 } else {
1658 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1659
1660 push_loop_stack(p, insn);
1661
1662 /* Override the defaults for this instruction:
1663 */
1664 brw_set_dest(p, insn, brw_null_reg());
1665 brw_set_src0(p, insn, brw_null_reg());
1666 brw_set_src1(p, insn, brw_null_reg());
1667
1668 insn->header.compression_control = BRW_COMPRESSION_NONE;
1669 insn->header.execution_size = execute_size;
1670 insn->header.predicate_control = BRW_PREDICATE_NONE;
1671 /* insn->header.mask_control = BRW_MASK_ENABLE; */
1672 /* insn->header.mask_control = BRW_MASK_DISABLE; */
1673
1674 return insn;
1675 }
1676 }
1677
1678 /**
1679 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1680 * instruction here.
1681 *
1682 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1683 * nesting, since it can always just point to the end of the block/current loop.
1684 */
1685 static void
1686 brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1687 {
1688 struct brw_context *brw = p->brw;
1689 struct brw_instruction *do_inst = get_inner_do_insn(p);
1690 struct brw_instruction *inst;
1691 int br = (brw->gen == 5) ? 2 : 1;
1692
1693 for (inst = while_inst - 1; inst != do_inst; inst--) {
1694 /* If the jump count is != 0, that means that this instruction has already
1695 * been patched because it's part of a loop inside of the one we're
1696 * patching.
1697 */
1698 if (inst->header.opcode == BRW_OPCODE_BREAK &&
1699 inst->bits3.if_else.jump_count == 0) {
1700 inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1701 } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1702 inst->bits3.if_else.jump_count == 0) {
1703 inst->bits3.if_else.jump_count = br * (while_inst - inst);
1704 }
1705 }
1706 }
1707
1708 struct brw_instruction *brw_WHILE(struct brw_compile *p)
1709 {
1710 struct brw_context *brw = p->brw;
1711 struct brw_instruction *insn, *do_insn;
1712 unsigned br = 1;
1713
1714 if (brw->gen >= 5)
1715 br = 2;
1716
1717 if (brw->gen >= 7) {
1718 insn = next_insn(p, BRW_OPCODE_WHILE);
1719 do_insn = get_inner_do_insn(p);
1720
1721 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1722 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1723 brw_set_src1(p, insn, brw_imm_ud(0));
1724 insn->bits3.break_cont.jip = br * (do_insn - insn);
1725
1726 insn->header.execution_size = BRW_EXECUTE_8;
1727 } else if (brw->gen == 6) {
1728 insn = next_insn(p, BRW_OPCODE_WHILE);
1729 do_insn = get_inner_do_insn(p);
1730
1731 brw_set_dest(p, insn, brw_imm_w(0));
1732 insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1733 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1734 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1735
1736 insn->header.execution_size = BRW_EXECUTE_8;
1737 } else {
1738 if (p->single_program_flow) {
1739 insn = next_insn(p, BRW_OPCODE_ADD);
1740 do_insn = get_inner_do_insn(p);
1741
1742 brw_set_dest(p, insn, brw_ip_reg());
1743 brw_set_src0(p, insn, brw_ip_reg());
1744 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1745 insn->header.execution_size = BRW_EXECUTE_1;
1746 } else {
1747 insn = next_insn(p, BRW_OPCODE_WHILE);
1748 do_insn = get_inner_do_insn(p);
1749
1750 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1751
1752 brw_set_dest(p, insn, brw_ip_reg());
1753 brw_set_src0(p, insn, brw_ip_reg());
1754 brw_set_src1(p, insn, brw_imm_d(0));
1755
1756 insn->header.execution_size = do_insn->header.execution_size;
1757 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1758 insn->bits3.if_else.pop_count = 0;
1759 insn->bits3.if_else.pad0 = 0;
1760
1761 brw_patch_break_cont(p, insn);
1762 }
1763 }
1764 insn->header.compression_control = BRW_COMPRESSION_NONE;
1765 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1766
1767 p->loop_stack_depth--;
1768
1769 return insn;
1770 }
1771
1772 /* To integrate with the above, it makes sense that the comparison
1773 * instruction should populate the flag register. It might be simpler
1774 * just to use the flag reg for most WM tasks?
1775 */
1776 void brw_CMP(struct brw_compile *p,
1777 struct brw_reg dest,
1778 unsigned conditional,
1779 struct brw_reg src0,
1780 struct brw_reg src1)
1781 {
1782 struct brw_context *brw = p->brw;
1783 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1784
1785 insn->header.destreg__conditionalmod = conditional;
1786 brw_set_dest(p, insn, dest);
1787 brw_set_src0(p, insn, src0);
1788 brw_set_src1(p, insn, src1);
1789
1790 /* guess_execution_size(insn, src0); */
1791
1792
1793 /* Make it so that future instructions will use the computed flag
1794 * value until brw_set_predicate_control_flag_value() is called
1795 * again.
1796 */
1797 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1798 dest.nr == 0) {
1799 p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1800 p->flag_value = 0xff;
1801 }
1802
1803 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1804 * page says:
1805 * "Any CMP instruction with a null destination must use a {switch}."
1806 *
1807 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1808 * mentioned on their work-arounds pages.
1809 */
1810 if (brw->gen == 7) {
1811 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1812 dest.nr == BRW_ARF_NULL) {
1813 insn->header.thread_control = BRW_THREAD_SWITCH;
1814 }
1815 }
1816 }
1817
1818 /* Issue 'wait' instruction for n1, host could program MMIO
1819 to wake up thread. */
1820 void brw_WAIT (struct brw_compile *p)
1821 {
1822 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1823 struct brw_reg src = brw_notification_1_reg();
1824
1825 brw_set_dest(p, insn, src);
1826 brw_set_src0(p, insn, src);
1827 brw_set_src1(p, insn, brw_null_reg());
1828 insn->header.execution_size = 0; /* must */
1829 insn->header.predicate_control = 0;
1830 insn->header.compression_control = 0;
1831 }
1832
1833
1834 /***********************************************************************
1835 * Helpers for the various SEND message types:
1836 */
1837
1838 /** Extended math function, float[8].
1839 */
1840 void brw_math( struct brw_compile *p,
1841 struct brw_reg dest,
1842 unsigned function,
1843 unsigned msg_reg_nr,
1844 struct brw_reg src,
1845 unsigned data_type,
1846 unsigned precision )
1847 {
1848 struct brw_context *brw = p->brw;
1849
1850 if (brw->gen >= 6) {
1851 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1852
1853 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1854 (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1855 assert(src.file == BRW_GENERAL_REGISTER_FILE);
1856
1857 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1858 if (brw->gen == 6)
1859 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1860
1861 /* Source modifiers are ignored for extended math instructions on Gen6. */
1862 if (brw->gen == 6) {
1863 assert(!src.negate);
1864 assert(!src.abs);
1865 }
1866
1867 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1868 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1869 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1870 assert(src.type != BRW_REGISTER_TYPE_F);
1871 } else {
1872 assert(src.type == BRW_REGISTER_TYPE_F);
1873 }
1874
1875 /* Math is the same ISA format as other opcodes, except that CondModifier
1876 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1877 */
1878 insn->header.destreg__conditionalmod = function;
1879
1880 brw_set_dest(p, insn, dest);
1881 brw_set_src0(p, insn, src);
1882 brw_set_src1(p, insn, brw_null_reg());
1883 } else {
1884 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1885
1886 /* Example code doesn't set predicate_control for send
1887 * instructions.
1888 */
1889 insn->header.predicate_control = 0;
1890 insn->header.destreg__conditionalmod = msg_reg_nr;
1891
1892 brw_set_dest(p, insn, dest);
1893 brw_set_src0(p, insn, src);
1894 brw_set_math_message(p,
1895 insn,
1896 function,
1897 src.type == BRW_REGISTER_TYPE_D,
1898 precision,
1899 data_type);
1900 }
1901 }
1902
1903 /** Extended math function, float[8].
1904 */
1905 void brw_math2(struct brw_compile *p,
1906 struct brw_reg dest,
1907 unsigned function,
1908 struct brw_reg src0,
1909 struct brw_reg src1)
1910 {
1911 struct brw_context *brw = p->brw;
1912 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1913
1914 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1915 (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1916 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1917 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1918
1919 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1920 if (brw->gen == 6) {
1921 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1922 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1923 }
1924
1925 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1926 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1927 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1928 assert(src0.type != BRW_REGISTER_TYPE_F);
1929 assert(src1.type != BRW_REGISTER_TYPE_F);
1930 } else {
1931 assert(src0.type == BRW_REGISTER_TYPE_F);
1932 assert(src1.type == BRW_REGISTER_TYPE_F);
1933 }
1934
1935 /* Source modifiers are ignored for extended math instructions on Gen6. */
1936 if (brw->gen == 6) {
1937 assert(!src0.negate);
1938 assert(!src0.abs);
1939 assert(!src1.negate);
1940 assert(!src1.abs);
1941 }
1942
1943 /* Math is the same ISA format as other opcodes, except that CondModifier
1944 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1945 */
1946 insn->header.destreg__conditionalmod = function;
1947
1948 brw_set_dest(p, insn, dest);
1949 brw_set_src0(p, insn, src0);
1950 brw_set_src1(p, insn, src1);
1951 }
1952
1953
1954 /**
1955 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1956 * using a constant offset per channel.
1957 *
1958 * The offset must be aligned to oword size (16 bytes). Used for
1959 * register spilling.
1960 */
1961 void brw_oword_block_write_scratch(struct brw_compile *p,
1962 struct brw_reg mrf,
1963 int num_regs,
1964 unsigned offset)
1965 {
1966 struct brw_context *brw = p->brw;
1967 uint32_t msg_control, msg_type;
1968 int mlen;
1969
1970 if (brw->gen >= 6)
1971 offset /= 16;
1972
1973 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1974
1975 if (num_regs == 1) {
1976 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1977 mlen = 2;
1978 } else {
1979 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1980 mlen = 3;
1981 }
1982
1983 /* Set up the message header. This is g0, with g0.2 filled with
1984 * the offset. We don't want to leave our offset around in g0 or
1985 * it'll screw up texture samples, so set it up inside the message
1986 * reg.
1987 */
1988 {
1989 brw_push_insn_state(p);
1990 brw_set_mask_control(p, BRW_MASK_DISABLE);
1991 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1992
1993 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1994
1995 /* set message header global offset field (reg 0, element 2) */
1996 brw_MOV(p,
1997 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1998 mrf.nr,
1999 2), BRW_REGISTER_TYPE_UD),
2000 brw_imm_ud(offset));
2001
2002 brw_pop_insn_state(p);
2003 }
2004
2005 {
2006 struct brw_reg dest;
2007 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2008 int send_commit_msg;
2009 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2010 BRW_REGISTER_TYPE_UW);
2011
2012 if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
2013 insn->header.compression_control = BRW_COMPRESSION_NONE;
2014 src_header = vec16(src_header);
2015 }
2016 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
2017 insn->header.destreg__conditionalmod = mrf.nr;
2018
2019 /* Until gen6, writes followed by reads from the same location
2020 * are not guaranteed to be ordered unless write_commit is set.
2021 * If set, then a no-op write is issued to the destination
2022 * register to set a dependency, and a read from the destination
2023 * can be used to ensure the ordering.
2024 *
2025 * For gen6, only writes between different threads need ordering
2026 * protection. Our use of DP writes is all about register
2027 * spilling within a thread.
2028 */
2029 if (brw->gen >= 6) {
2030 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2031 send_commit_msg = 0;
2032 } else {
2033 dest = src_header;
2034 send_commit_msg = 1;
2035 }
2036
2037 brw_set_dest(p, insn, dest);
2038 if (brw->gen >= 6) {
2039 brw_set_src0(p, insn, mrf);
2040 } else {
2041 brw_set_src0(p, insn, brw_null_reg());
2042 }
2043
2044 if (brw->gen >= 6)
2045 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2046 else
2047 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2048
2049 brw_set_dp_write_message(p,
2050 insn,
2051 255, /* binding table index (255=stateless) */
2052 msg_control,
2053 msg_type,
2054 mlen,
2055 true, /* header_present */
2056 0, /* not a render target */
2057 send_commit_msg, /* response_length */
2058 0, /* eot */
2059 send_commit_msg);
2060 }
2061 }
2062
2063
2064 /**
2065 * Read a block of owords (half a GRF each) from the scratch buffer
2066 * using a constant index per channel.
2067 *
2068 * Offset must be aligned to oword size (16 bytes). Used for register
2069 * spilling.
2070 */
2071 void
2072 brw_oword_block_read_scratch(struct brw_compile *p,
2073 struct brw_reg dest,
2074 struct brw_reg mrf,
2075 int num_regs,
2076 unsigned offset)
2077 {
2078 struct brw_context *brw = p->brw;
2079 uint32_t msg_control;
2080 int rlen;
2081
2082 if (brw->gen >= 6)
2083 offset /= 16;
2084
2085 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2086 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2087
2088 if (num_regs == 1) {
2089 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2090 rlen = 1;
2091 } else {
2092 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2093 rlen = 2;
2094 }
2095
2096 {
2097 brw_push_insn_state(p);
2098 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2099 brw_set_mask_control(p, BRW_MASK_DISABLE);
2100
2101 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2102
2103 /* set message header global offset field (reg 0, element 2) */
2104 brw_MOV(p,
2105 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2106 mrf.nr,
2107 2), BRW_REGISTER_TYPE_UD),
2108 brw_imm_ud(offset));
2109
2110 brw_pop_insn_state(p);
2111 }
2112
2113 {
2114 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2115
2116 assert(insn->header.predicate_control == 0);
2117 insn->header.compression_control = BRW_COMPRESSION_NONE;
2118 insn->header.destreg__conditionalmod = mrf.nr;
2119
2120 brw_set_dest(p, insn, dest); /* UW? */
2121 if (brw->gen >= 6) {
2122 brw_set_src0(p, insn, mrf);
2123 } else {
2124 brw_set_src0(p, insn, brw_null_reg());
2125 }
2126
2127 brw_set_dp_read_message(p,
2128 insn,
2129 255, /* binding table index (255=stateless) */
2130 msg_control,
2131 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2132 BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2133 1, /* msg_length */
2134 true, /* header_present */
2135 rlen);
2136 }
2137 }
2138
2139 void
2140 gen7_block_read_scratch(struct brw_compile *p,
2141 struct brw_reg dest,
2142 int num_regs,
2143 unsigned offset)
2144 {
2145 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2146
2147 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2148
2149 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
2150 insn->header.compression_control = BRW_COMPRESSION_NONE;
2151
2152 brw_set_dest(p, insn, dest);
2153
2154 /* The HW requires that the header is present; this is to get the g0.5
2155 * scratch offset.
2156 */
2157 bool header_present = true;
2158 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2159
2160 brw_set_message_descriptor(p, insn,
2161 GEN7_SFID_DATAPORT_DATA_CACHE,
2162 1, /* mlen: just g0 */
2163 num_regs,
2164 header_present,
2165 false);
2166
2167 insn->bits3.ud |= GEN7_DATAPORT_SCRATCH_READ;
2168
2169 assert(num_regs == 1 || num_regs == 2 || num_regs == 4);
2170 insn->bits3.ud |= (num_regs - 1) << GEN7_DATAPORT_SCRATCH_NUM_REGS_SHIFT;
2171
2172 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2173 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2174 * is 32 bytes, which happens to be the size of a register.
2175 */
2176 offset /= REG_SIZE;
2177 assert(offset < (1 << 12));
2178 insn->bits3.ud |= offset;
2179 }
2180
2181 /**
2182 * Read a float[4] vector from the data port Data Cache (const buffer).
2183 * Location (in buffer) should be a multiple of 16.
2184 * Used for fetching shader constants.
2185 */
2186 void brw_oword_block_read(struct brw_compile *p,
2187 struct brw_reg dest,
2188 struct brw_reg mrf,
2189 uint32_t offset,
2190 uint32_t bind_table_index)
2191 {
2192 struct brw_context *brw = p->brw;
2193
2194 /* On newer hardware, offset is in units of owords. */
2195 if (brw->gen >= 6)
2196 offset /= 16;
2197
2198 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2199
2200 brw_push_insn_state(p);
2201 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2202 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2203 brw_set_mask_control(p, BRW_MASK_DISABLE);
2204
2205 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2206
2207 /* set message header global offset field (reg 0, element 2) */
2208 brw_MOV(p,
2209 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2210 mrf.nr,
2211 2), BRW_REGISTER_TYPE_UD),
2212 brw_imm_ud(offset));
2213
2214 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2215 insn->header.destreg__conditionalmod = mrf.nr;
2216
2217 /* cast dest to a uword[8] vector */
2218 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2219
2220 brw_set_dest(p, insn, dest);
2221 if (brw->gen >= 6) {
2222 brw_set_src0(p, insn, mrf);
2223 } else {
2224 brw_set_src0(p, insn, brw_null_reg());
2225 }
2226
2227 brw_set_dp_read_message(p,
2228 insn,
2229 bind_table_index,
2230 BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2231 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2232 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2233 1, /* msg_length */
2234 true, /* header_present */
2235 1); /* response_length (1 reg, 2 owords!) */
2236
2237 brw_pop_insn_state(p);
2238 }
2239
2240
2241 void brw_fb_WRITE(struct brw_compile *p,
2242 int dispatch_width,
2243 unsigned msg_reg_nr,
2244 struct brw_reg src0,
2245 unsigned msg_control,
2246 unsigned binding_table_index,
2247 unsigned msg_length,
2248 unsigned response_length,
2249 bool eot,
2250 bool header_present)
2251 {
2252 struct brw_context *brw = p->brw;
2253 struct brw_instruction *insn;
2254 unsigned msg_type;
2255 struct brw_reg dest;
2256
2257 if (dispatch_width == 16)
2258 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2259 else
2260 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2261
2262 if (brw->gen >= 6) {
2263 insn = next_insn(p, BRW_OPCODE_SENDC);
2264 } else {
2265 insn = next_insn(p, BRW_OPCODE_SEND);
2266 }
2267 insn->header.compression_control = BRW_COMPRESSION_NONE;
2268
2269 if (brw->gen >= 6) {
2270 /* headerless version, just submit color payload */
2271 src0 = brw_message_reg(msg_reg_nr);
2272
2273 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2274 } else {
2275 insn->header.destreg__conditionalmod = msg_reg_nr;
2276
2277 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2278 }
2279
2280 brw_set_dest(p, insn, dest);
2281 brw_set_src0(p, insn, src0);
2282 brw_set_dp_write_message(p,
2283 insn,
2284 binding_table_index,
2285 msg_control,
2286 msg_type,
2287 msg_length,
2288 header_present,
2289 eot, /* last render target write */
2290 response_length,
2291 eot,
2292 0 /* send_commit_msg */);
2293 }
2294
2295
2296 /**
2297 * Texture sample instruction.
2298 * Note: the msg_type plus msg_length values determine exactly what kind
2299 * of sampling operation is performed. See volume 4, page 161 of docs.
2300 */
2301 void brw_SAMPLE(struct brw_compile *p,
2302 struct brw_reg dest,
2303 unsigned msg_reg_nr,
2304 struct brw_reg src0,
2305 unsigned binding_table_index,
2306 unsigned sampler,
2307 unsigned msg_type,
2308 unsigned response_length,
2309 unsigned msg_length,
2310 unsigned header_present,
2311 unsigned simd_mode,
2312 unsigned return_format)
2313 {
2314 struct brw_context *brw = p->brw;
2315 struct brw_instruction *insn;
2316
2317 if (msg_reg_nr != -1)
2318 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2319
2320 insn = next_insn(p, BRW_OPCODE_SEND);
2321 insn->header.predicate_control = 0; /* XXX */
2322
2323 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2324 *
2325 * "Instruction compression is not allowed for this instruction (that
2326 * is, send). The hardware behavior is undefined if this instruction is
2327 * set as compressed. However, compress control can be set to "SecHalf"
2328 * to affect the EMask generation."
2329 *
2330 * No similar wording is found in later PRMs, but there are examples
2331 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2332 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2333 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2334 */
2335 if (insn->header.compression_control != BRW_COMPRESSION_2NDHALF)
2336 insn->header.compression_control = BRW_COMPRESSION_NONE;
2337
2338 if (brw->gen < 6)
2339 insn->header.destreg__conditionalmod = msg_reg_nr;
2340
2341 brw_set_dest(p, insn, dest);
2342 brw_set_src0(p, insn, src0);
2343 brw_set_sampler_message(p, insn,
2344 binding_table_index,
2345 sampler,
2346 msg_type,
2347 response_length,
2348 msg_length,
2349 header_present,
2350 simd_mode,
2351 return_format);
2352 }
2353
2354 /* All these variables are pretty confusing - we might be better off
2355 * using bitmasks and macros for this, in the old style. Or perhaps
2356 * just having the caller instantiate the fields in dword3 itself.
2357 */
2358 void brw_urb_WRITE(struct brw_compile *p,
2359 struct brw_reg dest,
2360 unsigned msg_reg_nr,
2361 struct brw_reg src0,
2362 enum brw_urb_write_flags flags,
2363 unsigned msg_length,
2364 unsigned response_length,
2365 unsigned offset,
2366 unsigned swizzle)
2367 {
2368 struct brw_context *brw = p->brw;
2369 struct brw_instruction *insn;
2370
2371 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2372
2373 if (brw->gen == 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2374 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2375 brw_push_insn_state(p);
2376 brw_set_access_mode(p, BRW_ALIGN_1);
2377 brw_set_mask_control(p, BRW_MASK_DISABLE);
2378 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2379 BRW_REGISTER_TYPE_UD),
2380 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2381 brw_imm_ud(0xff00));
2382 brw_pop_insn_state(p);
2383 }
2384
2385 insn = next_insn(p, BRW_OPCODE_SEND);
2386
2387 assert(msg_length < BRW_MAX_MRF);
2388
2389 brw_set_dest(p, insn, dest);
2390 brw_set_src0(p, insn, src0);
2391 brw_set_src1(p, insn, brw_imm_d(0));
2392
2393 if (brw->gen < 6)
2394 insn->header.destreg__conditionalmod = msg_reg_nr;
2395
2396 brw_set_urb_message(p,
2397 insn,
2398 flags,
2399 msg_length,
2400 response_length,
2401 offset,
2402 swizzle);
2403 }
2404
2405 static int
2406 brw_find_next_block_end(struct brw_compile *p, int start_offset)
2407 {
2408 int offset;
2409 void *store = p->store;
2410
2411 for (offset = next_offset(store, start_offset); offset < p->next_insn_offset;
2412 offset = next_offset(store, offset)) {
2413 struct brw_instruction *insn = store + offset;
2414
2415 switch (insn->header.opcode) {
2416 case BRW_OPCODE_ENDIF:
2417 case BRW_OPCODE_ELSE:
2418 case BRW_OPCODE_WHILE:
2419 case BRW_OPCODE_HALT:
2420 return offset;
2421 }
2422 }
2423
2424 return 0;
2425 }
2426
2427 /* There is no DO instruction on gen6, so to find the end of the loop
2428 * we have to see if the loop is jumping back before our start
2429 * instruction.
2430 */
2431 static int
2432 brw_find_loop_end(struct brw_compile *p, int start_offset)
2433 {
2434 struct brw_context *brw = p->brw;
2435 int offset;
2436 int scale = 8;
2437 void *store = p->store;
2438
2439 /* Always start after the instruction (such as a WHILE) we're trying to fix
2440 * up.
2441 */
2442 for (offset = next_offset(store, start_offset); offset < p->next_insn_offset;
2443 offset = next_offset(store, offset)) {
2444 struct brw_instruction *insn = store + offset;
2445
2446 if (insn->header.opcode == BRW_OPCODE_WHILE) {
2447 int jip = brw->gen == 6 ? insn->bits1.branch_gen6.jump_count
2448 : insn->bits3.break_cont.jip;
2449 if (offset + jip * scale <= start_offset)
2450 return offset;
2451 }
2452 }
2453 assert(!"not reached");
2454 return start_offset;
2455 }
2456
2457 /* After program generation, go back and update the UIP and JIP of
2458 * BREAK, CONT, and HALT instructions to their correct locations.
2459 */
2460 void
2461 brw_set_uip_jip(struct brw_compile *p)
2462 {
2463 struct brw_context *brw = p->brw;
2464 int offset;
2465 int scale = 8;
2466 void *store = p->store;
2467
2468 if (brw->gen < 6)
2469 return;
2470
2471 for (offset = 0; offset < p->next_insn_offset;
2472 offset = next_offset(store, offset)) {
2473 struct brw_instruction *insn = store + offset;
2474
2475 if (insn->header.cmpt_control) {
2476 /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2477 assert(insn->header.opcode != BRW_OPCODE_BREAK &&
2478 insn->header.opcode != BRW_OPCODE_CONTINUE &&
2479 insn->header.opcode != BRW_OPCODE_HALT);
2480 continue;
2481 }
2482
2483 int block_end_offset = brw_find_next_block_end(p, offset);
2484 switch (insn->header.opcode) {
2485 case BRW_OPCODE_BREAK:
2486 assert(block_end_offset != 0);
2487 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2488 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2489 insn->bits3.break_cont.uip =
2490 (brw_find_loop_end(p, offset) - offset +
2491 (brw->gen == 6 ? 16 : 0)) / scale;
2492 break;
2493 case BRW_OPCODE_CONTINUE:
2494 assert(block_end_offset != 0);
2495 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2496 insn->bits3.break_cont.uip =
2497 (brw_find_loop_end(p, offset) - offset) / scale;
2498
2499 assert(insn->bits3.break_cont.uip != 0);
2500 assert(insn->bits3.break_cont.jip != 0);
2501 break;
2502
2503 case BRW_OPCODE_ENDIF:
2504 if (block_end_offset == 0)
2505 insn->bits3.break_cont.jip = 2;
2506 else
2507 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2508 break;
2509
2510 case BRW_OPCODE_HALT:
2511 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2512 *
2513 * "In case of the halt instruction not inside any conditional
2514 * code block, the value of <JIP> and <UIP> should be the
2515 * same. In case of the halt instruction inside conditional code
2516 * block, the <UIP> should be the end of the program, and the
2517 * <JIP> should be end of the most inner conditional code block."
2518 *
2519 * The uip will have already been set by whoever set up the
2520 * instruction.
2521 */
2522 if (block_end_offset == 0) {
2523 insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
2524 } else {
2525 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2526 }
2527 assert(insn->bits3.break_cont.uip != 0);
2528 assert(insn->bits3.break_cont.jip != 0);
2529 break;
2530 }
2531 }
2532 }
2533
2534 void brw_ff_sync(struct brw_compile *p,
2535 struct brw_reg dest,
2536 unsigned msg_reg_nr,
2537 struct brw_reg src0,
2538 bool allocate,
2539 unsigned response_length,
2540 bool eot)
2541 {
2542 struct brw_context *brw = p->brw;
2543 struct brw_instruction *insn;
2544
2545 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2546
2547 insn = next_insn(p, BRW_OPCODE_SEND);
2548 brw_set_dest(p, insn, dest);
2549 brw_set_src0(p, insn, src0);
2550 brw_set_src1(p, insn, brw_imm_d(0));
2551
2552 if (brw->gen < 6)
2553 insn->header.destreg__conditionalmod = msg_reg_nr;
2554
2555 brw_set_ff_sync_message(p,
2556 insn,
2557 allocate,
2558 response_length,
2559 eot);
2560 }
2561
2562 /**
2563 * Emit the SEND instruction necessary to generate stream output data on Gen6
2564 * (for transform feedback).
2565 *
2566 * If send_commit_msg is true, this is the last piece of stream output data
2567 * from this thread, so send the data as a committed write. According to the
2568 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2569 *
2570 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2571 * writes are complete by sending the final write as a committed write."
2572 */
2573 void
2574 brw_svb_write(struct brw_compile *p,
2575 struct brw_reg dest,
2576 unsigned msg_reg_nr,
2577 struct brw_reg src0,
2578 unsigned binding_table_index,
2579 bool send_commit_msg)
2580 {
2581 struct brw_instruction *insn;
2582
2583 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2584
2585 insn = next_insn(p, BRW_OPCODE_SEND);
2586 brw_set_dest(p, insn, dest);
2587 brw_set_src0(p, insn, src0);
2588 brw_set_src1(p, insn, brw_imm_d(0));
2589 brw_set_dp_write_message(p, insn,
2590 binding_table_index,
2591 0, /* msg_control: ignored */
2592 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2593 1, /* msg_length */
2594 true, /* header_present */
2595 0, /* last_render_target: ignored */
2596 send_commit_msg, /* response_length */
2597 0, /* end_of_thread */
2598 send_commit_msg); /* send_commit_msg */
2599 }
2600
2601 static void
2602 brw_set_dp_untyped_atomic_message(struct brw_compile *p,
2603 struct brw_instruction *insn,
2604 unsigned atomic_op,
2605 unsigned bind_table_index,
2606 unsigned msg_length,
2607 unsigned response_length,
2608 bool header_present)
2609 {
2610 if (p->brw->is_haswell) {
2611 brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1,
2612 msg_length, response_length,
2613 header_present, false);
2614
2615
2616 if (insn->header.access_mode == BRW_ALIGN_1) {
2617 if (insn->header.execution_size != BRW_EXECUTE_16)
2618 insn->bits3.ud |= 1 << 12; /* SIMD8 mode */
2619
2620 insn->bits3.gen7_dp.msg_type =
2621 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP;
2622 } else {
2623 insn->bits3.gen7_dp.msg_type =
2624 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2;
2625 }
2626
2627 } else {
2628 brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
2629 msg_length, response_length,
2630 header_present, false);
2631
2632 insn->bits3.gen7_dp.msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
2633
2634 if (insn->header.execution_size != BRW_EXECUTE_16)
2635 insn->bits3.ud |= 1 << 12; /* SIMD8 mode */
2636 }
2637
2638 if (response_length)
2639 insn->bits3.ud |= 1 << 13; /* Return data expected */
2640
2641 insn->bits3.gen7_dp.binding_table_index = bind_table_index;
2642 insn->bits3.ud |= atomic_op << 8;
2643 }
2644
2645 void
2646 brw_untyped_atomic(struct brw_compile *p,
2647 struct brw_reg dest,
2648 struct brw_reg mrf,
2649 unsigned atomic_op,
2650 unsigned bind_table_index,
2651 unsigned msg_length,
2652 unsigned response_length) {
2653 struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_SEND);
2654
2655 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD));
2656 brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
2657 brw_set_src1(p, insn, brw_imm_d(0));
2658 brw_set_dp_untyped_atomic_message(
2659 p, insn, atomic_op, bind_table_index, msg_length, response_length,
2660 insn->header.access_mode == BRW_ALIGN_1);
2661 }
2662
2663 static void
2664 brw_set_dp_untyped_surface_read_message(struct brw_compile *p,
2665 struct brw_instruction *insn,
2666 unsigned bind_table_index,
2667 unsigned msg_length,
2668 unsigned response_length,
2669 bool header_present)
2670 {
2671 const unsigned dispatch_width =
2672 (insn->header.execution_size == BRW_EXECUTE_16 ? 16 : 8);
2673 const unsigned num_channels = response_length / (dispatch_width / 8);
2674
2675 if (p->brw->is_haswell) {
2676 brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1,
2677 msg_length, response_length,
2678 header_present, false);
2679
2680 insn->bits3.gen7_dp.msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ;
2681 } else {
2682 brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
2683 msg_length, response_length,
2684 header_present, false);
2685
2686 insn->bits3.gen7_dp.msg_type = GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ;
2687 }
2688
2689 if (insn->header.access_mode == BRW_ALIGN_1) {
2690 if (dispatch_width == 16)
2691 insn->bits3.ud |= 1 << 12; /* SIMD16 mode */
2692 else
2693 insn->bits3.ud |= 2 << 12; /* SIMD8 mode */
2694 }
2695
2696 insn->bits3.gen7_dp.binding_table_index = bind_table_index;
2697
2698 /* Set mask of 32-bit channels to drop. */
2699 insn->bits3.ud |= (0xf & (0xf << num_channels)) << 8;
2700 }
2701
2702 void
2703 brw_untyped_surface_read(struct brw_compile *p,
2704 struct brw_reg dest,
2705 struct brw_reg mrf,
2706 unsigned bind_table_index,
2707 unsigned msg_length,
2708 unsigned response_length)
2709 {
2710 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2711
2712 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD));
2713 brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
2714 brw_set_dp_untyped_surface_read_message(
2715 p, insn, bind_table_index, msg_length, response_length,
2716 insn->header.access_mode == BRW_ALIGN_1);
2717 }
2718
2719 /**
2720 * This instruction is generated as a single-channel align1 instruction by
2721 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2722 *
2723 * We can't use the typed atomic op in the FS because that has the execution
2724 * mask ANDed with the pixel mask, but we just want to write the one dword for
2725 * all the pixels.
2726 *
2727 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2728 * one u32. So we use the same untyped atomic write message as the pixel
2729 * shader.
2730 *
2731 * The untyped atomic operation requires a BUFFER surface type with RAW
2732 * format, and is only accessible through the legacy DATA_CACHE dataport
2733 * messages.
2734 */
2735 void brw_shader_time_add(struct brw_compile *p,
2736 struct brw_reg payload,
2737 uint32_t surf_index)
2738 {
2739 struct brw_context *brw = p->brw;
2740 assert(brw->gen >= 7);
2741
2742 brw_push_insn_state(p);
2743 brw_set_access_mode(p, BRW_ALIGN_1);
2744 brw_set_mask_control(p, BRW_MASK_DISABLE);
2745 struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
2746 brw_pop_insn_state(p);
2747
2748 /* We use brw_vec1_reg and unmasked because we want to increment the given
2749 * offset only once.
2750 */
2751 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2752 BRW_ARF_NULL, 0));
2753 brw_set_src0(p, send, brw_vec1_reg(payload.file,
2754 payload.nr, 0));
2755 brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, surf_index,
2756 2 /* message length */,
2757 0 /* response length */,
2758 false /* header present */);
2759 }