c566aec39c47c73cd11589163d365357bd010420
[mesa.git] / src / mesa / drivers / dri / i965 / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_context.h"
34 #include "brw_defines.h"
35 #include "brw_eu.h"
36
37 #include "glsl/ralloc.h"
38
39 /***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43 static void guess_execution_size(struct brw_compile *p,
44 struct brw_instruction *insn,
45 struct brw_reg reg)
46 {
47 if (reg.width == BRW_WIDTH_8 && p->compressed)
48 insn->header.execution_size = BRW_EXECUTE_16;
49 else
50 insn->header.execution_size = reg.width; /* note - definitions are compatible */
51 }
52
53
54 /**
55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
56 * registers, implicitly moving the operand to a message register.
57 *
58 * On Sandybridge, this is no longer the case. This function performs the
59 * explicit move; it should be called before emitting a SEND instruction.
60 */
61 void
62 gen6_resolve_implied_move(struct brw_compile *p,
63 struct brw_reg *src,
64 unsigned msg_reg_nr)
65 {
66 struct brw_context *brw = p->brw;
67 if (brw->gen < 6)
68 return;
69
70 if (src->file == BRW_MESSAGE_REGISTER_FILE)
71 return;
72
73 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
74 brw_push_insn_state(p);
75 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
76 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
77 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
78 retype(*src, BRW_REGISTER_TYPE_UD));
79 brw_pop_insn_state(p);
80 }
81 *src = brw_message_reg(msg_reg_nr);
82 }
83
84 static void
85 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
86 {
87 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
88 * "The send with EOT should use register space R112-R127 for <src>. This is
89 * to enable loading of a new thread into the same slot while the message
90 * with EOT for current thread is pending dispatch."
91 *
92 * Since we're pretending to have 16 MRFs anyway, we may as well use the
93 * registers required for messages with EOT.
94 */
95 struct brw_context *brw = p->brw;
96 if (brw->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
97 reg->file = BRW_GENERAL_REGISTER_FILE;
98 reg->nr += GEN7_MRF_HACK_START;
99 }
100 }
101
102 /**
103 * Convert a brw_reg_type enumeration value into the hardware representation.
104 *
105 * The hardware encoding may depend on whether the value is an immediate.
106 */
107 unsigned
108 brw_reg_type_to_hw_type(const struct brw_context *brw,
109 enum brw_reg_type type, unsigned file)
110 {
111 if (file == BRW_IMMEDIATE_VALUE) {
112 const static int imm_hw_types[] = {
113 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
114 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
115 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
116 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
117 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
118 [BRW_REGISTER_TYPE_UB] = -1,
119 [BRW_REGISTER_TYPE_B] = -1,
120 [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
121 [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
122 [BRW_REGISTER_TYPE_V] = BRW_HW_REG_IMM_TYPE_V,
123 [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
124 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
125 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
126 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
127 };
128 assert(type < ARRAY_SIZE(imm_hw_types));
129 assert(imm_hw_types[type] != -1);
130 assert(brw->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
131 return imm_hw_types[type];
132 } else {
133 /* Non-immediate registers */
134 const static int hw_types[] = {
135 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
136 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
137 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
138 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
139 [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
140 [BRW_REGISTER_TYPE_B] = BRW_HW_REG_NON_IMM_TYPE_B,
141 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
142 [BRW_REGISTER_TYPE_UV] = -1,
143 [BRW_REGISTER_TYPE_VF] = -1,
144 [BRW_REGISTER_TYPE_V] = -1,
145 [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
146 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
147 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
148 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
149 };
150 assert(type < ARRAY_SIZE(hw_types));
151 assert(hw_types[type] != -1);
152 assert(brw->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
153 assert(brw->gen >= 8 || type < BRW_REGISTER_TYPE_HF);
154 return hw_types[type];
155 }
156 }
157
158 void
159 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
160 struct brw_reg dest)
161 {
162 if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
163 dest.file != BRW_MESSAGE_REGISTER_FILE)
164 assert(dest.nr < 128);
165
166 gen7_convert_mrf_to_grf(p, &dest);
167
168 insn->bits1.da1.dest_reg_file = dest.file;
169 insn->bits1.da1.dest_reg_type =
170 brw_reg_type_to_hw_type(p->brw, dest.type, dest.file);
171 insn->bits1.da1.dest_address_mode = dest.address_mode;
172
173 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
174 insn->bits1.da1.dest_reg_nr = dest.nr;
175
176 if (insn->header.access_mode == BRW_ALIGN_1) {
177 insn->bits1.da1.dest_subreg_nr = dest.subnr;
178 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
179 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
180 insn->bits1.da1.dest_horiz_stride = dest.hstride;
181 } else {
182 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
183 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
184 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
185 dest.file == BRW_MESSAGE_REGISTER_FILE) {
186 assert(dest.dw1.bits.writemask != 0);
187 }
188 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
189 * Although Dst.HorzStride is a don't care for Align16, HW needs
190 * this to be programmed as "01".
191 */
192 insn->bits1.da16.dest_horiz_stride = 1;
193 }
194 } else {
195 insn->bits1.ia1.dest_subreg_nr = dest.subnr;
196
197 /* These are different sizes in align1 vs align16:
198 */
199 if (insn->header.access_mode == BRW_ALIGN_1) {
200 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
201 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
202 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
203 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
204 } else {
205 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
206 /* even ignored in da16, still need to set as '01' */
207 insn->bits1.ia16.dest_horiz_stride = 1;
208 }
209 }
210
211 /* NEW: Set the execution size based on dest.width and
212 * insn->compression_control:
213 */
214 guess_execution_size(p, insn, dest);
215 }
216
217 extern int reg_type_size[];
218
219 static void
220 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
221 {
222 int hstride_for_reg[] = {0, 1, 2, 4};
223 int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
224 int width_for_reg[] = {1, 2, 4, 8, 16};
225 int execsize_for_reg[] = {1, 2, 4, 8, 16};
226 int width, hstride, vstride, execsize;
227
228 if (reg.file == BRW_IMMEDIATE_VALUE) {
229 /* 3.3.6: Region Parameters. Restriction: Immediate vectors
230 * mean the destination has to be 128-bit aligned and the
231 * destination horiz stride has to be a word.
232 */
233 if (reg.type == BRW_REGISTER_TYPE_V) {
234 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
235 reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
236 }
237
238 return;
239 }
240
241 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
242 reg.file == BRW_ARF_NULL)
243 return;
244
245 assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
246 hstride = hstride_for_reg[reg.hstride];
247
248 if (reg.vstride == 0xf) {
249 vstride = -1;
250 } else {
251 assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
252 vstride = vstride_for_reg[reg.vstride];
253 }
254
255 assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
256 width = width_for_reg[reg.width];
257
258 assert(insn->header.execution_size >= 0 &&
259 insn->header.execution_size < Elements(execsize_for_reg));
260 execsize = execsize_for_reg[insn->header.execution_size];
261
262 /* Restrictions from 3.3.10: Register Region Restrictions. */
263 /* 3. */
264 assert(execsize >= width);
265
266 /* 4. */
267 if (execsize == width && hstride != 0) {
268 assert(vstride == -1 || vstride == width * hstride);
269 }
270
271 /* 5. */
272 if (execsize == width && hstride == 0) {
273 /* no restriction on vstride. */
274 }
275
276 /* 6. */
277 if (width == 1) {
278 assert(hstride == 0);
279 }
280
281 /* 7. */
282 if (execsize == 1 && width == 1) {
283 assert(hstride == 0);
284 assert(vstride == 0);
285 }
286
287 /* 8. */
288 if (vstride == 0 && hstride == 0) {
289 assert(width == 1);
290 }
291
292 /* 10. Check destination issues. */
293 }
294
295 static bool
296 is_compactable_immediate(unsigned imm)
297 {
298 /* We get the low 12 bits as-is. */
299 imm &= ~0xfff;
300
301 /* We get one bit replicated through the top 20 bits. */
302 return imm == 0 || imm == 0xfffff000;
303 }
304
305 void
306 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
307 struct brw_reg reg)
308 {
309 struct brw_context *brw = p->brw;
310
311 if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
312 assert(reg.nr < 128);
313
314 gen7_convert_mrf_to_grf(p, &reg);
315
316 if (brw->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
317 insn->header.opcode == BRW_OPCODE_SENDC)) {
318 /* Any source modifiers or regions will be ignored, since this just
319 * identifies the MRF/GRF to start reading the message contents from.
320 * Check for some likely failures.
321 */
322 assert(!reg.negate);
323 assert(!reg.abs);
324 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
325 }
326
327 validate_reg(insn, reg);
328
329 insn->bits1.da1.src0_reg_file = reg.file;
330 insn->bits1.da1.src0_reg_type =
331 brw_reg_type_to_hw_type(brw, reg.type, reg.file);
332 insn->bits2.da1.src0_abs = reg.abs;
333 insn->bits2.da1.src0_negate = reg.negate;
334 insn->bits2.da1.src0_address_mode = reg.address_mode;
335
336 if (reg.file == BRW_IMMEDIATE_VALUE) {
337 insn->bits3.ud = reg.dw1.ud;
338
339 /* The Bspec's section titled "Non-present Operands" claims that if src0
340 * is an immediate that src1's type must be the same as that of src0.
341 *
342 * The SNB+ DataTypeIndex instruction compaction tables contain mappings
343 * that do not follow this rule. E.g., from the IVB/HSW table:
344 *
345 * DataTypeIndex 18-Bit Mapping Mapped Meaning
346 * 3 001000001011111101 r:f | i:vf | a:ud | <1> | dir |
347 *
348 * And from the SNB table:
349 *
350 * DataTypeIndex 18-Bit Mapping Mapped Meaning
351 * 8 001000000111101100 a:w | i:w | a:ud | <1> | dir |
352 *
353 * Neither of these cause warnings from the simulator when used,
354 * compacted or otherwise. In fact, all compaction mappings that have an
355 * immediate in src0 use a:ud for src1.
356 *
357 * The GM45 instruction compaction tables do not contain mapped meanings
358 * so it's not clear whether it has the restriction. We'll assume it was
359 * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
360 */
361 insn->bits1.da1.src1_reg_file = 0; /* arf */
362 if (brw->gen < 6) {
363 insn->bits1.da1.src1_reg_type = insn->bits1.da1.src0_reg_type;
364 } else {
365 insn->bits1.da1.src1_reg_type = BRW_HW_REG_TYPE_UD;
366 }
367
368 /* Compacted instructions only have 12-bits (plus 1 for the other 20)
369 * for immediate values. Presumably the hardware engineers realized
370 * that the only useful floating-point value that could be represented
371 * in this format is 0.0, which can also be represented as a VF-typed
372 * immediate, so they gave us the previously mentioned mapping on IVB+.
373 *
374 * Strangely, we do have a mapping for imm:f in src1, so we don't need
375 * to do this there.
376 *
377 * If we see a 0.0:F, change the type to VF so that it can be compacted.
378 */
379 if (insn->bits3.ud == 0x0 &&
380 insn->bits1.da1.src0_reg_type == BRW_HW_REG_TYPE_F) {
381 insn->bits1.da1.src0_reg_type = BRW_HW_REG_IMM_TYPE_VF;
382 }
383
384 /* There are no mappings for dst:d | i:d, so if the immediate is suitable
385 * set the types to :UD so the instruction can be compacted.
386 */
387 if (is_compactable_immediate(insn->bits3.ud) &&
388 insn->header.destreg__conditionalmod == BRW_CONDITIONAL_NONE &&
389 insn->bits1.da1.src0_reg_type == BRW_HW_REG_TYPE_D &&
390 insn->bits1.da1.dest_reg_type == BRW_HW_REG_TYPE_D) {
391 insn->bits1.da1.src0_reg_type = BRW_HW_REG_TYPE_UD;
392 insn->bits1.da1.dest_reg_type = BRW_HW_REG_TYPE_UD;
393 }
394 } else {
395 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
396 if (insn->header.access_mode == BRW_ALIGN_1) {
397 insn->bits2.da1.src0_subreg_nr = reg.subnr;
398 insn->bits2.da1.src0_reg_nr = reg.nr;
399 } else {
400 insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
401 insn->bits2.da16.src0_reg_nr = reg.nr;
402 }
403 } else {
404 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
405
406 if (insn->header.access_mode == BRW_ALIGN_1) {
407 insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
408 } else {
409 insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
410 }
411 }
412
413 if (insn->header.access_mode == BRW_ALIGN_1) {
414 if (reg.width == BRW_WIDTH_1 &&
415 insn->header.execution_size == BRW_EXECUTE_1) {
416 insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
417 insn->bits2.da1.src0_width = BRW_WIDTH_1;
418 insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
419 } else {
420 insn->bits2.da1.src0_horiz_stride = reg.hstride;
421 insn->bits2.da1.src0_width = reg.width;
422 insn->bits2.da1.src0_vert_stride = reg.vstride;
423 }
424 } else {
425 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
426 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
427 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
428 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
429
430 /* This is an oddity of the fact we're using the same
431 * descriptions for registers in align_16 as align_1:
432 */
433 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
434 insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
435 else
436 insn->bits2.da16.src0_vert_stride = reg.vstride;
437 }
438 }
439 }
440
441
442 void
443 brw_set_src1(struct brw_compile *p,
444 struct brw_instruction *insn,
445 struct brw_reg reg)
446 {
447 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
448
449 if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
450 assert(reg.nr < 128);
451
452 gen7_convert_mrf_to_grf(p, &reg);
453
454 validate_reg(insn, reg);
455
456 insn->bits1.da1.src1_reg_file = reg.file;
457 insn->bits1.da1.src1_reg_type =
458 brw_reg_type_to_hw_type(p->brw, reg.type, reg.file);
459 insn->bits3.da1.src1_abs = reg.abs;
460 insn->bits3.da1.src1_negate = reg.negate;
461
462 /* Only src1 can be immediate in two-argument instructions.
463 */
464 assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
465
466 if (reg.file == BRW_IMMEDIATE_VALUE) {
467 insn->bits3.ud = reg.dw1.ud;
468 } else {
469 /* This is a hardware restriction, which may or may not be lifted
470 * in the future:
471 */
472 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
473 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
474
475 if (insn->header.access_mode == BRW_ALIGN_1) {
476 insn->bits3.da1.src1_subreg_nr = reg.subnr;
477 insn->bits3.da1.src1_reg_nr = reg.nr;
478 } else {
479 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
480 insn->bits3.da16.src1_reg_nr = reg.nr;
481 }
482
483 if (insn->header.access_mode == BRW_ALIGN_1) {
484 if (reg.width == BRW_WIDTH_1 &&
485 insn->header.execution_size == BRW_EXECUTE_1) {
486 insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
487 insn->bits3.da1.src1_width = BRW_WIDTH_1;
488 insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
489 } else {
490 insn->bits3.da1.src1_horiz_stride = reg.hstride;
491 insn->bits3.da1.src1_width = reg.width;
492 insn->bits3.da1.src1_vert_stride = reg.vstride;
493 }
494 } else {
495 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
496 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
497 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
498 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
499
500 /* This is an oddity of the fact we're using the same
501 * descriptions for registers in align_16 as align_1:
502 */
503 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
504 insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
505 else
506 insn->bits3.da16.src1_vert_stride = reg.vstride;
507 }
508 }
509 }
510
511 /**
512 * Set the Message Descriptor and Extended Message Descriptor fields
513 * for SEND messages.
514 *
515 * \note This zeroes out the Function Control bits, so it must be called
516 * \b before filling out any message-specific data. Callers can
517 * choose not to fill in irrelevant bits; they will be zero.
518 */
519 static void
520 brw_set_message_descriptor(struct brw_compile *p,
521 struct brw_instruction *inst,
522 enum brw_message_target sfid,
523 unsigned msg_length,
524 unsigned response_length,
525 bool header_present,
526 bool end_of_thread)
527 {
528 struct brw_context *brw = p->brw;
529
530 brw_set_src1(p, inst, brw_imm_d(0));
531
532 if (brw->gen >= 5) {
533 inst->bits3.generic_gen5.header_present = header_present;
534 inst->bits3.generic_gen5.response_length = response_length;
535 inst->bits3.generic_gen5.msg_length = msg_length;
536 inst->bits3.generic_gen5.end_of_thread = end_of_thread;
537
538 if (brw->gen >= 6) {
539 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
540 inst->header.destreg__conditionalmod = sfid;
541 } else {
542 /* Set Extended Message Descriptor (ex_desc) */
543 inst->bits2.send_gen5.sfid = sfid;
544 inst->bits2.send_gen5.end_of_thread = end_of_thread;
545 }
546 } else {
547 inst->bits3.generic.response_length = response_length;
548 inst->bits3.generic.msg_length = msg_length;
549 inst->bits3.generic.msg_target = sfid;
550 inst->bits3.generic.end_of_thread = end_of_thread;
551 }
552 }
553
554 static void brw_set_math_message( struct brw_compile *p,
555 struct brw_instruction *insn,
556 unsigned function,
557 unsigned integer_type,
558 bool low_precision,
559 unsigned dataType )
560 {
561 struct brw_context *brw = p->brw;
562 unsigned msg_length;
563 unsigned response_length;
564
565 /* Infer message length from the function */
566 switch (function) {
567 case BRW_MATH_FUNCTION_POW:
568 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
569 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
570 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
571 msg_length = 2;
572 break;
573 default:
574 msg_length = 1;
575 break;
576 }
577
578 /* Infer response length from the function */
579 switch (function) {
580 case BRW_MATH_FUNCTION_SINCOS:
581 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
582 response_length = 2;
583 break;
584 default:
585 response_length = 1;
586 break;
587 }
588
589
590 brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
591 msg_length, response_length, false, false);
592 if (brw->gen == 5) {
593 insn->bits3.math_gen5.function = function;
594 insn->bits3.math_gen5.int_type = integer_type;
595 insn->bits3.math_gen5.precision = low_precision;
596 insn->bits3.math_gen5.saturate = insn->header.saturate;
597 insn->bits3.math_gen5.data_type = dataType;
598 insn->bits3.math_gen5.snapshot = 0;
599 } else {
600 insn->bits3.math.function = function;
601 insn->bits3.math.int_type = integer_type;
602 insn->bits3.math.precision = low_precision;
603 insn->bits3.math.saturate = insn->header.saturate;
604 insn->bits3.math.data_type = dataType;
605 }
606 insn->header.saturate = 0;
607 }
608
609
610 static void brw_set_ff_sync_message(struct brw_compile *p,
611 struct brw_instruction *insn,
612 bool allocate,
613 unsigned response_length,
614 bool end_of_thread)
615 {
616 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
617 1, response_length, true, end_of_thread);
618 insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
619 insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
620 insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
621 insn->bits3.urb_gen5.allocate = allocate;
622 insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
623 insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
624 }
625
626 static void brw_set_urb_message( struct brw_compile *p,
627 struct brw_instruction *insn,
628 enum brw_urb_write_flags flags,
629 unsigned msg_length,
630 unsigned response_length,
631 unsigned offset,
632 unsigned swizzle_control )
633 {
634 struct brw_context *brw = p->brw;
635
636 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
637 msg_length, response_length, true,
638 flags & BRW_URB_WRITE_EOT);
639 if (brw->gen == 7) {
640 if (flags & BRW_URB_WRITE_OWORD) {
641 assert(msg_length == 2); /* header + one OWORD of data */
642 insn->bits3.urb_gen7.opcode = BRW_URB_OPCODE_WRITE_OWORD;
643 } else {
644 insn->bits3.urb_gen7.opcode = BRW_URB_OPCODE_WRITE_HWORD;
645 }
646 insn->bits3.urb_gen7.offset = offset;
647 assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
648 insn->bits3.urb_gen7.swizzle_control = swizzle_control;
649 insn->bits3.urb_gen7.per_slot_offset =
650 flags & BRW_URB_WRITE_PER_SLOT_OFFSET ? 1 : 0;
651 insn->bits3.urb_gen7.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
652 } else if (brw->gen >= 5) {
653 insn->bits3.urb_gen5.opcode = 0; /* URB_WRITE */
654 insn->bits3.urb_gen5.offset = offset;
655 insn->bits3.urb_gen5.swizzle_control = swizzle_control;
656 insn->bits3.urb_gen5.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
657 insn->bits3.urb_gen5.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
658 insn->bits3.urb_gen5.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
659 } else {
660 insn->bits3.urb.opcode = 0; /* ? */
661 insn->bits3.urb.offset = offset;
662 insn->bits3.urb.swizzle_control = swizzle_control;
663 insn->bits3.urb.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
664 insn->bits3.urb.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
665 insn->bits3.urb.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
666 }
667 }
668
669 void
670 brw_set_dp_write_message(struct brw_compile *p,
671 struct brw_instruction *insn,
672 unsigned binding_table_index,
673 unsigned msg_control,
674 unsigned msg_type,
675 unsigned msg_length,
676 bool header_present,
677 unsigned last_render_target,
678 unsigned response_length,
679 unsigned end_of_thread,
680 unsigned send_commit_msg)
681 {
682 struct brw_context *brw = p->brw;
683 unsigned sfid;
684
685 if (brw->gen >= 7) {
686 /* Use the Render Cache for RT writes; otherwise use the Data Cache */
687 if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
688 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
689 else
690 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
691 } else if (brw->gen == 6) {
692 /* Use the render cache for all write messages. */
693 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
694 } else {
695 sfid = BRW_SFID_DATAPORT_WRITE;
696 }
697
698 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
699 header_present, end_of_thread);
700
701 if (brw->gen >= 7) {
702 insn->bits3.gen7_dp.binding_table_index = binding_table_index;
703 insn->bits3.gen7_dp.msg_control = msg_control;
704 insn->bits3.gen7_dp.last_render_target = last_render_target;
705 insn->bits3.gen7_dp.msg_type = msg_type;
706 } else if (brw->gen == 6) {
707 insn->bits3.gen6_dp.binding_table_index = binding_table_index;
708 insn->bits3.gen6_dp.msg_control = msg_control;
709 insn->bits3.gen6_dp.last_render_target = last_render_target;
710 insn->bits3.gen6_dp.msg_type = msg_type;
711 insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
712 } else if (brw->gen == 5) {
713 insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
714 insn->bits3.dp_write_gen5.msg_control = msg_control;
715 insn->bits3.dp_write_gen5.last_render_target = last_render_target;
716 insn->bits3.dp_write_gen5.msg_type = msg_type;
717 insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
718 } else {
719 insn->bits3.dp_write.binding_table_index = binding_table_index;
720 insn->bits3.dp_write.msg_control = msg_control;
721 insn->bits3.dp_write.last_render_target = last_render_target;
722 insn->bits3.dp_write.msg_type = msg_type;
723 insn->bits3.dp_write.send_commit_msg = send_commit_msg;
724 }
725 }
726
727 void
728 brw_set_dp_read_message(struct brw_compile *p,
729 struct brw_instruction *insn,
730 unsigned binding_table_index,
731 unsigned msg_control,
732 unsigned msg_type,
733 unsigned target_cache,
734 unsigned msg_length,
735 bool header_present,
736 unsigned response_length)
737 {
738 struct brw_context *brw = p->brw;
739 unsigned sfid;
740
741 if (brw->gen >= 7) {
742 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
743 } else if (brw->gen == 6) {
744 if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
745 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
746 else
747 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
748 } else {
749 sfid = BRW_SFID_DATAPORT_READ;
750 }
751
752 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
753 header_present, false);
754
755 if (brw->gen >= 7) {
756 insn->bits3.gen7_dp.binding_table_index = binding_table_index;
757 insn->bits3.gen7_dp.msg_control = msg_control;
758 insn->bits3.gen7_dp.last_render_target = 0;
759 insn->bits3.gen7_dp.msg_type = msg_type;
760 } else if (brw->gen == 6) {
761 insn->bits3.gen6_dp.binding_table_index = binding_table_index;
762 insn->bits3.gen6_dp.msg_control = msg_control;
763 insn->bits3.gen6_dp.last_render_target = 0;
764 insn->bits3.gen6_dp.msg_type = msg_type;
765 insn->bits3.gen6_dp.send_commit_msg = 0;
766 } else if (brw->gen == 5) {
767 insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
768 insn->bits3.dp_read_gen5.msg_control = msg_control;
769 insn->bits3.dp_read_gen5.msg_type = msg_type;
770 insn->bits3.dp_read_gen5.target_cache = target_cache;
771 } else if (brw->is_g4x) {
772 insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
773 insn->bits3.dp_read_g4x.msg_control = msg_control; /*8:10*/
774 insn->bits3.dp_read_g4x.msg_type = msg_type; /*11:13*/
775 insn->bits3.dp_read_g4x.target_cache = target_cache; /*14:15*/
776 } else {
777 insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
778 insn->bits3.dp_read.msg_control = msg_control; /*8:11*/
779 insn->bits3.dp_read.msg_type = msg_type; /*12:13*/
780 insn->bits3.dp_read.target_cache = target_cache; /*14:15*/
781 }
782 }
783
784 void
785 brw_set_sampler_message(struct brw_compile *p,
786 struct brw_instruction *insn,
787 unsigned binding_table_index,
788 unsigned sampler,
789 unsigned msg_type,
790 unsigned response_length,
791 unsigned msg_length,
792 unsigned header_present,
793 unsigned simd_mode,
794 unsigned return_format)
795 {
796 struct brw_context *brw = p->brw;
797
798 brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
799 response_length, header_present, false);
800
801 if (brw->gen >= 7) {
802 insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
803 insn->bits3.sampler_gen7.sampler = sampler;
804 insn->bits3.sampler_gen7.msg_type = msg_type;
805 insn->bits3.sampler_gen7.simd_mode = simd_mode;
806 } else if (brw->gen >= 5) {
807 insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
808 insn->bits3.sampler_gen5.sampler = sampler;
809 insn->bits3.sampler_gen5.msg_type = msg_type;
810 insn->bits3.sampler_gen5.simd_mode = simd_mode;
811 } else if (brw->is_g4x) {
812 insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
813 insn->bits3.sampler_g4x.sampler = sampler;
814 insn->bits3.sampler_g4x.msg_type = msg_type;
815 } else {
816 insn->bits3.sampler.binding_table_index = binding_table_index;
817 insn->bits3.sampler.sampler = sampler;
818 insn->bits3.sampler.msg_type = msg_type;
819 insn->bits3.sampler.return_format = return_format;
820 }
821 }
822
823
824 #define next_insn brw_next_insn
825 struct brw_instruction *
826 brw_next_insn(struct brw_compile *p, unsigned opcode)
827 {
828 struct brw_instruction *insn;
829
830 if (p->nr_insn + 1 > p->store_size) {
831 p->store_size <<= 1;
832 p->store = reralloc(p->mem_ctx, p->store,
833 struct brw_instruction, p->store_size);
834 }
835
836 p->next_insn_offset += 16;
837 insn = &p->store[p->nr_insn++];
838 memcpy(insn, p->current, sizeof(*insn));
839
840 insn->header.opcode = opcode;
841 return insn;
842 }
843
844 static struct brw_instruction *brw_alu1( struct brw_compile *p,
845 unsigned opcode,
846 struct brw_reg dest,
847 struct brw_reg src )
848 {
849 struct brw_instruction *insn = next_insn(p, opcode);
850 brw_set_dest(p, insn, dest);
851 brw_set_src0(p, insn, src);
852 return insn;
853 }
854
855 static struct brw_instruction *brw_alu2(struct brw_compile *p,
856 unsigned opcode,
857 struct brw_reg dest,
858 struct brw_reg src0,
859 struct brw_reg src1 )
860 {
861 struct brw_instruction *insn = next_insn(p, opcode);
862 brw_set_dest(p, insn, dest);
863 brw_set_src0(p, insn, src0);
864 brw_set_src1(p, insn, src1);
865 return insn;
866 }
867
868 static int
869 get_3src_subreg_nr(struct brw_reg reg)
870 {
871 if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
872 assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
873 return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
874 } else {
875 return reg.subnr / 4;
876 }
877 }
878
879 static struct brw_instruction *brw_alu3(struct brw_compile *p,
880 unsigned opcode,
881 struct brw_reg dest,
882 struct brw_reg src0,
883 struct brw_reg src1,
884 struct brw_reg src2)
885 {
886 struct brw_context *brw = p->brw;
887 struct brw_instruction *insn = next_insn(p, opcode);
888
889 gen7_convert_mrf_to_grf(p, &dest);
890
891 assert(insn->header.access_mode == BRW_ALIGN_16);
892
893 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
894 dest.file == BRW_MESSAGE_REGISTER_FILE);
895 assert(dest.nr < 128);
896 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
897 assert(dest.type == BRW_REGISTER_TYPE_F ||
898 dest.type == BRW_REGISTER_TYPE_D ||
899 dest.type == BRW_REGISTER_TYPE_UD);
900 insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
901 insn->bits1.da3src.dest_reg_nr = dest.nr;
902 insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
903 insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
904 guess_execution_size(p, insn, dest);
905
906 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
907 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
908 assert(src0.nr < 128);
909 insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
910 insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
911 insn->bits2.da3src.src0_reg_nr = src0.nr;
912 insn->bits1.da3src.src0_abs = src0.abs;
913 insn->bits1.da3src.src0_negate = src0.negate;
914 insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
915
916 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
917 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
918 assert(src1.nr < 128);
919 insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
920 insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
921 insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
922 insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
923 insn->bits3.da3src.src1_reg_nr = src1.nr;
924 insn->bits1.da3src.src1_abs = src1.abs;
925 insn->bits1.da3src.src1_negate = src1.negate;
926
927 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
928 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
929 assert(src2.nr < 128);
930 insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
931 insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
932 insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
933 insn->bits3.da3src.src2_reg_nr = src2.nr;
934 insn->bits1.da3src.src2_abs = src2.abs;
935 insn->bits1.da3src.src2_negate = src2.negate;
936
937 if (brw->gen >= 7) {
938 /* Set both the source and destination types based on dest.type,
939 * ignoring the source register types. The MAD and LRP emitters ensure
940 * that all four types are float. The BFE and BFI2 emitters, however,
941 * may send us mixed D and UD types and want us to ignore that and use
942 * the destination type.
943 */
944 switch (dest.type) {
945 case BRW_REGISTER_TYPE_F:
946 insn->bits1.da3src.src_type = BRW_3SRC_TYPE_F;
947 insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_F;
948 break;
949 case BRW_REGISTER_TYPE_D:
950 insn->bits1.da3src.src_type = BRW_3SRC_TYPE_D;
951 insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_D;
952 break;
953 case BRW_REGISTER_TYPE_UD:
954 insn->bits1.da3src.src_type = BRW_3SRC_TYPE_UD;
955 insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_UD;
956 break;
957 }
958 }
959
960 return insn;
961 }
962
963
964 /***********************************************************************
965 * Convenience routines.
966 */
967 #define ALU1(OP) \
968 struct brw_instruction *brw_##OP(struct brw_compile *p, \
969 struct brw_reg dest, \
970 struct brw_reg src0) \
971 { \
972 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
973 }
974
975 #define ALU2(OP) \
976 struct brw_instruction *brw_##OP(struct brw_compile *p, \
977 struct brw_reg dest, \
978 struct brw_reg src0, \
979 struct brw_reg src1) \
980 { \
981 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
982 }
983
984 #define ALU3(OP) \
985 struct brw_instruction *brw_##OP(struct brw_compile *p, \
986 struct brw_reg dest, \
987 struct brw_reg src0, \
988 struct brw_reg src1, \
989 struct brw_reg src2) \
990 { \
991 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
992 }
993
994 #define ALU3F(OP) \
995 struct brw_instruction *brw_##OP(struct brw_compile *p, \
996 struct brw_reg dest, \
997 struct brw_reg src0, \
998 struct brw_reg src1, \
999 struct brw_reg src2) \
1000 { \
1001 assert(dest.type == BRW_REGISTER_TYPE_F); \
1002 assert(src0.type == BRW_REGISTER_TYPE_F); \
1003 assert(src1.type == BRW_REGISTER_TYPE_F); \
1004 assert(src2.type == BRW_REGISTER_TYPE_F); \
1005 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1006 }
1007
1008 /* Rounding operations (other than RNDD) require two instructions - the first
1009 * stores a rounded value (possibly the wrong way) in the dest register, but
1010 * also sets a per-channel "increment bit" in the flag register. A predicated
1011 * add of 1.0 fixes dest to contain the desired result.
1012 *
1013 * Sandybridge and later appear to round correctly without an ADD.
1014 */
1015 #define ROUND(OP) \
1016 void brw_##OP(struct brw_compile *p, \
1017 struct brw_reg dest, \
1018 struct brw_reg src) \
1019 { \
1020 struct brw_context *brw = p->brw; \
1021 struct brw_instruction *rnd, *add; \
1022 rnd = next_insn(p, BRW_OPCODE_##OP); \
1023 brw_set_dest(p, rnd, dest); \
1024 brw_set_src0(p, rnd, src); \
1025 \
1026 if (brw->gen < 6) { \
1027 /* turn on round-increments */ \
1028 rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R; \
1029 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
1030 add->header.predicate_control = BRW_PREDICATE_NORMAL; \
1031 } \
1032 }
1033
1034
1035 ALU1(MOV)
1036 ALU2(SEL)
1037 ALU1(NOT)
1038 ALU2(AND)
1039 ALU2(OR)
1040 ALU2(XOR)
1041 ALU2(SHR)
1042 ALU2(SHL)
1043 ALU2(ASR)
1044 ALU1(F32TO16)
1045 ALU1(F16TO32)
1046 ALU1(FRC)
1047 ALU1(RNDD)
1048 ALU2(MAC)
1049 ALU2(MACH)
1050 ALU1(LZD)
1051 ALU2(DP4)
1052 ALU2(DPH)
1053 ALU2(DP3)
1054 ALU2(DP2)
1055 ALU2(LINE)
1056 ALU2(PLN)
1057 ALU3F(MAD)
1058 ALU3F(LRP)
1059 ALU1(BFREV)
1060 ALU3(BFE)
1061 ALU2(BFI1)
1062 ALU3(BFI2)
1063 ALU1(FBH)
1064 ALU1(FBL)
1065 ALU1(CBIT)
1066 ALU2(ADDC)
1067 ALU2(SUBB)
1068
1069 ROUND(RNDZ)
1070 ROUND(RNDE)
1071
1072
1073 struct brw_instruction *brw_ADD(struct brw_compile *p,
1074 struct brw_reg dest,
1075 struct brw_reg src0,
1076 struct brw_reg src1)
1077 {
1078 /* 6.2.2: add */
1079 if (src0.type == BRW_REGISTER_TYPE_F ||
1080 (src0.file == BRW_IMMEDIATE_VALUE &&
1081 src0.type == BRW_REGISTER_TYPE_VF)) {
1082 assert(src1.type != BRW_REGISTER_TYPE_UD);
1083 assert(src1.type != BRW_REGISTER_TYPE_D);
1084 }
1085
1086 if (src1.type == BRW_REGISTER_TYPE_F ||
1087 (src1.file == BRW_IMMEDIATE_VALUE &&
1088 src1.type == BRW_REGISTER_TYPE_VF)) {
1089 assert(src0.type != BRW_REGISTER_TYPE_UD);
1090 assert(src0.type != BRW_REGISTER_TYPE_D);
1091 }
1092
1093 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1094 }
1095
1096 struct brw_instruction *brw_AVG(struct brw_compile *p,
1097 struct brw_reg dest,
1098 struct brw_reg src0,
1099 struct brw_reg src1)
1100 {
1101 assert(dest.type == src0.type);
1102 assert(src0.type == src1.type);
1103 switch (src0.type) {
1104 case BRW_REGISTER_TYPE_B:
1105 case BRW_REGISTER_TYPE_UB:
1106 case BRW_REGISTER_TYPE_W:
1107 case BRW_REGISTER_TYPE_UW:
1108 case BRW_REGISTER_TYPE_D:
1109 case BRW_REGISTER_TYPE_UD:
1110 break;
1111 default:
1112 assert(!"Bad type for brw_AVG");
1113 }
1114
1115 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1116 }
1117
1118 struct brw_instruction *brw_MUL(struct brw_compile *p,
1119 struct brw_reg dest,
1120 struct brw_reg src0,
1121 struct brw_reg src1)
1122 {
1123 /* 6.32.38: mul */
1124 if (src0.type == BRW_REGISTER_TYPE_D ||
1125 src0.type == BRW_REGISTER_TYPE_UD ||
1126 src1.type == BRW_REGISTER_TYPE_D ||
1127 src1.type == BRW_REGISTER_TYPE_UD) {
1128 assert(dest.type != BRW_REGISTER_TYPE_F);
1129 }
1130
1131 if (src0.type == BRW_REGISTER_TYPE_F ||
1132 (src0.file == BRW_IMMEDIATE_VALUE &&
1133 src0.type == BRW_REGISTER_TYPE_VF)) {
1134 assert(src1.type != BRW_REGISTER_TYPE_UD);
1135 assert(src1.type != BRW_REGISTER_TYPE_D);
1136 }
1137
1138 if (src1.type == BRW_REGISTER_TYPE_F ||
1139 (src1.file == BRW_IMMEDIATE_VALUE &&
1140 src1.type == BRW_REGISTER_TYPE_VF)) {
1141 assert(src0.type != BRW_REGISTER_TYPE_UD);
1142 assert(src0.type != BRW_REGISTER_TYPE_D);
1143 }
1144
1145 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1146 src0.nr != BRW_ARF_ACCUMULATOR);
1147 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1148 src1.nr != BRW_ARF_ACCUMULATOR);
1149
1150 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1151 }
1152
1153
1154 void brw_NOP(struct brw_compile *p)
1155 {
1156 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
1157 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1158 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1159 brw_set_src1(p, insn, brw_imm_ud(0x0));
1160 }
1161
1162
1163
1164
1165
1166 /***********************************************************************
1167 * Comparisons, if/else/endif
1168 */
1169
1170 struct brw_instruction *brw_JMPI(struct brw_compile *p,
1171 struct brw_reg index,
1172 unsigned predicate_control)
1173 {
1174 struct brw_reg ip = brw_ip_reg();
1175 struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1176
1177 insn->header.execution_size = 1;
1178 insn->header.compression_control = BRW_COMPRESSION_NONE;
1179 insn->header.mask_control = BRW_MASK_DISABLE;
1180 insn->header.predicate_control = predicate_control;
1181
1182 return insn;
1183 }
1184
1185 static void
1186 push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1187 {
1188 p->if_stack[p->if_stack_depth] = inst - p->store;
1189
1190 p->if_stack_depth++;
1191 if (p->if_stack_array_size <= p->if_stack_depth) {
1192 p->if_stack_array_size *= 2;
1193 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1194 p->if_stack_array_size);
1195 }
1196 }
1197
1198 static struct brw_instruction *
1199 pop_if_stack(struct brw_compile *p)
1200 {
1201 p->if_stack_depth--;
1202 return &p->store[p->if_stack[p->if_stack_depth]];
1203 }
1204
1205 static void
1206 push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1207 {
1208 if (p->loop_stack_array_size < p->loop_stack_depth) {
1209 p->loop_stack_array_size *= 2;
1210 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1211 p->loop_stack_array_size);
1212 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1213 p->loop_stack_array_size);
1214 }
1215
1216 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1217 p->loop_stack_depth++;
1218 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1219 }
1220
1221 static struct brw_instruction *
1222 get_inner_do_insn(struct brw_compile *p)
1223 {
1224 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1225 }
1226
1227 /* EU takes the value from the flag register and pushes it onto some
1228 * sort of a stack (presumably merging with any flag value already on
1229 * the stack). Within an if block, the flags at the top of the stack
1230 * control execution on each channel of the unit, eg. on each of the
1231 * 16 pixel values in our wm programs.
1232 *
1233 * When the matching 'else' instruction is reached (presumably by
1234 * countdown of the instruction count patched in by our ELSE/ENDIF
1235 * functions), the relevent flags are inverted.
1236 *
1237 * When the matching 'endif' instruction is reached, the flags are
1238 * popped off. If the stack is now empty, normal execution resumes.
1239 */
1240 struct brw_instruction *
1241 brw_IF(struct brw_compile *p, unsigned execute_size)
1242 {
1243 struct brw_context *brw = p->brw;
1244 struct brw_instruction *insn;
1245
1246 insn = next_insn(p, BRW_OPCODE_IF);
1247
1248 /* Override the defaults for this instruction:
1249 */
1250 if (brw->gen < 6) {
1251 brw_set_dest(p, insn, brw_ip_reg());
1252 brw_set_src0(p, insn, brw_ip_reg());
1253 brw_set_src1(p, insn, brw_imm_d(0x0));
1254 } else if (brw->gen == 6) {
1255 brw_set_dest(p, insn, brw_imm_w(0));
1256 insn->bits1.branch_gen6.jump_count = 0;
1257 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1258 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1259 } else {
1260 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1261 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1262 brw_set_src1(p, insn, brw_imm_ud(0));
1263 insn->bits3.break_cont.jip = 0;
1264 insn->bits3.break_cont.uip = 0;
1265 }
1266
1267 insn->header.execution_size = execute_size;
1268 insn->header.compression_control = BRW_COMPRESSION_NONE;
1269 insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1270 insn->header.mask_control = BRW_MASK_ENABLE;
1271 if (!p->single_program_flow && brw->gen < 6)
1272 insn->header.thread_control = BRW_THREAD_SWITCH;
1273
1274 push_if_stack(p, insn);
1275 p->if_depth_in_loop[p->loop_stack_depth]++;
1276 return insn;
1277 }
1278
1279 /* This function is only used for gen6-style IF instructions with an
1280 * embedded comparison (conditional modifier). It is not used on gen7.
1281 */
1282 struct brw_instruction *
1283 gen6_IF(struct brw_compile *p, uint32_t conditional,
1284 struct brw_reg src0, struct brw_reg src1)
1285 {
1286 struct brw_instruction *insn;
1287
1288 insn = next_insn(p, BRW_OPCODE_IF);
1289
1290 brw_set_dest(p, insn, brw_imm_w(0));
1291 if (p->compressed) {
1292 insn->header.execution_size = BRW_EXECUTE_16;
1293 } else {
1294 insn->header.execution_size = BRW_EXECUTE_8;
1295 }
1296 insn->bits1.branch_gen6.jump_count = 0;
1297 brw_set_src0(p, insn, src0);
1298 brw_set_src1(p, insn, src1);
1299
1300 assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1301 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1302 insn->header.destreg__conditionalmod = conditional;
1303
1304 push_if_stack(p, insn);
1305 return insn;
1306 }
1307
1308 /**
1309 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1310 */
1311 static void
1312 convert_IF_ELSE_to_ADD(struct brw_compile *p,
1313 struct brw_instruction *if_inst,
1314 struct brw_instruction *else_inst)
1315 {
1316 /* The next instruction (where the ENDIF would be, if it existed) */
1317 struct brw_instruction *next_inst = &p->store[p->nr_insn];
1318
1319 assert(p->single_program_flow);
1320 assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1321 assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1322 assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1323
1324 /* Convert IF to an ADD instruction that moves the instruction pointer
1325 * to the first instruction of the ELSE block. If there is no ELSE
1326 * block, point to where ENDIF would be. Reverse the predicate.
1327 *
1328 * There's no need to execute an ENDIF since we don't need to do any
1329 * stack operations, and if we're currently executing, we just want to
1330 * continue normally.
1331 */
1332 if_inst->header.opcode = BRW_OPCODE_ADD;
1333 if_inst->header.predicate_inverse = 1;
1334
1335 if (else_inst != NULL) {
1336 /* Convert ELSE to an ADD instruction that points where the ENDIF
1337 * would be.
1338 */
1339 else_inst->header.opcode = BRW_OPCODE_ADD;
1340
1341 if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1342 else_inst->bits3.ud = (next_inst - else_inst) * 16;
1343 } else {
1344 if_inst->bits3.ud = (next_inst - if_inst) * 16;
1345 }
1346 }
1347
1348 /**
1349 * Patch IF and ELSE instructions with appropriate jump targets.
1350 */
1351 static void
1352 patch_IF_ELSE(struct brw_compile *p,
1353 struct brw_instruction *if_inst,
1354 struct brw_instruction *else_inst,
1355 struct brw_instruction *endif_inst)
1356 {
1357 struct brw_context *brw = p->brw;
1358
1359 /* We shouldn't be patching IF and ELSE instructions in single program flow
1360 * mode when gen < 6, because in single program flow mode on those
1361 * platforms, we convert flow control instructions to conditional ADDs that
1362 * operate on IP (see brw_ENDIF).
1363 *
1364 * However, on Gen6, writing to IP doesn't work in single program flow mode
1365 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1366 * not be updated by non-flow control instructions."). And on later
1367 * platforms, there is no significant benefit to converting control flow
1368 * instructions to conditional ADDs. So we do patch IF and ELSE
1369 * instructions in single program flow mode on those platforms.
1370 */
1371 if (brw->gen < 6)
1372 assert(!p->single_program_flow);
1373
1374 assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1375 assert(endif_inst != NULL);
1376 assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1377
1378 unsigned br = 1;
1379 /* Jump count is for 64bit data chunk each, so one 128bit instruction
1380 * requires 2 chunks.
1381 */
1382 if (brw->gen >= 5)
1383 br = 2;
1384
1385 assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1386 endif_inst->header.execution_size = if_inst->header.execution_size;
1387
1388 if (else_inst == NULL) {
1389 /* Patch IF -> ENDIF */
1390 if (brw->gen < 6) {
1391 /* Turn it into an IFF, which means no mask stack operations for
1392 * all-false and jumping past the ENDIF.
1393 */
1394 if_inst->header.opcode = BRW_OPCODE_IFF;
1395 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1396 if_inst->bits3.if_else.pop_count = 0;
1397 if_inst->bits3.if_else.pad0 = 0;
1398 } else if (brw->gen == 6) {
1399 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1400 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1401 } else {
1402 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1403 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1404 }
1405 } else {
1406 else_inst->header.execution_size = if_inst->header.execution_size;
1407
1408 /* Patch IF -> ELSE */
1409 if (brw->gen < 6) {
1410 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1411 if_inst->bits3.if_else.pop_count = 0;
1412 if_inst->bits3.if_else.pad0 = 0;
1413 } else if (brw->gen == 6) {
1414 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1415 }
1416
1417 /* Patch ELSE -> ENDIF */
1418 if (brw->gen < 6) {
1419 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1420 * matching ENDIF.
1421 */
1422 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1423 else_inst->bits3.if_else.pop_count = 1;
1424 else_inst->bits3.if_else.pad0 = 0;
1425 } else if (brw->gen == 6) {
1426 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1427 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1428 } else {
1429 /* The IF instruction's JIP should point just past the ELSE */
1430 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1431 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1432 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1433 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1434 }
1435 }
1436 }
1437
1438 void
1439 brw_ELSE(struct brw_compile *p)
1440 {
1441 struct brw_context *brw = p->brw;
1442 struct brw_instruction *insn;
1443
1444 insn = next_insn(p, BRW_OPCODE_ELSE);
1445
1446 if (brw->gen < 6) {
1447 brw_set_dest(p, insn, brw_ip_reg());
1448 brw_set_src0(p, insn, brw_ip_reg());
1449 brw_set_src1(p, insn, brw_imm_d(0x0));
1450 } else if (brw->gen == 6) {
1451 brw_set_dest(p, insn, brw_imm_w(0));
1452 insn->bits1.branch_gen6.jump_count = 0;
1453 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1454 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1455 } else {
1456 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1457 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1458 brw_set_src1(p, insn, brw_imm_ud(0));
1459 insn->bits3.break_cont.jip = 0;
1460 insn->bits3.break_cont.uip = 0;
1461 }
1462
1463 insn->header.compression_control = BRW_COMPRESSION_NONE;
1464 insn->header.mask_control = BRW_MASK_ENABLE;
1465 if (!p->single_program_flow && brw->gen < 6)
1466 insn->header.thread_control = BRW_THREAD_SWITCH;
1467
1468 push_if_stack(p, insn);
1469 }
1470
1471 void
1472 brw_ENDIF(struct brw_compile *p)
1473 {
1474 struct brw_context *brw = p->brw;
1475 struct brw_instruction *insn = NULL;
1476 struct brw_instruction *else_inst = NULL;
1477 struct brw_instruction *if_inst = NULL;
1478 struct brw_instruction *tmp;
1479 bool emit_endif = true;
1480
1481 /* In single program flow mode, we can express IF and ELSE instructions
1482 * equivalently as ADD instructions that operate on IP. On platforms prior
1483 * to Gen6, flow control instructions cause an implied thread switch, so
1484 * this is a significant savings.
1485 *
1486 * However, on Gen6, writing to IP doesn't work in single program flow mode
1487 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1488 * not be updated by non-flow control instructions."). And on later
1489 * platforms, there is no significant benefit to converting control flow
1490 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1491 * Gen5.
1492 */
1493 if (brw->gen < 6 && p->single_program_flow)
1494 emit_endif = false;
1495
1496 /*
1497 * A single next_insn() may change the base adress of instruction store
1498 * memory(p->store), so call it first before referencing the instruction
1499 * store pointer from an index
1500 */
1501 if (emit_endif)
1502 insn = next_insn(p, BRW_OPCODE_ENDIF);
1503
1504 /* Pop the IF and (optional) ELSE instructions from the stack */
1505 p->if_depth_in_loop[p->loop_stack_depth]--;
1506 tmp = pop_if_stack(p);
1507 if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1508 else_inst = tmp;
1509 tmp = pop_if_stack(p);
1510 }
1511 if_inst = tmp;
1512
1513 if (!emit_endif) {
1514 /* ENDIF is useless; don't bother emitting it. */
1515 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1516 return;
1517 }
1518
1519 if (brw->gen < 6) {
1520 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1521 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1522 brw_set_src1(p, insn, brw_imm_d(0x0));
1523 } else if (brw->gen == 6) {
1524 brw_set_dest(p, insn, brw_imm_w(0));
1525 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1526 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1527 } else {
1528 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1529 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1530 brw_set_src1(p, insn, brw_imm_ud(0));
1531 }
1532
1533 insn->header.compression_control = BRW_COMPRESSION_NONE;
1534 insn->header.mask_control = BRW_MASK_ENABLE;
1535 if (brw->gen < 6)
1536 insn->header.thread_control = BRW_THREAD_SWITCH;
1537
1538 /* Also pop item off the stack in the endif instruction: */
1539 if (brw->gen < 6) {
1540 insn->bits3.if_else.jump_count = 0;
1541 insn->bits3.if_else.pop_count = 1;
1542 insn->bits3.if_else.pad0 = 0;
1543 } else if (brw->gen == 6) {
1544 insn->bits1.branch_gen6.jump_count = 2;
1545 } else {
1546 insn->bits3.break_cont.jip = 2;
1547 }
1548 patch_IF_ELSE(p, if_inst, else_inst, insn);
1549 }
1550
1551 struct brw_instruction *brw_BREAK(struct brw_compile *p)
1552 {
1553 struct brw_context *brw = p->brw;
1554 struct brw_instruction *insn;
1555
1556 insn = next_insn(p, BRW_OPCODE_BREAK);
1557 if (brw->gen >= 6) {
1558 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1559 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1560 brw_set_src1(p, insn, brw_imm_d(0x0));
1561 } else {
1562 brw_set_dest(p, insn, brw_ip_reg());
1563 brw_set_src0(p, insn, brw_ip_reg());
1564 brw_set_src1(p, insn, brw_imm_d(0x0));
1565 insn->bits3.if_else.pad0 = 0;
1566 insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1567 }
1568 insn->header.compression_control = BRW_COMPRESSION_NONE;
1569 insn->header.execution_size = BRW_EXECUTE_8;
1570
1571 return insn;
1572 }
1573
1574 struct brw_instruction *gen6_CONT(struct brw_compile *p)
1575 {
1576 struct brw_instruction *insn;
1577
1578 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1579 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1580 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1581 brw_set_dest(p, insn, brw_ip_reg());
1582 brw_set_src0(p, insn, brw_ip_reg());
1583 brw_set_src1(p, insn, brw_imm_d(0x0));
1584
1585 insn->header.compression_control = BRW_COMPRESSION_NONE;
1586 insn->header.execution_size = BRW_EXECUTE_8;
1587 return insn;
1588 }
1589
1590 struct brw_instruction *brw_CONT(struct brw_compile *p)
1591 {
1592 struct brw_instruction *insn;
1593 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1594 brw_set_dest(p, insn, brw_ip_reg());
1595 brw_set_src0(p, insn, brw_ip_reg());
1596 brw_set_src1(p, insn, brw_imm_d(0x0));
1597 insn->header.compression_control = BRW_COMPRESSION_NONE;
1598 insn->header.execution_size = BRW_EXECUTE_8;
1599 /* insn->header.mask_control = BRW_MASK_DISABLE; */
1600 insn->bits3.if_else.pad0 = 0;
1601 insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1602 return insn;
1603 }
1604
1605 struct brw_instruction *gen6_HALT(struct brw_compile *p)
1606 {
1607 struct brw_instruction *insn;
1608
1609 insn = next_insn(p, BRW_OPCODE_HALT);
1610 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1611 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1612 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1613
1614 if (p->compressed) {
1615 insn->header.execution_size = BRW_EXECUTE_16;
1616 } else {
1617 insn->header.compression_control = BRW_COMPRESSION_NONE;
1618 insn->header.execution_size = BRW_EXECUTE_8;
1619 }
1620 return insn;
1621 }
1622
1623 /* DO/WHILE loop:
1624 *
1625 * The DO/WHILE is just an unterminated loop -- break or continue are
1626 * used for control within the loop. We have a few ways they can be
1627 * done.
1628 *
1629 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1630 * jip and no DO instruction.
1631 *
1632 * For non-uniform control flow pre-gen6, there's a DO instruction to
1633 * push the mask, and a WHILE to jump back, and BREAK to get out and
1634 * pop the mask.
1635 *
1636 * For gen6, there's no more mask stack, so no need for DO. WHILE
1637 * just points back to the first instruction of the loop.
1638 */
1639 struct brw_instruction *brw_DO(struct brw_compile *p, unsigned execute_size)
1640 {
1641 struct brw_context *brw = p->brw;
1642
1643 if (brw->gen >= 6 || p->single_program_flow) {
1644 push_loop_stack(p, &p->store[p->nr_insn]);
1645 return &p->store[p->nr_insn];
1646 } else {
1647 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1648
1649 push_loop_stack(p, insn);
1650
1651 /* Override the defaults for this instruction:
1652 */
1653 brw_set_dest(p, insn, brw_null_reg());
1654 brw_set_src0(p, insn, brw_null_reg());
1655 brw_set_src1(p, insn, brw_null_reg());
1656
1657 insn->header.compression_control = BRW_COMPRESSION_NONE;
1658 insn->header.execution_size = execute_size;
1659 insn->header.predicate_control = BRW_PREDICATE_NONE;
1660 /* insn->header.mask_control = BRW_MASK_ENABLE; */
1661 /* insn->header.mask_control = BRW_MASK_DISABLE; */
1662
1663 return insn;
1664 }
1665 }
1666
1667 /**
1668 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1669 * instruction here.
1670 *
1671 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1672 * nesting, since it can always just point to the end of the block/current loop.
1673 */
1674 static void
1675 brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1676 {
1677 struct brw_context *brw = p->brw;
1678 struct brw_instruction *do_inst = get_inner_do_insn(p);
1679 struct brw_instruction *inst;
1680 int br = (brw->gen == 5) ? 2 : 1;
1681
1682 for (inst = while_inst - 1; inst != do_inst; inst--) {
1683 /* If the jump count is != 0, that means that this instruction has already
1684 * been patched because it's part of a loop inside of the one we're
1685 * patching.
1686 */
1687 if (inst->header.opcode == BRW_OPCODE_BREAK &&
1688 inst->bits3.if_else.jump_count == 0) {
1689 inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1690 } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1691 inst->bits3.if_else.jump_count == 0) {
1692 inst->bits3.if_else.jump_count = br * (while_inst - inst);
1693 }
1694 }
1695 }
1696
1697 struct brw_instruction *brw_WHILE(struct brw_compile *p)
1698 {
1699 struct brw_context *brw = p->brw;
1700 struct brw_instruction *insn, *do_insn;
1701 unsigned br = 1;
1702
1703 if (brw->gen >= 5)
1704 br = 2;
1705
1706 if (brw->gen >= 7) {
1707 insn = next_insn(p, BRW_OPCODE_WHILE);
1708 do_insn = get_inner_do_insn(p);
1709
1710 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1711 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1712 brw_set_src1(p, insn, brw_imm_ud(0));
1713 insn->bits3.break_cont.jip = br * (do_insn - insn);
1714
1715 insn->header.execution_size = BRW_EXECUTE_8;
1716 } else if (brw->gen == 6) {
1717 insn = next_insn(p, BRW_OPCODE_WHILE);
1718 do_insn = get_inner_do_insn(p);
1719
1720 brw_set_dest(p, insn, brw_imm_w(0));
1721 insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1722 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1723 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1724
1725 insn->header.execution_size = BRW_EXECUTE_8;
1726 } else {
1727 if (p->single_program_flow) {
1728 insn = next_insn(p, BRW_OPCODE_ADD);
1729 do_insn = get_inner_do_insn(p);
1730
1731 brw_set_dest(p, insn, brw_ip_reg());
1732 brw_set_src0(p, insn, brw_ip_reg());
1733 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1734 insn->header.execution_size = BRW_EXECUTE_1;
1735 } else {
1736 insn = next_insn(p, BRW_OPCODE_WHILE);
1737 do_insn = get_inner_do_insn(p);
1738
1739 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1740
1741 brw_set_dest(p, insn, brw_ip_reg());
1742 brw_set_src0(p, insn, brw_ip_reg());
1743 brw_set_src1(p, insn, brw_imm_d(0));
1744
1745 insn->header.execution_size = do_insn->header.execution_size;
1746 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1747 insn->bits3.if_else.pop_count = 0;
1748 insn->bits3.if_else.pad0 = 0;
1749
1750 brw_patch_break_cont(p, insn);
1751 }
1752 }
1753 insn->header.compression_control = BRW_COMPRESSION_NONE;
1754
1755 p->loop_stack_depth--;
1756
1757 return insn;
1758 }
1759
1760 /* FORWARD JUMPS:
1761 */
1762 void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1763 {
1764 struct brw_context *brw = p->brw;
1765 struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1766 unsigned jmpi = 1;
1767
1768 if (brw->gen >= 5)
1769 jmpi = 2;
1770
1771 assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1772 assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1773
1774 jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
1775 }
1776
1777 /* To integrate with the above, it makes sense that the comparison
1778 * instruction should populate the flag register. It might be simpler
1779 * just to use the flag reg for most WM tasks?
1780 */
1781 void brw_CMP(struct brw_compile *p,
1782 struct brw_reg dest,
1783 unsigned conditional,
1784 struct brw_reg src0,
1785 struct brw_reg src1)
1786 {
1787 struct brw_context *brw = p->brw;
1788 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1789
1790 insn->header.destreg__conditionalmod = conditional;
1791 brw_set_dest(p, insn, dest);
1792 brw_set_src0(p, insn, src0);
1793 brw_set_src1(p, insn, src1);
1794
1795 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1796 * page says:
1797 * "Any CMP instruction with a null destination must use a {switch}."
1798 *
1799 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1800 * mentioned on their work-arounds pages.
1801 */
1802 if (brw->gen == 7) {
1803 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1804 dest.nr == BRW_ARF_NULL) {
1805 insn->header.thread_control = BRW_THREAD_SWITCH;
1806 }
1807 }
1808 }
1809
1810 /***********************************************************************
1811 * Helpers for the various SEND message types:
1812 */
1813
1814 /** Extended math function, float[8].
1815 */
1816 void gen4_math(struct brw_compile *p,
1817 struct brw_reg dest,
1818 unsigned function,
1819 unsigned msg_reg_nr,
1820 struct brw_reg src,
1821 unsigned data_type,
1822 unsigned precision )
1823 {
1824 struct brw_context *brw = p->brw;
1825 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1826
1827 assert(brw->gen < 6);
1828
1829 /* Example code doesn't set predicate_control for send
1830 * instructions.
1831 */
1832 insn->header.predicate_control = 0;
1833 insn->header.destreg__conditionalmod = msg_reg_nr;
1834
1835 brw_set_dest(p, insn, dest);
1836 brw_set_src0(p, insn, src);
1837 brw_set_math_message(p,
1838 insn,
1839 function,
1840 src.type == BRW_REGISTER_TYPE_D,
1841 precision,
1842 data_type);
1843 }
1844
1845 void gen6_math(struct brw_compile *p,
1846 struct brw_reg dest,
1847 unsigned function,
1848 struct brw_reg src0,
1849 struct brw_reg src1)
1850 {
1851 struct brw_context *brw = p->brw;
1852 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1853
1854 assert(brw->gen >= 6);
1855
1856 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1857 (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1858 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1859
1860 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1861 if (brw->gen == 6) {
1862 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1863 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1864 }
1865
1866 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1867 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1868 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1869 assert(src0.type != BRW_REGISTER_TYPE_F);
1870 assert(src1.type != BRW_REGISTER_TYPE_F);
1871 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1872 } else {
1873 assert(src0.type == BRW_REGISTER_TYPE_F);
1874 assert(src1.type == BRW_REGISTER_TYPE_F);
1875 if (function == BRW_MATH_FUNCTION_POW) {
1876 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1877 } else {
1878 assert(src1.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1879 src1.nr == BRW_ARF_NULL);
1880 }
1881 }
1882
1883 /* Source modifiers are ignored for extended math instructions on Gen6. */
1884 if (brw->gen == 6) {
1885 assert(!src0.negate);
1886 assert(!src0.abs);
1887 assert(!src1.negate);
1888 assert(!src1.abs);
1889 }
1890
1891 /* Math is the same ISA format as other opcodes, except that CondModifier
1892 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1893 */
1894 insn->header.destreg__conditionalmod = function;
1895
1896 brw_set_dest(p, insn, dest);
1897 brw_set_src0(p, insn, src0);
1898 brw_set_src1(p, insn, src1);
1899 }
1900
1901
1902 /**
1903 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1904 * using a constant offset per channel.
1905 *
1906 * The offset must be aligned to oword size (16 bytes). Used for
1907 * register spilling.
1908 */
1909 void brw_oword_block_write_scratch(struct brw_compile *p,
1910 struct brw_reg mrf,
1911 int num_regs,
1912 unsigned offset)
1913 {
1914 struct brw_context *brw = p->brw;
1915 uint32_t msg_control, msg_type;
1916 int mlen;
1917
1918 if (brw->gen >= 6)
1919 offset /= 16;
1920
1921 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1922
1923 if (num_regs == 1) {
1924 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1925 mlen = 2;
1926 } else {
1927 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1928 mlen = 3;
1929 }
1930
1931 /* Set up the message header. This is g0, with g0.2 filled with
1932 * the offset. We don't want to leave our offset around in g0 or
1933 * it'll screw up texture samples, so set it up inside the message
1934 * reg.
1935 */
1936 {
1937 brw_push_insn_state(p);
1938 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1939 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1940
1941 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1942
1943 /* set message header global offset field (reg 0, element 2) */
1944 brw_MOV(p,
1945 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1946 mrf.nr,
1947 2), BRW_REGISTER_TYPE_UD),
1948 brw_imm_ud(offset));
1949
1950 brw_pop_insn_state(p);
1951 }
1952
1953 {
1954 struct brw_reg dest;
1955 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1956 int send_commit_msg;
1957 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1958 BRW_REGISTER_TYPE_UW);
1959
1960 if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1961 insn->header.compression_control = BRW_COMPRESSION_NONE;
1962 src_header = vec16(src_header);
1963 }
1964 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1965 insn->header.destreg__conditionalmod = mrf.nr;
1966
1967 /* Until gen6, writes followed by reads from the same location
1968 * are not guaranteed to be ordered unless write_commit is set.
1969 * If set, then a no-op write is issued to the destination
1970 * register to set a dependency, and a read from the destination
1971 * can be used to ensure the ordering.
1972 *
1973 * For gen6, only writes between different threads need ordering
1974 * protection. Our use of DP writes is all about register
1975 * spilling within a thread.
1976 */
1977 if (brw->gen >= 6) {
1978 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1979 send_commit_msg = 0;
1980 } else {
1981 dest = src_header;
1982 send_commit_msg = 1;
1983 }
1984
1985 brw_set_dest(p, insn, dest);
1986 if (brw->gen >= 6) {
1987 brw_set_src0(p, insn, mrf);
1988 } else {
1989 brw_set_src0(p, insn, brw_null_reg());
1990 }
1991
1992 if (brw->gen >= 6)
1993 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1994 else
1995 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1996
1997 brw_set_dp_write_message(p,
1998 insn,
1999 255, /* binding table index (255=stateless) */
2000 msg_control,
2001 msg_type,
2002 mlen,
2003 true, /* header_present */
2004 0, /* not a render target */
2005 send_commit_msg, /* response_length */
2006 0, /* eot */
2007 send_commit_msg);
2008 }
2009 }
2010
2011
2012 /**
2013 * Read a block of owords (half a GRF each) from the scratch buffer
2014 * using a constant index per channel.
2015 *
2016 * Offset must be aligned to oword size (16 bytes). Used for register
2017 * spilling.
2018 */
2019 void
2020 brw_oword_block_read_scratch(struct brw_compile *p,
2021 struct brw_reg dest,
2022 struct brw_reg mrf,
2023 int num_regs,
2024 unsigned offset)
2025 {
2026 struct brw_context *brw = p->brw;
2027 uint32_t msg_control;
2028 int rlen;
2029
2030 if (brw->gen >= 6)
2031 offset /= 16;
2032
2033 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2034 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2035
2036 if (num_regs == 1) {
2037 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2038 rlen = 1;
2039 } else {
2040 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2041 rlen = 2;
2042 }
2043
2044 {
2045 brw_push_insn_state(p);
2046 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2047 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2048
2049 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2050
2051 /* set message header global offset field (reg 0, element 2) */
2052 brw_MOV(p,
2053 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2054 mrf.nr,
2055 2), BRW_REGISTER_TYPE_UD),
2056 brw_imm_ud(offset));
2057
2058 brw_pop_insn_state(p);
2059 }
2060
2061 {
2062 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2063
2064 assert(insn->header.predicate_control == 0);
2065 insn->header.compression_control = BRW_COMPRESSION_NONE;
2066 insn->header.destreg__conditionalmod = mrf.nr;
2067
2068 brw_set_dest(p, insn, dest); /* UW? */
2069 if (brw->gen >= 6) {
2070 brw_set_src0(p, insn, mrf);
2071 } else {
2072 brw_set_src0(p, insn, brw_null_reg());
2073 }
2074
2075 brw_set_dp_read_message(p,
2076 insn,
2077 255, /* binding table index (255=stateless) */
2078 msg_control,
2079 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2080 BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2081 1, /* msg_length */
2082 true, /* header_present */
2083 rlen);
2084 }
2085 }
2086
2087 void
2088 gen7_block_read_scratch(struct brw_compile *p,
2089 struct brw_reg dest,
2090 int num_regs,
2091 unsigned offset)
2092 {
2093 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2094
2095 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2096
2097 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
2098 insn->header.compression_control = BRW_COMPRESSION_NONE;
2099
2100 brw_set_dest(p, insn, dest);
2101
2102 /* The HW requires that the header is present; this is to get the g0.5
2103 * scratch offset.
2104 */
2105 bool header_present = true;
2106 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2107
2108 brw_set_message_descriptor(p, insn,
2109 GEN7_SFID_DATAPORT_DATA_CACHE,
2110 1, /* mlen: just g0 */
2111 num_regs,
2112 header_present,
2113 false);
2114
2115 insn->bits3.ud |= GEN7_DATAPORT_SCRATCH_READ;
2116
2117 assert(num_regs == 1 || num_regs == 2 || num_regs == 4);
2118 insn->bits3.ud |= (num_regs - 1) << GEN7_DATAPORT_SCRATCH_NUM_REGS_SHIFT;
2119
2120 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2121 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2122 * is 32 bytes, which happens to be the size of a register.
2123 */
2124 offset /= REG_SIZE;
2125 assert(offset < (1 << 12));
2126 insn->bits3.ud |= offset;
2127 }
2128
2129 /**
2130 * Read a float[4] vector from the data port Data Cache (const buffer).
2131 * Location (in buffer) should be a multiple of 16.
2132 * Used for fetching shader constants.
2133 */
2134 void brw_oword_block_read(struct brw_compile *p,
2135 struct brw_reg dest,
2136 struct brw_reg mrf,
2137 uint32_t offset,
2138 uint32_t bind_table_index)
2139 {
2140 struct brw_context *brw = p->brw;
2141
2142 /* On newer hardware, offset is in units of owords. */
2143 if (brw->gen >= 6)
2144 offset /= 16;
2145
2146 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2147
2148 brw_push_insn_state(p);
2149 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2150 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2151 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2152
2153 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2154
2155 /* set message header global offset field (reg 0, element 2) */
2156 brw_MOV(p,
2157 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2158 mrf.nr,
2159 2), BRW_REGISTER_TYPE_UD),
2160 brw_imm_ud(offset));
2161
2162 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2163 insn->header.destreg__conditionalmod = mrf.nr;
2164
2165 /* cast dest to a uword[8] vector */
2166 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2167
2168 brw_set_dest(p, insn, dest);
2169 if (brw->gen >= 6) {
2170 brw_set_src0(p, insn, mrf);
2171 } else {
2172 brw_set_src0(p, insn, brw_null_reg());
2173 }
2174
2175 brw_set_dp_read_message(p,
2176 insn,
2177 bind_table_index,
2178 BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2179 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2180 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2181 1, /* msg_length */
2182 true, /* header_present */
2183 1); /* response_length (1 reg, 2 owords!) */
2184
2185 brw_pop_insn_state(p);
2186 }
2187
2188
2189 void brw_fb_WRITE(struct brw_compile *p,
2190 int dispatch_width,
2191 unsigned msg_reg_nr,
2192 struct brw_reg src0,
2193 unsigned msg_control,
2194 unsigned binding_table_index,
2195 unsigned msg_length,
2196 unsigned response_length,
2197 bool eot,
2198 bool header_present)
2199 {
2200 struct brw_context *brw = p->brw;
2201 struct brw_instruction *insn;
2202 unsigned msg_type;
2203 struct brw_reg dest;
2204
2205 if (dispatch_width == 16)
2206 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2207 else
2208 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2209
2210 if (brw->gen >= 6) {
2211 insn = next_insn(p, BRW_OPCODE_SENDC);
2212 } else {
2213 insn = next_insn(p, BRW_OPCODE_SEND);
2214 }
2215 insn->header.compression_control = BRW_COMPRESSION_NONE;
2216
2217 if (brw->gen >= 6) {
2218 /* headerless version, just submit color payload */
2219 src0 = brw_message_reg(msg_reg_nr);
2220
2221 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2222 } else {
2223 insn->header.destreg__conditionalmod = msg_reg_nr;
2224
2225 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2226 }
2227
2228 brw_set_dest(p, insn, dest);
2229 brw_set_src0(p, insn, src0);
2230 brw_set_dp_write_message(p,
2231 insn,
2232 binding_table_index,
2233 msg_control,
2234 msg_type,
2235 msg_length,
2236 header_present,
2237 eot, /* last render target write */
2238 response_length,
2239 eot,
2240 0 /* send_commit_msg */);
2241 }
2242
2243
2244 /**
2245 * Texture sample instruction.
2246 * Note: the msg_type plus msg_length values determine exactly what kind
2247 * of sampling operation is performed. See volume 4, page 161 of docs.
2248 */
2249 void brw_SAMPLE(struct brw_compile *p,
2250 struct brw_reg dest,
2251 unsigned msg_reg_nr,
2252 struct brw_reg src0,
2253 unsigned binding_table_index,
2254 unsigned sampler,
2255 unsigned msg_type,
2256 unsigned response_length,
2257 unsigned msg_length,
2258 unsigned header_present,
2259 unsigned simd_mode,
2260 unsigned return_format)
2261 {
2262 struct brw_context *brw = p->brw;
2263 struct brw_instruction *insn;
2264
2265 if (msg_reg_nr != -1)
2266 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2267
2268 insn = next_insn(p, BRW_OPCODE_SEND);
2269 insn->header.predicate_control = 0; /* XXX */
2270
2271 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2272 *
2273 * "Instruction compression is not allowed for this instruction (that
2274 * is, send). The hardware behavior is undefined if this instruction is
2275 * set as compressed. However, compress control can be set to "SecHalf"
2276 * to affect the EMask generation."
2277 *
2278 * No similar wording is found in later PRMs, but there are examples
2279 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2280 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2281 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2282 */
2283 if (insn->header.compression_control != BRW_COMPRESSION_2NDHALF)
2284 insn->header.compression_control = BRW_COMPRESSION_NONE;
2285
2286 if (brw->gen < 6)
2287 insn->header.destreg__conditionalmod = msg_reg_nr;
2288
2289 brw_set_dest(p, insn, dest);
2290 brw_set_src0(p, insn, src0);
2291 brw_set_sampler_message(p, insn,
2292 binding_table_index,
2293 sampler,
2294 msg_type,
2295 response_length,
2296 msg_length,
2297 header_present,
2298 simd_mode,
2299 return_format);
2300 }
2301
2302 /* All these variables are pretty confusing - we might be better off
2303 * using bitmasks and macros for this, in the old style. Or perhaps
2304 * just having the caller instantiate the fields in dword3 itself.
2305 */
2306 void brw_urb_WRITE(struct brw_compile *p,
2307 struct brw_reg dest,
2308 unsigned msg_reg_nr,
2309 struct brw_reg src0,
2310 enum brw_urb_write_flags flags,
2311 unsigned msg_length,
2312 unsigned response_length,
2313 unsigned offset,
2314 unsigned swizzle)
2315 {
2316 struct brw_context *brw = p->brw;
2317 struct brw_instruction *insn;
2318
2319 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2320
2321 if (brw->gen == 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2322 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2323 brw_push_insn_state(p);
2324 brw_set_default_access_mode(p, BRW_ALIGN_1);
2325 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2326 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2327 BRW_REGISTER_TYPE_UD),
2328 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2329 brw_imm_ud(0xff00));
2330 brw_pop_insn_state(p);
2331 }
2332
2333 insn = next_insn(p, BRW_OPCODE_SEND);
2334
2335 assert(msg_length < BRW_MAX_MRF);
2336
2337 brw_set_dest(p, insn, dest);
2338 brw_set_src0(p, insn, src0);
2339 brw_set_src1(p, insn, brw_imm_d(0));
2340
2341 if (brw->gen < 6)
2342 insn->header.destreg__conditionalmod = msg_reg_nr;
2343
2344 brw_set_urb_message(p,
2345 insn,
2346 flags,
2347 msg_length,
2348 response_length,
2349 offset,
2350 swizzle);
2351 }
2352
2353 static int
2354 brw_find_next_block_end(struct brw_compile *p, int start_offset)
2355 {
2356 int offset;
2357 void *store = p->store;
2358
2359 for (offset = next_offset(store, start_offset); offset < p->next_insn_offset;
2360 offset = next_offset(store, offset)) {
2361 struct brw_instruction *insn = store + offset;
2362
2363 switch (insn->header.opcode) {
2364 case BRW_OPCODE_ENDIF:
2365 case BRW_OPCODE_ELSE:
2366 case BRW_OPCODE_WHILE:
2367 case BRW_OPCODE_HALT:
2368 return offset;
2369 }
2370 }
2371
2372 return 0;
2373 }
2374
2375 /* There is no DO instruction on gen6, so to find the end of the loop
2376 * we have to see if the loop is jumping back before our start
2377 * instruction.
2378 */
2379 static int
2380 brw_find_loop_end(struct brw_compile *p, int start_offset)
2381 {
2382 struct brw_context *brw = p->brw;
2383 int offset;
2384 int scale = 8;
2385 void *store = p->store;
2386
2387 /* Always start after the instruction (such as a WHILE) we're trying to fix
2388 * up.
2389 */
2390 for (offset = next_offset(store, start_offset); offset < p->next_insn_offset;
2391 offset = next_offset(store, offset)) {
2392 struct brw_instruction *insn = store + offset;
2393
2394 if (insn->header.opcode == BRW_OPCODE_WHILE) {
2395 int jip = brw->gen == 6 ? insn->bits1.branch_gen6.jump_count
2396 : insn->bits3.break_cont.jip;
2397 if (offset + jip * scale <= start_offset)
2398 return offset;
2399 }
2400 }
2401 assert(!"not reached");
2402 return start_offset;
2403 }
2404
2405 /* After program generation, go back and update the UIP and JIP of
2406 * BREAK, CONT, and HALT instructions to their correct locations.
2407 */
2408 void
2409 brw_set_uip_jip(struct brw_compile *p)
2410 {
2411 struct brw_context *brw = p->brw;
2412 int offset;
2413 int scale = 8;
2414 void *store = p->store;
2415
2416 if (brw->gen < 6)
2417 return;
2418
2419 for (offset = 0; offset < p->next_insn_offset;
2420 offset = next_offset(store, offset)) {
2421 struct brw_instruction *insn = store + offset;
2422
2423 if (insn->header.cmpt_control) {
2424 /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2425 assert(insn->header.opcode != BRW_OPCODE_BREAK &&
2426 insn->header.opcode != BRW_OPCODE_CONTINUE &&
2427 insn->header.opcode != BRW_OPCODE_HALT);
2428 continue;
2429 }
2430
2431 int block_end_offset = brw_find_next_block_end(p, offset);
2432 switch (insn->header.opcode) {
2433 case BRW_OPCODE_BREAK:
2434 assert(block_end_offset != 0);
2435 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2436 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2437 insn->bits3.break_cont.uip =
2438 (brw_find_loop_end(p, offset) - offset +
2439 (brw->gen == 6 ? 16 : 0)) / scale;
2440 break;
2441 case BRW_OPCODE_CONTINUE:
2442 assert(block_end_offset != 0);
2443 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2444 insn->bits3.break_cont.uip =
2445 (brw_find_loop_end(p, offset) - offset) / scale;
2446
2447 assert(insn->bits3.break_cont.uip != 0);
2448 assert(insn->bits3.break_cont.jip != 0);
2449 break;
2450
2451 case BRW_OPCODE_ENDIF:
2452 if (block_end_offset == 0)
2453 insn->bits3.break_cont.jip = 2;
2454 else
2455 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2456 break;
2457
2458 case BRW_OPCODE_HALT:
2459 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2460 *
2461 * "In case of the halt instruction not inside any conditional
2462 * code block, the value of <JIP> and <UIP> should be the
2463 * same. In case of the halt instruction inside conditional code
2464 * block, the <UIP> should be the end of the program, and the
2465 * <JIP> should be end of the most inner conditional code block."
2466 *
2467 * The uip will have already been set by whoever set up the
2468 * instruction.
2469 */
2470 if (block_end_offset == 0) {
2471 insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
2472 } else {
2473 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2474 }
2475 assert(insn->bits3.break_cont.uip != 0);
2476 assert(insn->bits3.break_cont.jip != 0);
2477 break;
2478 }
2479 }
2480 }
2481
2482 void brw_ff_sync(struct brw_compile *p,
2483 struct brw_reg dest,
2484 unsigned msg_reg_nr,
2485 struct brw_reg src0,
2486 bool allocate,
2487 unsigned response_length,
2488 bool eot)
2489 {
2490 struct brw_context *brw = p->brw;
2491 struct brw_instruction *insn;
2492
2493 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2494
2495 insn = next_insn(p, BRW_OPCODE_SEND);
2496 brw_set_dest(p, insn, dest);
2497 brw_set_src0(p, insn, src0);
2498 brw_set_src1(p, insn, brw_imm_d(0));
2499
2500 if (brw->gen < 6)
2501 insn->header.destreg__conditionalmod = msg_reg_nr;
2502
2503 brw_set_ff_sync_message(p,
2504 insn,
2505 allocate,
2506 response_length,
2507 eot);
2508 }
2509
2510 /**
2511 * Emit the SEND instruction necessary to generate stream output data on Gen6
2512 * (for transform feedback).
2513 *
2514 * If send_commit_msg is true, this is the last piece of stream output data
2515 * from this thread, so send the data as a committed write. According to the
2516 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2517 *
2518 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2519 * writes are complete by sending the final write as a committed write."
2520 */
2521 void
2522 brw_svb_write(struct brw_compile *p,
2523 struct brw_reg dest,
2524 unsigned msg_reg_nr,
2525 struct brw_reg src0,
2526 unsigned binding_table_index,
2527 bool send_commit_msg)
2528 {
2529 struct brw_instruction *insn;
2530
2531 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2532
2533 insn = next_insn(p, BRW_OPCODE_SEND);
2534 brw_set_dest(p, insn, dest);
2535 brw_set_src0(p, insn, src0);
2536 brw_set_src1(p, insn, brw_imm_d(0));
2537 brw_set_dp_write_message(p, insn,
2538 binding_table_index,
2539 0, /* msg_control: ignored */
2540 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2541 1, /* msg_length */
2542 true, /* header_present */
2543 0, /* last_render_target: ignored */
2544 send_commit_msg, /* response_length */
2545 0, /* end_of_thread */
2546 send_commit_msg); /* send_commit_msg */
2547 }
2548
2549 static void
2550 brw_set_dp_untyped_atomic_message(struct brw_compile *p,
2551 struct brw_instruction *insn,
2552 unsigned atomic_op,
2553 unsigned bind_table_index,
2554 unsigned msg_length,
2555 unsigned response_length,
2556 bool header_present)
2557 {
2558 if (p->brw->is_haswell) {
2559 brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1,
2560 msg_length, response_length,
2561 header_present, false);
2562
2563
2564 if (insn->header.access_mode == BRW_ALIGN_1) {
2565 if (insn->header.execution_size != BRW_EXECUTE_16)
2566 insn->bits3.ud |= 1 << 12; /* SIMD8 mode */
2567
2568 insn->bits3.gen7_dp.msg_type =
2569 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP;
2570 } else {
2571 insn->bits3.gen7_dp.msg_type =
2572 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2;
2573 }
2574 } else {
2575 brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
2576 msg_length, response_length,
2577 header_present, false);
2578
2579 insn->bits3.gen7_dp.msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
2580
2581 if (insn->header.execution_size != BRW_EXECUTE_16)
2582 insn->bits3.ud |= 1 << 12; /* SIMD8 mode */
2583 }
2584
2585 if (response_length)
2586 insn->bits3.ud |= 1 << 13; /* Return data expected */
2587
2588 insn->bits3.gen7_dp.binding_table_index = bind_table_index;
2589 insn->bits3.ud |= atomic_op << 8;
2590 }
2591
2592 void
2593 brw_untyped_atomic(struct brw_compile *p,
2594 struct brw_reg dest,
2595 struct brw_reg mrf,
2596 unsigned atomic_op,
2597 unsigned bind_table_index,
2598 unsigned msg_length,
2599 unsigned response_length) {
2600 struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_SEND);
2601
2602 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD));
2603 brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
2604 brw_set_src1(p, insn, brw_imm_d(0));
2605 brw_set_dp_untyped_atomic_message(
2606 p, insn, atomic_op, bind_table_index, msg_length, response_length,
2607 insn->header.access_mode == BRW_ALIGN_1);
2608 }
2609
2610 static void
2611 brw_set_dp_untyped_surface_read_message(struct brw_compile *p,
2612 struct brw_instruction *insn,
2613 unsigned bind_table_index,
2614 unsigned msg_length,
2615 unsigned response_length,
2616 bool header_present)
2617 {
2618 const unsigned dispatch_width =
2619 (insn->header.execution_size == BRW_EXECUTE_16 ? 16 : 8);
2620 const unsigned num_channels = response_length / (dispatch_width / 8);
2621
2622 if (p->brw->is_haswell) {
2623 brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1,
2624 msg_length, response_length,
2625 header_present, false);
2626
2627 insn->bits3.gen7_dp.msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ;
2628 } else {
2629 brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
2630 msg_length, response_length,
2631 header_present, false);
2632
2633 insn->bits3.gen7_dp.msg_type = GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ;
2634 }
2635
2636 if (insn->header.access_mode == BRW_ALIGN_1) {
2637 if (dispatch_width == 16)
2638 insn->bits3.ud |= 1 << 12; /* SIMD16 mode */
2639 else
2640 insn->bits3.ud |= 2 << 12; /* SIMD8 mode */
2641 }
2642
2643 insn->bits3.gen7_dp.binding_table_index = bind_table_index;
2644
2645 /* Set mask of 32-bit channels to drop. */
2646 insn->bits3.ud |= (0xf & (0xf << num_channels)) << 8;
2647 }
2648
2649 void
2650 brw_untyped_surface_read(struct brw_compile *p,
2651 struct brw_reg dest,
2652 struct brw_reg mrf,
2653 unsigned bind_table_index,
2654 unsigned msg_length,
2655 unsigned response_length)
2656 {
2657 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2658
2659 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD));
2660 brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
2661 brw_set_dp_untyped_surface_read_message(
2662 p, insn, bind_table_index, msg_length, response_length,
2663 insn->header.access_mode == BRW_ALIGN_1);
2664 }
2665
2666 /**
2667 * This instruction is generated as a single-channel align1 instruction by
2668 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2669 *
2670 * We can't use the typed atomic op in the FS because that has the execution
2671 * mask ANDed with the pixel mask, but we just want to write the one dword for
2672 * all the pixels.
2673 *
2674 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2675 * one u32. So we use the same untyped atomic write message as the pixel
2676 * shader.
2677 *
2678 * The untyped atomic operation requires a BUFFER surface type with RAW
2679 * format, and is only accessible through the legacy DATA_CACHE dataport
2680 * messages.
2681 */
2682 void brw_shader_time_add(struct brw_compile *p,
2683 struct brw_reg payload,
2684 uint32_t surf_index)
2685 {
2686 assert(p->brw->gen >= 7);
2687
2688 brw_push_insn_state(p);
2689 brw_set_default_access_mode(p, BRW_ALIGN_1);
2690 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2691 struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
2692 brw_pop_insn_state(p);
2693
2694 /* We use brw_vec1_reg and unmasked because we want to increment the given
2695 * offset only once.
2696 */
2697 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2698 BRW_ARF_NULL, 0));
2699 brw_set_src0(p, send, brw_vec1_reg(payload.file,
2700 payload.nr, 0));
2701 brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, surf_index,
2702 2 /* message length */,
2703 0 /* response length */,
2704 false /* header present */);
2705 }