2fa65e98f9f153d7cffdc7db63756ea75838673d
[mesa.git] / src / mesa / drivers / dri / i965 / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_context.h"
34 #include "brw_defines.h"
35 #include "brw_eu.h"
36
37 #include "glsl/ralloc.h"
38
39 /***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43 static void guess_execution_size(struct brw_compile *p,
44 struct brw_instruction *insn,
45 struct brw_reg reg)
46 {
47 if (reg.width == BRW_WIDTH_8 && p->compressed)
48 insn->header.execution_size = BRW_EXECUTE_16;
49 else
50 insn->header.execution_size = reg.width; /* note - definitions are compatible */
51 }
52
53
54 /**
55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
56 * registers, implicitly moving the operand to a message register.
57 *
58 * On Sandybridge, this is no longer the case. This function performs the
59 * explicit move; it should be called before emitting a SEND instruction.
60 */
61 void
62 gen6_resolve_implied_move(struct brw_compile *p,
63 struct brw_reg *src,
64 unsigned msg_reg_nr)
65 {
66 struct brw_context *brw = p->brw;
67 if (brw->gen < 6)
68 return;
69
70 if (src->file == BRW_MESSAGE_REGISTER_FILE)
71 return;
72
73 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
74 brw_push_insn_state(p);
75 brw_set_mask_control(p, BRW_MASK_DISABLE);
76 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
77 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
78 retype(*src, BRW_REGISTER_TYPE_UD));
79 brw_pop_insn_state(p);
80 }
81 *src = brw_message_reg(msg_reg_nr);
82 }
83
84 static void
85 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
86 {
87 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
88 * "The send with EOT should use register space R112-R127 for <src>. This is
89 * to enable loading of a new thread into the same slot while the message
90 * with EOT for current thread is pending dispatch."
91 *
92 * Since we're pretending to have 16 MRFs anyway, we may as well use the
93 * registers required for messages with EOT.
94 */
95 struct brw_context *brw = p->brw;
96 if (brw->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
97 reg->file = BRW_GENERAL_REGISTER_FILE;
98 reg->nr += GEN7_MRF_HACK_START;
99 }
100 }
101
102 /**
103 * Convert a brw_reg_type enumeration value into the hardware representation.
104 *
105 * The hardware encoding may depend on whether the value is an immediate.
106 */
107 unsigned
108 brw_reg_type_to_hw_type(const struct brw_context *brw,
109 enum brw_reg_type type, unsigned file)
110 {
111 if (file == BRW_IMMEDIATE_VALUE) {
112 const static int imm_hw_types[] = {
113 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
114 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
115 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
116 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
117 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
118 [BRW_REGISTER_TYPE_UB] = -1,
119 [BRW_REGISTER_TYPE_B] = -1,
120 [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
121 [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
122 [BRW_REGISTER_TYPE_V] = BRW_HW_REG_IMM_TYPE_V,
123 [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
124 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
125 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
126 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
127 };
128 assert(type < ARRAY_SIZE(imm_hw_types));
129 assert(imm_hw_types[type] != -1);
130 assert(brw->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
131 return imm_hw_types[type];
132 } else {
133 /* Non-immediate registers */
134 const static int hw_types[] = {
135 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
136 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
137 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
138 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
139 [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
140 [BRW_REGISTER_TYPE_B] = BRW_HW_REG_NON_IMM_TYPE_B,
141 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
142 [BRW_REGISTER_TYPE_UV] = -1,
143 [BRW_REGISTER_TYPE_VF] = -1,
144 [BRW_REGISTER_TYPE_V] = -1,
145 [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
146 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
147 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
148 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
149 };
150 assert(type < ARRAY_SIZE(hw_types));
151 assert(hw_types[type] != -1);
152 assert(brw->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
153 assert(brw->gen >= 8 || type < BRW_REGISTER_TYPE_HF);
154 return hw_types[type];
155 }
156 }
157
158 void
159 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
160 struct brw_reg dest)
161 {
162 if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
163 dest.file != BRW_MESSAGE_REGISTER_FILE)
164 assert(dest.nr < 128);
165
166 gen7_convert_mrf_to_grf(p, &dest);
167
168 insn->bits1.da1.dest_reg_file = dest.file;
169 insn->bits1.da1.dest_reg_type =
170 brw_reg_type_to_hw_type(p->brw, dest.type, dest.file);
171 insn->bits1.da1.dest_address_mode = dest.address_mode;
172
173 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
174 insn->bits1.da1.dest_reg_nr = dest.nr;
175
176 if (insn->header.access_mode == BRW_ALIGN_1) {
177 insn->bits1.da1.dest_subreg_nr = dest.subnr;
178 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
179 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
180 insn->bits1.da1.dest_horiz_stride = dest.hstride;
181 }
182 else {
183 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
184 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
185 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
186 dest.file == BRW_MESSAGE_REGISTER_FILE) {
187 assert(dest.dw1.bits.writemask != 0);
188 }
189 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
190 * Although Dst.HorzStride is a don't care for Align16, HW needs
191 * this to be programmed as "01".
192 */
193 insn->bits1.da16.dest_horiz_stride = 1;
194 }
195 }
196 else {
197 insn->bits1.ia1.dest_subreg_nr = dest.subnr;
198
199 /* These are different sizes in align1 vs align16:
200 */
201 if (insn->header.access_mode == BRW_ALIGN_1) {
202 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
203 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
204 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
205 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
206 }
207 else {
208 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
209 /* even ignored in da16, still need to set as '01' */
210 insn->bits1.ia16.dest_horiz_stride = 1;
211 }
212 }
213
214 /* NEW: Set the execution size based on dest.width and
215 * insn->compression_control:
216 */
217 guess_execution_size(p, insn, dest);
218 }
219
220 extern int reg_type_size[];
221
222 static void
223 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
224 {
225 int hstride_for_reg[] = {0, 1, 2, 4};
226 int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
227 int width_for_reg[] = {1, 2, 4, 8, 16};
228 int execsize_for_reg[] = {1, 2, 4, 8, 16};
229 int width, hstride, vstride, execsize;
230
231 if (reg.file == BRW_IMMEDIATE_VALUE) {
232 /* 3.3.6: Region Parameters. Restriction: Immediate vectors
233 * mean the destination has to be 128-bit aligned and the
234 * destination horiz stride has to be a word.
235 */
236 if (reg.type == BRW_REGISTER_TYPE_V) {
237 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
238 reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
239 }
240
241 return;
242 }
243
244 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
245 reg.file == BRW_ARF_NULL)
246 return;
247
248 assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
249 hstride = hstride_for_reg[reg.hstride];
250
251 if (reg.vstride == 0xf) {
252 vstride = -1;
253 } else {
254 assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
255 vstride = vstride_for_reg[reg.vstride];
256 }
257
258 assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
259 width = width_for_reg[reg.width];
260
261 assert(insn->header.execution_size >= 0 &&
262 insn->header.execution_size < Elements(execsize_for_reg));
263 execsize = execsize_for_reg[insn->header.execution_size];
264
265 /* Restrictions from 3.3.10: Register Region Restrictions. */
266 /* 3. */
267 assert(execsize >= width);
268
269 /* 4. */
270 if (execsize == width && hstride != 0) {
271 assert(vstride == -1 || vstride == width * hstride);
272 }
273
274 /* 5. */
275 if (execsize == width && hstride == 0) {
276 /* no restriction on vstride. */
277 }
278
279 /* 6. */
280 if (width == 1) {
281 assert(hstride == 0);
282 }
283
284 /* 7. */
285 if (execsize == 1 && width == 1) {
286 assert(hstride == 0);
287 assert(vstride == 0);
288 }
289
290 /* 8. */
291 if (vstride == 0 && hstride == 0) {
292 assert(width == 1);
293 }
294
295 /* 10. Check destination issues. */
296 }
297
298 static bool
299 is_compactable_immediate(unsigned imm)
300 {
301 /* We get the low 12 bits as-is. */
302 imm &= ~0xfff;
303
304 /* We get one bit replicated through the top 20 bits. */
305 return imm == 0 || imm == 0xfffff000;
306 }
307
308 void
309 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
310 struct brw_reg reg)
311 {
312 struct brw_context *brw = p->brw;
313
314 if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
315 assert(reg.nr < 128);
316
317 gen7_convert_mrf_to_grf(p, &reg);
318
319 if (brw->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
320 insn->header.opcode == BRW_OPCODE_SENDC)) {
321 /* Any source modifiers or regions will be ignored, since this just
322 * identifies the MRF/GRF to start reading the message contents from.
323 * Check for some likely failures.
324 */
325 assert(!reg.negate);
326 assert(!reg.abs);
327 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
328 }
329
330 validate_reg(insn, reg);
331
332 insn->bits1.da1.src0_reg_file = reg.file;
333 insn->bits1.da1.src0_reg_type =
334 brw_reg_type_to_hw_type(brw, reg.type, reg.file);
335 insn->bits2.da1.src0_abs = reg.abs;
336 insn->bits2.da1.src0_negate = reg.negate;
337 insn->bits2.da1.src0_address_mode = reg.address_mode;
338
339 if (reg.file == BRW_IMMEDIATE_VALUE) {
340 insn->bits3.ud = reg.dw1.ud;
341
342 /* The Bspec's section titled "Non-present Operands" claims that if src0
343 * is an immediate that src1's type must be the same as that of src0.
344 *
345 * The SNB+ DataTypeIndex instruction compaction tables contain mappings
346 * that do not follow this rule. E.g., from the IVB/HSW table:
347 *
348 * DataTypeIndex 18-Bit Mapping Mapped Meaning
349 * 3 001000001011111101 r:f | i:vf | a:ud | <1> | dir |
350 *
351 * And from the SNB table:
352 *
353 * DataTypeIndex 18-Bit Mapping Mapped Meaning
354 * 8 001000000111101100 a:w | i:w | a:ud | <1> | dir |
355 *
356 * Neither of these cause warnings from the simulator when used,
357 * compacted or otherwise. In fact, all compaction mappings that have an
358 * immediate in src0 use a:ud for src1.
359 *
360 * The GM45 instruction compaction tables do not contain mapped meanings
361 * so it's not clear whether it has the restriction. We'll assume it was
362 * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
363 */
364 insn->bits1.da1.src1_reg_file = 0; /* arf */
365 if (brw->gen < 6) {
366 insn->bits1.da1.src1_reg_type = insn->bits1.da1.src0_reg_type;
367 } else {
368 insn->bits1.da1.src1_reg_type = BRW_HW_REG_TYPE_UD;
369 }
370
371 /* Compacted instructions only have 12-bits (plus 1 for the other 20)
372 * for immediate values. Presumably the hardware engineers realized
373 * that the only useful floating-point value that could be represented
374 * in this format is 0.0, which can also be represented as a VF-typed
375 * immediate, so they gave us the previously mentioned mapping on IVB+.
376 *
377 * Strangely, we do have a mapping for imm:f in src1, so we don't need
378 * to do this there.
379 *
380 * If we see a 0.0:F, change the type to VF so that it can be compacted.
381 */
382 if (insn->bits3.ud == 0x0 &&
383 insn->bits1.da1.src0_reg_type == BRW_HW_REG_TYPE_F) {
384 insn->bits1.da1.src0_reg_type = BRW_HW_REG_IMM_TYPE_VF;
385 }
386
387 /* There are no mappings for dst:d | i:d, so if the immediate is suitable
388 * set the types to :UD so the instruction can be compacted.
389 */
390 if (is_compactable_immediate(insn->bits3.ud) &&
391 insn->header.destreg__conditionalmod == BRW_CONDITIONAL_NONE &&
392 insn->bits1.da1.src0_reg_type == BRW_HW_REG_TYPE_D &&
393 insn->bits1.da1.dest_reg_type == BRW_HW_REG_TYPE_D) {
394 insn->bits1.da1.src0_reg_type = BRW_HW_REG_TYPE_UD;
395 insn->bits1.da1.dest_reg_type = BRW_HW_REG_TYPE_UD;
396 }
397 }
398 else
399 {
400 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
401 if (insn->header.access_mode == BRW_ALIGN_1) {
402 insn->bits2.da1.src0_subreg_nr = reg.subnr;
403 insn->bits2.da1.src0_reg_nr = reg.nr;
404 }
405 else {
406 insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
407 insn->bits2.da16.src0_reg_nr = reg.nr;
408 }
409 }
410 else {
411 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
412
413 if (insn->header.access_mode == BRW_ALIGN_1) {
414 insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
415 }
416 else {
417 insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
418 }
419 }
420
421 if (insn->header.access_mode == BRW_ALIGN_1) {
422 if (reg.width == BRW_WIDTH_1 &&
423 insn->header.execution_size == BRW_EXECUTE_1) {
424 insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
425 insn->bits2.da1.src0_width = BRW_WIDTH_1;
426 insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
427 }
428 else {
429 insn->bits2.da1.src0_horiz_stride = reg.hstride;
430 insn->bits2.da1.src0_width = reg.width;
431 insn->bits2.da1.src0_vert_stride = reg.vstride;
432 }
433 }
434 else {
435 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
436 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
437 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
438 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
439
440 /* This is an oddity of the fact we're using the same
441 * descriptions for registers in align_16 as align_1:
442 */
443 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
444 insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
445 else
446 insn->bits2.da16.src0_vert_stride = reg.vstride;
447 }
448 }
449 }
450
451
452 void
453 brw_set_src1(struct brw_compile *p,
454 struct brw_instruction *insn,
455 struct brw_reg reg)
456 {
457 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
458
459 if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
460 assert(reg.nr < 128);
461
462 gen7_convert_mrf_to_grf(p, &reg);
463
464 validate_reg(insn, reg);
465
466 insn->bits1.da1.src1_reg_file = reg.file;
467 insn->bits1.da1.src1_reg_type =
468 brw_reg_type_to_hw_type(p->brw, reg.type, reg.file);
469 insn->bits3.da1.src1_abs = reg.abs;
470 insn->bits3.da1.src1_negate = reg.negate;
471
472 /* Only src1 can be immediate in two-argument instructions.
473 */
474 assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
475
476 if (reg.file == BRW_IMMEDIATE_VALUE) {
477 insn->bits3.ud = reg.dw1.ud;
478 }
479 else {
480 /* This is a hardware restriction, which may or may not be lifted
481 * in the future:
482 */
483 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
484 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
485
486 if (insn->header.access_mode == BRW_ALIGN_1) {
487 insn->bits3.da1.src1_subreg_nr = reg.subnr;
488 insn->bits3.da1.src1_reg_nr = reg.nr;
489 }
490 else {
491 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
492 insn->bits3.da16.src1_reg_nr = reg.nr;
493 }
494
495 if (insn->header.access_mode == BRW_ALIGN_1) {
496 if (reg.width == BRW_WIDTH_1 &&
497 insn->header.execution_size == BRW_EXECUTE_1) {
498 insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
499 insn->bits3.da1.src1_width = BRW_WIDTH_1;
500 insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
501 }
502 else {
503 insn->bits3.da1.src1_horiz_stride = reg.hstride;
504 insn->bits3.da1.src1_width = reg.width;
505 insn->bits3.da1.src1_vert_stride = reg.vstride;
506 }
507 }
508 else {
509 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
510 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
511 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
512 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
513
514 /* This is an oddity of the fact we're using the same
515 * descriptions for registers in align_16 as align_1:
516 */
517 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
518 insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
519 else
520 insn->bits3.da16.src1_vert_stride = reg.vstride;
521 }
522 }
523 }
524
525 /**
526 * Set the Message Descriptor and Extended Message Descriptor fields
527 * for SEND messages.
528 *
529 * \note This zeroes out the Function Control bits, so it must be called
530 * \b before filling out any message-specific data. Callers can
531 * choose not to fill in irrelevant bits; they will be zero.
532 */
533 static void
534 brw_set_message_descriptor(struct brw_compile *p,
535 struct brw_instruction *inst,
536 enum brw_message_target sfid,
537 unsigned msg_length,
538 unsigned response_length,
539 bool header_present,
540 bool end_of_thread)
541 {
542 struct brw_context *brw = p->brw;
543
544 brw_set_src1(p, inst, brw_imm_d(0));
545
546 if (brw->gen >= 5) {
547 inst->bits3.generic_gen5.header_present = header_present;
548 inst->bits3.generic_gen5.response_length = response_length;
549 inst->bits3.generic_gen5.msg_length = msg_length;
550 inst->bits3.generic_gen5.end_of_thread = end_of_thread;
551
552 if (brw->gen >= 6) {
553 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
554 inst->header.destreg__conditionalmod = sfid;
555 } else {
556 /* Set Extended Message Descriptor (ex_desc) */
557 inst->bits2.send_gen5.sfid = sfid;
558 inst->bits2.send_gen5.end_of_thread = end_of_thread;
559 }
560 } else {
561 inst->bits3.generic.response_length = response_length;
562 inst->bits3.generic.msg_length = msg_length;
563 inst->bits3.generic.msg_target = sfid;
564 inst->bits3.generic.end_of_thread = end_of_thread;
565 }
566 }
567
568 static void brw_set_math_message( struct brw_compile *p,
569 struct brw_instruction *insn,
570 unsigned function,
571 unsigned integer_type,
572 bool low_precision,
573 unsigned dataType )
574 {
575 struct brw_context *brw = p->brw;
576 unsigned msg_length;
577 unsigned response_length;
578
579 /* Infer message length from the function */
580 switch (function) {
581 case BRW_MATH_FUNCTION_POW:
582 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
583 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
584 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
585 msg_length = 2;
586 break;
587 default:
588 msg_length = 1;
589 break;
590 }
591
592 /* Infer response length from the function */
593 switch (function) {
594 case BRW_MATH_FUNCTION_SINCOS:
595 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
596 response_length = 2;
597 break;
598 default:
599 response_length = 1;
600 break;
601 }
602
603
604 brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
605 msg_length, response_length, false, false);
606 if (brw->gen == 5) {
607 insn->bits3.math_gen5.function = function;
608 insn->bits3.math_gen5.int_type = integer_type;
609 insn->bits3.math_gen5.precision = low_precision;
610 insn->bits3.math_gen5.saturate = insn->header.saturate;
611 insn->bits3.math_gen5.data_type = dataType;
612 insn->bits3.math_gen5.snapshot = 0;
613 } else {
614 insn->bits3.math.function = function;
615 insn->bits3.math.int_type = integer_type;
616 insn->bits3.math.precision = low_precision;
617 insn->bits3.math.saturate = insn->header.saturate;
618 insn->bits3.math.data_type = dataType;
619 }
620 insn->header.saturate = 0;
621 }
622
623
624 static void brw_set_ff_sync_message(struct brw_compile *p,
625 struct brw_instruction *insn,
626 bool allocate,
627 unsigned response_length,
628 bool end_of_thread)
629 {
630 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
631 1, response_length, true, end_of_thread);
632 insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
633 insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
634 insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
635 insn->bits3.urb_gen5.allocate = allocate;
636 insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
637 insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
638 }
639
640 static void brw_set_urb_message( struct brw_compile *p,
641 struct brw_instruction *insn,
642 enum brw_urb_write_flags flags,
643 unsigned msg_length,
644 unsigned response_length,
645 unsigned offset,
646 unsigned swizzle_control )
647 {
648 struct brw_context *brw = p->brw;
649
650 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
651 msg_length, response_length, true,
652 flags & BRW_URB_WRITE_EOT);
653 if (brw->gen == 7) {
654 if (flags & BRW_URB_WRITE_OWORD) {
655 assert(msg_length == 2); /* header + one OWORD of data */
656 insn->bits3.urb_gen7.opcode = BRW_URB_OPCODE_WRITE_OWORD;
657 } else {
658 insn->bits3.urb_gen7.opcode = BRW_URB_OPCODE_WRITE_HWORD;
659 }
660 insn->bits3.urb_gen7.offset = offset;
661 assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
662 insn->bits3.urb_gen7.swizzle_control = swizzle_control;
663 insn->bits3.urb_gen7.per_slot_offset =
664 flags & BRW_URB_WRITE_PER_SLOT_OFFSET ? 1 : 0;
665 insn->bits3.urb_gen7.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
666 } else if (brw->gen >= 5) {
667 insn->bits3.urb_gen5.opcode = 0; /* URB_WRITE */
668 insn->bits3.urb_gen5.offset = offset;
669 insn->bits3.urb_gen5.swizzle_control = swizzle_control;
670 insn->bits3.urb_gen5.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
671 insn->bits3.urb_gen5.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
672 insn->bits3.urb_gen5.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
673 } else {
674 insn->bits3.urb.opcode = 0; /* ? */
675 insn->bits3.urb.offset = offset;
676 insn->bits3.urb.swizzle_control = swizzle_control;
677 insn->bits3.urb.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
678 insn->bits3.urb.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
679 insn->bits3.urb.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
680 }
681 }
682
683 void
684 brw_set_dp_write_message(struct brw_compile *p,
685 struct brw_instruction *insn,
686 unsigned binding_table_index,
687 unsigned msg_control,
688 unsigned msg_type,
689 unsigned msg_length,
690 bool header_present,
691 unsigned last_render_target,
692 unsigned response_length,
693 unsigned end_of_thread,
694 unsigned send_commit_msg)
695 {
696 struct brw_context *brw = p->brw;
697 unsigned sfid;
698
699 if (brw->gen >= 7) {
700 /* Use the Render Cache for RT writes; otherwise use the Data Cache */
701 if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
702 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
703 else
704 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
705 } else if (brw->gen == 6) {
706 /* Use the render cache for all write messages. */
707 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
708 } else {
709 sfid = BRW_SFID_DATAPORT_WRITE;
710 }
711
712 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
713 header_present, end_of_thread);
714
715 if (brw->gen >= 7) {
716 insn->bits3.gen7_dp.binding_table_index = binding_table_index;
717 insn->bits3.gen7_dp.msg_control = msg_control;
718 insn->bits3.gen7_dp.last_render_target = last_render_target;
719 insn->bits3.gen7_dp.msg_type = msg_type;
720 } else if (brw->gen == 6) {
721 insn->bits3.gen6_dp.binding_table_index = binding_table_index;
722 insn->bits3.gen6_dp.msg_control = msg_control;
723 insn->bits3.gen6_dp.last_render_target = last_render_target;
724 insn->bits3.gen6_dp.msg_type = msg_type;
725 insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
726 } else if (brw->gen == 5) {
727 insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
728 insn->bits3.dp_write_gen5.msg_control = msg_control;
729 insn->bits3.dp_write_gen5.last_render_target = last_render_target;
730 insn->bits3.dp_write_gen5.msg_type = msg_type;
731 insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
732 } else {
733 insn->bits3.dp_write.binding_table_index = binding_table_index;
734 insn->bits3.dp_write.msg_control = msg_control;
735 insn->bits3.dp_write.last_render_target = last_render_target;
736 insn->bits3.dp_write.msg_type = msg_type;
737 insn->bits3.dp_write.send_commit_msg = send_commit_msg;
738 }
739 }
740
741 void
742 brw_set_dp_read_message(struct brw_compile *p,
743 struct brw_instruction *insn,
744 unsigned binding_table_index,
745 unsigned msg_control,
746 unsigned msg_type,
747 unsigned target_cache,
748 unsigned msg_length,
749 bool header_present,
750 unsigned response_length)
751 {
752 struct brw_context *brw = p->brw;
753 unsigned sfid;
754
755 if (brw->gen >= 7) {
756 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
757 } else if (brw->gen == 6) {
758 if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
759 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
760 else
761 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
762 } else {
763 sfid = BRW_SFID_DATAPORT_READ;
764 }
765
766 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
767 header_present, false);
768
769 if (brw->gen >= 7) {
770 insn->bits3.gen7_dp.binding_table_index = binding_table_index;
771 insn->bits3.gen7_dp.msg_control = msg_control;
772 insn->bits3.gen7_dp.last_render_target = 0;
773 insn->bits3.gen7_dp.msg_type = msg_type;
774 } else if (brw->gen == 6) {
775 insn->bits3.gen6_dp.binding_table_index = binding_table_index;
776 insn->bits3.gen6_dp.msg_control = msg_control;
777 insn->bits3.gen6_dp.last_render_target = 0;
778 insn->bits3.gen6_dp.msg_type = msg_type;
779 insn->bits3.gen6_dp.send_commit_msg = 0;
780 } else if (brw->gen == 5) {
781 insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
782 insn->bits3.dp_read_gen5.msg_control = msg_control;
783 insn->bits3.dp_read_gen5.msg_type = msg_type;
784 insn->bits3.dp_read_gen5.target_cache = target_cache;
785 } else if (brw->is_g4x) {
786 insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
787 insn->bits3.dp_read_g4x.msg_control = msg_control; /*8:10*/
788 insn->bits3.dp_read_g4x.msg_type = msg_type; /*11:13*/
789 insn->bits3.dp_read_g4x.target_cache = target_cache; /*14:15*/
790 } else {
791 insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
792 insn->bits3.dp_read.msg_control = msg_control; /*8:11*/
793 insn->bits3.dp_read.msg_type = msg_type; /*12:13*/
794 insn->bits3.dp_read.target_cache = target_cache; /*14:15*/
795 }
796 }
797
798 void
799 brw_set_sampler_message(struct brw_compile *p,
800 struct brw_instruction *insn,
801 unsigned binding_table_index,
802 unsigned sampler,
803 unsigned msg_type,
804 unsigned response_length,
805 unsigned msg_length,
806 unsigned header_present,
807 unsigned simd_mode,
808 unsigned return_format)
809 {
810 struct brw_context *brw = p->brw;
811
812 brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
813 response_length, header_present, false);
814
815 if (brw->gen >= 7) {
816 insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
817 insn->bits3.sampler_gen7.sampler = sampler;
818 insn->bits3.sampler_gen7.msg_type = msg_type;
819 insn->bits3.sampler_gen7.simd_mode = simd_mode;
820 } else if (brw->gen >= 5) {
821 insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
822 insn->bits3.sampler_gen5.sampler = sampler;
823 insn->bits3.sampler_gen5.msg_type = msg_type;
824 insn->bits3.sampler_gen5.simd_mode = simd_mode;
825 } else if (brw->is_g4x) {
826 insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
827 insn->bits3.sampler_g4x.sampler = sampler;
828 insn->bits3.sampler_g4x.msg_type = msg_type;
829 } else {
830 insn->bits3.sampler.binding_table_index = binding_table_index;
831 insn->bits3.sampler.sampler = sampler;
832 insn->bits3.sampler.msg_type = msg_type;
833 insn->bits3.sampler.return_format = return_format;
834 }
835 }
836
837
838 #define next_insn brw_next_insn
839 struct brw_instruction *
840 brw_next_insn(struct brw_compile *p, unsigned opcode)
841 {
842 struct brw_instruction *insn;
843
844 if (p->nr_insn + 1 > p->store_size) {
845 p->store_size <<= 1;
846 p->store = reralloc(p->mem_ctx, p->store,
847 struct brw_instruction, p->store_size);
848 }
849
850 p->next_insn_offset += 16;
851 insn = &p->store[p->nr_insn++];
852 memcpy(insn, p->current, sizeof(*insn));
853
854 /* Reset this one-shot flag:
855 */
856
857 if (p->current->header.destreg__conditionalmod) {
858 p->current->header.destreg__conditionalmod = 0;
859 p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
860 }
861
862 insn->header.opcode = opcode;
863 return insn;
864 }
865
866 static struct brw_instruction *brw_alu1( struct brw_compile *p,
867 unsigned opcode,
868 struct brw_reg dest,
869 struct brw_reg src )
870 {
871 struct brw_instruction *insn = next_insn(p, opcode);
872 brw_set_dest(p, insn, dest);
873 brw_set_src0(p, insn, src);
874 return insn;
875 }
876
877 static struct brw_instruction *brw_alu2(struct brw_compile *p,
878 unsigned opcode,
879 struct brw_reg dest,
880 struct brw_reg src0,
881 struct brw_reg src1 )
882 {
883 struct brw_instruction *insn = next_insn(p, opcode);
884 brw_set_dest(p, insn, dest);
885 brw_set_src0(p, insn, src0);
886 brw_set_src1(p, insn, src1);
887 return insn;
888 }
889
890 static int
891 get_3src_subreg_nr(struct brw_reg reg)
892 {
893 if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
894 assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
895 return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
896 } else {
897 return reg.subnr / 4;
898 }
899 }
900
901 static struct brw_instruction *brw_alu3(struct brw_compile *p,
902 unsigned opcode,
903 struct brw_reg dest,
904 struct brw_reg src0,
905 struct brw_reg src1,
906 struct brw_reg src2)
907 {
908 struct brw_context *brw = p->brw;
909 struct brw_instruction *insn = next_insn(p, opcode);
910
911 gen7_convert_mrf_to_grf(p, &dest);
912
913 assert(insn->header.access_mode == BRW_ALIGN_16);
914
915 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
916 dest.file == BRW_MESSAGE_REGISTER_FILE);
917 assert(dest.nr < 128);
918 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
919 assert(dest.type == BRW_REGISTER_TYPE_F ||
920 dest.type == BRW_REGISTER_TYPE_D ||
921 dest.type == BRW_REGISTER_TYPE_UD);
922 insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
923 insn->bits1.da3src.dest_reg_nr = dest.nr;
924 insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
925 insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
926 guess_execution_size(p, insn, dest);
927
928 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
929 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
930 assert(src0.nr < 128);
931 insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
932 insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
933 insn->bits2.da3src.src0_reg_nr = src0.nr;
934 insn->bits1.da3src.src0_abs = src0.abs;
935 insn->bits1.da3src.src0_negate = src0.negate;
936 insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
937
938 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
939 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
940 assert(src1.nr < 128);
941 insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
942 insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
943 insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
944 insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
945 insn->bits3.da3src.src1_reg_nr = src1.nr;
946 insn->bits1.da3src.src1_abs = src1.abs;
947 insn->bits1.da3src.src1_negate = src1.negate;
948
949 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
950 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
951 assert(src2.nr < 128);
952 insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
953 insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
954 insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
955 insn->bits3.da3src.src2_reg_nr = src2.nr;
956 insn->bits1.da3src.src2_abs = src2.abs;
957 insn->bits1.da3src.src2_negate = src2.negate;
958
959 if (brw->gen >= 7) {
960 /* Set both the source and destination types based on dest.type,
961 * ignoring the source register types. The MAD and LRP emitters ensure
962 * that all four types are float. The BFE and BFI2 emitters, however,
963 * may send us mixed D and UD types and want us to ignore that and use
964 * the destination type.
965 */
966 switch (dest.type) {
967 case BRW_REGISTER_TYPE_F:
968 insn->bits1.da3src.src_type = BRW_3SRC_TYPE_F;
969 insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_F;
970 break;
971 case BRW_REGISTER_TYPE_D:
972 insn->bits1.da3src.src_type = BRW_3SRC_TYPE_D;
973 insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_D;
974 break;
975 case BRW_REGISTER_TYPE_UD:
976 insn->bits1.da3src.src_type = BRW_3SRC_TYPE_UD;
977 insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_UD;
978 break;
979 }
980 }
981
982 return insn;
983 }
984
985
986 /***********************************************************************
987 * Convenience routines.
988 */
989 #define ALU1(OP) \
990 struct brw_instruction *brw_##OP(struct brw_compile *p, \
991 struct brw_reg dest, \
992 struct brw_reg src0) \
993 { \
994 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
995 }
996
997 #define ALU2(OP) \
998 struct brw_instruction *brw_##OP(struct brw_compile *p, \
999 struct brw_reg dest, \
1000 struct brw_reg src0, \
1001 struct brw_reg src1) \
1002 { \
1003 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
1004 }
1005
1006 #define ALU3(OP) \
1007 struct brw_instruction *brw_##OP(struct brw_compile *p, \
1008 struct brw_reg dest, \
1009 struct brw_reg src0, \
1010 struct brw_reg src1, \
1011 struct brw_reg src2) \
1012 { \
1013 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1014 }
1015
1016 #define ALU3F(OP) \
1017 struct brw_instruction *brw_##OP(struct brw_compile *p, \
1018 struct brw_reg dest, \
1019 struct brw_reg src0, \
1020 struct brw_reg src1, \
1021 struct brw_reg src2) \
1022 { \
1023 assert(dest.type == BRW_REGISTER_TYPE_F); \
1024 assert(src0.type == BRW_REGISTER_TYPE_F); \
1025 assert(src1.type == BRW_REGISTER_TYPE_F); \
1026 assert(src2.type == BRW_REGISTER_TYPE_F); \
1027 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1028 }
1029
1030 /* Rounding operations (other than RNDD) require two instructions - the first
1031 * stores a rounded value (possibly the wrong way) in the dest register, but
1032 * also sets a per-channel "increment bit" in the flag register. A predicated
1033 * add of 1.0 fixes dest to contain the desired result.
1034 *
1035 * Sandybridge and later appear to round correctly without an ADD.
1036 */
1037 #define ROUND(OP) \
1038 void brw_##OP(struct brw_compile *p, \
1039 struct brw_reg dest, \
1040 struct brw_reg src) \
1041 { \
1042 struct brw_instruction *rnd, *add; \
1043 rnd = next_insn(p, BRW_OPCODE_##OP); \
1044 brw_set_dest(p, rnd, dest); \
1045 brw_set_src0(p, rnd, src); \
1046 \
1047 if (p->brw->gen < 6) { \
1048 /* turn on round-increments */ \
1049 rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R; \
1050 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
1051 add->header.predicate_control = BRW_PREDICATE_NORMAL; \
1052 } \
1053 }
1054
1055
1056 ALU1(MOV)
1057 ALU2(SEL)
1058 ALU1(NOT)
1059 ALU2(AND)
1060 ALU2(OR)
1061 ALU2(XOR)
1062 ALU2(SHR)
1063 ALU2(SHL)
1064 ALU2(ASR)
1065 ALU1(F32TO16)
1066 ALU1(F16TO32)
1067 ALU1(FRC)
1068 ALU1(RNDD)
1069 ALU2(MAC)
1070 ALU2(MACH)
1071 ALU1(LZD)
1072 ALU2(DP4)
1073 ALU2(DPH)
1074 ALU2(DP3)
1075 ALU2(DP2)
1076 ALU2(LINE)
1077 ALU2(PLN)
1078 ALU3F(MAD)
1079 ALU3F(LRP)
1080 ALU1(BFREV)
1081 ALU3(BFE)
1082 ALU2(BFI1)
1083 ALU3(BFI2)
1084 ALU1(FBH)
1085 ALU1(FBL)
1086 ALU1(CBIT)
1087 ALU2(ADDC)
1088 ALU2(SUBB)
1089
1090 ROUND(RNDZ)
1091 ROUND(RNDE)
1092
1093
1094 struct brw_instruction *brw_ADD(struct brw_compile *p,
1095 struct brw_reg dest,
1096 struct brw_reg src0,
1097 struct brw_reg src1)
1098 {
1099 /* 6.2.2: add */
1100 if (src0.type == BRW_REGISTER_TYPE_F ||
1101 (src0.file == BRW_IMMEDIATE_VALUE &&
1102 src0.type == BRW_REGISTER_TYPE_VF)) {
1103 assert(src1.type != BRW_REGISTER_TYPE_UD);
1104 assert(src1.type != BRW_REGISTER_TYPE_D);
1105 }
1106
1107 if (src1.type == BRW_REGISTER_TYPE_F ||
1108 (src1.file == BRW_IMMEDIATE_VALUE &&
1109 src1.type == BRW_REGISTER_TYPE_VF)) {
1110 assert(src0.type != BRW_REGISTER_TYPE_UD);
1111 assert(src0.type != BRW_REGISTER_TYPE_D);
1112 }
1113
1114 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1115 }
1116
1117 struct brw_instruction *brw_AVG(struct brw_compile *p,
1118 struct brw_reg dest,
1119 struct brw_reg src0,
1120 struct brw_reg src1)
1121 {
1122 assert(dest.type == src0.type);
1123 assert(src0.type == src1.type);
1124 switch (src0.type) {
1125 case BRW_REGISTER_TYPE_B:
1126 case BRW_REGISTER_TYPE_UB:
1127 case BRW_REGISTER_TYPE_W:
1128 case BRW_REGISTER_TYPE_UW:
1129 case BRW_REGISTER_TYPE_D:
1130 case BRW_REGISTER_TYPE_UD:
1131 break;
1132 default:
1133 assert(!"Bad type for brw_AVG");
1134 }
1135
1136 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1137 }
1138
1139 struct brw_instruction *brw_MUL(struct brw_compile *p,
1140 struct brw_reg dest,
1141 struct brw_reg src0,
1142 struct brw_reg src1)
1143 {
1144 /* 6.32.38: mul */
1145 if (src0.type == BRW_REGISTER_TYPE_D ||
1146 src0.type == BRW_REGISTER_TYPE_UD ||
1147 src1.type == BRW_REGISTER_TYPE_D ||
1148 src1.type == BRW_REGISTER_TYPE_UD) {
1149 assert(dest.type != BRW_REGISTER_TYPE_F);
1150 }
1151
1152 if (src0.type == BRW_REGISTER_TYPE_F ||
1153 (src0.file == BRW_IMMEDIATE_VALUE &&
1154 src0.type == BRW_REGISTER_TYPE_VF)) {
1155 assert(src1.type != BRW_REGISTER_TYPE_UD);
1156 assert(src1.type != BRW_REGISTER_TYPE_D);
1157 }
1158
1159 if (src1.type == BRW_REGISTER_TYPE_F ||
1160 (src1.file == BRW_IMMEDIATE_VALUE &&
1161 src1.type == BRW_REGISTER_TYPE_VF)) {
1162 assert(src0.type != BRW_REGISTER_TYPE_UD);
1163 assert(src0.type != BRW_REGISTER_TYPE_D);
1164 }
1165
1166 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1167 src0.nr != BRW_ARF_ACCUMULATOR);
1168 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1169 src1.nr != BRW_ARF_ACCUMULATOR);
1170
1171 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1172 }
1173
1174
1175 void brw_NOP(struct brw_compile *p)
1176 {
1177 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
1178 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1179 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1180 brw_set_src1(p, insn, brw_imm_ud(0x0));
1181 }
1182
1183
1184
1185
1186
1187 /***********************************************************************
1188 * Comparisons, if/else/endif
1189 */
1190
1191 struct brw_instruction *brw_JMPI(struct brw_compile *p,
1192 struct brw_reg dest,
1193 struct brw_reg src0,
1194 struct brw_reg src1)
1195 {
1196 struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
1197
1198 insn->header.execution_size = 1;
1199 insn->header.compression_control = BRW_COMPRESSION_NONE;
1200 insn->header.mask_control = BRW_MASK_DISABLE;
1201
1202 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1203
1204 return insn;
1205 }
1206
1207 static void
1208 push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1209 {
1210 p->if_stack[p->if_stack_depth] = inst - p->store;
1211
1212 p->if_stack_depth++;
1213 if (p->if_stack_array_size <= p->if_stack_depth) {
1214 p->if_stack_array_size *= 2;
1215 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1216 p->if_stack_array_size);
1217 }
1218 }
1219
1220 static struct brw_instruction *
1221 pop_if_stack(struct brw_compile *p)
1222 {
1223 p->if_stack_depth--;
1224 return &p->store[p->if_stack[p->if_stack_depth]];
1225 }
1226
1227 static void
1228 push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1229 {
1230 if (p->loop_stack_array_size < p->loop_stack_depth) {
1231 p->loop_stack_array_size *= 2;
1232 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1233 p->loop_stack_array_size);
1234 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1235 p->loop_stack_array_size);
1236 }
1237
1238 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1239 p->loop_stack_depth++;
1240 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1241 }
1242
1243 static struct brw_instruction *
1244 get_inner_do_insn(struct brw_compile *p)
1245 {
1246 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1247 }
1248
1249 /* EU takes the value from the flag register and pushes it onto some
1250 * sort of a stack (presumably merging with any flag value already on
1251 * the stack). Within an if block, the flags at the top of the stack
1252 * control execution on each channel of the unit, eg. on each of the
1253 * 16 pixel values in our wm programs.
1254 *
1255 * When the matching 'else' instruction is reached (presumably by
1256 * countdown of the instruction count patched in by our ELSE/ENDIF
1257 * functions), the relevent flags are inverted.
1258 *
1259 * When the matching 'endif' instruction is reached, the flags are
1260 * popped off. If the stack is now empty, normal execution resumes.
1261 */
1262 struct brw_instruction *
1263 brw_IF(struct brw_compile *p, unsigned execute_size)
1264 {
1265 struct brw_context *brw = p->brw;
1266 struct brw_instruction *insn;
1267
1268 insn = next_insn(p, BRW_OPCODE_IF);
1269
1270 /* Override the defaults for this instruction:
1271 */
1272 if (brw->gen < 6) {
1273 brw_set_dest(p, insn, brw_ip_reg());
1274 brw_set_src0(p, insn, brw_ip_reg());
1275 brw_set_src1(p, insn, brw_imm_d(0x0));
1276 } else if (brw->gen == 6) {
1277 brw_set_dest(p, insn, brw_imm_w(0));
1278 insn->bits1.branch_gen6.jump_count = 0;
1279 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1280 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1281 } else {
1282 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1283 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1284 brw_set_src1(p, insn, brw_imm_ud(0));
1285 insn->bits3.break_cont.jip = 0;
1286 insn->bits3.break_cont.uip = 0;
1287 }
1288
1289 insn->header.execution_size = execute_size;
1290 insn->header.compression_control = BRW_COMPRESSION_NONE;
1291 insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1292 insn->header.mask_control = BRW_MASK_ENABLE;
1293 if (!p->single_program_flow)
1294 insn->header.thread_control = BRW_THREAD_SWITCH;
1295
1296 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1297
1298 push_if_stack(p, insn);
1299 p->if_depth_in_loop[p->loop_stack_depth]++;
1300 return insn;
1301 }
1302
1303 /* This function is only used for gen6-style IF instructions with an
1304 * embedded comparison (conditional modifier). It is not used on gen7.
1305 */
1306 struct brw_instruction *
1307 gen6_IF(struct brw_compile *p, uint32_t conditional,
1308 struct brw_reg src0, struct brw_reg src1)
1309 {
1310 struct brw_instruction *insn;
1311
1312 insn = next_insn(p, BRW_OPCODE_IF);
1313
1314 brw_set_dest(p, insn, brw_imm_w(0));
1315 if (p->compressed) {
1316 insn->header.execution_size = BRW_EXECUTE_16;
1317 } else {
1318 insn->header.execution_size = BRW_EXECUTE_8;
1319 }
1320 insn->bits1.branch_gen6.jump_count = 0;
1321 brw_set_src0(p, insn, src0);
1322 brw_set_src1(p, insn, src1);
1323
1324 assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1325 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1326 insn->header.destreg__conditionalmod = conditional;
1327
1328 if (!p->single_program_flow)
1329 insn->header.thread_control = BRW_THREAD_SWITCH;
1330
1331 push_if_stack(p, insn);
1332 return insn;
1333 }
1334
1335 /**
1336 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1337 */
1338 static void
1339 convert_IF_ELSE_to_ADD(struct brw_compile *p,
1340 struct brw_instruction *if_inst,
1341 struct brw_instruction *else_inst)
1342 {
1343 /* The next instruction (where the ENDIF would be, if it existed) */
1344 struct brw_instruction *next_inst = &p->store[p->nr_insn];
1345
1346 assert(p->single_program_flow);
1347 assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1348 assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1349 assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1350
1351 /* Convert IF to an ADD instruction that moves the instruction pointer
1352 * to the first instruction of the ELSE block. If there is no ELSE
1353 * block, point to where ENDIF would be. Reverse the predicate.
1354 *
1355 * There's no need to execute an ENDIF since we don't need to do any
1356 * stack operations, and if we're currently executing, we just want to
1357 * continue normally.
1358 */
1359 if_inst->header.opcode = BRW_OPCODE_ADD;
1360 if_inst->header.predicate_inverse = 1;
1361
1362 if (else_inst != NULL) {
1363 /* Convert ELSE to an ADD instruction that points where the ENDIF
1364 * would be.
1365 */
1366 else_inst->header.opcode = BRW_OPCODE_ADD;
1367
1368 if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1369 else_inst->bits3.ud = (next_inst - else_inst) * 16;
1370 } else {
1371 if_inst->bits3.ud = (next_inst - if_inst) * 16;
1372 }
1373 }
1374
1375 /**
1376 * Patch IF and ELSE instructions with appropriate jump targets.
1377 */
1378 static void
1379 patch_IF_ELSE(struct brw_compile *p,
1380 struct brw_instruction *if_inst,
1381 struct brw_instruction *else_inst,
1382 struct brw_instruction *endif_inst)
1383 {
1384 struct brw_context *brw = p->brw;
1385
1386 /* We shouldn't be patching IF and ELSE instructions in single program flow
1387 * mode when gen < 6, because in single program flow mode on those
1388 * platforms, we convert flow control instructions to conditional ADDs that
1389 * operate on IP (see brw_ENDIF).
1390 *
1391 * However, on Gen6, writing to IP doesn't work in single program flow mode
1392 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1393 * not be updated by non-flow control instructions."). And on later
1394 * platforms, there is no significant benefit to converting control flow
1395 * instructions to conditional ADDs. So we do patch IF and ELSE
1396 * instructions in single program flow mode on those platforms.
1397 */
1398 if (brw->gen < 6)
1399 assert(!p->single_program_flow);
1400
1401 assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1402 assert(endif_inst != NULL);
1403 assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1404
1405 unsigned br = 1;
1406 /* Jump count is for 64bit data chunk each, so one 128bit instruction
1407 * requires 2 chunks.
1408 */
1409 if (brw->gen >= 5)
1410 br = 2;
1411
1412 assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1413 endif_inst->header.execution_size = if_inst->header.execution_size;
1414
1415 if (else_inst == NULL) {
1416 /* Patch IF -> ENDIF */
1417 if (brw->gen < 6) {
1418 /* Turn it into an IFF, which means no mask stack operations for
1419 * all-false and jumping past the ENDIF.
1420 */
1421 if_inst->header.opcode = BRW_OPCODE_IFF;
1422 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1423 if_inst->bits3.if_else.pop_count = 0;
1424 if_inst->bits3.if_else.pad0 = 0;
1425 } else if (brw->gen == 6) {
1426 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1427 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1428 } else {
1429 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1430 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1431 }
1432 } else {
1433 else_inst->header.execution_size = if_inst->header.execution_size;
1434
1435 /* Patch IF -> ELSE */
1436 if (brw->gen < 6) {
1437 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1438 if_inst->bits3.if_else.pop_count = 0;
1439 if_inst->bits3.if_else.pad0 = 0;
1440 } else if (brw->gen == 6) {
1441 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1442 }
1443
1444 /* Patch ELSE -> ENDIF */
1445 if (brw->gen < 6) {
1446 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1447 * matching ENDIF.
1448 */
1449 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1450 else_inst->bits3.if_else.pop_count = 1;
1451 else_inst->bits3.if_else.pad0 = 0;
1452 } else if (brw->gen == 6) {
1453 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1454 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1455 } else {
1456 /* The IF instruction's JIP should point just past the ELSE */
1457 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1458 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1459 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1460 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1461 }
1462 }
1463 }
1464
1465 void
1466 brw_ELSE(struct brw_compile *p)
1467 {
1468 struct brw_context *brw = p->brw;
1469 struct brw_instruction *insn;
1470
1471 insn = next_insn(p, BRW_OPCODE_ELSE);
1472
1473 if (brw->gen < 6) {
1474 brw_set_dest(p, insn, brw_ip_reg());
1475 brw_set_src0(p, insn, brw_ip_reg());
1476 brw_set_src1(p, insn, brw_imm_d(0x0));
1477 } else if (brw->gen == 6) {
1478 brw_set_dest(p, insn, brw_imm_w(0));
1479 insn->bits1.branch_gen6.jump_count = 0;
1480 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1481 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1482 } else {
1483 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1484 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1485 brw_set_src1(p, insn, brw_imm_ud(0));
1486 insn->bits3.break_cont.jip = 0;
1487 insn->bits3.break_cont.uip = 0;
1488 }
1489
1490 insn->header.compression_control = BRW_COMPRESSION_NONE;
1491 insn->header.mask_control = BRW_MASK_ENABLE;
1492 if (!p->single_program_flow)
1493 insn->header.thread_control = BRW_THREAD_SWITCH;
1494
1495 push_if_stack(p, insn);
1496 }
1497
1498 void
1499 brw_ENDIF(struct brw_compile *p)
1500 {
1501 struct brw_context *brw = p->brw;
1502 struct brw_instruction *insn = NULL;
1503 struct brw_instruction *else_inst = NULL;
1504 struct brw_instruction *if_inst = NULL;
1505 struct brw_instruction *tmp;
1506 bool emit_endif = true;
1507
1508 /* In single program flow mode, we can express IF and ELSE instructions
1509 * equivalently as ADD instructions that operate on IP. On platforms prior
1510 * to Gen6, flow control instructions cause an implied thread switch, so
1511 * this is a significant savings.
1512 *
1513 * However, on Gen6, writing to IP doesn't work in single program flow mode
1514 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1515 * not be updated by non-flow control instructions."). And on later
1516 * platforms, there is no significant benefit to converting control flow
1517 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1518 * Gen5.
1519 */
1520 if (brw->gen < 6 && p->single_program_flow)
1521 emit_endif = false;
1522
1523 /*
1524 * A single next_insn() may change the base adress of instruction store
1525 * memory(p->store), so call it first before referencing the instruction
1526 * store pointer from an index
1527 */
1528 if (emit_endif)
1529 insn = next_insn(p, BRW_OPCODE_ENDIF);
1530
1531 /* Pop the IF and (optional) ELSE instructions from the stack */
1532 p->if_depth_in_loop[p->loop_stack_depth]--;
1533 tmp = pop_if_stack(p);
1534 if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1535 else_inst = tmp;
1536 tmp = pop_if_stack(p);
1537 }
1538 if_inst = tmp;
1539
1540 if (!emit_endif) {
1541 /* ENDIF is useless; don't bother emitting it. */
1542 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1543 return;
1544 }
1545
1546 if (brw->gen < 6) {
1547 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1548 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1549 brw_set_src1(p, insn, brw_imm_d(0x0));
1550 } else if (brw->gen == 6) {
1551 brw_set_dest(p, insn, brw_imm_w(0));
1552 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1553 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1554 } else {
1555 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1556 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1557 brw_set_src1(p, insn, brw_imm_ud(0));
1558 }
1559
1560 insn->header.compression_control = BRW_COMPRESSION_NONE;
1561 insn->header.mask_control = BRW_MASK_ENABLE;
1562 insn->header.thread_control = BRW_THREAD_SWITCH;
1563
1564 /* Also pop item off the stack in the endif instruction: */
1565 if (brw->gen < 6) {
1566 insn->bits3.if_else.jump_count = 0;
1567 insn->bits3.if_else.pop_count = 1;
1568 insn->bits3.if_else.pad0 = 0;
1569 } else if (brw->gen == 6) {
1570 insn->bits1.branch_gen6.jump_count = 2;
1571 } else {
1572 insn->bits3.break_cont.jip = 2;
1573 }
1574 patch_IF_ELSE(p, if_inst, else_inst, insn);
1575 }
1576
1577 struct brw_instruction *brw_BREAK(struct brw_compile *p)
1578 {
1579 struct brw_context *brw = p->brw;
1580 struct brw_instruction *insn;
1581
1582 insn = next_insn(p, BRW_OPCODE_BREAK);
1583 if (brw->gen >= 6) {
1584 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1585 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1586 brw_set_src1(p, insn, brw_imm_d(0x0));
1587 } else {
1588 brw_set_dest(p, insn, brw_ip_reg());
1589 brw_set_src0(p, insn, brw_ip_reg());
1590 brw_set_src1(p, insn, brw_imm_d(0x0));
1591 insn->bits3.if_else.pad0 = 0;
1592 insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1593 }
1594 insn->header.compression_control = BRW_COMPRESSION_NONE;
1595 insn->header.execution_size = BRW_EXECUTE_8;
1596
1597 return insn;
1598 }
1599
1600 struct brw_instruction *gen6_CONT(struct brw_compile *p)
1601 {
1602 struct brw_instruction *insn;
1603
1604 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1605 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1606 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1607 brw_set_dest(p, insn, brw_ip_reg());
1608 brw_set_src0(p, insn, brw_ip_reg());
1609 brw_set_src1(p, insn, brw_imm_d(0x0));
1610
1611 insn->header.compression_control = BRW_COMPRESSION_NONE;
1612 insn->header.execution_size = BRW_EXECUTE_8;
1613 return insn;
1614 }
1615
1616 struct brw_instruction *brw_CONT(struct brw_compile *p)
1617 {
1618 struct brw_instruction *insn;
1619 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1620 brw_set_dest(p, insn, brw_ip_reg());
1621 brw_set_src0(p, insn, brw_ip_reg());
1622 brw_set_src1(p, insn, brw_imm_d(0x0));
1623 insn->header.compression_control = BRW_COMPRESSION_NONE;
1624 insn->header.execution_size = BRW_EXECUTE_8;
1625 /* insn->header.mask_control = BRW_MASK_DISABLE; */
1626 insn->bits3.if_else.pad0 = 0;
1627 insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1628 return insn;
1629 }
1630
1631 struct brw_instruction *gen6_HALT(struct brw_compile *p)
1632 {
1633 struct brw_instruction *insn;
1634
1635 insn = next_insn(p, BRW_OPCODE_HALT);
1636 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1637 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1638 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1639
1640 if (p->compressed) {
1641 insn->header.execution_size = BRW_EXECUTE_16;
1642 } else {
1643 insn->header.compression_control = BRW_COMPRESSION_NONE;
1644 insn->header.execution_size = BRW_EXECUTE_8;
1645 }
1646 return insn;
1647 }
1648
1649 /* DO/WHILE loop:
1650 *
1651 * The DO/WHILE is just an unterminated loop -- break or continue are
1652 * used for control within the loop. We have a few ways they can be
1653 * done.
1654 *
1655 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1656 * jip and no DO instruction.
1657 *
1658 * For non-uniform control flow pre-gen6, there's a DO instruction to
1659 * push the mask, and a WHILE to jump back, and BREAK to get out and
1660 * pop the mask.
1661 *
1662 * For gen6, there's no more mask stack, so no need for DO. WHILE
1663 * just points back to the first instruction of the loop.
1664 */
1665 struct brw_instruction *brw_DO(struct brw_compile *p, unsigned execute_size)
1666 {
1667 struct brw_context *brw = p->brw;
1668
1669 if (brw->gen >= 6 || p->single_program_flow) {
1670 push_loop_stack(p, &p->store[p->nr_insn]);
1671 return &p->store[p->nr_insn];
1672 } else {
1673 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1674
1675 push_loop_stack(p, insn);
1676
1677 /* Override the defaults for this instruction:
1678 */
1679 brw_set_dest(p, insn, brw_null_reg());
1680 brw_set_src0(p, insn, brw_null_reg());
1681 brw_set_src1(p, insn, brw_null_reg());
1682
1683 insn->header.compression_control = BRW_COMPRESSION_NONE;
1684 insn->header.execution_size = execute_size;
1685 insn->header.predicate_control = BRW_PREDICATE_NONE;
1686 /* insn->header.mask_control = BRW_MASK_ENABLE; */
1687 /* insn->header.mask_control = BRW_MASK_DISABLE; */
1688
1689 return insn;
1690 }
1691 }
1692
1693 /**
1694 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1695 * instruction here.
1696 *
1697 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1698 * nesting, since it can always just point to the end of the block/current loop.
1699 */
1700 static void
1701 brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1702 {
1703 struct brw_context *brw = p->brw;
1704 struct brw_instruction *do_inst = get_inner_do_insn(p);
1705 struct brw_instruction *inst;
1706 int br = (brw->gen == 5) ? 2 : 1;
1707
1708 for (inst = while_inst - 1; inst != do_inst; inst--) {
1709 /* If the jump count is != 0, that means that this instruction has already
1710 * been patched because it's part of a loop inside of the one we're
1711 * patching.
1712 */
1713 if (inst->header.opcode == BRW_OPCODE_BREAK &&
1714 inst->bits3.if_else.jump_count == 0) {
1715 inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1716 } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1717 inst->bits3.if_else.jump_count == 0) {
1718 inst->bits3.if_else.jump_count = br * (while_inst - inst);
1719 }
1720 }
1721 }
1722
1723 struct brw_instruction *brw_WHILE(struct brw_compile *p)
1724 {
1725 struct brw_context *brw = p->brw;
1726 struct brw_instruction *insn, *do_insn;
1727 unsigned br = 1;
1728
1729 if (brw->gen >= 5)
1730 br = 2;
1731
1732 if (brw->gen >= 7) {
1733 insn = next_insn(p, BRW_OPCODE_WHILE);
1734 do_insn = get_inner_do_insn(p);
1735
1736 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1737 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1738 brw_set_src1(p, insn, brw_imm_ud(0));
1739 insn->bits3.break_cont.jip = br * (do_insn - insn);
1740
1741 insn->header.execution_size = BRW_EXECUTE_8;
1742 } else if (brw->gen == 6) {
1743 insn = next_insn(p, BRW_OPCODE_WHILE);
1744 do_insn = get_inner_do_insn(p);
1745
1746 brw_set_dest(p, insn, brw_imm_w(0));
1747 insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1748 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1749 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1750
1751 insn->header.execution_size = BRW_EXECUTE_8;
1752 } else {
1753 if (p->single_program_flow) {
1754 insn = next_insn(p, BRW_OPCODE_ADD);
1755 do_insn = get_inner_do_insn(p);
1756
1757 brw_set_dest(p, insn, brw_ip_reg());
1758 brw_set_src0(p, insn, brw_ip_reg());
1759 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1760 insn->header.execution_size = BRW_EXECUTE_1;
1761 } else {
1762 insn = next_insn(p, BRW_OPCODE_WHILE);
1763 do_insn = get_inner_do_insn(p);
1764
1765 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1766
1767 brw_set_dest(p, insn, brw_ip_reg());
1768 brw_set_src0(p, insn, brw_ip_reg());
1769 brw_set_src1(p, insn, brw_imm_d(0));
1770
1771 insn->header.execution_size = do_insn->header.execution_size;
1772 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1773 insn->bits3.if_else.pop_count = 0;
1774 insn->bits3.if_else.pad0 = 0;
1775
1776 brw_patch_break_cont(p, insn);
1777 }
1778 }
1779 insn->header.compression_control = BRW_COMPRESSION_NONE;
1780 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1781
1782 p->loop_stack_depth--;
1783
1784 return insn;
1785 }
1786
1787 /* To integrate with the above, it makes sense that the comparison
1788 * instruction should populate the flag register. It might be simpler
1789 * just to use the flag reg for most WM tasks?
1790 */
1791 void brw_CMP(struct brw_compile *p,
1792 struct brw_reg dest,
1793 unsigned conditional,
1794 struct brw_reg src0,
1795 struct brw_reg src1)
1796 {
1797 struct brw_context *brw = p->brw;
1798 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1799
1800 insn->header.destreg__conditionalmod = conditional;
1801 brw_set_dest(p, insn, dest);
1802 brw_set_src0(p, insn, src0);
1803 brw_set_src1(p, insn, src1);
1804
1805 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1806 * page says:
1807 * "Any CMP instruction with a null destination must use a {switch}."
1808 *
1809 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1810 * mentioned on their work-arounds pages.
1811 */
1812 if (brw->gen == 7) {
1813 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1814 dest.nr == BRW_ARF_NULL) {
1815 insn->header.thread_control = BRW_THREAD_SWITCH;
1816 }
1817 }
1818 }
1819
1820 /* Issue 'wait' instruction for n1, host could program MMIO
1821 to wake up thread. */
1822 void brw_WAIT (struct brw_compile *p)
1823 {
1824 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1825 struct brw_reg src = brw_notification_1_reg();
1826
1827 brw_set_dest(p, insn, src);
1828 brw_set_src0(p, insn, src);
1829 brw_set_src1(p, insn, brw_null_reg());
1830 insn->header.execution_size = 0; /* must */
1831 insn->header.predicate_control = 0;
1832 insn->header.compression_control = 0;
1833 }
1834
1835
1836 /***********************************************************************
1837 * Helpers for the various SEND message types:
1838 */
1839
1840 /** Extended math function, float[8].
1841 */
1842 void brw_math( struct brw_compile *p,
1843 struct brw_reg dest,
1844 unsigned function,
1845 unsigned msg_reg_nr,
1846 struct brw_reg src,
1847 unsigned data_type,
1848 unsigned precision )
1849 {
1850 struct brw_context *brw = p->brw;
1851
1852 if (brw->gen >= 6) {
1853 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1854
1855 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1856 (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1857 assert(src.file == BRW_GENERAL_REGISTER_FILE);
1858
1859 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1860 if (brw->gen == 6)
1861 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1862
1863 /* Source modifiers are ignored for extended math instructions on Gen6. */
1864 if (brw->gen == 6) {
1865 assert(!src.negate);
1866 assert(!src.abs);
1867 }
1868
1869 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1870 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1871 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1872 assert(src.type != BRW_REGISTER_TYPE_F);
1873 } else {
1874 assert(src.type == BRW_REGISTER_TYPE_F);
1875 }
1876
1877 /* Math is the same ISA format as other opcodes, except that CondModifier
1878 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1879 */
1880 insn->header.destreg__conditionalmod = function;
1881
1882 brw_set_dest(p, insn, dest);
1883 brw_set_src0(p, insn, src);
1884 brw_set_src1(p, insn, brw_null_reg());
1885 } else {
1886 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1887
1888 /* Example code doesn't set predicate_control for send
1889 * instructions.
1890 */
1891 insn->header.predicate_control = 0;
1892 insn->header.destreg__conditionalmod = msg_reg_nr;
1893
1894 brw_set_dest(p, insn, dest);
1895 brw_set_src0(p, insn, src);
1896 brw_set_math_message(p,
1897 insn,
1898 function,
1899 src.type == BRW_REGISTER_TYPE_D,
1900 precision,
1901 data_type);
1902 }
1903 }
1904
1905 /** Extended math function, float[8].
1906 */
1907 void brw_math2(struct brw_compile *p,
1908 struct brw_reg dest,
1909 unsigned function,
1910 struct brw_reg src0,
1911 struct brw_reg src1)
1912 {
1913 struct brw_context *brw = p->brw;
1914 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1915
1916 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1917 (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1918 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1919 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1920
1921 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1922 if (brw->gen == 6) {
1923 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1924 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1925 }
1926
1927 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1928 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1929 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1930 assert(src0.type != BRW_REGISTER_TYPE_F);
1931 assert(src1.type != BRW_REGISTER_TYPE_F);
1932 } else {
1933 assert(src0.type == BRW_REGISTER_TYPE_F);
1934 assert(src1.type == BRW_REGISTER_TYPE_F);
1935 }
1936
1937 /* Source modifiers are ignored for extended math instructions on Gen6. */
1938 if (brw->gen == 6) {
1939 assert(!src0.negate);
1940 assert(!src0.abs);
1941 assert(!src1.negate);
1942 assert(!src1.abs);
1943 }
1944
1945 /* Math is the same ISA format as other opcodes, except that CondModifier
1946 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1947 */
1948 insn->header.destreg__conditionalmod = function;
1949
1950 brw_set_dest(p, insn, dest);
1951 brw_set_src0(p, insn, src0);
1952 brw_set_src1(p, insn, src1);
1953 }
1954
1955
1956 /**
1957 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1958 * using a constant offset per channel.
1959 *
1960 * The offset must be aligned to oword size (16 bytes). Used for
1961 * register spilling.
1962 */
1963 void brw_oword_block_write_scratch(struct brw_compile *p,
1964 struct brw_reg mrf,
1965 int num_regs,
1966 unsigned offset)
1967 {
1968 struct brw_context *brw = p->brw;
1969 uint32_t msg_control, msg_type;
1970 int mlen;
1971
1972 if (brw->gen >= 6)
1973 offset /= 16;
1974
1975 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1976
1977 if (num_regs == 1) {
1978 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1979 mlen = 2;
1980 } else {
1981 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1982 mlen = 3;
1983 }
1984
1985 /* Set up the message header. This is g0, with g0.2 filled with
1986 * the offset. We don't want to leave our offset around in g0 or
1987 * it'll screw up texture samples, so set it up inside the message
1988 * reg.
1989 */
1990 {
1991 brw_push_insn_state(p);
1992 brw_set_mask_control(p, BRW_MASK_DISABLE);
1993 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1994
1995 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1996
1997 /* set message header global offset field (reg 0, element 2) */
1998 brw_MOV(p,
1999 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2000 mrf.nr,
2001 2), BRW_REGISTER_TYPE_UD),
2002 brw_imm_ud(offset));
2003
2004 brw_pop_insn_state(p);
2005 }
2006
2007 {
2008 struct brw_reg dest;
2009 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2010 int send_commit_msg;
2011 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2012 BRW_REGISTER_TYPE_UW);
2013
2014 if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
2015 insn->header.compression_control = BRW_COMPRESSION_NONE;
2016 src_header = vec16(src_header);
2017 }
2018 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
2019 insn->header.destreg__conditionalmod = mrf.nr;
2020
2021 /* Until gen6, writes followed by reads from the same location
2022 * are not guaranteed to be ordered unless write_commit is set.
2023 * If set, then a no-op write is issued to the destination
2024 * register to set a dependency, and a read from the destination
2025 * can be used to ensure the ordering.
2026 *
2027 * For gen6, only writes between different threads need ordering
2028 * protection. Our use of DP writes is all about register
2029 * spilling within a thread.
2030 */
2031 if (brw->gen >= 6) {
2032 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2033 send_commit_msg = 0;
2034 } else {
2035 dest = src_header;
2036 send_commit_msg = 1;
2037 }
2038
2039 brw_set_dest(p, insn, dest);
2040 if (brw->gen >= 6) {
2041 brw_set_src0(p, insn, mrf);
2042 } else {
2043 brw_set_src0(p, insn, brw_null_reg());
2044 }
2045
2046 if (brw->gen >= 6)
2047 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2048 else
2049 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2050
2051 brw_set_dp_write_message(p,
2052 insn,
2053 255, /* binding table index (255=stateless) */
2054 msg_control,
2055 msg_type,
2056 mlen,
2057 true, /* header_present */
2058 0, /* not a render target */
2059 send_commit_msg, /* response_length */
2060 0, /* eot */
2061 send_commit_msg);
2062 }
2063 }
2064
2065
2066 /**
2067 * Read a block of owords (half a GRF each) from the scratch buffer
2068 * using a constant index per channel.
2069 *
2070 * Offset must be aligned to oword size (16 bytes). Used for register
2071 * spilling.
2072 */
2073 void
2074 brw_oword_block_read_scratch(struct brw_compile *p,
2075 struct brw_reg dest,
2076 struct brw_reg mrf,
2077 int num_regs,
2078 unsigned offset)
2079 {
2080 struct brw_context *brw = p->brw;
2081 uint32_t msg_control;
2082 int rlen;
2083
2084 if (brw->gen >= 6)
2085 offset /= 16;
2086
2087 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2088 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2089
2090 if (num_regs == 1) {
2091 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2092 rlen = 1;
2093 } else {
2094 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2095 rlen = 2;
2096 }
2097
2098 {
2099 brw_push_insn_state(p);
2100 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2101 brw_set_mask_control(p, BRW_MASK_DISABLE);
2102
2103 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2104
2105 /* set message header global offset field (reg 0, element 2) */
2106 brw_MOV(p,
2107 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2108 mrf.nr,
2109 2), BRW_REGISTER_TYPE_UD),
2110 brw_imm_ud(offset));
2111
2112 brw_pop_insn_state(p);
2113 }
2114
2115 {
2116 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2117
2118 assert(insn->header.predicate_control == 0);
2119 insn->header.compression_control = BRW_COMPRESSION_NONE;
2120 insn->header.destreg__conditionalmod = mrf.nr;
2121
2122 brw_set_dest(p, insn, dest); /* UW? */
2123 if (brw->gen >= 6) {
2124 brw_set_src0(p, insn, mrf);
2125 } else {
2126 brw_set_src0(p, insn, brw_null_reg());
2127 }
2128
2129 brw_set_dp_read_message(p,
2130 insn,
2131 255, /* binding table index (255=stateless) */
2132 msg_control,
2133 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2134 BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2135 1, /* msg_length */
2136 true, /* header_present */
2137 rlen);
2138 }
2139 }
2140
2141 void
2142 gen7_block_read_scratch(struct brw_compile *p,
2143 struct brw_reg dest,
2144 int num_regs,
2145 unsigned offset)
2146 {
2147 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2148
2149 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2150
2151 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
2152 insn->header.compression_control = BRW_COMPRESSION_NONE;
2153
2154 brw_set_dest(p, insn, dest);
2155
2156 /* The HW requires that the header is present; this is to get the g0.5
2157 * scratch offset.
2158 */
2159 bool header_present = true;
2160 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2161
2162 brw_set_message_descriptor(p, insn,
2163 GEN7_SFID_DATAPORT_DATA_CACHE,
2164 1, /* mlen: just g0 */
2165 num_regs,
2166 header_present,
2167 false);
2168
2169 insn->bits3.ud |= GEN7_DATAPORT_SCRATCH_READ;
2170
2171 assert(num_regs == 1 || num_regs == 2 || num_regs == 4);
2172 insn->bits3.ud |= (num_regs - 1) << GEN7_DATAPORT_SCRATCH_NUM_REGS_SHIFT;
2173
2174 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2175 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2176 * is 32 bytes, which happens to be the size of a register.
2177 */
2178 offset /= REG_SIZE;
2179 assert(offset < (1 << 12));
2180 insn->bits3.ud |= offset;
2181 }
2182
2183 /**
2184 * Read a float[4] vector from the data port Data Cache (const buffer).
2185 * Location (in buffer) should be a multiple of 16.
2186 * Used for fetching shader constants.
2187 */
2188 void brw_oword_block_read(struct brw_compile *p,
2189 struct brw_reg dest,
2190 struct brw_reg mrf,
2191 uint32_t offset,
2192 uint32_t bind_table_index)
2193 {
2194 struct brw_context *brw = p->brw;
2195
2196 /* On newer hardware, offset is in units of owords. */
2197 if (brw->gen >= 6)
2198 offset /= 16;
2199
2200 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2201
2202 brw_push_insn_state(p);
2203 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2204 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2205 brw_set_mask_control(p, BRW_MASK_DISABLE);
2206
2207 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2208
2209 /* set message header global offset field (reg 0, element 2) */
2210 brw_MOV(p,
2211 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2212 mrf.nr,
2213 2), BRW_REGISTER_TYPE_UD),
2214 brw_imm_ud(offset));
2215
2216 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2217 insn->header.destreg__conditionalmod = mrf.nr;
2218
2219 /* cast dest to a uword[8] vector */
2220 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2221
2222 brw_set_dest(p, insn, dest);
2223 if (brw->gen >= 6) {
2224 brw_set_src0(p, insn, mrf);
2225 } else {
2226 brw_set_src0(p, insn, brw_null_reg());
2227 }
2228
2229 brw_set_dp_read_message(p,
2230 insn,
2231 bind_table_index,
2232 BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2233 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2234 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2235 1, /* msg_length */
2236 true, /* header_present */
2237 1); /* response_length (1 reg, 2 owords!) */
2238
2239 brw_pop_insn_state(p);
2240 }
2241
2242
2243 void brw_fb_WRITE(struct brw_compile *p,
2244 int dispatch_width,
2245 unsigned msg_reg_nr,
2246 struct brw_reg src0,
2247 unsigned msg_control,
2248 unsigned binding_table_index,
2249 unsigned msg_length,
2250 unsigned response_length,
2251 bool eot,
2252 bool header_present)
2253 {
2254 struct brw_context *brw = p->brw;
2255 struct brw_instruction *insn;
2256 unsigned msg_type;
2257 struct brw_reg dest;
2258
2259 if (dispatch_width == 16)
2260 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2261 else
2262 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2263
2264 if (brw->gen >= 6) {
2265 insn = next_insn(p, BRW_OPCODE_SENDC);
2266 } else {
2267 insn = next_insn(p, BRW_OPCODE_SEND);
2268 }
2269 insn->header.compression_control = BRW_COMPRESSION_NONE;
2270
2271 if (brw->gen >= 6) {
2272 /* headerless version, just submit color payload */
2273 src0 = brw_message_reg(msg_reg_nr);
2274
2275 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2276 } else {
2277 insn->header.destreg__conditionalmod = msg_reg_nr;
2278
2279 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2280 }
2281
2282 brw_set_dest(p, insn, dest);
2283 brw_set_src0(p, insn, src0);
2284 brw_set_dp_write_message(p,
2285 insn,
2286 binding_table_index,
2287 msg_control,
2288 msg_type,
2289 msg_length,
2290 header_present,
2291 eot, /* last render target write */
2292 response_length,
2293 eot,
2294 0 /* send_commit_msg */);
2295 }
2296
2297
2298 /**
2299 * Texture sample instruction.
2300 * Note: the msg_type plus msg_length values determine exactly what kind
2301 * of sampling operation is performed. See volume 4, page 161 of docs.
2302 */
2303 void brw_SAMPLE(struct brw_compile *p,
2304 struct brw_reg dest,
2305 unsigned msg_reg_nr,
2306 struct brw_reg src0,
2307 unsigned binding_table_index,
2308 unsigned sampler,
2309 unsigned msg_type,
2310 unsigned response_length,
2311 unsigned msg_length,
2312 unsigned header_present,
2313 unsigned simd_mode,
2314 unsigned return_format)
2315 {
2316 struct brw_context *brw = p->brw;
2317 struct brw_instruction *insn;
2318
2319 if (msg_reg_nr != -1)
2320 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2321
2322 insn = next_insn(p, BRW_OPCODE_SEND);
2323 insn->header.predicate_control = 0; /* XXX */
2324
2325 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2326 *
2327 * "Instruction compression is not allowed for this instruction (that
2328 * is, send). The hardware behavior is undefined if this instruction is
2329 * set as compressed. However, compress control can be set to "SecHalf"
2330 * to affect the EMask generation."
2331 *
2332 * No similar wording is found in later PRMs, but there are examples
2333 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2334 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2335 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2336 */
2337 if (insn->header.compression_control != BRW_COMPRESSION_2NDHALF)
2338 insn->header.compression_control = BRW_COMPRESSION_NONE;
2339
2340 if (brw->gen < 6)
2341 insn->header.destreg__conditionalmod = msg_reg_nr;
2342
2343 brw_set_dest(p, insn, dest);
2344 brw_set_src0(p, insn, src0);
2345 brw_set_sampler_message(p, insn,
2346 binding_table_index,
2347 sampler,
2348 msg_type,
2349 response_length,
2350 msg_length,
2351 header_present,
2352 simd_mode,
2353 return_format);
2354 }
2355
2356 /* All these variables are pretty confusing - we might be better off
2357 * using bitmasks and macros for this, in the old style. Or perhaps
2358 * just having the caller instantiate the fields in dword3 itself.
2359 */
2360 void brw_urb_WRITE(struct brw_compile *p,
2361 struct brw_reg dest,
2362 unsigned msg_reg_nr,
2363 struct brw_reg src0,
2364 enum brw_urb_write_flags flags,
2365 unsigned msg_length,
2366 unsigned response_length,
2367 unsigned offset,
2368 unsigned swizzle)
2369 {
2370 struct brw_context *brw = p->brw;
2371 struct brw_instruction *insn;
2372
2373 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2374
2375 if (brw->gen == 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2376 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2377 brw_push_insn_state(p);
2378 brw_set_access_mode(p, BRW_ALIGN_1);
2379 brw_set_mask_control(p, BRW_MASK_DISABLE);
2380 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2381 BRW_REGISTER_TYPE_UD),
2382 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2383 brw_imm_ud(0xff00));
2384 brw_pop_insn_state(p);
2385 }
2386
2387 insn = next_insn(p, BRW_OPCODE_SEND);
2388
2389 assert(msg_length < BRW_MAX_MRF);
2390
2391 brw_set_dest(p, insn, dest);
2392 brw_set_src0(p, insn, src0);
2393 brw_set_src1(p, insn, brw_imm_d(0));
2394
2395 if (brw->gen < 6)
2396 insn->header.destreg__conditionalmod = msg_reg_nr;
2397
2398 brw_set_urb_message(p,
2399 insn,
2400 flags,
2401 msg_length,
2402 response_length,
2403 offset,
2404 swizzle);
2405 }
2406
2407 static int
2408 brw_find_next_block_end(struct brw_compile *p, int start_offset)
2409 {
2410 int offset;
2411 void *store = p->store;
2412
2413 for (offset = next_offset(store, start_offset); offset < p->next_insn_offset;
2414 offset = next_offset(store, offset)) {
2415 struct brw_instruction *insn = store + offset;
2416
2417 switch (insn->header.opcode) {
2418 case BRW_OPCODE_ENDIF:
2419 case BRW_OPCODE_ELSE:
2420 case BRW_OPCODE_WHILE:
2421 case BRW_OPCODE_HALT:
2422 return offset;
2423 }
2424 }
2425
2426 return 0;
2427 }
2428
2429 /* There is no DO instruction on gen6, so to find the end of the loop
2430 * we have to see if the loop is jumping back before our start
2431 * instruction.
2432 */
2433 static int
2434 brw_find_loop_end(struct brw_compile *p, int start_offset)
2435 {
2436 struct brw_context *brw = p->brw;
2437 int offset;
2438 int scale = 8;
2439 void *store = p->store;
2440
2441 /* Always start after the instruction (such as a WHILE) we're trying to fix
2442 * up.
2443 */
2444 for (offset = next_offset(store, start_offset); offset < p->next_insn_offset;
2445 offset = next_offset(store, offset)) {
2446 struct brw_instruction *insn = store + offset;
2447
2448 if (insn->header.opcode == BRW_OPCODE_WHILE) {
2449 int jip = brw->gen == 6 ? insn->bits1.branch_gen6.jump_count
2450 : insn->bits3.break_cont.jip;
2451 if (offset + jip * scale <= start_offset)
2452 return offset;
2453 }
2454 }
2455 assert(!"not reached");
2456 return start_offset;
2457 }
2458
2459 /* After program generation, go back and update the UIP and JIP of
2460 * BREAK, CONT, and HALT instructions to their correct locations.
2461 */
2462 void
2463 brw_set_uip_jip(struct brw_compile *p)
2464 {
2465 struct brw_context *brw = p->brw;
2466 int offset;
2467 int scale = 8;
2468 void *store = p->store;
2469
2470 if (brw->gen < 6)
2471 return;
2472
2473 for (offset = 0; offset < p->next_insn_offset;
2474 offset = next_offset(store, offset)) {
2475 struct brw_instruction *insn = store + offset;
2476
2477 if (insn->header.cmpt_control) {
2478 /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2479 assert(insn->header.opcode != BRW_OPCODE_BREAK &&
2480 insn->header.opcode != BRW_OPCODE_CONTINUE &&
2481 insn->header.opcode != BRW_OPCODE_HALT);
2482 continue;
2483 }
2484
2485 int block_end_offset = brw_find_next_block_end(p, offset);
2486 switch (insn->header.opcode) {
2487 case BRW_OPCODE_BREAK:
2488 assert(block_end_offset != 0);
2489 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2490 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2491 insn->bits3.break_cont.uip =
2492 (brw_find_loop_end(p, offset) - offset +
2493 (brw->gen == 6 ? 16 : 0)) / scale;
2494 break;
2495 case BRW_OPCODE_CONTINUE:
2496 assert(block_end_offset != 0);
2497 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2498 insn->bits3.break_cont.uip =
2499 (brw_find_loop_end(p, offset) - offset) / scale;
2500
2501 assert(insn->bits3.break_cont.uip != 0);
2502 assert(insn->bits3.break_cont.jip != 0);
2503 break;
2504
2505 case BRW_OPCODE_ENDIF:
2506 if (block_end_offset == 0)
2507 insn->bits3.break_cont.jip = 2;
2508 else
2509 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2510 break;
2511
2512 case BRW_OPCODE_HALT:
2513 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2514 *
2515 * "In case of the halt instruction not inside any conditional
2516 * code block, the value of <JIP> and <UIP> should be the
2517 * same. In case of the halt instruction inside conditional code
2518 * block, the <UIP> should be the end of the program, and the
2519 * <JIP> should be end of the most inner conditional code block."
2520 *
2521 * The uip will have already been set by whoever set up the
2522 * instruction.
2523 */
2524 if (block_end_offset == 0) {
2525 insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
2526 } else {
2527 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2528 }
2529 assert(insn->bits3.break_cont.uip != 0);
2530 assert(insn->bits3.break_cont.jip != 0);
2531 break;
2532 }
2533 }
2534 }
2535
2536 void brw_ff_sync(struct brw_compile *p,
2537 struct brw_reg dest,
2538 unsigned msg_reg_nr,
2539 struct brw_reg src0,
2540 bool allocate,
2541 unsigned response_length,
2542 bool eot)
2543 {
2544 struct brw_context *brw = p->brw;
2545 struct brw_instruction *insn;
2546
2547 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2548
2549 insn = next_insn(p, BRW_OPCODE_SEND);
2550 brw_set_dest(p, insn, dest);
2551 brw_set_src0(p, insn, src0);
2552 brw_set_src1(p, insn, brw_imm_d(0));
2553
2554 if (brw->gen < 6)
2555 insn->header.destreg__conditionalmod = msg_reg_nr;
2556
2557 brw_set_ff_sync_message(p,
2558 insn,
2559 allocate,
2560 response_length,
2561 eot);
2562 }
2563
2564 /**
2565 * Emit the SEND instruction necessary to generate stream output data on Gen6
2566 * (for transform feedback).
2567 *
2568 * If send_commit_msg is true, this is the last piece of stream output data
2569 * from this thread, so send the data as a committed write. According to the
2570 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2571 *
2572 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2573 * writes are complete by sending the final write as a committed write."
2574 */
2575 void
2576 brw_svb_write(struct brw_compile *p,
2577 struct brw_reg dest,
2578 unsigned msg_reg_nr,
2579 struct brw_reg src0,
2580 unsigned binding_table_index,
2581 bool send_commit_msg)
2582 {
2583 struct brw_instruction *insn;
2584
2585 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2586
2587 insn = next_insn(p, BRW_OPCODE_SEND);
2588 brw_set_dest(p, insn, dest);
2589 brw_set_src0(p, insn, src0);
2590 brw_set_src1(p, insn, brw_imm_d(0));
2591 brw_set_dp_write_message(p, insn,
2592 binding_table_index,
2593 0, /* msg_control: ignored */
2594 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2595 1, /* msg_length */
2596 true, /* header_present */
2597 0, /* last_render_target: ignored */
2598 send_commit_msg, /* response_length */
2599 0, /* end_of_thread */
2600 send_commit_msg); /* send_commit_msg */
2601 }
2602
2603 static void
2604 brw_set_dp_untyped_atomic_message(struct brw_compile *p,
2605 struct brw_instruction *insn,
2606 unsigned atomic_op,
2607 unsigned bind_table_index,
2608 unsigned msg_length,
2609 unsigned response_length,
2610 bool header_present)
2611 {
2612 if (p->brw->is_haswell) {
2613 brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1,
2614 msg_length, response_length,
2615 header_present, false);
2616
2617
2618 if (insn->header.access_mode == BRW_ALIGN_1) {
2619 if (insn->header.execution_size != BRW_EXECUTE_16)
2620 insn->bits3.ud |= 1 << 12; /* SIMD8 mode */
2621
2622 insn->bits3.gen7_dp.msg_type =
2623 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP;
2624 } else {
2625 insn->bits3.gen7_dp.msg_type =
2626 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2;
2627 }
2628
2629 } else {
2630 brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
2631 msg_length, response_length,
2632 header_present, false);
2633
2634 insn->bits3.gen7_dp.msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
2635
2636 if (insn->header.execution_size != BRW_EXECUTE_16)
2637 insn->bits3.ud |= 1 << 12; /* SIMD8 mode */
2638 }
2639
2640 if (response_length)
2641 insn->bits3.ud |= 1 << 13; /* Return data expected */
2642
2643 insn->bits3.gen7_dp.binding_table_index = bind_table_index;
2644 insn->bits3.ud |= atomic_op << 8;
2645 }
2646
2647 void
2648 brw_untyped_atomic(struct brw_compile *p,
2649 struct brw_reg dest,
2650 struct brw_reg mrf,
2651 unsigned atomic_op,
2652 unsigned bind_table_index,
2653 unsigned msg_length,
2654 unsigned response_length) {
2655 struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_SEND);
2656
2657 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD));
2658 brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
2659 brw_set_src1(p, insn, brw_imm_d(0));
2660 brw_set_dp_untyped_atomic_message(
2661 p, insn, atomic_op, bind_table_index, msg_length, response_length,
2662 insn->header.access_mode == BRW_ALIGN_1);
2663 }
2664
2665 static void
2666 brw_set_dp_untyped_surface_read_message(struct brw_compile *p,
2667 struct brw_instruction *insn,
2668 unsigned bind_table_index,
2669 unsigned msg_length,
2670 unsigned response_length,
2671 bool header_present)
2672 {
2673 const unsigned dispatch_width =
2674 (insn->header.execution_size == BRW_EXECUTE_16 ? 16 : 8);
2675 const unsigned num_channels = response_length / (dispatch_width / 8);
2676
2677 if (p->brw->is_haswell) {
2678 brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1,
2679 msg_length, response_length,
2680 header_present, false);
2681
2682 insn->bits3.gen7_dp.msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ;
2683 } else {
2684 brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
2685 msg_length, response_length,
2686 header_present, false);
2687
2688 insn->bits3.gen7_dp.msg_type = GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ;
2689 }
2690
2691 if (insn->header.access_mode == BRW_ALIGN_1) {
2692 if (dispatch_width == 16)
2693 insn->bits3.ud |= 1 << 12; /* SIMD16 mode */
2694 else
2695 insn->bits3.ud |= 2 << 12; /* SIMD8 mode */
2696 }
2697
2698 insn->bits3.gen7_dp.binding_table_index = bind_table_index;
2699
2700 /* Set mask of 32-bit channels to drop. */
2701 insn->bits3.ud |= (0xf & (0xf << num_channels)) << 8;
2702 }
2703
2704 void
2705 brw_untyped_surface_read(struct brw_compile *p,
2706 struct brw_reg dest,
2707 struct brw_reg mrf,
2708 unsigned bind_table_index,
2709 unsigned msg_length,
2710 unsigned response_length)
2711 {
2712 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2713
2714 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD));
2715 brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
2716 brw_set_dp_untyped_surface_read_message(
2717 p, insn, bind_table_index, msg_length, response_length,
2718 insn->header.access_mode == BRW_ALIGN_1);
2719 }
2720
2721 /**
2722 * This instruction is generated as a single-channel align1 instruction by
2723 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2724 *
2725 * We can't use the typed atomic op in the FS because that has the execution
2726 * mask ANDed with the pixel mask, but we just want to write the one dword for
2727 * all the pixels.
2728 *
2729 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2730 * one u32. So we use the same untyped atomic write message as the pixel
2731 * shader.
2732 *
2733 * The untyped atomic operation requires a BUFFER surface type with RAW
2734 * format, and is only accessible through the legacy DATA_CACHE dataport
2735 * messages.
2736 */
2737 void brw_shader_time_add(struct brw_compile *p,
2738 struct brw_reg payload,
2739 uint32_t surf_index)
2740 {
2741 struct brw_context *brw = p->brw;
2742 assert(brw->gen >= 7);
2743
2744 brw_push_insn_state(p);
2745 brw_set_access_mode(p, BRW_ALIGN_1);
2746 brw_set_mask_control(p, BRW_MASK_DISABLE);
2747 struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
2748 brw_pop_insn_state(p);
2749
2750 /* We use brw_vec1_reg and unmasked because we want to increment the given
2751 * offset only once.
2752 */
2753 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2754 BRW_ARF_NULL, 0));
2755 brw_set_src0(p, send, brw_vec1_reg(payload.file,
2756 payload.nr, 0));
2757 brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, surf_index,
2758 2 /* message length */,
2759 0 /* response length */,
2760 false /* header present */);
2761 }