7448512a7b57eb41cc1f52293ff9e751e7d37f92
[mesa.git] / src / mesa / drivers / dri / i965 / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_context.h"
34 #include "brw_defines.h"
35 #include "brw_eu.h"
36
37 #include "glsl/ralloc.h"
38
39 /***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43 static void guess_execution_size(struct brw_compile *p,
44 struct brw_instruction *insn,
45 struct brw_reg reg)
46 {
47 if (reg.width == BRW_WIDTH_8 && p->compressed)
48 insn->header.execution_size = BRW_EXECUTE_16;
49 else
50 insn->header.execution_size = reg.width; /* note - definitions are compatible */
51 }
52
53
54 /**
55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
56 * registers, implicitly moving the operand to a message register.
57 *
58 * On Sandybridge, this is no longer the case. This function performs the
59 * explicit move; it should be called before emitting a SEND instruction.
60 */
61 void
62 gen6_resolve_implied_move(struct brw_compile *p,
63 struct brw_reg *src,
64 unsigned msg_reg_nr)
65 {
66 struct brw_context *brw = p->brw;
67 if (brw->gen < 6)
68 return;
69
70 if (src->file == BRW_MESSAGE_REGISTER_FILE)
71 return;
72
73 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
74 brw_push_insn_state(p);
75 brw_set_mask_control(p, BRW_MASK_DISABLE);
76 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
77 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
78 retype(*src, BRW_REGISTER_TYPE_UD));
79 brw_pop_insn_state(p);
80 }
81 *src = brw_message_reg(msg_reg_nr);
82 }
83
84 static void
85 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
86 {
87 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
88 * "The send with EOT should use register space R112-R127 for <src>. This is
89 * to enable loading of a new thread into the same slot while the message
90 * with EOT for current thread is pending dispatch."
91 *
92 * Since we're pretending to have 16 MRFs anyway, we may as well use the
93 * registers required for messages with EOT.
94 */
95 struct brw_context *brw = p->brw;
96 if (brw->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
97 reg->file = BRW_GENERAL_REGISTER_FILE;
98 reg->nr += GEN7_MRF_HACK_START;
99 }
100 }
101
102 /**
103 * Convert a brw_reg_type enumeration value into the hardware representation.
104 *
105 * The hardware encoding may depend on whether the value is an immediate.
106 */
107 unsigned
108 brw_reg_type_to_hw_type(const struct brw_context *brw,
109 enum brw_reg_type type, unsigned file)
110 {
111 if (file == BRW_IMMEDIATE_VALUE) {
112 const static int imm_hw_types[] = {
113 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
114 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
115 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
116 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
117 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
118 [BRW_REGISTER_TYPE_UB] = -1,
119 [BRW_REGISTER_TYPE_B] = -1,
120 [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
121 [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
122 [BRW_REGISTER_TYPE_V] = BRW_HW_REG_IMM_TYPE_V,
123 [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
124 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
125 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
126 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
127 };
128 assert(type < ARRAY_SIZE(imm_hw_types));
129 assert(imm_hw_types[type] != -1);
130 assert(brw->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
131 return imm_hw_types[type];
132 } else {
133 /* Non-immediate registers */
134 const static int hw_types[] = {
135 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
136 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
137 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
138 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
139 [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
140 [BRW_REGISTER_TYPE_B] = BRW_HW_REG_NON_IMM_TYPE_B,
141 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
142 [BRW_REGISTER_TYPE_UV] = -1,
143 [BRW_REGISTER_TYPE_VF] = -1,
144 [BRW_REGISTER_TYPE_V] = -1,
145 [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
146 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
147 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
148 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
149 };
150 assert(type < ARRAY_SIZE(hw_types));
151 assert(hw_types[type] != -1);
152 assert(brw->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
153 assert(brw->gen >= 8 || type < BRW_REGISTER_TYPE_HF);
154 return hw_types[type];
155 }
156 }
157
158 void
159 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
160 struct brw_reg dest)
161 {
162 if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
163 dest.file != BRW_MESSAGE_REGISTER_FILE)
164 assert(dest.nr < 128);
165
166 gen7_convert_mrf_to_grf(p, &dest);
167
168 insn->bits1.da1.dest_reg_file = dest.file;
169 insn->bits1.da1.dest_reg_type =
170 brw_reg_type_to_hw_type(p->brw, dest.type, dest.file);
171 insn->bits1.da1.dest_address_mode = dest.address_mode;
172
173 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
174 insn->bits1.da1.dest_reg_nr = dest.nr;
175
176 if (insn->header.access_mode == BRW_ALIGN_1) {
177 insn->bits1.da1.dest_subreg_nr = dest.subnr;
178 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
179 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
180 insn->bits1.da1.dest_horiz_stride = dest.hstride;
181 }
182 else {
183 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
184 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
185 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
186 dest.file == BRW_MESSAGE_REGISTER_FILE) {
187 assert(dest.dw1.bits.writemask != 0);
188 }
189 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
190 * Although Dst.HorzStride is a don't care for Align16, HW needs
191 * this to be programmed as "01".
192 */
193 insn->bits1.da16.dest_horiz_stride = 1;
194 }
195 }
196 else {
197 insn->bits1.ia1.dest_subreg_nr = dest.subnr;
198
199 /* These are different sizes in align1 vs align16:
200 */
201 if (insn->header.access_mode == BRW_ALIGN_1) {
202 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
203 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
204 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
205 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
206 }
207 else {
208 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
209 /* even ignored in da16, still need to set as '01' */
210 insn->bits1.ia16.dest_horiz_stride = 1;
211 }
212 }
213
214 /* NEW: Set the execution size based on dest.width and
215 * insn->compression_control:
216 */
217 guess_execution_size(p, insn, dest);
218 }
219
220 extern int reg_type_size[];
221
222 static void
223 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
224 {
225 int hstride_for_reg[] = {0, 1, 2, 4};
226 int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
227 int width_for_reg[] = {1, 2, 4, 8, 16};
228 int execsize_for_reg[] = {1, 2, 4, 8, 16};
229 int width, hstride, vstride, execsize;
230
231 if (reg.file == BRW_IMMEDIATE_VALUE) {
232 /* 3.3.6: Region Parameters. Restriction: Immediate vectors
233 * mean the destination has to be 128-bit aligned and the
234 * destination horiz stride has to be a word.
235 */
236 if (reg.type == BRW_REGISTER_TYPE_V) {
237 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
238 reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
239 }
240
241 return;
242 }
243
244 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
245 reg.file == BRW_ARF_NULL)
246 return;
247
248 assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
249 hstride = hstride_for_reg[reg.hstride];
250
251 if (reg.vstride == 0xf) {
252 vstride = -1;
253 } else {
254 assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
255 vstride = vstride_for_reg[reg.vstride];
256 }
257
258 assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
259 width = width_for_reg[reg.width];
260
261 assert(insn->header.execution_size >= 0 &&
262 insn->header.execution_size < Elements(execsize_for_reg));
263 execsize = execsize_for_reg[insn->header.execution_size];
264
265 /* Restrictions from 3.3.10: Register Region Restrictions. */
266 /* 3. */
267 assert(execsize >= width);
268
269 /* 4. */
270 if (execsize == width && hstride != 0) {
271 assert(vstride == -1 || vstride == width * hstride);
272 }
273
274 /* 5. */
275 if (execsize == width && hstride == 0) {
276 /* no restriction on vstride. */
277 }
278
279 /* 6. */
280 if (width == 1) {
281 assert(hstride == 0);
282 }
283
284 /* 7. */
285 if (execsize == 1 && width == 1) {
286 assert(hstride == 0);
287 assert(vstride == 0);
288 }
289
290 /* 8. */
291 if (vstride == 0 && hstride == 0) {
292 assert(width == 1);
293 }
294
295 /* 10. Check destination issues. */
296 }
297
298 static bool
299 is_compactable_immediate(unsigned imm)
300 {
301 /* We get the low 12 bits as-is. */
302 imm &= ~0xfff;
303
304 /* We get one bit replicated through the top 20 bits. */
305 return imm == 0 || imm == 0xfffff000;
306 }
307
308 void
309 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
310 struct brw_reg reg)
311 {
312 struct brw_context *brw = p->brw;
313
314 if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
315 assert(reg.nr < 128);
316
317 gen7_convert_mrf_to_grf(p, &reg);
318
319 if (brw->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
320 insn->header.opcode == BRW_OPCODE_SENDC)) {
321 /* Any source modifiers or regions will be ignored, since this just
322 * identifies the MRF/GRF to start reading the message contents from.
323 * Check for some likely failures.
324 */
325 assert(!reg.negate);
326 assert(!reg.abs);
327 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
328 }
329
330 validate_reg(insn, reg);
331
332 insn->bits1.da1.src0_reg_file = reg.file;
333 insn->bits1.da1.src0_reg_type =
334 brw_reg_type_to_hw_type(brw, reg.type, reg.file);
335 insn->bits2.da1.src0_abs = reg.abs;
336 insn->bits2.da1.src0_negate = reg.negate;
337 insn->bits2.da1.src0_address_mode = reg.address_mode;
338
339 if (reg.file == BRW_IMMEDIATE_VALUE) {
340 insn->bits3.ud = reg.dw1.ud;
341
342 /* The Bspec's section titled "Non-present Operands" claims that if src0
343 * is an immediate that src1's type must be the same as that of src0.
344 *
345 * The SNB+ DataTypeIndex instruction compaction tables contain mappings
346 * that do not follow this rule. E.g., from the IVB/HSW table:
347 *
348 * DataTypeIndex 18-Bit Mapping Mapped Meaning
349 * 3 001000001011111101 r:f | i:vf | a:ud | <1> | dir |
350 *
351 * And from the SNB table:
352 *
353 * DataTypeIndex 18-Bit Mapping Mapped Meaning
354 * 8 001000000111101100 a:w | i:w | a:ud | <1> | dir |
355 *
356 * Neither of these cause warnings from the simulator when used,
357 * compacted or otherwise. In fact, all compaction mappings that have an
358 * immediate in src0 use a:ud for src1.
359 *
360 * The GM45 instruction compaction tables do not contain mapped meanings
361 * so it's not clear whether it has the restriction. We'll assume it was
362 * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
363 */
364 insn->bits1.da1.src1_reg_file = 0; /* arf */
365 if (brw->gen < 6) {
366 insn->bits1.da1.src1_reg_type = insn->bits1.da1.src0_reg_type;
367 } else {
368 insn->bits1.da1.src1_reg_type = BRW_HW_REG_TYPE_UD;
369 }
370
371 /* Compacted instructions only have 12-bits (plus 1 for the other 20)
372 * for immediate values. Presumably the hardware engineers realized
373 * that the only useful floating-point value that could be represented
374 * in this format is 0.0, which can also be represented as a VF-typed
375 * immediate, so they gave us the previously mentioned mapping on IVB+.
376 *
377 * Strangely, we do have a mapping for imm:f in src1, so we don't need
378 * to do this there.
379 *
380 * If we see a 0.0:F, change the type to VF so that it can be compacted.
381 */
382 if (insn->bits3.ud == 0x0 &&
383 insn->bits1.da1.src0_reg_type == BRW_HW_REG_TYPE_F) {
384 insn->bits1.da1.src0_reg_type = BRW_HW_REG_IMM_TYPE_VF;
385 }
386
387 /* There are no mappings for dst:d | i:d, so if the immediate is suitable
388 * set the types to :UD so the instruction can be compacted.
389 */
390 if (is_compactable_immediate(insn->bits3.ud) &&
391 insn->header.destreg__conditionalmod == BRW_CONDITIONAL_NONE &&
392 insn->bits1.da1.src0_reg_type == BRW_HW_REG_TYPE_D &&
393 insn->bits1.da1.dest_reg_type == BRW_HW_REG_TYPE_D) {
394 insn->bits1.da1.src0_reg_type = BRW_HW_REG_TYPE_UD;
395 insn->bits1.da1.dest_reg_type = BRW_HW_REG_TYPE_UD;
396 }
397 }
398 else
399 {
400 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
401 if (insn->header.access_mode == BRW_ALIGN_1) {
402 insn->bits2.da1.src0_subreg_nr = reg.subnr;
403 insn->bits2.da1.src0_reg_nr = reg.nr;
404 }
405 else {
406 insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
407 insn->bits2.da16.src0_reg_nr = reg.nr;
408 }
409 }
410 else {
411 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
412
413 if (insn->header.access_mode == BRW_ALIGN_1) {
414 insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
415 }
416 else {
417 insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
418 }
419 }
420
421 if (insn->header.access_mode == BRW_ALIGN_1) {
422 if (reg.width == BRW_WIDTH_1 &&
423 insn->header.execution_size == BRW_EXECUTE_1) {
424 insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
425 insn->bits2.da1.src0_width = BRW_WIDTH_1;
426 insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
427 }
428 else {
429 insn->bits2.da1.src0_horiz_stride = reg.hstride;
430 insn->bits2.da1.src0_width = reg.width;
431 insn->bits2.da1.src0_vert_stride = reg.vstride;
432 }
433 }
434 else {
435 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
436 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
437 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
438 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
439
440 /* This is an oddity of the fact we're using the same
441 * descriptions for registers in align_16 as align_1:
442 */
443 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
444 insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
445 else
446 insn->bits2.da16.src0_vert_stride = reg.vstride;
447 }
448 }
449 }
450
451
452 void
453 brw_set_src1(struct brw_compile *p,
454 struct brw_instruction *insn,
455 struct brw_reg reg)
456 {
457 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
458
459 if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
460 assert(reg.nr < 128);
461
462 gen7_convert_mrf_to_grf(p, &reg);
463
464 validate_reg(insn, reg);
465
466 insn->bits1.da1.src1_reg_file = reg.file;
467 insn->bits1.da1.src1_reg_type =
468 brw_reg_type_to_hw_type(p->brw, reg.type, reg.file);
469 insn->bits3.da1.src1_abs = reg.abs;
470 insn->bits3.da1.src1_negate = reg.negate;
471
472 /* Only src1 can be immediate in two-argument instructions.
473 */
474 assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
475
476 if (reg.file == BRW_IMMEDIATE_VALUE) {
477 insn->bits3.ud = reg.dw1.ud;
478 }
479 else {
480 /* This is a hardware restriction, which may or may not be lifted
481 * in the future:
482 */
483 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
484 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
485
486 if (insn->header.access_mode == BRW_ALIGN_1) {
487 insn->bits3.da1.src1_subreg_nr = reg.subnr;
488 insn->bits3.da1.src1_reg_nr = reg.nr;
489 }
490 else {
491 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
492 insn->bits3.da16.src1_reg_nr = reg.nr;
493 }
494
495 if (insn->header.access_mode == BRW_ALIGN_1) {
496 if (reg.width == BRW_WIDTH_1 &&
497 insn->header.execution_size == BRW_EXECUTE_1) {
498 insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
499 insn->bits3.da1.src1_width = BRW_WIDTH_1;
500 insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
501 }
502 else {
503 insn->bits3.da1.src1_horiz_stride = reg.hstride;
504 insn->bits3.da1.src1_width = reg.width;
505 insn->bits3.da1.src1_vert_stride = reg.vstride;
506 }
507 }
508 else {
509 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
510 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
511 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
512 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
513
514 /* This is an oddity of the fact we're using the same
515 * descriptions for registers in align_16 as align_1:
516 */
517 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
518 insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
519 else
520 insn->bits3.da16.src1_vert_stride = reg.vstride;
521 }
522 }
523 }
524
525 /**
526 * Set the Message Descriptor and Extended Message Descriptor fields
527 * for SEND messages.
528 *
529 * \note This zeroes out the Function Control bits, so it must be called
530 * \b before filling out any message-specific data. Callers can
531 * choose not to fill in irrelevant bits; they will be zero.
532 */
533 static void
534 brw_set_message_descriptor(struct brw_compile *p,
535 struct brw_instruction *inst,
536 enum brw_message_target sfid,
537 unsigned msg_length,
538 unsigned response_length,
539 bool header_present,
540 bool end_of_thread)
541 {
542 struct brw_context *brw = p->brw;
543
544 brw_set_src1(p, inst, brw_imm_d(0));
545
546 if (brw->gen >= 5) {
547 inst->bits3.generic_gen5.header_present = header_present;
548 inst->bits3.generic_gen5.response_length = response_length;
549 inst->bits3.generic_gen5.msg_length = msg_length;
550 inst->bits3.generic_gen5.end_of_thread = end_of_thread;
551
552 if (brw->gen >= 6) {
553 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
554 inst->header.destreg__conditionalmod = sfid;
555 } else {
556 /* Set Extended Message Descriptor (ex_desc) */
557 inst->bits2.send_gen5.sfid = sfid;
558 inst->bits2.send_gen5.end_of_thread = end_of_thread;
559 }
560 } else {
561 inst->bits3.generic.response_length = response_length;
562 inst->bits3.generic.msg_length = msg_length;
563 inst->bits3.generic.msg_target = sfid;
564 inst->bits3.generic.end_of_thread = end_of_thread;
565 }
566 }
567
568 static void brw_set_math_message( struct brw_compile *p,
569 struct brw_instruction *insn,
570 unsigned function,
571 unsigned integer_type,
572 bool low_precision,
573 unsigned dataType )
574 {
575 struct brw_context *brw = p->brw;
576 unsigned msg_length;
577 unsigned response_length;
578
579 /* Infer message length from the function */
580 switch (function) {
581 case BRW_MATH_FUNCTION_POW:
582 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
583 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
584 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
585 msg_length = 2;
586 break;
587 default:
588 msg_length = 1;
589 break;
590 }
591
592 /* Infer response length from the function */
593 switch (function) {
594 case BRW_MATH_FUNCTION_SINCOS:
595 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
596 response_length = 2;
597 break;
598 default:
599 response_length = 1;
600 break;
601 }
602
603
604 brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
605 msg_length, response_length, false, false);
606 if (brw->gen == 5) {
607 insn->bits3.math_gen5.function = function;
608 insn->bits3.math_gen5.int_type = integer_type;
609 insn->bits3.math_gen5.precision = low_precision;
610 insn->bits3.math_gen5.saturate = insn->header.saturate;
611 insn->bits3.math_gen5.data_type = dataType;
612 insn->bits3.math_gen5.snapshot = 0;
613 } else {
614 insn->bits3.math.function = function;
615 insn->bits3.math.int_type = integer_type;
616 insn->bits3.math.precision = low_precision;
617 insn->bits3.math.saturate = insn->header.saturate;
618 insn->bits3.math.data_type = dataType;
619 }
620 insn->header.saturate = 0;
621 }
622
623
624 static void brw_set_ff_sync_message(struct brw_compile *p,
625 struct brw_instruction *insn,
626 bool allocate,
627 unsigned response_length,
628 bool end_of_thread)
629 {
630 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
631 1, response_length, true, end_of_thread);
632 insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
633 insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
634 insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
635 insn->bits3.urb_gen5.allocate = allocate;
636 insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
637 insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
638 }
639
640 static void brw_set_urb_message( struct brw_compile *p,
641 struct brw_instruction *insn,
642 enum brw_urb_write_flags flags,
643 unsigned msg_length,
644 unsigned response_length,
645 unsigned offset,
646 unsigned swizzle_control )
647 {
648 struct brw_context *brw = p->brw;
649
650 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
651 msg_length, response_length, true,
652 flags & BRW_URB_WRITE_EOT);
653 if (brw->gen == 7) {
654 if (flags & BRW_URB_WRITE_OWORD) {
655 assert(msg_length == 2); /* header + one OWORD of data */
656 insn->bits3.urb_gen7.opcode = BRW_URB_OPCODE_WRITE_OWORD;
657 } else {
658 insn->bits3.urb_gen7.opcode = BRW_URB_OPCODE_WRITE_HWORD;
659 }
660 insn->bits3.urb_gen7.offset = offset;
661 assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
662 insn->bits3.urb_gen7.swizzle_control = swizzle_control;
663 insn->bits3.urb_gen7.per_slot_offset =
664 flags & BRW_URB_WRITE_PER_SLOT_OFFSET ? 1 : 0;
665 insn->bits3.urb_gen7.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
666 } else if (brw->gen >= 5) {
667 insn->bits3.urb_gen5.opcode = 0; /* URB_WRITE */
668 insn->bits3.urb_gen5.offset = offset;
669 insn->bits3.urb_gen5.swizzle_control = swizzle_control;
670 insn->bits3.urb_gen5.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
671 insn->bits3.urb_gen5.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
672 insn->bits3.urb_gen5.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
673 } else {
674 insn->bits3.urb.opcode = 0; /* ? */
675 insn->bits3.urb.offset = offset;
676 insn->bits3.urb.swizzle_control = swizzle_control;
677 insn->bits3.urb.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
678 insn->bits3.urb.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
679 insn->bits3.urb.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
680 }
681 }
682
683 void
684 brw_set_dp_write_message(struct brw_compile *p,
685 struct brw_instruction *insn,
686 unsigned binding_table_index,
687 unsigned msg_control,
688 unsigned msg_type,
689 unsigned msg_length,
690 bool header_present,
691 unsigned last_render_target,
692 unsigned response_length,
693 unsigned end_of_thread,
694 unsigned send_commit_msg)
695 {
696 struct brw_context *brw = p->brw;
697 unsigned sfid;
698
699 if (brw->gen >= 7) {
700 /* Use the Render Cache for RT writes; otherwise use the Data Cache */
701 if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
702 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
703 else
704 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
705 } else if (brw->gen == 6) {
706 /* Use the render cache for all write messages. */
707 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
708 } else {
709 sfid = BRW_SFID_DATAPORT_WRITE;
710 }
711
712 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
713 header_present, end_of_thread);
714
715 if (brw->gen >= 7) {
716 insn->bits3.gen7_dp.binding_table_index = binding_table_index;
717 insn->bits3.gen7_dp.msg_control = msg_control;
718 insn->bits3.gen7_dp.last_render_target = last_render_target;
719 insn->bits3.gen7_dp.msg_type = msg_type;
720 } else if (brw->gen == 6) {
721 insn->bits3.gen6_dp.binding_table_index = binding_table_index;
722 insn->bits3.gen6_dp.msg_control = msg_control;
723 insn->bits3.gen6_dp.last_render_target = last_render_target;
724 insn->bits3.gen6_dp.msg_type = msg_type;
725 insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
726 } else if (brw->gen == 5) {
727 insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
728 insn->bits3.dp_write_gen5.msg_control = msg_control;
729 insn->bits3.dp_write_gen5.last_render_target = last_render_target;
730 insn->bits3.dp_write_gen5.msg_type = msg_type;
731 insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
732 } else {
733 insn->bits3.dp_write.binding_table_index = binding_table_index;
734 insn->bits3.dp_write.msg_control = msg_control;
735 insn->bits3.dp_write.last_render_target = last_render_target;
736 insn->bits3.dp_write.msg_type = msg_type;
737 insn->bits3.dp_write.send_commit_msg = send_commit_msg;
738 }
739 }
740
741 void
742 brw_set_dp_read_message(struct brw_compile *p,
743 struct brw_instruction *insn,
744 unsigned binding_table_index,
745 unsigned msg_control,
746 unsigned msg_type,
747 unsigned target_cache,
748 unsigned msg_length,
749 bool header_present,
750 unsigned response_length)
751 {
752 struct brw_context *brw = p->brw;
753 unsigned sfid;
754
755 if (brw->gen >= 7) {
756 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
757 } else if (brw->gen == 6) {
758 if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
759 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
760 else
761 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
762 } else {
763 sfid = BRW_SFID_DATAPORT_READ;
764 }
765
766 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
767 header_present, false);
768
769 if (brw->gen >= 7) {
770 insn->bits3.gen7_dp.binding_table_index = binding_table_index;
771 insn->bits3.gen7_dp.msg_control = msg_control;
772 insn->bits3.gen7_dp.last_render_target = 0;
773 insn->bits3.gen7_dp.msg_type = msg_type;
774 } else if (brw->gen == 6) {
775 insn->bits3.gen6_dp.binding_table_index = binding_table_index;
776 insn->bits3.gen6_dp.msg_control = msg_control;
777 insn->bits3.gen6_dp.last_render_target = 0;
778 insn->bits3.gen6_dp.msg_type = msg_type;
779 insn->bits3.gen6_dp.send_commit_msg = 0;
780 } else if (brw->gen == 5) {
781 insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
782 insn->bits3.dp_read_gen5.msg_control = msg_control;
783 insn->bits3.dp_read_gen5.msg_type = msg_type;
784 insn->bits3.dp_read_gen5.target_cache = target_cache;
785 } else if (brw->is_g4x) {
786 insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
787 insn->bits3.dp_read_g4x.msg_control = msg_control; /*8:10*/
788 insn->bits3.dp_read_g4x.msg_type = msg_type; /*11:13*/
789 insn->bits3.dp_read_g4x.target_cache = target_cache; /*14:15*/
790 } else {
791 insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
792 insn->bits3.dp_read.msg_control = msg_control; /*8:11*/
793 insn->bits3.dp_read.msg_type = msg_type; /*12:13*/
794 insn->bits3.dp_read.target_cache = target_cache; /*14:15*/
795 }
796 }
797
798 void
799 brw_set_sampler_message(struct brw_compile *p,
800 struct brw_instruction *insn,
801 unsigned binding_table_index,
802 unsigned sampler,
803 unsigned msg_type,
804 unsigned response_length,
805 unsigned msg_length,
806 unsigned header_present,
807 unsigned simd_mode,
808 unsigned return_format)
809 {
810 struct brw_context *brw = p->brw;
811
812 brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
813 response_length, header_present, false);
814
815 if (brw->gen >= 7) {
816 insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
817 insn->bits3.sampler_gen7.sampler = sampler;
818 insn->bits3.sampler_gen7.msg_type = msg_type;
819 insn->bits3.sampler_gen7.simd_mode = simd_mode;
820 } else if (brw->gen >= 5) {
821 insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
822 insn->bits3.sampler_gen5.sampler = sampler;
823 insn->bits3.sampler_gen5.msg_type = msg_type;
824 insn->bits3.sampler_gen5.simd_mode = simd_mode;
825 } else if (brw->is_g4x) {
826 insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
827 insn->bits3.sampler_g4x.sampler = sampler;
828 insn->bits3.sampler_g4x.msg_type = msg_type;
829 } else {
830 insn->bits3.sampler.binding_table_index = binding_table_index;
831 insn->bits3.sampler.sampler = sampler;
832 insn->bits3.sampler.msg_type = msg_type;
833 insn->bits3.sampler.return_format = return_format;
834 }
835 }
836
837
838 #define next_insn brw_next_insn
839 struct brw_instruction *
840 brw_next_insn(struct brw_compile *p, unsigned opcode)
841 {
842 struct brw_instruction *insn;
843
844 if (p->nr_insn + 1 > p->store_size) {
845 p->store_size <<= 1;
846 p->store = reralloc(p->mem_ctx, p->store,
847 struct brw_instruction, p->store_size);
848 }
849
850 p->next_insn_offset += 16;
851 insn = &p->store[p->nr_insn++];
852 memcpy(insn, p->current, sizeof(*insn));
853
854 /* Reset this one-shot flag:
855 */
856
857 if (p->current->header.destreg__conditionalmod) {
858 p->current->header.destreg__conditionalmod = 0;
859 p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
860 }
861
862 insn->header.opcode = opcode;
863 return insn;
864 }
865
866 static struct brw_instruction *brw_alu1( struct brw_compile *p,
867 unsigned opcode,
868 struct brw_reg dest,
869 struct brw_reg src )
870 {
871 struct brw_instruction *insn = next_insn(p, opcode);
872 brw_set_dest(p, insn, dest);
873 brw_set_src0(p, insn, src);
874 return insn;
875 }
876
877 static struct brw_instruction *brw_alu2(struct brw_compile *p,
878 unsigned opcode,
879 struct brw_reg dest,
880 struct brw_reg src0,
881 struct brw_reg src1 )
882 {
883 struct brw_instruction *insn = next_insn(p, opcode);
884 brw_set_dest(p, insn, dest);
885 brw_set_src0(p, insn, src0);
886 brw_set_src1(p, insn, src1);
887 return insn;
888 }
889
890 static int
891 get_3src_subreg_nr(struct brw_reg reg)
892 {
893 if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
894 assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
895 return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
896 } else {
897 return reg.subnr / 4;
898 }
899 }
900
901 static struct brw_instruction *brw_alu3(struct brw_compile *p,
902 unsigned opcode,
903 struct brw_reg dest,
904 struct brw_reg src0,
905 struct brw_reg src1,
906 struct brw_reg src2)
907 {
908 struct brw_context *brw = p->brw;
909 struct brw_instruction *insn = next_insn(p, opcode);
910
911 gen7_convert_mrf_to_grf(p, &dest);
912
913 assert(insn->header.access_mode == BRW_ALIGN_16);
914
915 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
916 dest.file == BRW_MESSAGE_REGISTER_FILE);
917 assert(dest.nr < 128);
918 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
919 assert(dest.type == BRW_REGISTER_TYPE_F ||
920 dest.type == BRW_REGISTER_TYPE_D ||
921 dest.type == BRW_REGISTER_TYPE_UD);
922 insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
923 insn->bits1.da3src.dest_reg_nr = dest.nr;
924 insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
925 insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
926 guess_execution_size(p, insn, dest);
927
928 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
929 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
930 assert(src0.nr < 128);
931 insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
932 insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
933 insn->bits2.da3src.src0_reg_nr = src0.nr;
934 insn->bits1.da3src.src0_abs = src0.abs;
935 insn->bits1.da3src.src0_negate = src0.negate;
936 insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
937
938 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
939 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
940 assert(src1.nr < 128);
941 insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
942 insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
943 insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
944 insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
945 insn->bits3.da3src.src1_reg_nr = src1.nr;
946 insn->bits1.da3src.src1_abs = src1.abs;
947 insn->bits1.da3src.src1_negate = src1.negate;
948
949 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
950 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
951 assert(src2.nr < 128);
952 insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
953 insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
954 insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
955 insn->bits3.da3src.src2_reg_nr = src2.nr;
956 insn->bits1.da3src.src2_abs = src2.abs;
957 insn->bits1.da3src.src2_negate = src2.negate;
958
959 if (brw->gen >= 7) {
960 /* Set both the source and destination types based on dest.type,
961 * ignoring the source register types. The MAD and LRP emitters ensure
962 * that all four types are float. The BFE and BFI2 emitters, however,
963 * may send us mixed D and UD types and want us to ignore that and use
964 * the destination type.
965 */
966 switch (dest.type) {
967 case BRW_REGISTER_TYPE_F:
968 insn->bits1.da3src.src_type = BRW_3SRC_TYPE_F;
969 insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_F;
970 break;
971 case BRW_REGISTER_TYPE_D:
972 insn->bits1.da3src.src_type = BRW_3SRC_TYPE_D;
973 insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_D;
974 break;
975 case BRW_REGISTER_TYPE_UD:
976 insn->bits1.da3src.src_type = BRW_3SRC_TYPE_UD;
977 insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_UD;
978 break;
979 }
980 }
981
982 return insn;
983 }
984
985
986 /***********************************************************************
987 * Convenience routines.
988 */
989 #define ALU1(OP) \
990 struct brw_instruction *brw_##OP(struct brw_compile *p, \
991 struct brw_reg dest, \
992 struct brw_reg src0) \
993 { \
994 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
995 }
996
997 #define ALU2(OP) \
998 struct brw_instruction *brw_##OP(struct brw_compile *p, \
999 struct brw_reg dest, \
1000 struct brw_reg src0, \
1001 struct brw_reg src1) \
1002 { \
1003 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
1004 }
1005
1006 #define ALU3(OP) \
1007 struct brw_instruction *brw_##OP(struct brw_compile *p, \
1008 struct brw_reg dest, \
1009 struct brw_reg src0, \
1010 struct brw_reg src1, \
1011 struct brw_reg src2) \
1012 { \
1013 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1014 }
1015
1016 #define ALU3F(OP) \
1017 struct brw_instruction *brw_##OP(struct brw_compile *p, \
1018 struct brw_reg dest, \
1019 struct brw_reg src0, \
1020 struct brw_reg src1, \
1021 struct brw_reg src2) \
1022 { \
1023 assert(dest.type == BRW_REGISTER_TYPE_F); \
1024 assert(src0.type == BRW_REGISTER_TYPE_F); \
1025 assert(src1.type == BRW_REGISTER_TYPE_F); \
1026 assert(src2.type == BRW_REGISTER_TYPE_F); \
1027 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1028 }
1029
1030 /* Rounding operations (other than RNDD) require two instructions - the first
1031 * stores a rounded value (possibly the wrong way) in the dest register, but
1032 * also sets a per-channel "increment bit" in the flag register. A predicated
1033 * add of 1.0 fixes dest to contain the desired result.
1034 *
1035 * Sandybridge and later appear to round correctly without an ADD.
1036 */
1037 #define ROUND(OP) \
1038 void brw_##OP(struct brw_compile *p, \
1039 struct brw_reg dest, \
1040 struct brw_reg src) \
1041 { \
1042 struct brw_instruction *rnd, *add; \
1043 rnd = next_insn(p, BRW_OPCODE_##OP); \
1044 brw_set_dest(p, rnd, dest); \
1045 brw_set_src0(p, rnd, src); \
1046 \
1047 if (p->brw->gen < 6) { \
1048 /* turn on round-increments */ \
1049 rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R; \
1050 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
1051 add->header.predicate_control = BRW_PREDICATE_NORMAL; \
1052 } \
1053 }
1054
1055
1056 ALU1(MOV)
1057 ALU2(SEL)
1058 ALU1(NOT)
1059 ALU2(AND)
1060 ALU2(OR)
1061 ALU2(XOR)
1062 ALU2(SHR)
1063 ALU2(SHL)
1064 ALU2(ASR)
1065 ALU1(F32TO16)
1066 ALU1(F16TO32)
1067 ALU1(FRC)
1068 ALU1(RNDD)
1069 ALU2(MAC)
1070 ALU2(MACH)
1071 ALU1(LZD)
1072 ALU2(DP4)
1073 ALU2(DPH)
1074 ALU2(DP3)
1075 ALU2(DP2)
1076 ALU2(LINE)
1077 ALU2(PLN)
1078 ALU3F(MAD)
1079 ALU3F(LRP)
1080 ALU1(BFREV)
1081 ALU3(BFE)
1082 ALU2(BFI1)
1083 ALU3(BFI2)
1084 ALU1(FBH)
1085 ALU1(FBL)
1086 ALU1(CBIT)
1087 ALU2(ADDC)
1088 ALU2(SUBB)
1089
1090 ROUND(RNDZ)
1091 ROUND(RNDE)
1092
1093
1094 struct brw_instruction *brw_ADD(struct brw_compile *p,
1095 struct brw_reg dest,
1096 struct brw_reg src0,
1097 struct brw_reg src1)
1098 {
1099 /* 6.2.2: add */
1100 if (src0.type == BRW_REGISTER_TYPE_F ||
1101 (src0.file == BRW_IMMEDIATE_VALUE &&
1102 src0.type == BRW_REGISTER_TYPE_VF)) {
1103 assert(src1.type != BRW_REGISTER_TYPE_UD);
1104 assert(src1.type != BRW_REGISTER_TYPE_D);
1105 }
1106
1107 if (src1.type == BRW_REGISTER_TYPE_F ||
1108 (src1.file == BRW_IMMEDIATE_VALUE &&
1109 src1.type == BRW_REGISTER_TYPE_VF)) {
1110 assert(src0.type != BRW_REGISTER_TYPE_UD);
1111 assert(src0.type != BRW_REGISTER_TYPE_D);
1112 }
1113
1114 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1115 }
1116
1117 struct brw_instruction *brw_AVG(struct brw_compile *p,
1118 struct brw_reg dest,
1119 struct brw_reg src0,
1120 struct brw_reg src1)
1121 {
1122 assert(dest.type == src0.type);
1123 assert(src0.type == src1.type);
1124 switch (src0.type) {
1125 case BRW_REGISTER_TYPE_B:
1126 case BRW_REGISTER_TYPE_UB:
1127 case BRW_REGISTER_TYPE_W:
1128 case BRW_REGISTER_TYPE_UW:
1129 case BRW_REGISTER_TYPE_D:
1130 case BRW_REGISTER_TYPE_UD:
1131 break;
1132 default:
1133 assert(!"Bad type for brw_AVG");
1134 }
1135
1136 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1137 }
1138
1139 struct brw_instruction *brw_MUL(struct brw_compile *p,
1140 struct brw_reg dest,
1141 struct brw_reg src0,
1142 struct brw_reg src1)
1143 {
1144 /* 6.32.38: mul */
1145 if (src0.type == BRW_REGISTER_TYPE_D ||
1146 src0.type == BRW_REGISTER_TYPE_UD ||
1147 src1.type == BRW_REGISTER_TYPE_D ||
1148 src1.type == BRW_REGISTER_TYPE_UD) {
1149 assert(dest.type != BRW_REGISTER_TYPE_F);
1150 }
1151
1152 if (src0.type == BRW_REGISTER_TYPE_F ||
1153 (src0.file == BRW_IMMEDIATE_VALUE &&
1154 src0.type == BRW_REGISTER_TYPE_VF)) {
1155 assert(src1.type != BRW_REGISTER_TYPE_UD);
1156 assert(src1.type != BRW_REGISTER_TYPE_D);
1157 }
1158
1159 if (src1.type == BRW_REGISTER_TYPE_F ||
1160 (src1.file == BRW_IMMEDIATE_VALUE &&
1161 src1.type == BRW_REGISTER_TYPE_VF)) {
1162 assert(src0.type != BRW_REGISTER_TYPE_UD);
1163 assert(src0.type != BRW_REGISTER_TYPE_D);
1164 }
1165
1166 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1167 src0.nr != BRW_ARF_ACCUMULATOR);
1168 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1169 src1.nr != BRW_ARF_ACCUMULATOR);
1170
1171 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1172 }
1173
1174
1175 void brw_NOP(struct brw_compile *p)
1176 {
1177 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
1178 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1179 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1180 brw_set_src1(p, insn, brw_imm_ud(0x0));
1181 }
1182
1183
1184
1185
1186
1187 /***********************************************************************
1188 * Comparisons, if/else/endif
1189 */
1190
1191 struct brw_instruction *brw_JMPI(struct brw_compile *p,
1192 struct brw_reg dest,
1193 struct brw_reg src0,
1194 struct brw_reg src1)
1195 {
1196 struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
1197
1198 insn->header.execution_size = 1;
1199 insn->header.compression_control = BRW_COMPRESSION_NONE;
1200 insn->header.mask_control = BRW_MASK_DISABLE;
1201
1202 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1203
1204 return insn;
1205 }
1206
1207 static void
1208 push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1209 {
1210 p->if_stack[p->if_stack_depth] = inst - p->store;
1211
1212 p->if_stack_depth++;
1213 if (p->if_stack_array_size <= p->if_stack_depth) {
1214 p->if_stack_array_size *= 2;
1215 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1216 p->if_stack_array_size);
1217 }
1218 }
1219
1220 static struct brw_instruction *
1221 pop_if_stack(struct brw_compile *p)
1222 {
1223 p->if_stack_depth--;
1224 return &p->store[p->if_stack[p->if_stack_depth]];
1225 }
1226
1227 static void
1228 push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1229 {
1230 if (p->loop_stack_array_size < p->loop_stack_depth) {
1231 p->loop_stack_array_size *= 2;
1232 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1233 p->loop_stack_array_size);
1234 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1235 p->loop_stack_array_size);
1236 }
1237
1238 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1239 p->loop_stack_depth++;
1240 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1241 }
1242
1243 static struct brw_instruction *
1244 get_inner_do_insn(struct brw_compile *p)
1245 {
1246 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1247 }
1248
1249 /* EU takes the value from the flag register and pushes it onto some
1250 * sort of a stack (presumably merging with any flag value already on
1251 * the stack). Within an if block, the flags at the top of the stack
1252 * control execution on each channel of the unit, eg. on each of the
1253 * 16 pixel values in our wm programs.
1254 *
1255 * When the matching 'else' instruction is reached (presumably by
1256 * countdown of the instruction count patched in by our ELSE/ENDIF
1257 * functions), the relevent flags are inverted.
1258 *
1259 * When the matching 'endif' instruction is reached, the flags are
1260 * popped off. If the stack is now empty, normal execution resumes.
1261 */
1262 struct brw_instruction *
1263 brw_IF(struct brw_compile *p, unsigned execute_size)
1264 {
1265 struct brw_context *brw = p->brw;
1266 struct brw_instruction *insn;
1267
1268 insn = next_insn(p, BRW_OPCODE_IF);
1269
1270 /* Override the defaults for this instruction:
1271 */
1272 if (brw->gen < 6) {
1273 brw_set_dest(p, insn, brw_ip_reg());
1274 brw_set_src0(p, insn, brw_ip_reg());
1275 brw_set_src1(p, insn, brw_imm_d(0x0));
1276 } else if (brw->gen == 6) {
1277 brw_set_dest(p, insn, brw_imm_w(0));
1278 insn->bits1.branch_gen6.jump_count = 0;
1279 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1280 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1281 } else {
1282 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1283 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1284 brw_set_src1(p, insn, brw_imm_ud(0));
1285 insn->bits3.break_cont.jip = 0;
1286 insn->bits3.break_cont.uip = 0;
1287 }
1288
1289 insn->header.execution_size = execute_size;
1290 insn->header.compression_control = BRW_COMPRESSION_NONE;
1291 insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1292 insn->header.mask_control = BRW_MASK_ENABLE;
1293 if (!p->single_program_flow)
1294 insn->header.thread_control = BRW_THREAD_SWITCH;
1295
1296 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1297
1298 push_if_stack(p, insn);
1299 p->if_depth_in_loop[p->loop_stack_depth]++;
1300 return insn;
1301 }
1302
1303 /* This function is only used for gen6-style IF instructions with an
1304 * embedded comparison (conditional modifier). It is not used on gen7.
1305 */
1306 struct brw_instruction *
1307 gen6_IF(struct brw_compile *p, uint32_t conditional,
1308 struct brw_reg src0, struct brw_reg src1)
1309 {
1310 struct brw_instruction *insn;
1311
1312 insn = next_insn(p, BRW_OPCODE_IF);
1313
1314 brw_set_dest(p, insn, brw_imm_w(0));
1315 if (p->compressed) {
1316 insn->header.execution_size = BRW_EXECUTE_16;
1317 } else {
1318 insn->header.execution_size = BRW_EXECUTE_8;
1319 }
1320 insn->bits1.branch_gen6.jump_count = 0;
1321 brw_set_src0(p, insn, src0);
1322 brw_set_src1(p, insn, src1);
1323
1324 assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1325 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1326 insn->header.destreg__conditionalmod = conditional;
1327
1328 if (!p->single_program_flow)
1329 insn->header.thread_control = BRW_THREAD_SWITCH;
1330
1331 push_if_stack(p, insn);
1332 return insn;
1333 }
1334
1335 /**
1336 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1337 */
1338 static void
1339 convert_IF_ELSE_to_ADD(struct brw_compile *p,
1340 struct brw_instruction *if_inst,
1341 struct brw_instruction *else_inst)
1342 {
1343 /* The next instruction (where the ENDIF would be, if it existed) */
1344 struct brw_instruction *next_inst = &p->store[p->nr_insn];
1345
1346 assert(p->single_program_flow);
1347 assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1348 assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1349 assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1350
1351 /* Convert IF to an ADD instruction that moves the instruction pointer
1352 * to the first instruction of the ELSE block. If there is no ELSE
1353 * block, point to where ENDIF would be. Reverse the predicate.
1354 *
1355 * There's no need to execute an ENDIF since we don't need to do any
1356 * stack operations, and if we're currently executing, we just want to
1357 * continue normally.
1358 */
1359 if_inst->header.opcode = BRW_OPCODE_ADD;
1360 if_inst->header.predicate_inverse = 1;
1361
1362 if (else_inst != NULL) {
1363 /* Convert ELSE to an ADD instruction that points where the ENDIF
1364 * would be.
1365 */
1366 else_inst->header.opcode = BRW_OPCODE_ADD;
1367
1368 if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1369 else_inst->bits3.ud = (next_inst - else_inst) * 16;
1370 } else {
1371 if_inst->bits3.ud = (next_inst - if_inst) * 16;
1372 }
1373 }
1374
1375 /**
1376 * Patch IF and ELSE instructions with appropriate jump targets.
1377 */
1378 static void
1379 patch_IF_ELSE(struct brw_compile *p,
1380 struct brw_instruction *if_inst,
1381 struct brw_instruction *else_inst,
1382 struct brw_instruction *endif_inst)
1383 {
1384 struct brw_context *brw = p->brw;
1385
1386 /* We shouldn't be patching IF and ELSE instructions in single program flow
1387 * mode when gen < 6, because in single program flow mode on those
1388 * platforms, we convert flow control instructions to conditional ADDs that
1389 * operate on IP (see brw_ENDIF).
1390 *
1391 * However, on Gen6, writing to IP doesn't work in single program flow mode
1392 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1393 * not be updated by non-flow control instructions."). And on later
1394 * platforms, there is no significant benefit to converting control flow
1395 * instructions to conditional ADDs. So we do patch IF and ELSE
1396 * instructions in single program flow mode on those platforms.
1397 */
1398 if (brw->gen < 6)
1399 assert(!p->single_program_flow);
1400
1401 assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1402 assert(endif_inst != NULL);
1403 assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1404
1405 unsigned br = 1;
1406 /* Jump count is for 64bit data chunk each, so one 128bit instruction
1407 * requires 2 chunks.
1408 */
1409 if (brw->gen >= 5)
1410 br = 2;
1411
1412 assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1413 endif_inst->header.execution_size = if_inst->header.execution_size;
1414
1415 if (else_inst == NULL) {
1416 /* Patch IF -> ENDIF */
1417 if (brw->gen < 6) {
1418 /* Turn it into an IFF, which means no mask stack operations for
1419 * all-false and jumping past the ENDIF.
1420 */
1421 if_inst->header.opcode = BRW_OPCODE_IFF;
1422 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1423 if_inst->bits3.if_else.pop_count = 0;
1424 if_inst->bits3.if_else.pad0 = 0;
1425 } else if (brw->gen == 6) {
1426 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1427 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1428 } else {
1429 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1430 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1431 }
1432 } else {
1433 else_inst->header.execution_size = if_inst->header.execution_size;
1434
1435 /* Patch IF -> ELSE */
1436 if (brw->gen < 6) {
1437 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1438 if_inst->bits3.if_else.pop_count = 0;
1439 if_inst->bits3.if_else.pad0 = 0;
1440 } else if (brw->gen == 6) {
1441 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1442 }
1443
1444 /* Patch ELSE -> ENDIF */
1445 if (brw->gen < 6) {
1446 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1447 * matching ENDIF.
1448 */
1449 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1450 else_inst->bits3.if_else.pop_count = 1;
1451 else_inst->bits3.if_else.pad0 = 0;
1452 } else if (brw->gen == 6) {
1453 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1454 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1455 } else {
1456 /* The IF instruction's JIP should point just past the ELSE */
1457 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1458 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1459 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1460 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1461 }
1462 }
1463 }
1464
1465 void
1466 brw_ELSE(struct brw_compile *p)
1467 {
1468 struct brw_context *brw = p->brw;
1469 struct brw_instruction *insn;
1470
1471 insn = next_insn(p, BRW_OPCODE_ELSE);
1472
1473 if (brw->gen < 6) {
1474 brw_set_dest(p, insn, brw_ip_reg());
1475 brw_set_src0(p, insn, brw_ip_reg());
1476 brw_set_src1(p, insn, brw_imm_d(0x0));
1477 } else if (brw->gen == 6) {
1478 brw_set_dest(p, insn, brw_imm_w(0));
1479 insn->bits1.branch_gen6.jump_count = 0;
1480 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1481 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1482 } else {
1483 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1484 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1485 brw_set_src1(p, insn, brw_imm_ud(0));
1486 insn->bits3.break_cont.jip = 0;
1487 insn->bits3.break_cont.uip = 0;
1488 }
1489
1490 insn->header.compression_control = BRW_COMPRESSION_NONE;
1491 insn->header.mask_control = BRW_MASK_ENABLE;
1492 if (!p->single_program_flow)
1493 insn->header.thread_control = BRW_THREAD_SWITCH;
1494
1495 push_if_stack(p, insn);
1496 }
1497
1498 void
1499 brw_ENDIF(struct brw_compile *p)
1500 {
1501 struct brw_context *brw = p->brw;
1502 struct brw_instruction *insn = NULL;
1503 struct brw_instruction *else_inst = NULL;
1504 struct brw_instruction *if_inst = NULL;
1505 struct brw_instruction *tmp;
1506 bool emit_endif = true;
1507
1508 /* In single program flow mode, we can express IF and ELSE instructions
1509 * equivalently as ADD instructions that operate on IP. On platforms prior
1510 * to Gen6, flow control instructions cause an implied thread switch, so
1511 * this is a significant savings.
1512 *
1513 * However, on Gen6, writing to IP doesn't work in single program flow mode
1514 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1515 * not be updated by non-flow control instructions."). And on later
1516 * platforms, there is no significant benefit to converting control flow
1517 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1518 * Gen5.
1519 */
1520 if (brw->gen < 6 && p->single_program_flow)
1521 emit_endif = false;
1522
1523 /*
1524 * A single next_insn() may change the base adress of instruction store
1525 * memory(p->store), so call it first before referencing the instruction
1526 * store pointer from an index
1527 */
1528 if (emit_endif)
1529 insn = next_insn(p, BRW_OPCODE_ENDIF);
1530
1531 /* Pop the IF and (optional) ELSE instructions from the stack */
1532 p->if_depth_in_loop[p->loop_stack_depth]--;
1533 tmp = pop_if_stack(p);
1534 if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1535 else_inst = tmp;
1536 tmp = pop_if_stack(p);
1537 }
1538 if_inst = tmp;
1539
1540 if (!emit_endif) {
1541 /* ENDIF is useless; don't bother emitting it. */
1542 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1543 return;
1544 }
1545
1546 if (brw->gen < 6) {
1547 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1548 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1549 brw_set_src1(p, insn, brw_imm_d(0x0));
1550 } else if (brw->gen == 6) {
1551 brw_set_dest(p, insn, brw_imm_w(0));
1552 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1553 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1554 } else {
1555 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1556 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1557 brw_set_src1(p, insn, brw_imm_ud(0));
1558 }
1559
1560 insn->header.compression_control = BRW_COMPRESSION_NONE;
1561 insn->header.mask_control = BRW_MASK_ENABLE;
1562 insn->header.thread_control = BRW_THREAD_SWITCH;
1563
1564 /* Also pop item off the stack in the endif instruction: */
1565 if (brw->gen < 6) {
1566 insn->bits3.if_else.jump_count = 0;
1567 insn->bits3.if_else.pop_count = 1;
1568 insn->bits3.if_else.pad0 = 0;
1569 } else if (brw->gen == 6) {
1570 insn->bits1.branch_gen6.jump_count = 2;
1571 } else {
1572 insn->bits3.break_cont.jip = 2;
1573 }
1574 patch_IF_ELSE(p, if_inst, else_inst, insn);
1575 }
1576
1577 struct brw_instruction *brw_BREAK(struct brw_compile *p)
1578 {
1579 struct brw_context *brw = p->brw;
1580 struct brw_instruction *insn;
1581
1582 insn = next_insn(p, BRW_OPCODE_BREAK);
1583 if (brw->gen >= 6) {
1584 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1585 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1586 brw_set_src1(p, insn, brw_imm_d(0x0));
1587 } else {
1588 brw_set_dest(p, insn, brw_ip_reg());
1589 brw_set_src0(p, insn, brw_ip_reg());
1590 brw_set_src1(p, insn, brw_imm_d(0x0));
1591 insn->bits3.if_else.pad0 = 0;
1592 insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1593 }
1594 insn->header.compression_control = BRW_COMPRESSION_NONE;
1595 insn->header.execution_size = BRW_EXECUTE_8;
1596
1597 return insn;
1598 }
1599
1600 struct brw_instruction *gen6_CONT(struct brw_compile *p)
1601 {
1602 struct brw_instruction *insn;
1603
1604 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1605 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1606 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1607 brw_set_dest(p, insn, brw_ip_reg());
1608 brw_set_src0(p, insn, brw_ip_reg());
1609 brw_set_src1(p, insn, brw_imm_d(0x0));
1610
1611 insn->header.compression_control = BRW_COMPRESSION_NONE;
1612 insn->header.execution_size = BRW_EXECUTE_8;
1613 return insn;
1614 }
1615
1616 struct brw_instruction *brw_CONT(struct brw_compile *p)
1617 {
1618 struct brw_instruction *insn;
1619 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1620 brw_set_dest(p, insn, brw_ip_reg());
1621 brw_set_src0(p, insn, brw_ip_reg());
1622 brw_set_src1(p, insn, brw_imm_d(0x0));
1623 insn->header.compression_control = BRW_COMPRESSION_NONE;
1624 insn->header.execution_size = BRW_EXECUTE_8;
1625 /* insn->header.mask_control = BRW_MASK_DISABLE; */
1626 insn->bits3.if_else.pad0 = 0;
1627 insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1628 return insn;
1629 }
1630
1631 struct brw_instruction *gen6_HALT(struct brw_compile *p)
1632 {
1633 struct brw_instruction *insn;
1634
1635 insn = next_insn(p, BRW_OPCODE_HALT);
1636 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1637 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1638 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1639
1640 if (p->compressed) {
1641 insn->header.execution_size = BRW_EXECUTE_16;
1642 } else {
1643 insn->header.compression_control = BRW_COMPRESSION_NONE;
1644 insn->header.execution_size = BRW_EXECUTE_8;
1645 }
1646 return insn;
1647 }
1648
1649 /* DO/WHILE loop:
1650 *
1651 * The DO/WHILE is just an unterminated loop -- break or continue are
1652 * used for control within the loop. We have a few ways they can be
1653 * done.
1654 *
1655 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1656 * jip and no DO instruction.
1657 *
1658 * For non-uniform control flow pre-gen6, there's a DO instruction to
1659 * push the mask, and a WHILE to jump back, and BREAK to get out and
1660 * pop the mask.
1661 *
1662 * For gen6, there's no more mask stack, so no need for DO. WHILE
1663 * just points back to the first instruction of the loop.
1664 */
1665 struct brw_instruction *brw_DO(struct brw_compile *p, unsigned execute_size)
1666 {
1667 struct brw_context *brw = p->brw;
1668
1669 if (brw->gen >= 6 || p->single_program_flow) {
1670 push_loop_stack(p, &p->store[p->nr_insn]);
1671 return &p->store[p->nr_insn];
1672 } else {
1673 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1674
1675 push_loop_stack(p, insn);
1676
1677 /* Override the defaults for this instruction:
1678 */
1679 brw_set_dest(p, insn, brw_null_reg());
1680 brw_set_src0(p, insn, brw_null_reg());
1681 brw_set_src1(p, insn, brw_null_reg());
1682
1683 insn->header.compression_control = BRW_COMPRESSION_NONE;
1684 insn->header.execution_size = execute_size;
1685 insn->header.predicate_control = BRW_PREDICATE_NONE;
1686 /* insn->header.mask_control = BRW_MASK_ENABLE; */
1687 /* insn->header.mask_control = BRW_MASK_DISABLE; */
1688
1689 return insn;
1690 }
1691 }
1692
1693 /**
1694 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1695 * instruction here.
1696 *
1697 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1698 * nesting, since it can always just point to the end of the block/current loop.
1699 */
1700 static void
1701 brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1702 {
1703 struct brw_context *brw = p->brw;
1704 struct brw_instruction *do_inst = get_inner_do_insn(p);
1705 struct brw_instruction *inst;
1706 int br = (brw->gen == 5) ? 2 : 1;
1707
1708 for (inst = while_inst - 1; inst != do_inst; inst--) {
1709 /* If the jump count is != 0, that means that this instruction has already
1710 * been patched because it's part of a loop inside of the one we're
1711 * patching.
1712 */
1713 if (inst->header.opcode == BRW_OPCODE_BREAK &&
1714 inst->bits3.if_else.jump_count == 0) {
1715 inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1716 } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1717 inst->bits3.if_else.jump_count == 0) {
1718 inst->bits3.if_else.jump_count = br * (while_inst - inst);
1719 }
1720 }
1721 }
1722
1723 struct brw_instruction *brw_WHILE(struct brw_compile *p)
1724 {
1725 struct brw_context *brw = p->brw;
1726 struct brw_instruction *insn, *do_insn;
1727 unsigned br = 1;
1728
1729 if (brw->gen >= 5)
1730 br = 2;
1731
1732 if (brw->gen >= 7) {
1733 insn = next_insn(p, BRW_OPCODE_WHILE);
1734 do_insn = get_inner_do_insn(p);
1735
1736 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1737 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1738 brw_set_src1(p, insn, brw_imm_ud(0));
1739 insn->bits3.break_cont.jip = br * (do_insn - insn);
1740
1741 insn->header.execution_size = BRW_EXECUTE_8;
1742 } else if (brw->gen == 6) {
1743 insn = next_insn(p, BRW_OPCODE_WHILE);
1744 do_insn = get_inner_do_insn(p);
1745
1746 brw_set_dest(p, insn, brw_imm_w(0));
1747 insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1748 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1749 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1750
1751 insn->header.execution_size = BRW_EXECUTE_8;
1752 } else {
1753 if (p->single_program_flow) {
1754 insn = next_insn(p, BRW_OPCODE_ADD);
1755 do_insn = get_inner_do_insn(p);
1756
1757 brw_set_dest(p, insn, brw_ip_reg());
1758 brw_set_src0(p, insn, brw_ip_reg());
1759 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1760 insn->header.execution_size = BRW_EXECUTE_1;
1761 } else {
1762 insn = next_insn(p, BRW_OPCODE_WHILE);
1763 do_insn = get_inner_do_insn(p);
1764
1765 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1766
1767 brw_set_dest(p, insn, brw_ip_reg());
1768 brw_set_src0(p, insn, brw_ip_reg());
1769 brw_set_src1(p, insn, brw_imm_d(0));
1770
1771 insn->header.execution_size = do_insn->header.execution_size;
1772 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1773 insn->bits3.if_else.pop_count = 0;
1774 insn->bits3.if_else.pad0 = 0;
1775
1776 brw_patch_break_cont(p, insn);
1777 }
1778 }
1779 insn->header.compression_control = BRW_COMPRESSION_NONE;
1780 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1781
1782 p->loop_stack_depth--;
1783
1784 return insn;
1785 }
1786
1787 /* To integrate with the above, it makes sense that the comparison
1788 * instruction should populate the flag register. It might be simpler
1789 * just to use the flag reg for most WM tasks?
1790 */
1791 void brw_CMP(struct brw_compile *p,
1792 struct brw_reg dest,
1793 unsigned conditional,
1794 struct brw_reg src0,
1795 struct brw_reg src1)
1796 {
1797 struct brw_context *brw = p->brw;
1798 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1799
1800 insn->header.destreg__conditionalmod = conditional;
1801 brw_set_dest(p, insn, dest);
1802 brw_set_src0(p, insn, src0);
1803 brw_set_src1(p, insn, src1);
1804
1805 /* guess_execution_size(insn, src0); */
1806
1807
1808 /* Make it so that future instructions will use the computed flag
1809 * value until brw_set_predicate_control_flag_value() is called
1810 * again.
1811 */
1812 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1813 dest.nr == 0) {
1814 p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1815 p->flag_value = 0xff;
1816 }
1817
1818 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1819 * page says:
1820 * "Any CMP instruction with a null destination must use a {switch}."
1821 *
1822 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1823 * mentioned on their work-arounds pages.
1824 */
1825 if (brw->gen == 7) {
1826 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1827 dest.nr == BRW_ARF_NULL) {
1828 insn->header.thread_control = BRW_THREAD_SWITCH;
1829 }
1830 }
1831 }
1832
1833 /* Issue 'wait' instruction for n1, host could program MMIO
1834 to wake up thread. */
1835 void brw_WAIT (struct brw_compile *p)
1836 {
1837 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1838 struct brw_reg src = brw_notification_1_reg();
1839
1840 brw_set_dest(p, insn, src);
1841 brw_set_src0(p, insn, src);
1842 brw_set_src1(p, insn, brw_null_reg());
1843 insn->header.execution_size = 0; /* must */
1844 insn->header.predicate_control = 0;
1845 insn->header.compression_control = 0;
1846 }
1847
1848
1849 /***********************************************************************
1850 * Helpers for the various SEND message types:
1851 */
1852
1853 /** Extended math function, float[8].
1854 */
1855 void brw_math( struct brw_compile *p,
1856 struct brw_reg dest,
1857 unsigned function,
1858 unsigned msg_reg_nr,
1859 struct brw_reg src,
1860 unsigned data_type,
1861 unsigned precision )
1862 {
1863 struct brw_context *brw = p->brw;
1864
1865 if (brw->gen >= 6) {
1866 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1867
1868 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1869 (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1870 assert(src.file == BRW_GENERAL_REGISTER_FILE);
1871
1872 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1873 if (brw->gen == 6)
1874 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1875
1876 /* Source modifiers are ignored for extended math instructions on Gen6. */
1877 if (brw->gen == 6) {
1878 assert(!src.negate);
1879 assert(!src.abs);
1880 }
1881
1882 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1883 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1884 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1885 assert(src.type != BRW_REGISTER_TYPE_F);
1886 } else {
1887 assert(src.type == BRW_REGISTER_TYPE_F);
1888 }
1889
1890 /* Math is the same ISA format as other opcodes, except that CondModifier
1891 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1892 */
1893 insn->header.destreg__conditionalmod = function;
1894
1895 brw_set_dest(p, insn, dest);
1896 brw_set_src0(p, insn, src);
1897 brw_set_src1(p, insn, brw_null_reg());
1898 } else {
1899 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1900
1901 /* Example code doesn't set predicate_control for send
1902 * instructions.
1903 */
1904 insn->header.predicate_control = 0;
1905 insn->header.destreg__conditionalmod = msg_reg_nr;
1906
1907 brw_set_dest(p, insn, dest);
1908 brw_set_src0(p, insn, src);
1909 brw_set_math_message(p,
1910 insn,
1911 function,
1912 src.type == BRW_REGISTER_TYPE_D,
1913 precision,
1914 data_type);
1915 }
1916 }
1917
1918 /** Extended math function, float[8].
1919 */
1920 void brw_math2(struct brw_compile *p,
1921 struct brw_reg dest,
1922 unsigned function,
1923 struct brw_reg src0,
1924 struct brw_reg src1)
1925 {
1926 struct brw_context *brw = p->brw;
1927 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1928
1929 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1930 (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1931 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1932 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1933
1934 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1935 if (brw->gen == 6) {
1936 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1937 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1938 }
1939
1940 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1941 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1942 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1943 assert(src0.type != BRW_REGISTER_TYPE_F);
1944 assert(src1.type != BRW_REGISTER_TYPE_F);
1945 } else {
1946 assert(src0.type == BRW_REGISTER_TYPE_F);
1947 assert(src1.type == BRW_REGISTER_TYPE_F);
1948 }
1949
1950 /* Source modifiers are ignored for extended math instructions on Gen6. */
1951 if (brw->gen == 6) {
1952 assert(!src0.negate);
1953 assert(!src0.abs);
1954 assert(!src1.negate);
1955 assert(!src1.abs);
1956 }
1957
1958 /* Math is the same ISA format as other opcodes, except that CondModifier
1959 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1960 */
1961 insn->header.destreg__conditionalmod = function;
1962
1963 brw_set_dest(p, insn, dest);
1964 brw_set_src0(p, insn, src0);
1965 brw_set_src1(p, insn, src1);
1966 }
1967
1968
1969 /**
1970 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1971 * using a constant offset per channel.
1972 *
1973 * The offset must be aligned to oword size (16 bytes). Used for
1974 * register spilling.
1975 */
1976 void brw_oword_block_write_scratch(struct brw_compile *p,
1977 struct brw_reg mrf,
1978 int num_regs,
1979 unsigned offset)
1980 {
1981 struct brw_context *brw = p->brw;
1982 uint32_t msg_control, msg_type;
1983 int mlen;
1984
1985 if (brw->gen >= 6)
1986 offset /= 16;
1987
1988 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1989
1990 if (num_regs == 1) {
1991 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1992 mlen = 2;
1993 } else {
1994 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1995 mlen = 3;
1996 }
1997
1998 /* Set up the message header. This is g0, with g0.2 filled with
1999 * the offset. We don't want to leave our offset around in g0 or
2000 * it'll screw up texture samples, so set it up inside the message
2001 * reg.
2002 */
2003 {
2004 brw_push_insn_state(p);
2005 brw_set_mask_control(p, BRW_MASK_DISABLE);
2006 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2007
2008 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2009
2010 /* set message header global offset field (reg 0, element 2) */
2011 brw_MOV(p,
2012 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2013 mrf.nr,
2014 2), BRW_REGISTER_TYPE_UD),
2015 brw_imm_ud(offset));
2016
2017 brw_pop_insn_state(p);
2018 }
2019
2020 {
2021 struct brw_reg dest;
2022 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2023 int send_commit_msg;
2024 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2025 BRW_REGISTER_TYPE_UW);
2026
2027 if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
2028 insn->header.compression_control = BRW_COMPRESSION_NONE;
2029 src_header = vec16(src_header);
2030 }
2031 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
2032 insn->header.destreg__conditionalmod = mrf.nr;
2033
2034 /* Until gen6, writes followed by reads from the same location
2035 * are not guaranteed to be ordered unless write_commit is set.
2036 * If set, then a no-op write is issued to the destination
2037 * register to set a dependency, and a read from the destination
2038 * can be used to ensure the ordering.
2039 *
2040 * For gen6, only writes between different threads need ordering
2041 * protection. Our use of DP writes is all about register
2042 * spilling within a thread.
2043 */
2044 if (brw->gen >= 6) {
2045 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2046 send_commit_msg = 0;
2047 } else {
2048 dest = src_header;
2049 send_commit_msg = 1;
2050 }
2051
2052 brw_set_dest(p, insn, dest);
2053 if (brw->gen >= 6) {
2054 brw_set_src0(p, insn, mrf);
2055 } else {
2056 brw_set_src0(p, insn, brw_null_reg());
2057 }
2058
2059 if (brw->gen >= 6)
2060 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2061 else
2062 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2063
2064 brw_set_dp_write_message(p,
2065 insn,
2066 255, /* binding table index (255=stateless) */
2067 msg_control,
2068 msg_type,
2069 mlen,
2070 true, /* header_present */
2071 0, /* not a render target */
2072 send_commit_msg, /* response_length */
2073 0, /* eot */
2074 send_commit_msg);
2075 }
2076 }
2077
2078
2079 /**
2080 * Read a block of owords (half a GRF each) from the scratch buffer
2081 * using a constant index per channel.
2082 *
2083 * Offset must be aligned to oword size (16 bytes). Used for register
2084 * spilling.
2085 */
2086 void
2087 brw_oword_block_read_scratch(struct brw_compile *p,
2088 struct brw_reg dest,
2089 struct brw_reg mrf,
2090 int num_regs,
2091 unsigned offset)
2092 {
2093 struct brw_context *brw = p->brw;
2094 uint32_t msg_control;
2095 int rlen;
2096
2097 if (brw->gen >= 6)
2098 offset /= 16;
2099
2100 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2101 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2102
2103 if (num_regs == 1) {
2104 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2105 rlen = 1;
2106 } else {
2107 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2108 rlen = 2;
2109 }
2110
2111 {
2112 brw_push_insn_state(p);
2113 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2114 brw_set_mask_control(p, BRW_MASK_DISABLE);
2115
2116 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2117
2118 /* set message header global offset field (reg 0, element 2) */
2119 brw_MOV(p,
2120 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2121 mrf.nr,
2122 2), BRW_REGISTER_TYPE_UD),
2123 brw_imm_ud(offset));
2124
2125 brw_pop_insn_state(p);
2126 }
2127
2128 {
2129 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2130
2131 assert(insn->header.predicate_control == 0);
2132 insn->header.compression_control = BRW_COMPRESSION_NONE;
2133 insn->header.destreg__conditionalmod = mrf.nr;
2134
2135 brw_set_dest(p, insn, dest); /* UW? */
2136 if (brw->gen >= 6) {
2137 brw_set_src0(p, insn, mrf);
2138 } else {
2139 brw_set_src0(p, insn, brw_null_reg());
2140 }
2141
2142 brw_set_dp_read_message(p,
2143 insn,
2144 255, /* binding table index (255=stateless) */
2145 msg_control,
2146 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2147 BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2148 1, /* msg_length */
2149 true, /* header_present */
2150 rlen);
2151 }
2152 }
2153
2154 void
2155 gen7_block_read_scratch(struct brw_compile *p,
2156 struct brw_reg dest,
2157 int num_regs,
2158 unsigned offset)
2159 {
2160 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2161
2162 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2163
2164 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
2165 insn->header.compression_control = BRW_COMPRESSION_NONE;
2166
2167 brw_set_dest(p, insn, dest);
2168
2169 /* The HW requires that the header is present; this is to get the g0.5
2170 * scratch offset.
2171 */
2172 bool header_present = true;
2173 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2174
2175 brw_set_message_descriptor(p, insn,
2176 GEN7_SFID_DATAPORT_DATA_CACHE,
2177 1, /* mlen: just g0 */
2178 num_regs,
2179 header_present,
2180 false);
2181
2182 insn->bits3.ud |= GEN7_DATAPORT_SCRATCH_READ;
2183
2184 assert(num_regs == 1 || num_regs == 2 || num_regs == 4);
2185 insn->bits3.ud |= (num_regs - 1) << GEN7_DATAPORT_SCRATCH_NUM_REGS_SHIFT;
2186
2187 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2188 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2189 * is 32 bytes, which happens to be the size of a register.
2190 */
2191 offset /= REG_SIZE;
2192 assert(offset < (1 << 12));
2193 insn->bits3.ud |= offset;
2194 }
2195
2196 /**
2197 * Read a float[4] vector from the data port Data Cache (const buffer).
2198 * Location (in buffer) should be a multiple of 16.
2199 * Used for fetching shader constants.
2200 */
2201 void brw_oword_block_read(struct brw_compile *p,
2202 struct brw_reg dest,
2203 struct brw_reg mrf,
2204 uint32_t offset,
2205 uint32_t bind_table_index)
2206 {
2207 struct brw_context *brw = p->brw;
2208
2209 /* On newer hardware, offset is in units of owords. */
2210 if (brw->gen >= 6)
2211 offset /= 16;
2212
2213 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2214
2215 brw_push_insn_state(p);
2216 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2217 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2218 brw_set_mask_control(p, BRW_MASK_DISABLE);
2219
2220 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2221
2222 /* set message header global offset field (reg 0, element 2) */
2223 brw_MOV(p,
2224 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2225 mrf.nr,
2226 2), BRW_REGISTER_TYPE_UD),
2227 brw_imm_ud(offset));
2228
2229 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2230 insn->header.destreg__conditionalmod = mrf.nr;
2231
2232 /* cast dest to a uword[8] vector */
2233 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2234
2235 brw_set_dest(p, insn, dest);
2236 if (brw->gen >= 6) {
2237 brw_set_src0(p, insn, mrf);
2238 } else {
2239 brw_set_src0(p, insn, brw_null_reg());
2240 }
2241
2242 brw_set_dp_read_message(p,
2243 insn,
2244 bind_table_index,
2245 BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2246 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2247 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2248 1, /* msg_length */
2249 true, /* header_present */
2250 1); /* response_length (1 reg, 2 owords!) */
2251
2252 brw_pop_insn_state(p);
2253 }
2254
2255
2256 void brw_fb_WRITE(struct brw_compile *p,
2257 int dispatch_width,
2258 unsigned msg_reg_nr,
2259 struct brw_reg src0,
2260 unsigned msg_control,
2261 unsigned binding_table_index,
2262 unsigned msg_length,
2263 unsigned response_length,
2264 bool eot,
2265 bool header_present)
2266 {
2267 struct brw_context *brw = p->brw;
2268 struct brw_instruction *insn;
2269 unsigned msg_type;
2270 struct brw_reg dest;
2271
2272 if (dispatch_width == 16)
2273 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2274 else
2275 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2276
2277 if (brw->gen >= 6) {
2278 insn = next_insn(p, BRW_OPCODE_SENDC);
2279 } else {
2280 insn = next_insn(p, BRW_OPCODE_SEND);
2281 }
2282 insn->header.compression_control = BRW_COMPRESSION_NONE;
2283
2284 if (brw->gen >= 6) {
2285 /* headerless version, just submit color payload */
2286 src0 = brw_message_reg(msg_reg_nr);
2287
2288 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2289 } else {
2290 insn->header.destreg__conditionalmod = msg_reg_nr;
2291
2292 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2293 }
2294
2295 brw_set_dest(p, insn, dest);
2296 brw_set_src0(p, insn, src0);
2297 brw_set_dp_write_message(p,
2298 insn,
2299 binding_table_index,
2300 msg_control,
2301 msg_type,
2302 msg_length,
2303 header_present,
2304 eot, /* last render target write */
2305 response_length,
2306 eot,
2307 0 /* send_commit_msg */);
2308 }
2309
2310
2311 /**
2312 * Texture sample instruction.
2313 * Note: the msg_type plus msg_length values determine exactly what kind
2314 * of sampling operation is performed. See volume 4, page 161 of docs.
2315 */
2316 void brw_SAMPLE(struct brw_compile *p,
2317 struct brw_reg dest,
2318 unsigned msg_reg_nr,
2319 struct brw_reg src0,
2320 unsigned binding_table_index,
2321 unsigned sampler,
2322 unsigned msg_type,
2323 unsigned response_length,
2324 unsigned msg_length,
2325 unsigned header_present,
2326 unsigned simd_mode,
2327 unsigned return_format)
2328 {
2329 struct brw_context *brw = p->brw;
2330 struct brw_instruction *insn;
2331
2332 if (msg_reg_nr != -1)
2333 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2334
2335 insn = next_insn(p, BRW_OPCODE_SEND);
2336 insn->header.predicate_control = 0; /* XXX */
2337
2338 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2339 *
2340 * "Instruction compression is not allowed for this instruction (that
2341 * is, send). The hardware behavior is undefined if this instruction is
2342 * set as compressed. However, compress control can be set to "SecHalf"
2343 * to affect the EMask generation."
2344 *
2345 * No similar wording is found in later PRMs, but there are examples
2346 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2347 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2348 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2349 */
2350 if (insn->header.compression_control != BRW_COMPRESSION_2NDHALF)
2351 insn->header.compression_control = BRW_COMPRESSION_NONE;
2352
2353 if (brw->gen < 6)
2354 insn->header.destreg__conditionalmod = msg_reg_nr;
2355
2356 brw_set_dest(p, insn, dest);
2357 brw_set_src0(p, insn, src0);
2358 brw_set_sampler_message(p, insn,
2359 binding_table_index,
2360 sampler,
2361 msg_type,
2362 response_length,
2363 msg_length,
2364 header_present,
2365 simd_mode,
2366 return_format);
2367 }
2368
2369 /* All these variables are pretty confusing - we might be better off
2370 * using bitmasks and macros for this, in the old style. Or perhaps
2371 * just having the caller instantiate the fields in dword3 itself.
2372 */
2373 void brw_urb_WRITE(struct brw_compile *p,
2374 struct brw_reg dest,
2375 unsigned msg_reg_nr,
2376 struct brw_reg src0,
2377 enum brw_urb_write_flags flags,
2378 unsigned msg_length,
2379 unsigned response_length,
2380 unsigned offset,
2381 unsigned swizzle)
2382 {
2383 struct brw_context *brw = p->brw;
2384 struct brw_instruction *insn;
2385
2386 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2387
2388 if (brw->gen == 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2389 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2390 brw_push_insn_state(p);
2391 brw_set_access_mode(p, BRW_ALIGN_1);
2392 brw_set_mask_control(p, BRW_MASK_DISABLE);
2393 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2394 BRW_REGISTER_TYPE_UD),
2395 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2396 brw_imm_ud(0xff00));
2397 brw_pop_insn_state(p);
2398 }
2399
2400 insn = next_insn(p, BRW_OPCODE_SEND);
2401
2402 assert(msg_length < BRW_MAX_MRF);
2403
2404 brw_set_dest(p, insn, dest);
2405 brw_set_src0(p, insn, src0);
2406 brw_set_src1(p, insn, brw_imm_d(0));
2407
2408 if (brw->gen < 6)
2409 insn->header.destreg__conditionalmod = msg_reg_nr;
2410
2411 brw_set_urb_message(p,
2412 insn,
2413 flags,
2414 msg_length,
2415 response_length,
2416 offset,
2417 swizzle);
2418 }
2419
2420 static int
2421 brw_find_next_block_end(struct brw_compile *p, int start_offset)
2422 {
2423 int offset;
2424 void *store = p->store;
2425
2426 for (offset = next_offset(store, start_offset); offset < p->next_insn_offset;
2427 offset = next_offset(store, offset)) {
2428 struct brw_instruction *insn = store + offset;
2429
2430 switch (insn->header.opcode) {
2431 case BRW_OPCODE_ENDIF:
2432 case BRW_OPCODE_ELSE:
2433 case BRW_OPCODE_WHILE:
2434 case BRW_OPCODE_HALT:
2435 return offset;
2436 }
2437 }
2438
2439 return 0;
2440 }
2441
2442 /* There is no DO instruction on gen6, so to find the end of the loop
2443 * we have to see if the loop is jumping back before our start
2444 * instruction.
2445 */
2446 static int
2447 brw_find_loop_end(struct brw_compile *p, int start_offset)
2448 {
2449 struct brw_context *brw = p->brw;
2450 int offset;
2451 int scale = 8;
2452 void *store = p->store;
2453
2454 /* Always start after the instruction (such as a WHILE) we're trying to fix
2455 * up.
2456 */
2457 for (offset = next_offset(store, start_offset); offset < p->next_insn_offset;
2458 offset = next_offset(store, offset)) {
2459 struct brw_instruction *insn = store + offset;
2460
2461 if (insn->header.opcode == BRW_OPCODE_WHILE) {
2462 int jip = brw->gen == 6 ? insn->bits1.branch_gen6.jump_count
2463 : insn->bits3.break_cont.jip;
2464 if (offset + jip * scale <= start_offset)
2465 return offset;
2466 }
2467 }
2468 assert(!"not reached");
2469 return start_offset;
2470 }
2471
2472 /* After program generation, go back and update the UIP and JIP of
2473 * BREAK, CONT, and HALT instructions to their correct locations.
2474 */
2475 void
2476 brw_set_uip_jip(struct brw_compile *p)
2477 {
2478 struct brw_context *brw = p->brw;
2479 int offset;
2480 int scale = 8;
2481 void *store = p->store;
2482
2483 if (brw->gen < 6)
2484 return;
2485
2486 for (offset = 0; offset < p->next_insn_offset;
2487 offset = next_offset(store, offset)) {
2488 struct brw_instruction *insn = store + offset;
2489
2490 if (insn->header.cmpt_control) {
2491 /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2492 assert(insn->header.opcode != BRW_OPCODE_BREAK &&
2493 insn->header.opcode != BRW_OPCODE_CONTINUE &&
2494 insn->header.opcode != BRW_OPCODE_HALT);
2495 continue;
2496 }
2497
2498 int block_end_offset = brw_find_next_block_end(p, offset);
2499 switch (insn->header.opcode) {
2500 case BRW_OPCODE_BREAK:
2501 assert(block_end_offset != 0);
2502 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2503 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2504 insn->bits3.break_cont.uip =
2505 (brw_find_loop_end(p, offset) - offset +
2506 (brw->gen == 6 ? 16 : 0)) / scale;
2507 break;
2508 case BRW_OPCODE_CONTINUE:
2509 assert(block_end_offset != 0);
2510 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2511 insn->bits3.break_cont.uip =
2512 (brw_find_loop_end(p, offset) - offset) / scale;
2513
2514 assert(insn->bits3.break_cont.uip != 0);
2515 assert(insn->bits3.break_cont.jip != 0);
2516 break;
2517
2518 case BRW_OPCODE_ENDIF:
2519 if (block_end_offset == 0)
2520 insn->bits3.break_cont.jip = 2;
2521 else
2522 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2523 break;
2524
2525 case BRW_OPCODE_HALT:
2526 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2527 *
2528 * "In case of the halt instruction not inside any conditional
2529 * code block, the value of <JIP> and <UIP> should be the
2530 * same. In case of the halt instruction inside conditional code
2531 * block, the <UIP> should be the end of the program, and the
2532 * <JIP> should be end of the most inner conditional code block."
2533 *
2534 * The uip will have already been set by whoever set up the
2535 * instruction.
2536 */
2537 if (block_end_offset == 0) {
2538 insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
2539 } else {
2540 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2541 }
2542 assert(insn->bits3.break_cont.uip != 0);
2543 assert(insn->bits3.break_cont.jip != 0);
2544 break;
2545 }
2546 }
2547 }
2548
2549 void brw_ff_sync(struct brw_compile *p,
2550 struct brw_reg dest,
2551 unsigned msg_reg_nr,
2552 struct brw_reg src0,
2553 bool allocate,
2554 unsigned response_length,
2555 bool eot)
2556 {
2557 struct brw_context *brw = p->brw;
2558 struct brw_instruction *insn;
2559
2560 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2561
2562 insn = next_insn(p, BRW_OPCODE_SEND);
2563 brw_set_dest(p, insn, dest);
2564 brw_set_src0(p, insn, src0);
2565 brw_set_src1(p, insn, brw_imm_d(0));
2566
2567 if (brw->gen < 6)
2568 insn->header.destreg__conditionalmod = msg_reg_nr;
2569
2570 brw_set_ff_sync_message(p,
2571 insn,
2572 allocate,
2573 response_length,
2574 eot);
2575 }
2576
2577 /**
2578 * Emit the SEND instruction necessary to generate stream output data on Gen6
2579 * (for transform feedback).
2580 *
2581 * If send_commit_msg is true, this is the last piece of stream output data
2582 * from this thread, so send the data as a committed write. According to the
2583 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2584 *
2585 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2586 * writes are complete by sending the final write as a committed write."
2587 */
2588 void
2589 brw_svb_write(struct brw_compile *p,
2590 struct brw_reg dest,
2591 unsigned msg_reg_nr,
2592 struct brw_reg src0,
2593 unsigned binding_table_index,
2594 bool send_commit_msg)
2595 {
2596 struct brw_instruction *insn;
2597
2598 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2599
2600 insn = next_insn(p, BRW_OPCODE_SEND);
2601 brw_set_dest(p, insn, dest);
2602 brw_set_src0(p, insn, src0);
2603 brw_set_src1(p, insn, brw_imm_d(0));
2604 brw_set_dp_write_message(p, insn,
2605 binding_table_index,
2606 0, /* msg_control: ignored */
2607 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2608 1, /* msg_length */
2609 true, /* header_present */
2610 0, /* last_render_target: ignored */
2611 send_commit_msg, /* response_length */
2612 0, /* end_of_thread */
2613 send_commit_msg); /* send_commit_msg */
2614 }
2615
2616 static void
2617 brw_set_dp_untyped_atomic_message(struct brw_compile *p,
2618 struct brw_instruction *insn,
2619 unsigned atomic_op,
2620 unsigned bind_table_index,
2621 unsigned msg_length,
2622 unsigned response_length,
2623 bool header_present)
2624 {
2625 if (p->brw->is_haswell) {
2626 brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1,
2627 msg_length, response_length,
2628 header_present, false);
2629
2630
2631 if (insn->header.access_mode == BRW_ALIGN_1) {
2632 if (insn->header.execution_size != BRW_EXECUTE_16)
2633 insn->bits3.ud |= 1 << 12; /* SIMD8 mode */
2634
2635 insn->bits3.gen7_dp.msg_type =
2636 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP;
2637 } else {
2638 insn->bits3.gen7_dp.msg_type =
2639 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2;
2640 }
2641
2642 } else {
2643 brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
2644 msg_length, response_length,
2645 header_present, false);
2646
2647 insn->bits3.gen7_dp.msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
2648
2649 if (insn->header.execution_size != BRW_EXECUTE_16)
2650 insn->bits3.ud |= 1 << 12; /* SIMD8 mode */
2651 }
2652
2653 if (response_length)
2654 insn->bits3.ud |= 1 << 13; /* Return data expected */
2655
2656 insn->bits3.gen7_dp.binding_table_index = bind_table_index;
2657 insn->bits3.ud |= atomic_op << 8;
2658 }
2659
2660 void
2661 brw_untyped_atomic(struct brw_compile *p,
2662 struct brw_reg dest,
2663 struct brw_reg mrf,
2664 unsigned atomic_op,
2665 unsigned bind_table_index,
2666 unsigned msg_length,
2667 unsigned response_length) {
2668 struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_SEND);
2669
2670 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD));
2671 brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
2672 brw_set_src1(p, insn, brw_imm_d(0));
2673 brw_set_dp_untyped_atomic_message(
2674 p, insn, atomic_op, bind_table_index, msg_length, response_length,
2675 insn->header.access_mode == BRW_ALIGN_1);
2676 }
2677
2678 static void
2679 brw_set_dp_untyped_surface_read_message(struct brw_compile *p,
2680 struct brw_instruction *insn,
2681 unsigned bind_table_index,
2682 unsigned msg_length,
2683 unsigned response_length,
2684 bool header_present)
2685 {
2686 const unsigned dispatch_width =
2687 (insn->header.execution_size == BRW_EXECUTE_16 ? 16 : 8);
2688 const unsigned num_channels = response_length / (dispatch_width / 8);
2689
2690 if (p->brw->is_haswell) {
2691 brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1,
2692 msg_length, response_length,
2693 header_present, false);
2694
2695 insn->bits3.gen7_dp.msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ;
2696 } else {
2697 brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
2698 msg_length, response_length,
2699 header_present, false);
2700
2701 insn->bits3.gen7_dp.msg_type = GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ;
2702 }
2703
2704 if (insn->header.access_mode == BRW_ALIGN_1) {
2705 if (dispatch_width == 16)
2706 insn->bits3.ud |= 1 << 12; /* SIMD16 mode */
2707 else
2708 insn->bits3.ud |= 2 << 12; /* SIMD8 mode */
2709 }
2710
2711 insn->bits3.gen7_dp.binding_table_index = bind_table_index;
2712
2713 /* Set mask of 32-bit channels to drop. */
2714 insn->bits3.ud |= (0xf & (0xf << num_channels)) << 8;
2715 }
2716
2717 void
2718 brw_untyped_surface_read(struct brw_compile *p,
2719 struct brw_reg dest,
2720 struct brw_reg mrf,
2721 unsigned bind_table_index,
2722 unsigned msg_length,
2723 unsigned response_length)
2724 {
2725 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2726
2727 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD));
2728 brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
2729 brw_set_dp_untyped_surface_read_message(
2730 p, insn, bind_table_index, msg_length, response_length,
2731 insn->header.access_mode == BRW_ALIGN_1);
2732 }
2733
2734 /**
2735 * This instruction is generated as a single-channel align1 instruction by
2736 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2737 *
2738 * We can't use the typed atomic op in the FS because that has the execution
2739 * mask ANDed with the pixel mask, but we just want to write the one dword for
2740 * all the pixels.
2741 *
2742 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2743 * one u32. So we use the same untyped atomic write message as the pixel
2744 * shader.
2745 *
2746 * The untyped atomic operation requires a BUFFER surface type with RAW
2747 * format, and is only accessible through the legacy DATA_CACHE dataport
2748 * messages.
2749 */
2750 void brw_shader_time_add(struct brw_compile *p,
2751 struct brw_reg payload,
2752 uint32_t surf_index)
2753 {
2754 struct brw_context *brw = p->brw;
2755 assert(brw->gen >= 7);
2756
2757 brw_push_insn_state(p);
2758 brw_set_access_mode(p, BRW_ALIGN_1);
2759 brw_set_mask_control(p, BRW_MASK_DISABLE);
2760 struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
2761 brw_pop_insn_state(p);
2762
2763 /* We use brw_vec1_reg and unmasked because we want to increment the given
2764 * offset only once.
2765 */
2766 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2767 BRW_ARF_NULL, 0));
2768 brw_set_src0(p, send, brw_vec1_reg(payload.file,
2769 payload.nr, 0));
2770 brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, surf_index,
2771 2 /* message length */,
2772 0 /* response length */,
2773 false /* header present */);
2774 }