f831413cd477d30f9e7aeb6e3129a8b9ce0e1f47
[mesa.git] / src / mesa / drivers / dri / i965 / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_context.h"
34 #include "brw_defines.h"
35 #include "brw_eu.h"
36
37 #include "glsl/ralloc.h"
38
39 /***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43 static void guess_execution_size(struct brw_compile *p,
44 struct brw_instruction *insn,
45 struct brw_reg reg)
46 {
47 if (reg.width == BRW_WIDTH_8 && p->compressed)
48 insn->header.execution_size = BRW_EXECUTE_16;
49 else
50 insn->header.execution_size = reg.width; /* note - definitions are compatible */
51 }
52
53
54 /**
55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
56 * registers, implicitly moving the operand to a message register.
57 *
58 * On Sandybridge, this is no longer the case. This function performs the
59 * explicit move; it should be called before emitting a SEND instruction.
60 */
61 void
62 gen6_resolve_implied_move(struct brw_compile *p,
63 struct brw_reg *src,
64 unsigned msg_reg_nr)
65 {
66 struct brw_context *brw = p->brw;
67 if (brw->gen < 6)
68 return;
69
70 if (src->file == BRW_MESSAGE_REGISTER_FILE)
71 return;
72
73 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
74 brw_push_insn_state(p);
75 brw_set_mask_control(p, BRW_MASK_DISABLE);
76 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
77 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
78 retype(*src, BRW_REGISTER_TYPE_UD));
79 brw_pop_insn_state(p);
80 }
81 *src = brw_message_reg(msg_reg_nr);
82 }
83
84 static void
85 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
86 {
87 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
88 * "The send with EOT should use register space R112-R127 for <src>. This is
89 * to enable loading of a new thread into the same slot while the message
90 * with EOT for current thread is pending dispatch."
91 *
92 * Since we're pretending to have 16 MRFs anyway, we may as well use the
93 * registers required for messages with EOT.
94 */
95 struct brw_context *brw = p->brw;
96 if (brw->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
97 reg->file = BRW_GENERAL_REGISTER_FILE;
98 reg->nr += GEN7_MRF_HACK_START;
99 }
100 }
101
102 /**
103 * Convert a brw_reg_type enumeration value into the hardware representation.
104 *
105 * The hardware encoding may depend on whether the value is an immediate.
106 */
107 unsigned
108 brw_reg_type_to_hw_type(const struct brw_context *brw,
109 enum brw_reg_type type, unsigned file)
110 {
111 if (file == BRW_IMMEDIATE_VALUE) {
112 const static int imm_hw_types[] = {
113 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
114 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
115 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
116 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
117 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
118 [BRW_REGISTER_TYPE_UB] = -1,
119 [BRW_REGISTER_TYPE_B] = -1,
120 [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
121 [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
122 [BRW_REGISTER_TYPE_V] = BRW_HW_REG_IMM_TYPE_V,
123 [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
124 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
125 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
126 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
127 };
128 assert(type < ARRAY_SIZE(imm_hw_types));
129 assert(imm_hw_types[type] != -1);
130 assert(brw->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
131 return imm_hw_types[type];
132 } else {
133 /* Non-immediate registers */
134 const static int hw_types[] = {
135 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
136 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
137 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
138 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
139 [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
140 [BRW_REGISTER_TYPE_B] = BRW_HW_REG_NON_IMM_TYPE_B,
141 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
142 [BRW_REGISTER_TYPE_UV] = -1,
143 [BRW_REGISTER_TYPE_VF] = -1,
144 [BRW_REGISTER_TYPE_V] = -1,
145 [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
146 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
147 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
148 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
149 };
150 assert(type < ARRAY_SIZE(hw_types));
151 assert(hw_types[type] != -1);
152 assert(brw->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
153 assert(brw->gen >= 8 || type < BRW_REGISTER_TYPE_HF);
154 return hw_types[type];
155 }
156 }
157
158 void
159 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
160 struct brw_reg dest)
161 {
162 if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
163 dest.file != BRW_MESSAGE_REGISTER_FILE)
164 assert(dest.nr < 128);
165
166 gen7_convert_mrf_to_grf(p, &dest);
167
168 insn->bits1.da1.dest_reg_file = dest.file;
169 insn->bits1.da1.dest_reg_type =
170 brw_reg_type_to_hw_type(p->brw, dest.type, dest.file);
171 insn->bits1.da1.dest_address_mode = dest.address_mode;
172
173 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
174 insn->bits1.da1.dest_reg_nr = dest.nr;
175
176 if (insn->header.access_mode == BRW_ALIGN_1) {
177 insn->bits1.da1.dest_subreg_nr = dest.subnr;
178 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
179 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
180 insn->bits1.da1.dest_horiz_stride = dest.hstride;
181 }
182 else {
183 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
184 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
185 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
186 dest.file == BRW_MESSAGE_REGISTER_FILE) {
187 assert(dest.dw1.bits.writemask != 0);
188 }
189 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
190 * Although Dst.HorzStride is a don't care for Align16, HW needs
191 * this to be programmed as "01".
192 */
193 insn->bits1.da16.dest_horiz_stride = 1;
194 }
195 }
196 else {
197 insn->bits1.ia1.dest_subreg_nr = dest.subnr;
198
199 /* These are different sizes in align1 vs align16:
200 */
201 if (insn->header.access_mode == BRW_ALIGN_1) {
202 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
203 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
204 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
205 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
206 }
207 else {
208 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
209 /* even ignored in da16, still need to set as '01' */
210 insn->bits1.ia16.dest_horiz_stride = 1;
211 }
212 }
213
214 /* NEW: Set the execution size based on dest.width and
215 * insn->compression_control:
216 */
217 guess_execution_size(p, insn, dest);
218 }
219
220 extern int reg_type_size[];
221
222 static void
223 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
224 {
225 int hstride_for_reg[] = {0, 1, 2, 4};
226 int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
227 int width_for_reg[] = {1, 2, 4, 8, 16};
228 int execsize_for_reg[] = {1, 2, 4, 8, 16};
229 int width, hstride, vstride, execsize;
230
231 if (reg.file == BRW_IMMEDIATE_VALUE) {
232 /* 3.3.6: Region Parameters. Restriction: Immediate vectors
233 * mean the destination has to be 128-bit aligned and the
234 * destination horiz stride has to be a word.
235 */
236 if (reg.type == BRW_REGISTER_TYPE_V) {
237 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
238 reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
239 }
240
241 return;
242 }
243
244 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
245 reg.file == BRW_ARF_NULL)
246 return;
247
248 assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
249 hstride = hstride_for_reg[reg.hstride];
250
251 if (reg.vstride == 0xf) {
252 vstride = -1;
253 } else {
254 assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
255 vstride = vstride_for_reg[reg.vstride];
256 }
257
258 assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
259 width = width_for_reg[reg.width];
260
261 assert(insn->header.execution_size >= 0 &&
262 insn->header.execution_size < Elements(execsize_for_reg));
263 execsize = execsize_for_reg[insn->header.execution_size];
264
265 /* Restrictions from 3.3.10: Register Region Restrictions. */
266 /* 3. */
267 assert(execsize >= width);
268
269 /* 4. */
270 if (execsize == width && hstride != 0) {
271 assert(vstride == -1 || vstride == width * hstride);
272 }
273
274 /* 5. */
275 if (execsize == width && hstride == 0) {
276 /* no restriction on vstride. */
277 }
278
279 /* 6. */
280 if (width == 1) {
281 assert(hstride == 0);
282 }
283
284 /* 7. */
285 if (execsize == 1 && width == 1) {
286 assert(hstride == 0);
287 assert(vstride == 0);
288 }
289
290 /* 8. */
291 if (vstride == 0 && hstride == 0) {
292 assert(width == 1);
293 }
294
295 /* 10. Check destination issues. */
296 }
297
298 void
299 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
300 struct brw_reg reg)
301 {
302 struct brw_context *brw = p->brw;
303
304 if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
305 assert(reg.nr < 128);
306
307 gen7_convert_mrf_to_grf(p, &reg);
308
309 if (brw->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
310 insn->header.opcode == BRW_OPCODE_SENDC)) {
311 /* Any source modifiers or regions will be ignored, since this just
312 * identifies the MRF/GRF to start reading the message contents from.
313 * Check for some likely failures.
314 */
315 assert(!reg.negate);
316 assert(!reg.abs);
317 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
318 }
319
320 validate_reg(insn, reg);
321
322 insn->bits1.da1.src0_reg_file = reg.file;
323 insn->bits1.da1.src0_reg_type =
324 brw_reg_type_to_hw_type(brw, reg.type, reg.file);
325 insn->bits2.da1.src0_abs = reg.abs;
326 insn->bits2.da1.src0_negate = reg.negate;
327 insn->bits2.da1.src0_address_mode = reg.address_mode;
328
329 if (reg.file == BRW_IMMEDIATE_VALUE) {
330 insn->bits3.ud = reg.dw1.ud;
331
332 /* The Bspec's section titled "Non-present Operands" claims that if src0
333 * is an immediate that src1's type must be the same as that of src0.
334 *
335 * The SNB+ DataTypeIndex instruction compaction tables contain mappings
336 * that do not follow this rule. E.g., from the IVB/HSW table:
337 *
338 * DataTypeIndex 18-Bit Mapping Mapped Meaning
339 * 3 001000001011111101 r:f | i:vf | a:ud | <1> | dir |
340 *
341 * And from the SNB table:
342 *
343 * DataTypeIndex 18-Bit Mapping Mapped Meaning
344 * 8 001000000111101100 a:w | i:w | a:ud | <1> | dir |
345 *
346 * Neither of these cause warnings from the simulator when used,
347 * compacted or otherwise. In fact, all compaction mappings that have an
348 * immediate in src0 use a:ud for src1.
349 *
350 * The GM45 instruction compaction tables do not contain mapped meanings
351 * so it's not clear whether it has the restriction. We'll assume it was
352 * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
353 */
354 insn->bits1.da1.src1_reg_file = 0; /* arf */
355 if (brw->gen < 6) {
356 insn->bits1.da1.src1_reg_type = insn->bits1.da1.src0_reg_type;
357 } else {
358 insn->bits1.da1.src1_reg_type = BRW_HW_REG_TYPE_UD;
359 }
360
361 /* Compacted instructions only have 12-bits (plus 1 for the other 20)
362 * for immediate values. Presumably the hardware engineers realized
363 * that the only useful floating-point value that could be represented
364 * in this format is 0.0, which can also be represented as a VF-typed
365 * immediate, so they gave us the previously mentioned mapping on IVB+.
366 *
367 * Strangely, we do have a mapping for imm:f in src1, so we don't need
368 * to do this there.
369 *
370 * If we see a 0.0:F, change the type to VF so that it can be compacted.
371 */
372 if (insn->bits3.ud == 0x0 &&
373 insn->bits1.da1.src0_reg_type == BRW_HW_REG_TYPE_F) {
374 insn->bits1.da1.src0_reg_type = BRW_HW_REG_IMM_TYPE_VF;
375 }
376 }
377 else
378 {
379 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
380 if (insn->header.access_mode == BRW_ALIGN_1) {
381 insn->bits2.da1.src0_subreg_nr = reg.subnr;
382 insn->bits2.da1.src0_reg_nr = reg.nr;
383 }
384 else {
385 insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
386 insn->bits2.da16.src0_reg_nr = reg.nr;
387 }
388 }
389 else {
390 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
391
392 if (insn->header.access_mode == BRW_ALIGN_1) {
393 insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
394 }
395 else {
396 insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
397 }
398 }
399
400 if (insn->header.access_mode == BRW_ALIGN_1) {
401 if (reg.width == BRW_WIDTH_1 &&
402 insn->header.execution_size == BRW_EXECUTE_1) {
403 insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
404 insn->bits2.da1.src0_width = BRW_WIDTH_1;
405 insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
406 }
407 else {
408 insn->bits2.da1.src0_horiz_stride = reg.hstride;
409 insn->bits2.da1.src0_width = reg.width;
410 insn->bits2.da1.src0_vert_stride = reg.vstride;
411 }
412 }
413 else {
414 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
415 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
416 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
417 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
418
419 /* This is an oddity of the fact we're using the same
420 * descriptions for registers in align_16 as align_1:
421 */
422 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
423 insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
424 else
425 insn->bits2.da16.src0_vert_stride = reg.vstride;
426 }
427 }
428 }
429
430
431 void
432 brw_set_src1(struct brw_compile *p,
433 struct brw_instruction *insn,
434 struct brw_reg reg)
435 {
436 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
437
438 if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
439 assert(reg.nr < 128);
440
441 gen7_convert_mrf_to_grf(p, &reg);
442
443 validate_reg(insn, reg);
444
445 insn->bits1.da1.src1_reg_file = reg.file;
446 insn->bits1.da1.src1_reg_type =
447 brw_reg_type_to_hw_type(p->brw, reg.type, reg.file);
448 insn->bits3.da1.src1_abs = reg.abs;
449 insn->bits3.da1.src1_negate = reg.negate;
450
451 /* Only src1 can be immediate in two-argument instructions.
452 */
453 assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
454
455 if (reg.file == BRW_IMMEDIATE_VALUE) {
456 insn->bits3.ud = reg.dw1.ud;
457 }
458 else {
459 /* This is a hardware restriction, which may or may not be lifted
460 * in the future:
461 */
462 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
463 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
464
465 if (insn->header.access_mode == BRW_ALIGN_1) {
466 insn->bits3.da1.src1_subreg_nr = reg.subnr;
467 insn->bits3.da1.src1_reg_nr = reg.nr;
468 }
469 else {
470 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
471 insn->bits3.da16.src1_reg_nr = reg.nr;
472 }
473
474 if (insn->header.access_mode == BRW_ALIGN_1) {
475 if (reg.width == BRW_WIDTH_1 &&
476 insn->header.execution_size == BRW_EXECUTE_1) {
477 insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
478 insn->bits3.da1.src1_width = BRW_WIDTH_1;
479 insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
480 }
481 else {
482 insn->bits3.da1.src1_horiz_stride = reg.hstride;
483 insn->bits3.da1.src1_width = reg.width;
484 insn->bits3.da1.src1_vert_stride = reg.vstride;
485 }
486 }
487 else {
488 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
489 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
490 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
491 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
492
493 /* This is an oddity of the fact we're using the same
494 * descriptions for registers in align_16 as align_1:
495 */
496 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
497 insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
498 else
499 insn->bits3.da16.src1_vert_stride = reg.vstride;
500 }
501 }
502 }
503
504 /**
505 * Set the Message Descriptor and Extended Message Descriptor fields
506 * for SEND messages.
507 *
508 * \note This zeroes out the Function Control bits, so it must be called
509 * \b before filling out any message-specific data. Callers can
510 * choose not to fill in irrelevant bits; they will be zero.
511 */
512 static void
513 brw_set_message_descriptor(struct brw_compile *p,
514 struct brw_instruction *inst,
515 enum brw_message_target sfid,
516 unsigned msg_length,
517 unsigned response_length,
518 bool header_present,
519 bool end_of_thread)
520 {
521 struct brw_context *brw = p->brw;
522
523 brw_set_src1(p, inst, brw_imm_d(0));
524
525 if (brw->gen >= 5) {
526 inst->bits3.generic_gen5.header_present = header_present;
527 inst->bits3.generic_gen5.response_length = response_length;
528 inst->bits3.generic_gen5.msg_length = msg_length;
529 inst->bits3.generic_gen5.end_of_thread = end_of_thread;
530
531 if (brw->gen >= 6) {
532 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
533 inst->header.destreg__conditionalmod = sfid;
534 } else {
535 /* Set Extended Message Descriptor (ex_desc) */
536 inst->bits2.send_gen5.sfid = sfid;
537 inst->bits2.send_gen5.end_of_thread = end_of_thread;
538 }
539 } else {
540 inst->bits3.generic.response_length = response_length;
541 inst->bits3.generic.msg_length = msg_length;
542 inst->bits3.generic.msg_target = sfid;
543 inst->bits3.generic.end_of_thread = end_of_thread;
544 }
545 }
546
547 static void brw_set_math_message( struct brw_compile *p,
548 struct brw_instruction *insn,
549 unsigned function,
550 unsigned integer_type,
551 bool low_precision,
552 unsigned dataType )
553 {
554 struct brw_context *brw = p->brw;
555 unsigned msg_length;
556 unsigned response_length;
557
558 /* Infer message length from the function */
559 switch (function) {
560 case BRW_MATH_FUNCTION_POW:
561 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
562 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
563 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
564 msg_length = 2;
565 break;
566 default:
567 msg_length = 1;
568 break;
569 }
570
571 /* Infer response length from the function */
572 switch (function) {
573 case BRW_MATH_FUNCTION_SINCOS:
574 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
575 response_length = 2;
576 break;
577 default:
578 response_length = 1;
579 break;
580 }
581
582
583 brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
584 msg_length, response_length, false, false);
585 if (brw->gen == 5) {
586 insn->bits3.math_gen5.function = function;
587 insn->bits3.math_gen5.int_type = integer_type;
588 insn->bits3.math_gen5.precision = low_precision;
589 insn->bits3.math_gen5.saturate = insn->header.saturate;
590 insn->bits3.math_gen5.data_type = dataType;
591 insn->bits3.math_gen5.snapshot = 0;
592 } else {
593 insn->bits3.math.function = function;
594 insn->bits3.math.int_type = integer_type;
595 insn->bits3.math.precision = low_precision;
596 insn->bits3.math.saturate = insn->header.saturate;
597 insn->bits3.math.data_type = dataType;
598 }
599 insn->header.saturate = 0;
600 }
601
602
603 static void brw_set_ff_sync_message(struct brw_compile *p,
604 struct brw_instruction *insn,
605 bool allocate,
606 unsigned response_length,
607 bool end_of_thread)
608 {
609 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
610 1, response_length, true, end_of_thread);
611 insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
612 insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
613 insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
614 insn->bits3.urb_gen5.allocate = allocate;
615 insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
616 insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
617 }
618
619 static void brw_set_urb_message( struct brw_compile *p,
620 struct brw_instruction *insn,
621 enum brw_urb_write_flags flags,
622 unsigned msg_length,
623 unsigned response_length,
624 unsigned offset,
625 unsigned swizzle_control )
626 {
627 struct brw_context *brw = p->brw;
628
629 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
630 msg_length, response_length, true,
631 flags & BRW_URB_WRITE_EOT);
632 if (brw->gen == 7) {
633 if (flags & BRW_URB_WRITE_OWORD) {
634 assert(msg_length == 2); /* header + one OWORD of data */
635 insn->bits3.urb_gen7.opcode = BRW_URB_OPCODE_WRITE_OWORD;
636 } else {
637 insn->bits3.urb_gen7.opcode = BRW_URB_OPCODE_WRITE_HWORD;
638 }
639 insn->bits3.urb_gen7.offset = offset;
640 assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
641 insn->bits3.urb_gen7.swizzle_control = swizzle_control;
642 insn->bits3.urb_gen7.per_slot_offset =
643 flags & BRW_URB_WRITE_PER_SLOT_OFFSET ? 1 : 0;
644 insn->bits3.urb_gen7.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
645 } else if (brw->gen >= 5) {
646 insn->bits3.urb_gen5.opcode = 0; /* URB_WRITE */
647 insn->bits3.urb_gen5.offset = offset;
648 insn->bits3.urb_gen5.swizzle_control = swizzle_control;
649 insn->bits3.urb_gen5.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
650 insn->bits3.urb_gen5.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
651 insn->bits3.urb_gen5.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
652 } else {
653 insn->bits3.urb.opcode = 0; /* ? */
654 insn->bits3.urb.offset = offset;
655 insn->bits3.urb.swizzle_control = swizzle_control;
656 insn->bits3.urb.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
657 insn->bits3.urb.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
658 insn->bits3.urb.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
659 }
660 }
661
662 void
663 brw_set_dp_write_message(struct brw_compile *p,
664 struct brw_instruction *insn,
665 unsigned binding_table_index,
666 unsigned msg_control,
667 unsigned msg_type,
668 unsigned msg_length,
669 bool header_present,
670 unsigned last_render_target,
671 unsigned response_length,
672 unsigned end_of_thread,
673 unsigned send_commit_msg)
674 {
675 struct brw_context *brw = p->brw;
676 unsigned sfid;
677
678 if (brw->gen >= 7) {
679 /* Use the Render Cache for RT writes; otherwise use the Data Cache */
680 if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
681 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
682 else
683 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
684 } else if (brw->gen == 6) {
685 /* Use the render cache for all write messages. */
686 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
687 } else {
688 sfid = BRW_SFID_DATAPORT_WRITE;
689 }
690
691 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
692 header_present, end_of_thread);
693
694 if (brw->gen >= 7) {
695 insn->bits3.gen7_dp.binding_table_index = binding_table_index;
696 insn->bits3.gen7_dp.msg_control = msg_control;
697 insn->bits3.gen7_dp.last_render_target = last_render_target;
698 insn->bits3.gen7_dp.msg_type = msg_type;
699 } else if (brw->gen == 6) {
700 insn->bits3.gen6_dp.binding_table_index = binding_table_index;
701 insn->bits3.gen6_dp.msg_control = msg_control;
702 insn->bits3.gen6_dp.last_render_target = last_render_target;
703 insn->bits3.gen6_dp.msg_type = msg_type;
704 insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
705 } else if (brw->gen == 5) {
706 insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
707 insn->bits3.dp_write_gen5.msg_control = msg_control;
708 insn->bits3.dp_write_gen5.last_render_target = last_render_target;
709 insn->bits3.dp_write_gen5.msg_type = msg_type;
710 insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
711 } else {
712 insn->bits3.dp_write.binding_table_index = binding_table_index;
713 insn->bits3.dp_write.msg_control = msg_control;
714 insn->bits3.dp_write.last_render_target = last_render_target;
715 insn->bits3.dp_write.msg_type = msg_type;
716 insn->bits3.dp_write.send_commit_msg = send_commit_msg;
717 }
718 }
719
720 void
721 brw_set_dp_read_message(struct brw_compile *p,
722 struct brw_instruction *insn,
723 unsigned binding_table_index,
724 unsigned msg_control,
725 unsigned msg_type,
726 unsigned target_cache,
727 unsigned msg_length,
728 bool header_present,
729 unsigned response_length)
730 {
731 struct brw_context *brw = p->brw;
732 unsigned sfid;
733
734 if (brw->gen >= 7) {
735 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
736 } else if (brw->gen == 6) {
737 if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
738 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
739 else
740 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
741 } else {
742 sfid = BRW_SFID_DATAPORT_READ;
743 }
744
745 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
746 header_present, false);
747
748 if (brw->gen >= 7) {
749 insn->bits3.gen7_dp.binding_table_index = binding_table_index;
750 insn->bits3.gen7_dp.msg_control = msg_control;
751 insn->bits3.gen7_dp.last_render_target = 0;
752 insn->bits3.gen7_dp.msg_type = msg_type;
753 } else if (brw->gen == 6) {
754 insn->bits3.gen6_dp.binding_table_index = binding_table_index;
755 insn->bits3.gen6_dp.msg_control = msg_control;
756 insn->bits3.gen6_dp.last_render_target = 0;
757 insn->bits3.gen6_dp.msg_type = msg_type;
758 insn->bits3.gen6_dp.send_commit_msg = 0;
759 } else if (brw->gen == 5) {
760 insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
761 insn->bits3.dp_read_gen5.msg_control = msg_control;
762 insn->bits3.dp_read_gen5.msg_type = msg_type;
763 insn->bits3.dp_read_gen5.target_cache = target_cache;
764 } else if (brw->is_g4x) {
765 insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
766 insn->bits3.dp_read_g4x.msg_control = msg_control; /*8:10*/
767 insn->bits3.dp_read_g4x.msg_type = msg_type; /*11:13*/
768 insn->bits3.dp_read_g4x.target_cache = target_cache; /*14:15*/
769 } else {
770 insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
771 insn->bits3.dp_read.msg_control = msg_control; /*8:11*/
772 insn->bits3.dp_read.msg_type = msg_type; /*12:13*/
773 insn->bits3.dp_read.target_cache = target_cache; /*14:15*/
774 }
775 }
776
777 void
778 brw_set_sampler_message(struct brw_compile *p,
779 struct brw_instruction *insn,
780 unsigned binding_table_index,
781 unsigned sampler,
782 unsigned msg_type,
783 unsigned response_length,
784 unsigned msg_length,
785 unsigned header_present,
786 unsigned simd_mode,
787 unsigned return_format)
788 {
789 struct brw_context *brw = p->brw;
790
791 brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
792 response_length, header_present, false);
793
794 if (brw->gen >= 7) {
795 insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
796 insn->bits3.sampler_gen7.sampler = sampler;
797 insn->bits3.sampler_gen7.msg_type = msg_type;
798 insn->bits3.sampler_gen7.simd_mode = simd_mode;
799 } else if (brw->gen >= 5) {
800 insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
801 insn->bits3.sampler_gen5.sampler = sampler;
802 insn->bits3.sampler_gen5.msg_type = msg_type;
803 insn->bits3.sampler_gen5.simd_mode = simd_mode;
804 } else if (brw->is_g4x) {
805 insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
806 insn->bits3.sampler_g4x.sampler = sampler;
807 insn->bits3.sampler_g4x.msg_type = msg_type;
808 } else {
809 insn->bits3.sampler.binding_table_index = binding_table_index;
810 insn->bits3.sampler.sampler = sampler;
811 insn->bits3.sampler.msg_type = msg_type;
812 insn->bits3.sampler.return_format = return_format;
813 }
814 }
815
816
817 #define next_insn brw_next_insn
818 struct brw_instruction *
819 brw_next_insn(struct brw_compile *p, unsigned opcode)
820 {
821 struct brw_instruction *insn;
822
823 if (p->nr_insn + 1 > p->store_size) {
824 p->store_size <<= 1;
825 p->store = reralloc(p->mem_ctx, p->store,
826 struct brw_instruction, p->store_size);
827 }
828
829 p->next_insn_offset += 16;
830 insn = &p->store[p->nr_insn++];
831 memcpy(insn, p->current, sizeof(*insn));
832
833 /* Reset this one-shot flag:
834 */
835
836 if (p->current->header.destreg__conditionalmod) {
837 p->current->header.destreg__conditionalmod = 0;
838 p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
839 }
840
841 insn->header.opcode = opcode;
842 return insn;
843 }
844
845 static struct brw_instruction *brw_alu1( struct brw_compile *p,
846 unsigned opcode,
847 struct brw_reg dest,
848 struct brw_reg src )
849 {
850 struct brw_instruction *insn = next_insn(p, opcode);
851 brw_set_dest(p, insn, dest);
852 brw_set_src0(p, insn, src);
853 return insn;
854 }
855
856 static struct brw_instruction *brw_alu2(struct brw_compile *p,
857 unsigned opcode,
858 struct brw_reg dest,
859 struct brw_reg src0,
860 struct brw_reg src1 )
861 {
862 struct brw_instruction *insn = next_insn(p, opcode);
863 brw_set_dest(p, insn, dest);
864 brw_set_src0(p, insn, src0);
865 brw_set_src1(p, insn, src1);
866 return insn;
867 }
868
869 static int
870 get_3src_subreg_nr(struct brw_reg reg)
871 {
872 if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
873 assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
874 return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
875 } else {
876 return reg.subnr / 4;
877 }
878 }
879
880 static struct brw_instruction *brw_alu3(struct brw_compile *p,
881 unsigned opcode,
882 struct brw_reg dest,
883 struct brw_reg src0,
884 struct brw_reg src1,
885 struct brw_reg src2)
886 {
887 struct brw_context *brw = p->brw;
888 struct brw_instruction *insn = next_insn(p, opcode);
889
890 gen7_convert_mrf_to_grf(p, &dest);
891
892 assert(insn->header.access_mode == BRW_ALIGN_16);
893
894 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
895 dest.file == BRW_MESSAGE_REGISTER_FILE);
896 assert(dest.nr < 128);
897 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
898 assert(dest.type == BRW_REGISTER_TYPE_F ||
899 dest.type == BRW_REGISTER_TYPE_D ||
900 dest.type == BRW_REGISTER_TYPE_UD);
901 insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
902 insn->bits1.da3src.dest_reg_nr = dest.nr;
903 insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
904 insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
905 guess_execution_size(p, insn, dest);
906
907 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
908 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
909 assert(src0.nr < 128);
910 insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
911 insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
912 insn->bits2.da3src.src0_reg_nr = src0.nr;
913 insn->bits1.da3src.src0_abs = src0.abs;
914 insn->bits1.da3src.src0_negate = src0.negate;
915 insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
916
917 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
918 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
919 assert(src1.nr < 128);
920 insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
921 insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
922 insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
923 insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
924 insn->bits3.da3src.src1_reg_nr = src1.nr;
925 insn->bits1.da3src.src1_abs = src1.abs;
926 insn->bits1.da3src.src1_negate = src1.negate;
927
928 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
929 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
930 assert(src2.nr < 128);
931 insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
932 insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
933 insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
934 insn->bits3.da3src.src2_reg_nr = src2.nr;
935 insn->bits1.da3src.src2_abs = src2.abs;
936 insn->bits1.da3src.src2_negate = src2.negate;
937
938 if (brw->gen >= 7) {
939 /* Set both the source and destination types based on dest.type,
940 * ignoring the source register types. The MAD and LRP emitters ensure
941 * that all four types are float. The BFE and BFI2 emitters, however,
942 * may send us mixed D and UD types and want us to ignore that and use
943 * the destination type.
944 */
945 switch (dest.type) {
946 case BRW_REGISTER_TYPE_F:
947 insn->bits1.da3src.src_type = BRW_3SRC_TYPE_F;
948 insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_F;
949 break;
950 case BRW_REGISTER_TYPE_D:
951 insn->bits1.da3src.src_type = BRW_3SRC_TYPE_D;
952 insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_D;
953 break;
954 case BRW_REGISTER_TYPE_UD:
955 insn->bits1.da3src.src_type = BRW_3SRC_TYPE_UD;
956 insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_UD;
957 break;
958 }
959 }
960
961 return insn;
962 }
963
964
965 /***********************************************************************
966 * Convenience routines.
967 */
968 #define ALU1(OP) \
969 struct brw_instruction *brw_##OP(struct brw_compile *p, \
970 struct brw_reg dest, \
971 struct brw_reg src0) \
972 { \
973 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
974 }
975
976 #define ALU2(OP) \
977 struct brw_instruction *brw_##OP(struct brw_compile *p, \
978 struct brw_reg dest, \
979 struct brw_reg src0, \
980 struct brw_reg src1) \
981 { \
982 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
983 }
984
985 #define ALU3(OP) \
986 struct brw_instruction *brw_##OP(struct brw_compile *p, \
987 struct brw_reg dest, \
988 struct brw_reg src0, \
989 struct brw_reg src1, \
990 struct brw_reg src2) \
991 { \
992 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
993 }
994
995 #define ALU3F(OP) \
996 struct brw_instruction *brw_##OP(struct brw_compile *p, \
997 struct brw_reg dest, \
998 struct brw_reg src0, \
999 struct brw_reg src1, \
1000 struct brw_reg src2) \
1001 { \
1002 assert(dest.type == BRW_REGISTER_TYPE_F); \
1003 assert(src0.type == BRW_REGISTER_TYPE_F); \
1004 assert(src1.type == BRW_REGISTER_TYPE_F); \
1005 assert(src2.type == BRW_REGISTER_TYPE_F); \
1006 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1007 }
1008
1009 /* Rounding operations (other than RNDD) require two instructions - the first
1010 * stores a rounded value (possibly the wrong way) in the dest register, but
1011 * also sets a per-channel "increment bit" in the flag register. A predicated
1012 * add of 1.0 fixes dest to contain the desired result.
1013 *
1014 * Sandybridge and later appear to round correctly without an ADD.
1015 */
1016 #define ROUND(OP) \
1017 void brw_##OP(struct brw_compile *p, \
1018 struct brw_reg dest, \
1019 struct brw_reg src) \
1020 { \
1021 struct brw_instruction *rnd, *add; \
1022 rnd = next_insn(p, BRW_OPCODE_##OP); \
1023 brw_set_dest(p, rnd, dest); \
1024 brw_set_src0(p, rnd, src); \
1025 \
1026 if (p->brw->gen < 6) { \
1027 /* turn on round-increments */ \
1028 rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R; \
1029 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
1030 add->header.predicate_control = BRW_PREDICATE_NORMAL; \
1031 } \
1032 }
1033
1034
1035 ALU1(MOV)
1036 ALU2(SEL)
1037 ALU1(NOT)
1038 ALU2(AND)
1039 ALU2(OR)
1040 ALU2(XOR)
1041 ALU2(SHR)
1042 ALU2(SHL)
1043 ALU2(ASR)
1044 ALU1(F32TO16)
1045 ALU1(F16TO32)
1046 ALU1(FRC)
1047 ALU1(RNDD)
1048 ALU2(MAC)
1049 ALU2(MACH)
1050 ALU1(LZD)
1051 ALU2(DP4)
1052 ALU2(DPH)
1053 ALU2(DP3)
1054 ALU2(DP2)
1055 ALU2(LINE)
1056 ALU2(PLN)
1057 ALU3F(MAD)
1058 ALU3F(LRP)
1059 ALU1(BFREV)
1060 ALU3(BFE)
1061 ALU2(BFI1)
1062 ALU3(BFI2)
1063 ALU1(FBH)
1064 ALU1(FBL)
1065 ALU1(CBIT)
1066 ALU2(ADDC)
1067 ALU2(SUBB)
1068
1069 ROUND(RNDZ)
1070 ROUND(RNDE)
1071
1072
1073 struct brw_instruction *brw_ADD(struct brw_compile *p,
1074 struct brw_reg dest,
1075 struct brw_reg src0,
1076 struct brw_reg src1)
1077 {
1078 /* 6.2.2: add */
1079 if (src0.type == BRW_REGISTER_TYPE_F ||
1080 (src0.file == BRW_IMMEDIATE_VALUE &&
1081 src0.type == BRW_REGISTER_TYPE_VF)) {
1082 assert(src1.type != BRW_REGISTER_TYPE_UD);
1083 assert(src1.type != BRW_REGISTER_TYPE_D);
1084 }
1085
1086 if (src1.type == BRW_REGISTER_TYPE_F ||
1087 (src1.file == BRW_IMMEDIATE_VALUE &&
1088 src1.type == BRW_REGISTER_TYPE_VF)) {
1089 assert(src0.type != BRW_REGISTER_TYPE_UD);
1090 assert(src0.type != BRW_REGISTER_TYPE_D);
1091 }
1092
1093 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1094 }
1095
1096 struct brw_instruction *brw_AVG(struct brw_compile *p,
1097 struct brw_reg dest,
1098 struct brw_reg src0,
1099 struct brw_reg src1)
1100 {
1101 assert(dest.type == src0.type);
1102 assert(src0.type == src1.type);
1103 switch (src0.type) {
1104 case BRW_REGISTER_TYPE_B:
1105 case BRW_REGISTER_TYPE_UB:
1106 case BRW_REGISTER_TYPE_W:
1107 case BRW_REGISTER_TYPE_UW:
1108 case BRW_REGISTER_TYPE_D:
1109 case BRW_REGISTER_TYPE_UD:
1110 break;
1111 default:
1112 assert(!"Bad type for brw_AVG");
1113 }
1114
1115 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1116 }
1117
1118 struct brw_instruction *brw_MUL(struct brw_compile *p,
1119 struct brw_reg dest,
1120 struct brw_reg src0,
1121 struct brw_reg src1)
1122 {
1123 /* 6.32.38: mul */
1124 if (src0.type == BRW_REGISTER_TYPE_D ||
1125 src0.type == BRW_REGISTER_TYPE_UD ||
1126 src1.type == BRW_REGISTER_TYPE_D ||
1127 src1.type == BRW_REGISTER_TYPE_UD) {
1128 assert(dest.type != BRW_REGISTER_TYPE_F);
1129 }
1130
1131 if (src0.type == BRW_REGISTER_TYPE_F ||
1132 (src0.file == BRW_IMMEDIATE_VALUE &&
1133 src0.type == BRW_REGISTER_TYPE_VF)) {
1134 assert(src1.type != BRW_REGISTER_TYPE_UD);
1135 assert(src1.type != BRW_REGISTER_TYPE_D);
1136 }
1137
1138 if (src1.type == BRW_REGISTER_TYPE_F ||
1139 (src1.file == BRW_IMMEDIATE_VALUE &&
1140 src1.type == BRW_REGISTER_TYPE_VF)) {
1141 assert(src0.type != BRW_REGISTER_TYPE_UD);
1142 assert(src0.type != BRW_REGISTER_TYPE_D);
1143 }
1144
1145 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1146 src0.nr != BRW_ARF_ACCUMULATOR);
1147 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1148 src1.nr != BRW_ARF_ACCUMULATOR);
1149
1150 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1151 }
1152
1153
1154 void brw_NOP(struct brw_compile *p)
1155 {
1156 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
1157 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1158 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1159 brw_set_src1(p, insn, brw_imm_ud(0x0));
1160 }
1161
1162
1163
1164
1165
1166 /***********************************************************************
1167 * Comparisons, if/else/endif
1168 */
1169
1170 struct brw_instruction *brw_JMPI(struct brw_compile *p,
1171 struct brw_reg dest,
1172 struct brw_reg src0,
1173 struct brw_reg src1)
1174 {
1175 struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
1176
1177 insn->header.execution_size = 1;
1178 insn->header.compression_control = BRW_COMPRESSION_NONE;
1179 insn->header.mask_control = BRW_MASK_DISABLE;
1180
1181 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1182
1183 return insn;
1184 }
1185
1186 static void
1187 push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1188 {
1189 p->if_stack[p->if_stack_depth] = inst - p->store;
1190
1191 p->if_stack_depth++;
1192 if (p->if_stack_array_size <= p->if_stack_depth) {
1193 p->if_stack_array_size *= 2;
1194 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1195 p->if_stack_array_size);
1196 }
1197 }
1198
1199 static struct brw_instruction *
1200 pop_if_stack(struct brw_compile *p)
1201 {
1202 p->if_stack_depth--;
1203 return &p->store[p->if_stack[p->if_stack_depth]];
1204 }
1205
1206 static void
1207 push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1208 {
1209 if (p->loop_stack_array_size < p->loop_stack_depth) {
1210 p->loop_stack_array_size *= 2;
1211 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1212 p->loop_stack_array_size);
1213 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1214 p->loop_stack_array_size);
1215 }
1216
1217 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1218 p->loop_stack_depth++;
1219 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1220 }
1221
1222 static struct brw_instruction *
1223 get_inner_do_insn(struct brw_compile *p)
1224 {
1225 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1226 }
1227
1228 /* EU takes the value from the flag register and pushes it onto some
1229 * sort of a stack (presumably merging with any flag value already on
1230 * the stack). Within an if block, the flags at the top of the stack
1231 * control execution on each channel of the unit, eg. on each of the
1232 * 16 pixel values in our wm programs.
1233 *
1234 * When the matching 'else' instruction is reached (presumably by
1235 * countdown of the instruction count patched in by our ELSE/ENDIF
1236 * functions), the relevent flags are inverted.
1237 *
1238 * When the matching 'endif' instruction is reached, the flags are
1239 * popped off. If the stack is now empty, normal execution resumes.
1240 */
1241 struct brw_instruction *
1242 brw_IF(struct brw_compile *p, unsigned execute_size)
1243 {
1244 struct brw_context *brw = p->brw;
1245 struct brw_instruction *insn;
1246
1247 insn = next_insn(p, BRW_OPCODE_IF);
1248
1249 /* Override the defaults for this instruction:
1250 */
1251 if (brw->gen < 6) {
1252 brw_set_dest(p, insn, brw_ip_reg());
1253 brw_set_src0(p, insn, brw_ip_reg());
1254 brw_set_src1(p, insn, brw_imm_d(0x0));
1255 } else if (brw->gen == 6) {
1256 brw_set_dest(p, insn, brw_imm_w(0));
1257 insn->bits1.branch_gen6.jump_count = 0;
1258 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1259 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1260 } else {
1261 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1262 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1263 brw_set_src1(p, insn, brw_imm_ud(0));
1264 insn->bits3.break_cont.jip = 0;
1265 insn->bits3.break_cont.uip = 0;
1266 }
1267
1268 insn->header.execution_size = execute_size;
1269 insn->header.compression_control = BRW_COMPRESSION_NONE;
1270 insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1271 insn->header.mask_control = BRW_MASK_ENABLE;
1272 if (!p->single_program_flow)
1273 insn->header.thread_control = BRW_THREAD_SWITCH;
1274
1275 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1276
1277 push_if_stack(p, insn);
1278 p->if_depth_in_loop[p->loop_stack_depth]++;
1279 return insn;
1280 }
1281
1282 /* This function is only used for gen6-style IF instructions with an
1283 * embedded comparison (conditional modifier). It is not used on gen7.
1284 */
1285 struct brw_instruction *
1286 gen6_IF(struct brw_compile *p, uint32_t conditional,
1287 struct brw_reg src0, struct brw_reg src1)
1288 {
1289 struct brw_instruction *insn;
1290
1291 insn = next_insn(p, BRW_OPCODE_IF);
1292
1293 brw_set_dest(p, insn, brw_imm_w(0));
1294 if (p->compressed) {
1295 insn->header.execution_size = BRW_EXECUTE_16;
1296 } else {
1297 insn->header.execution_size = BRW_EXECUTE_8;
1298 }
1299 insn->bits1.branch_gen6.jump_count = 0;
1300 brw_set_src0(p, insn, src0);
1301 brw_set_src1(p, insn, src1);
1302
1303 assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1304 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1305 insn->header.destreg__conditionalmod = conditional;
1306
1307 if (!p->single_program_flow)
1308 insn->header.thread_control = BRW_THREAD_SWITCH;
1309
1310 push_if_stack(p, insn);
1311 return insn;
1312 }
1313
1314 /**
1315 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1316 */
1317 static void
1318 convert_IF_ELSE_to_ADD(struct brw_compile *p,
1319 struct brw_instruction *if_inst,
1320 struct brw_instruction *else_inst)
1321 {
1322 /* The next instruction (where the ENDIF would be, if it existed) */
1323 struct brw_instruction *next_inst = &p->store[p->nr_insn];
1324
1325 assert(p->single_program_flow);
1326 assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1327 assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1328 assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1329
1330 /* Convert IF to an ADD instruction that moves the instruction pointer
1331 * to the first instruction of the ELSE block. If there is no ELSE
1332 * block, point to where ENDIF would be. Reverse the predicate.
1333 *
1334 * There's no need to execute an ENDIF since we don't need to do any
1335 * stack operations, and if we're currently executing, we just want to
1336 * continue normally.
1337 */
1338 if_inst->header.opcode = BRW_OPCODE_ADD;
1339 if_inst->header.predicate_inverse = 1;
1340
1341 if (else_inst != NULL) {
1342 /* Convert ELSE to an ADD instruction that points where the ENDIF
1343 * would be.
1344 */
1345 else_inst->header.opcode = BRW_OPCODE_ADD;
1346
1347 if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1348 else_inst->bits3.ud = (next_inst - else_inst) * 16;
1349 } else {
1350 if_inst->bits3.ud = (next_inst - if_inst) * 16;
1351 }
1352 }
1353
1354 /**
1355 * Patch IF and ELSE instructions with appropriate jump targets.
1356 */
1357 static void
1358 patch_IF_ELSE(struct brw_compile *p,
1359 struct brw_instruction *if_inst,
1360 struct brw_instruction *else_inst,
1361 struct brw_instruction *endif_inst)
1362 {
1363 struct brw_context *brw = p->brw;
1364
1365 /* We shouldn't be patching IF and ELSE instructions in single program flow
1366 * mode when gen < 6, because in single program flow mode on those
1367 * platforms, we convert flow control instructions to conditional ADDs that
1368 * operate on IP (see brw_ENDIF).
1369 *
1370 * However, on Gen6, writing to IP doesn't work in single program flow mode
1371 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1372 * not be updated by non-flow control instructions."). And on later
1373 * platforms, there is no significant benefit to converting control flow
1374 * instructions to conditional ADDs. So we do patch IF and ELSE
1375 * instructions in single program flow mode on those platforms.
1376 */
1377 if (brw->gen < 6)
1378 assert(!p->single_program_flow);
1379
1380 assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1381 assert(endif_inst != NULL);
1382 assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1383
1384 unsigned br = 1;
1385 /* Jump count is for 64bit data chunk each, so one 128bit instruction
1386 * requires 2 chunks.
1387 */
1388 if (brw->gen >= 5)
1389 br = 2;
1390
1391 assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1392 endif_inst->header.execution_size = if_inst->header.execution_size;
1393
1394 if (else_inst == NULL) {
1395 /* Patch IF -> ENDIF */
1396 if (brw->gen < 6) {
1397 /* Turn it into an IFF, which means no mask stack operations for
1398 * all-false and jumping past the ENDIF.
1399 */
1400 if_inst->header.opcode = BRW_OPCODE_IFF;
1401 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1402 if_inst->bits3.if_else.pop_count = 0;
1403 if_inst->bits3.if_else.pad0 = 0;
1404 } else if (brw->gen == 6) {
1405 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1406 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1407 } else {
1408 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1409 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1410 }
1411 } else {
1412 else_inst->header.execution_size = if_inst->header.execution_size;
1413
1414 /* Patch IF -> ELSE */
1415 if (brw->gen < 6) {
1416 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1417 if_inst->bits3.if_else.pop_count = 0;
1418 if_inst->bits3.if_else.pad0 = 0;
1419 } else if (brw->gen == 6) {
1420 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1421 }
1422
1423 /* Patch ELSE -> ENDIF */
1424 if (brw->gen < 6) {
1425 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1426 * matching ENDIF.
1427 */
1428 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1429 else_inst->bits3.if_else.pop_count = 1;
1430 else_inst->bits3.if_else.pad0 = 0;
1431 } else if (brw->gen == 6) {
1432 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1433 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1434 } else {
1435 /* The IF instruction's JIP should point just past the ELSE */
1436 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1437 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1438 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1439 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1440 }
1441 }
1442 }
1443
1444 void
1445 brw_ELSE(struct brw_compile *p)
1446 {
1447 struct brw_context *brw = p->brw;
1448 struct brw_instruction *insn;
1449
1450 insn = next_insn(p, BRW_OPCODE_ELSE);
1451
1452 if (brw->gen < 6) {
1453 brw_set_dest(p, insn, brw_ip_reg());
1454 brw_set_src0(p, insn, brw_ip_reg());
1455 brw_set_src1(p, insn, brw_imm_d(0x0));
1456 } else if (brw->gen == 6) {
1457 brw_set_dest(p, insn, brw_imm_w(0));
1458 insn->bits1.branch_gen6.jump_count = 0;
1459 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1460 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1461 } else {
1462 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1463 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1464 brw_set_src1(p, insn, brw_imm_ud(0));
1465 insn->bits3.break_cont.jip = 0;
1466 insn->bits3.break_cont.uip = 0;
1467 }
1468
1469 insn->header.compression_control = BRW_COMPRESSION_NONE;
1470 insn->header.mask_control = BRW_MASK_ENABLE;
1471 if (!p->single_program_flow)
1472 insn->header.thread_control = BRW_THREAD_SWITCH;
1473
1474 push_if_stack(p, insn);
1475 }
1476
1477 void
1478 brw_ENDIF(struct brw_compile *p)
1479 {
1480 struct brw_context *brw = p->brw;
1481 struct brw_instruction *insn = NULL;
1482 struct brw_instruction *else_inst = NULL;
1483 struct brw_instruction *if_inst = NULL;
1484 struct brw_instruction *tmp;
1485 bool emit_endif = true;
1486
1487 /* In single program flow mode, we can express IF and ELSE instructions
1488 * equivalently as ADD instructions that operate on IP. On platforms prior
1489 * to Gen6, flow control instructions cause an implied thread switch, so
1490 * this is a significant savings.
1491 *
1492 * However, on Gen6, writing to IP doesn't work in single program flow mode
1493 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1494 * not be updated by non-flow control instructions."). And on later
1495 * platforms, there is no significant benefit to converting control flow
1496 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1497 * Gen5.
1498 */
1499 if (brw->gen < 6 && p->single_program_flow)
1500 emit_endif = false;
1501
1502 /*
1503 * A single next_insn() may change the base adress of instruction store
1504 * memory(p->store), so call it first before referencing the instruction
1505 * store pointer from an index
1506 */
1507 if (emit_endif)
1508 insn = next_insn(p, BRW_OPCODE_ENDIF);
1509
1510 /* Pop the IF and (optional) ELSE instructions from the stack */
1511 p->if_depth_in_loop[p->loop_stack_depth]--;
1512 tmp = pop_if_stack(p);
1513 if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1514 else_inst = tmp;
1515 tmp = pop_if_stack(p);
1516 }
1517 if_inst = tmp;
1518
1519 if (!emit_endif) {
1520 /* ENDIF is useless; don't bother emitting it. */
1521 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1522 return;
1523 }
1524
1525 if (brw->gen < 6) {
1526 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1527 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1528 brw_set_src1(p, insn, brw_imm_d(0x0));
1529 } else if (brw->gen == 6) {
1530 brw_set_dest(p, insn, brw_imm_w(0));
1531 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1532 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1533 } else {
1534 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1535 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1536 brw_set_src1(p, insn, brw_imm_ud(0));
1537 }
1538
1539 insn->header.compression_control = BRW_COMPRESSION_NONE;
1540 insn->header.mask_control = BRW_MASK_ENABLE;
1541 insn->header.thread_control = BRW_THREAD_SWITCH;
1542
1543 /* Also pop item off the stack in the endif instruction: */
1544 if (brw->gen < 6) {
1545 insn->bits3.if_else.jump_count = 0;
1546 insn->bits3.if_else.pop_count = 1;
1547 insn->bits3.if_else.pad0 = 0;
1548 } else if (brw->gen == 6) {
1549 insn->bits1.branch_gen6.jump_count = 2;
1550 } else {
1551 insn->bits3.break_cont.jip = 2;
1552 }
1553 patch_IF_ELSE(p, if_inst, else_inst, insn);
1554 }
1555
1556 struct brw_instruction *brw_BREAK(struct brw_compile *p)
1557 {
1558 struct brw_context *brw = p->brw;
1559 struct brw_instruction *insn;
1560
1561 insn = next_insn(p, BRW_OPCODE_BREAK);
1562 if (brw->gen >= 6) {
1563 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1564 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1565 brw_set_src1(p, insn, brw_imm_d(0x0));
1566 } else {
1567 brw_set_dest(p, insn, brw_ip_reg());
1568 brw_set_src0(p, insn, brw_ip_reg());
1569 brw_set_src1(p, insn, brw_imm_d(0x0));
1570 insn->bits3.if_else.pad0 = 0;
1571 insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1572 }
1573 insn->header.compression_control = BRW_COMPRESSION_NONE;
1574 insn->header.execution_size = BRW_EXECUTE_8;
1575
1576 return insn;
1577 }
1578
1579 struct brw_instruction *gen6_CONT(struct brw_compile *p)
1580 {
1581 struct brw_instruction *insn;
1582
1583 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1584 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1585 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1586 brw_set_dest(p, insn, brw_ip_reg());
1587 brw_set_src0(p, insn, brw_ip_reg());
1588 brw_set_src1(p, insn, brw_imm_d(0x0));
1589
1590 insn->header.compression_control = BRW_COMPRESSION_NONE;
1591 insn->header.execution_size = BRW_EXECUTE_8;
1592 return insn;
1593 }
1594
1595 struct brw_instruction *brw_CONT(struct brw_compile *p)
1596 {
1597 struct brw_instruction *insn;
1598 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1599 brw_set_dest(p, insn, brw_ip_reg());
1600 brw_set_src0(p, insn, brw_ip_reg());
1601 brw_set_src1(p, insn, brw_imm_d(0x0));
1602 insn->header.compression_control = BRW_COMPRESSION_NONE;
1603 insn->header.execution_size = BRW_EXECUTE_8;
1604 /* insn->header.mask_control = BRW_MASK_DISABLE; */
1605 insn->bits3.if_else.pad0 = 0;
1606 insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1607 return insn;
1608 }
1609
1610 struct brw_instruction *gen6_HALT(struct brw_compile *p)
1611 {
1612 struct brw_instruction *insn;
1613
1614 insn = next_insn(p, BRW_OPCODE_HALT);
1615 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1616 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1617 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1618
1619 if (p->compressed) {
1620 insn->header.execution_size = BRW_EXECUTE_16;
1621 } else {
1622 insn->header.compression_control = BRW_COMPRESSION_NONE;
1623 insn->header.execution_size = BRW_EXECUTE_8;
1624 }
1625 return insn;
1626 }
1627
1628 /* DO/WHILE loop:
1629 *
1630 * The DO/WHILE is just an unterminated loop -- break or continue are
1631 * used for control within the loop. We have a few ways they can be
1632 * done.
1633 *
1634 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1635 * jip and no DO instruction.
1636 *
1637 * For non-uniform control flow pre-gen6, there's a DO instruction to
1638 * push the mask, and a WHILE to jump back, and BREAK to get out and
1639 * pop the mask.
1640 *
1641 * For gen6, there's no more mask stack, so no need for DO. WHILE
1642 * just points back to the first instruction of the loop.
1643 */
1644 struct brw_instruction *brw_DO(struct brw_compile *p, unsigned execute_size)
1645 {
1646 struct brw_context *brw = p->brw;
1647
1648 if (brw->gen >= 6 || p->single_program_flow) {
1649 push_loop_stack(p, &p->store[p->nr_insn]);
1650 return &p->store[p->nr_insn];
1651 } else {
1652 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1653
1654 push_loop_stack(p, insn);
1655
1656 /* Override the defaults for this instruction:
1657 */
1658 brw_set_dest(p, insn, brw_null_reg());
1659 brw_set_src0(p, insn, brw_null_reg());
1660 brw_set_src1(p, insn, brw_null_reg());
1661
1662 insn->header.compression_control = BRW_COMPRESSION_NONE;
1663 insn->header.execution_size = execute_size;
1664 insn->header.predicate_control = BRW_PREDICATE_NONE;
1665 /* insn->header.mask_control = BRW_MASK_ENABLE; */
1666 /* insn->header.mask_control = BRW_MASK_DISABLE; */
1667
1668 return insn;
1669 }
1670 }
1671
1672 /**
1673 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1674 * instruction here.
1675 *
1676 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1677 * nesting, since it can always just point to the end of the block/current loop.
1678 */
1679 static void
1680 brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1681 {
1682 struct brw_context *brw = p->brw;
1683 struct brw_instruction *do_inst = get_inner_do_insn(p);
1684 struct brw_instruction *inst;
1685 int br = (brw->gen == 5) ? 2 : 1;
1686
1687 for (inst = while_inst - 1; inst != do_inst; inst--) {
1688 /* If the jump count is != 0, that means that this instruction has already
1689 * been patched because it's part of a loop inside of the one we're
1690 * patching.
1691 */
1692 if (inst->header.opcode == BRW_OPCODE_BREAK &&
1693 inst->bits3.if_else.jump_count == 0) {
1694 inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1695 } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1696 inst->bits3.if_else.jump_count == 0) {
1697 inst->bits3.if_else.jump_count = br * (while_inst - inst);
1698 }
1699 }
1700 }
1701
1702 struct brw_instruction *brw_WHILE(struct brw_compile *p)
1703 {
1704 struct brw_context *brw = p->brw;
1705 struct brw_instruction *insn, *do_insn;
1706 unsigned br = 1;
1707
1708 if (brw->gen >= 5)
1709 br = 2;
1710
1711 if (brw->gen >= 7) {
1712 insn = next_insn(p, BRW_OPCODE_WHILE);
1713 do_insn = get_inner_do_insn(p);
1714
1715 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1716 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1717 brw_set_src1(p, insn, brw_imm_ud(0));
1718 insn->bits3.break_cont.jip = br * (do_insn - insn);
1719
1720 insn->header.execution_size = BRW_EXECUTE_8;
1721 } else if (brw->gen == 6) {
1722 insn = next_insn(p, BRW_OPCODE_WHILE);
1723 do_insn = get_inner_do_insn(p);
1724
1725 brw_set_dest(p, insn, brw_imm_w(0));
1726 insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1727 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1728 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1729
1730 insn->header.execution_size = BRW_EXECUTE_8;
1731 } else {
1732 if (p->single_program_flow) {
1733 insn = next_insn(p, BRW_OPCODE_ADD);
1734 do_insn = get_inner_do_insn(p);
1735
1736 brw_set_dest(p, insn, brw_ip_reg());
1737 brw_set_src0(p, insn, brw_ip_reg());
1738 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1739 insn->header.execution_size = BRW_EXECUTE_1;
1740 } else {
1741 insn = next_insn(p, BRW_OPCODE_WHILE);
1742 do_insn = get_inner_do_insn(p);
1743
1744 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1745
1746 brw_set_dest(p, insn, brw_ip_reg());
1747 brw_set_src0(p, insn, brw_ip_reg());
1748 brw_set_src1(p, insn, brw_imm_d(0));
1749
1750 insn->header.execution_size = do_insn->header.execution_size;
1751 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1752 insn->bits3.if_else.pop_count = 0;
1753 insn->bits3.if_else.pad0 = 0;
1754
1755 brw_patch_break_cont(p, insn);
1756 }
1757 }
1758 insn->header.compression_control = BRW_COMPRESSION_NONE;
1759 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1760
1761 p->loop_stack_depth--;
1762
1763 return insn;
1764 }
1765
1766 /* To integrate with the above, it makes sense that the comparison
1767 * instruction should populate the flag register. It might be simpler
1768 * just to use the flag reg for most WM tasks?
1769 */
1770 void brw_CMP(struct brw_compile *p,
1771 struct brw_reg dest,
1772 unsigned conditional,
1773 struct brw_reg src0,
1774 struct brw_reg src1)
1775 {
1776 struct brw_context *brw = p->brw;
1777 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1778
1779 insn->header.destreg__conditionalmod = conditional;
1780 brw_set_dest(p, insn, dest);
1781 brw_set_src0(p, insn, src0);
1782 brw_set_src1(p, insn, src1);
1783
1784 /* guess_execution_size(insn, src0); */
1785
1786
1787 /* Make it so that future instructions will use the computed flag
1788 * value until brw_set_predicate_control_flag_value() is called
1789 * again.
1790 */
1791 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1792 dest.nr == 0) {
1793 p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1794 p->flag_value = 0xff;
1795 }
1796
1797 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1798 * page says:
1799 * "Any CMP instruction with a null destination must use a {switch}."
1800 *
1801 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1802 * mentioned on their work-arounds pages.
1803 */
1804 if (brw->gen == 7) {
1805 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1806 dest.nr == BRW_ARF_NULL) {
1807 insn->header.thread_control = BRW_THREAD_SWITCH;
1808 }
1809 }
1810 }
1811
1812 /* Issue 'wait' instruction for n1, host could program MMIO
1813 to wake up thread. */
1814 void brw_WAIT (struct brw_compile *p)
1815 {
1816 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1817 struct brw_reg src = brw_notification_1_reg();
1818
1819 brw_set_dest(p, insn, src);
1820 brw_set_src0(p, insn, src);
1821 brw_set_src1(p, insn, brw_null_reg());
1822 insn->header.execution_size = 0; /* must */
1823 insn->header.predicate_control = 0;
1824 insn->header.compression_control = 0;
1825 }
1826
1827
1828 /***********************************************************************
1829 * Helpers for the various SEND message types:
1830 */
1831
1832 /** Extended math function, float[8].
1833 */
1834 void brw_math( struct brw_compile *p,
1835 struct brw_reg dest,
1836 unsigned function,
1837 unsigned msg_reg_nr,
1838 struct brw_reg src,
1839 unsigned data_type,
1840 unsigned precision )
1841 {
1842 struct brw_context *brw = p->brw;
1843
1844 if (brw->gen >= 6) {
1845 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1846
1847 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1848 (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1849 assert(src.file == BRW_GENERAL_REGISTER_FILE);
1850
1851 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1852 if (brw->gen == 6)
1853 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1854
1855 /* Source modifiers are ignored for extended math instructions on Gen6. */
1856 if (brw->gen == 6) {
1857 assert(!src.negate);
1858 assert(!src.abs);
1859 }
1860
1861 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1862 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1863 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1864 assert(src.type != BRW_REGISTER_TYPE_F);
1865 } else {
1866 assert(src.type == BRW_REGISTER_TYPE_F);
1867 }
1868
1869 /* Math is the same ISA format as other opcodes, except that CondModifier
1870 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1871 */
1872 insn->header.destreg__conditionalmod = function;
1873
1874 brw_set_dest(p, insn, dest);
1875 brw_set_src0(p, insn, src);
1876 brw_set_src1(p, insn, brw_null_reg());
1877 } else {
1878 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1879
1880 /* Example code doesn't set predicate_control for send
1881 * instructions.
1882 */
1883 insn->header.predicate_control = 0;
1884 insn->header.destreg__conditionalmod = msg_reg_nr;
1885
1886 brw_set_dest(p, insn, dest);
1887 brw_set_src0(p, insn, src);
1888 brw_set_math_message(p,
1889 insn,
1890 function,
1891 src.type == BRW_REGISTER_TYPE_D,
1892 precision,
1893 data_type);
1894 }
1895 }
1896
1897 /** Extended math function, float[8].
1898 */
1899 void brw_math2(struct brw_compile *p,
1900 struct brw_reg dest,
1901 unsigned function,
1902 struct brw_reg src0,
1903 struct brw_reg src1)
1904 {
1905 struct brw_context *brw = p->brw;
1906 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1907
1908 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1909 (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1910 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1911 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1912
1913 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1914 if (brw->gen == 6) {
1915 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1916 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1917 }
1918
1919 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1920 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1921 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1922 assert(src0.type != BRW_REGISTER_TYPE_F);
1923 assert(src1.type != BRW_REGISTER_TYPE_F);
1924 } else {
1925 assert(src0.type == BRW_REGISTER_TYPE_F);
1926 assert(src1.type == BRW_REGISTER_TYPE_F);
1927 }
1928
1929 /* Source modifiers are ignored for extended math instructions on Gen6. */
1930 if (brw->gen == 6) {
1931 assert(!src0.negate);
1932 assert(!src0.abs);
1933 assert(!src1.negate);
1934 assert(!src1.abs);
1935 }
1936
1937 /* Math is the same ISA format as other opcodes, except that CondModifier
1938 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1939 */
1940 insn->header.destreg__conditionalmod = function;
1941
1942 brw_set_dest(p, insn, dest);
1943 brw_set_src0(p, insn, src0);
1944 brw_set_src1(p, insn, src1);
1945 }
1946
1947
1948 /**
1949 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1950 * using a constant offset per channel.
1951 *
1952 * The offset must be aligned to oword size (16 bytes). Used for
1953 * register spilling.
1954 */
1955 void brw_oword_block_write_scratch(struct brw_compile *p,
1956 struct brw_reg mrf,
1957 int num_regs,
1958 unsigned offset)
1959 {
1960 struct brw_context *brw = p->brw;
1961 uint32_t msg_control, msg_type;
1962 int mlen;
1963
1964 if (brw->gen >= 6)
1965 offset /= 16;
1966
1967 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1968
1969 if (num_regs == 1) {
1970 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1971 mlen = 2;
1972 } else {
1973 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1974 mlen = 3;
1975 }
1976
1977 /* Set up the message header. This is g0, with g0.2 filled with
1978 * the offset. We don't want to leave our offset around in g0 or
1979 * it'll screw up texture samples, so set it up inside the message
1980 * reg.
1981 */
1982 {
1983 brw_push_insn_state(p);
1984 brw_set_mask_control(p, BRW_MASK_DISABLE);
1985 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1986
1987 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1988
1989 /* set message header global offset field (reg 0, element 2) */
1990 brw_MOV(p,
1991 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1992 mrf.nr,
1993 2), BRW_REGISTER_TYPE_UD),
1994 brw_imm_ud(offset));
1995
1996 brw_pop_insn_state(p);
1997 }
1998
1999 {
2000 struct brw_reg dest;
2001 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2002 int send_commit_msg;
2003 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2004 BRW_REGISTER_TYPE_UW);
2005
2006 if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
2007 insn->header.compression_control = BRW_COMPRESSION_NONE;
2008 src_header = vec16(src_header);
2009 }
2010 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
2011 insn->header.destreg__conditionalmod = mrf.nr;
2012
2013 /* Until gen6, writes followed by reads from the same location
2014 * are not guaranteed to be ordered unless write_commit is set.
2015 * If set, then a no-op write is issued to the destination
2016 * register to set a dependency, and a read from the destination
2017 * can be used to ensure the ordering.
2018 *
2019 * For gen6, only writes between different threads need ordering
2020 * protection. Our use of DP writes is all about register
2021 * spilling within a thread.
2022 */
2023 if (brw->gen >= 6) {
2024 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2025 send_commit_msg = 0;
2026 } else {
2027 dest = src_header;
2028 send_commit_msg = 1;
2029 }
2030
2031 brw_set_dest(p, insn, dest);
2032 if (brw->gen >= 6) {
2033 brw_set_src0(p, insn, mrf);
2034 } else {
2035 brw_set_src0(p, insn, brw_null_reg());
2036 }
2037
2038 if (brw->gen >= 6)
2039 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2040 else
2041 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2042
2043 brw_set_dp_write_message(p,
2044 insn,
2045 255, /* binding table index (255=stateless) */
2046 msg_control,
2047 msg_type,
2048 mlen,
2049 true, /* header_present */
2050 0, /* not a render target */
2051 send_commit_msg, /* response_length */
2052 0, /* eot */
2053 send_commit_msg);
2054 }
2055 }
2056
2057
2058 /**
2059 * Read a block of owords (half a GRF each) from the scratch buffer
2060 * using a constant index per channel.
2061 *
2062 * Offset must be aligned to oword size (16 bytes). Used for register
2063 * spilling.
2064 */
2065 void
2066 brw_oword_block_read_scratch(struct brw_compile *p,
2067 struct brw_reg dest,
2068 struct brw_reg mrf,
2069 int num_regs,
2070 unsigned offset)
2071 {
2072 struct brw_context *brw = p->brw;
2073 uint32_t msg_control;
2074 int rlen;
2075
2076 if (brw->gen >= 6)
2077 offset /= 16;
2078
2079 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2080 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2081
2082 if (num_regs == 1) {
2083 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2084 rlen = 1;
2085 } else {
2086 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2087 rlen = 2;
2088 }
2089
2090 {
2091 brw_push_insn_state(p);
2092 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2093 brw_set_mask_control(p, BRW_MASK_DISABLE);
2094
2095 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2096
2097 /* set message header global offset field (reg 0, element 2) */
2098 brw_MOV(p,
2099 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2100 mrf.nr,
2101 2), BRW_REGISTER_TYPE_UD),
2102 brw_imm_ud(offset));
2103
2104 brw_pop_insn_state(p);
2105 }
2106
2107 {
2108 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2109
2110 assert(insn->header.predicate_control == 0);
2111 insn->header.compression_control = BRW_COMPRESSION_NONE;
2112 insn->header.destreg__conditionalmod = mrf.nr;
2113
2114 brw_set_dest(p, insn, dest); /* UW? */
2115 if (brw->gen >= 6) {
2116 brw_set_src0(p, insn, mrf);
2117 } else {
2118 brw_set_src0(p, insn, brw_null_reg());
2119 }
2120
2121 brw_set_dp_read_message(p,
2122 insn,
2123 255, /* binding table index (255=stateless) */
2124 msg_control,
2125 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2126 BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2127 1, /* msg_length */
2128 true, /* header_present */
2129 rlen);
2130 }
2131 }
2132
2133 void
2134 gen7_block_read_scratch(struct brw_compile *p,
2135 struct brw_reg dest,
2136 int num_regs,
2137 unsigned offset)
2138 {
2139 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2140
2141 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2142
2143 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
2144 insn->header.compression_control = BRW_COMPRESSION_NONE;
2145
2146 brw_set_dest(p, insn, dest);
2147
2148 /* The HW requires that the header is present; this is to get the g0.5
2149 * scratch offset.
2150 */
2151 bool header_present = true;
2152 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2153
2154 brw_set_message_descriptor(p, insn,
2155 GEN7_SFID_DATAPORT_DATA_CACHE,
2156 1, /* mlen: just g0 */
2157 num_regs,
2158 header_present,
2159 false);
2160
2161 insn->bits3.ud |= GEN7_DATAPORT_SCRATCH_READ;
2162
2163 assert(num_regs == 1 || num_regs == 2 || num_regs == 4);
2164 insn->bits3.ud |= (num_regs - 1) << GEN7_DATAPORT_SCRATCH_NUM_REGS_SHIFT;
2165
2166 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2167 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2168 * is 32 bytes, which happens to be the size of a register.
2169 */
2170 offset /= REG_SIZE;
2171 assert(offset < (1 << 12));
2172 insn->bits3.ud |= offset;
2173 }
2174
2175 /**
2176 * Read a float[4] vector from the data port Data Cache (const buffer).
2177 * Location (in buffer) should be a multiple of 16.
2178 * Used for fetching shader constants.
2179 */
2180 void brw_oword_block_read(struct brw_compile *p,
2181 struct brw_reg dest,
2182 struct brw_reg mrf,
2183 uint32_t offset,
2184 uint32_t bind_table_index)
2185 {
2186 struct brw_context *brw = p->brw;
2187
2188 /* On newer hardware, offset is in units of owords. */
2189 if (brw->gen >= 6)
2190 offset /= 16;
2191
2192 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2193
2194 brw_push_insn_state(p);
2195 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2196 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2197 brw_set_mask_control(p, BRW_MASK_DISABLE);
2198
2199 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2200
2201 /* set message header global offset field (reg 0, element 2) */
2202 brw_MOV(p,
2203 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2204 mrf.nr,
2205 2), BRW_REGISTER_TYPE_UD),
2206 brw_imm_ud(offset));
2207
2208 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2209 insn->header.destreg__conditionalmod = mrf.nr;
2210
2211 /* cast dest to a uword[8] vector */
2212 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2213
2214 brw_set_dest(p, insn, dest);
2215 if (brw->gen >= 6) {
2216 brw_set_src0(p, insn, mrf);
2217 } else {
2218 brw_set_src0(p, insn, brw_null_reg());
2219 }
2220
2221 brw_set_dp_read_message(p,
2222 insn,
2223 bind_table_index,
2224 BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2225 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2226 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2227 1, /* msg_length */
2228 true, /* header_present */
2229 1); /* response_length (1 reg, 2 owords!) */
2230
2231 brw_pop_insn_state(p);
2232 }
2233
2234
2235 void brw_fb_WRITE(struct brw_compile *p,
2236 int dispatch_width,
2237 unsigned msg_reg_nr,
2238 struct brw_reg src0,
2239 unsigned msg_control,
2240 unsigned binding_table_index,
2241 unsigned msg_length,
2242 unsigned response_length,
2243 bool eot,
2244 bool header_present)
2245 {
2246 struct brw_context *brw = p->brw;
2247 struct brw_instruction *insn;
2248 unsigned msg_type;
2249 struct brw_reg dest;
2250
2251 if (dispatch_width == 16)
2252 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2253 else
2254 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2255
2256 if (brw->gen >= 6) {
2257 insn = next_insn(p, BRW_OPCODE_SENDC);
2258 } else {
2259 insn = next_insn(p, BRW_OPCODE_SEND);
2260 }
2261 insn->header.compression_control = BRW_COMPRESSION_NONE;
2262
2263 if (brw->gen >= 6) {
2264 /* headerless version, just submit color payload */
2265 src0 = brw_message_reg(msg_reg_nr);
2266
2267 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2268 } else {
2269 insn->header.destreg__conditionalmod = msg_reg_nr;
2270
2271 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2272 }
2273
2274 brw_set_dest(p, insn, dest);
2275 brw_set_src0(p, insn, src0);
2276 brw_set_dp_write_message(p,
2277 insn,
2278 binding_table_index,
2279 msg_control,
2280 msg_type,
2281 msg_length,
2282 header_present,
2283 eot, /* last render target write */
2284 response_length,
2285 eot,
2286 0 /* send_commit_msg */);
2287 }
2288
2289
2290 /**
2291 * Texture sample instruction.
2292 * Note: the msg_type plus msg_length values determine exactly what kind
2293 * of sampling operation is performed. See volume 4, page 161 of docs.
2294 */
2295 void brw_SAMPLE(struct brw_compile *p,
2296 struct brw_reg dest,
2297 unsigned msg_reg_nr,
2298 struct brw_reg src0,
2299 unsigned binding_table_index,
2300 unsigned sampler,
2301 unsigned msg_type,
2302 unsigned response_length,
2303 unsigned msg_length,
2304 unsigned header_present,
2305 unsigned simd_mode,
2306 unsigned return_format)
2307 {
2308 struct brw_context *brw = p->brw;
2309 struct brw_instruction *insn;
2310
2311 if (msg_reg_nr != -1)
2312 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2313
2314 insn = next_insn(p, BRW_OPCODE_SEND);
2315 insn->header.predicate_control = 0; /* XXX */
2316
2317 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2318 *
2319 * "Instruction compression is not allowed for this instruction (that
2320 * is, send). The hardware behavior is undefined if this instruction is
2321 * set as compressed. However, compress control can be set to "SecHalf"
2322 * to affect the EMask generation."
2323 *
2324 * No similar wording is found in later PRMs, but there are examples
2325 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2326 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2327 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2328 */
2329 if (insn->header.compression_control != BRW_COMPRESSION_2NDHALF)
2330 insn->header.compression_control = BRW_COMPRESSION_NONE;
2331
2332 if (brw->gen < 6)
2333 insn->header.destreg__conditionalmod = msg_reg_nr;
2334
2335 brw_set_dest(p, insn, dest);
2336 brw_set_src0(p, insn, src0);
2337 brw_set_sampler_message(p, insn,
2338 binding_table_index,
2339 sampler,
2340 msg_type,
2341 response_length,
2342 msg_length,
2343 header_present,
2344 simd_mode,
2345 return_format);
2346 }
2347
2348 /* All these variables are pretty confusing - we might be better off
2349 * using bitmasks and macros for this, in the old style. Or perhaps
2350 * just having the caller instantiate the fields in dword3 itself.
2351 */
2352 void brw_urb_WRITE(struct brw_compile *p,
2353 struct brw_reg dest,
2354 unsigned msg_reg_nr,
2355 struct brw_reg src0,
2356 enum brw_urb_write_flags flags,
2357 unsigned msg_length,
2358 unsigned response_length,
2359 unsigned offset,
2360 unsigned swizzle)
2361 {
2362 struct brw_context *brw = p->brw;
2363 struct brw_instruction *insn;
2364
2365 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2366
2367 if (brw->gen == 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2368 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2369 brw_push_insn_state(p);
2370 brw_set_access_mode(p, BRW_ALIGN_1);
2371 brw_set_mask_control(p, BRW_MASK_DISABLE);
2372 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2373 BRW_REGISTER_TYPE_UD),
2374 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2375 brw_imm_ud(0xff00));
2376 brw_pop_insn_state(p);
2377 }
2378
2379 insn = next_insn(p, BRW_OPCODE_SEND);
2380
2381 assert(msg_length < BRW_MAX_MRF);
2382
2383 brw_set_dest(p, insn, dest);
2384 brw_set_src0(p, insn, src0);
2385 brw_set_src1(p, insn, brw_imm_d(0));
2386
2387 if (brw->gen < 6)
2388 insn->header.destreg__conditionalmod = msg_reg_nr;
2389
2390 brw_set_urb_message(p,
2391 insn,
2392 flags,
2393 msg_length,
2394 response_length,
2395 offset,
2396 swizzle);
2397 }
2398
2399 static int
2400 brw_find_next_block_end(struct brw_compile *p, int start_offset)
2401 {
2402 int offset;
2403 void *store = p->store;
2404
2405 for (offset = next_offset(store, start_offset); offset < p->next_insn_offset;
2406 offset = next_offset(store, offset)) {
2407 struct brw_instruction *insn = store + offset;
2408
2409 switch (insn->header.opcode) {
2410 case BRW_OPCODE_ENDIF:
2411 case BRW_OPCODE_ELSE:
2412 case BRW_OPCODE_WHILE:
2413 case BRW_OPCODE_HALT:
2414 return offset;
2415 }
2416 }
2417
2418 return 0;
2419 }
2420
2421 /* There is no DO instruction on gen6, so to find the end of the loop
2422 * we have to see if the loop is jumping back before our start
2423 * instruction.
2424 */
2425 static int
2426 brw_find_loop_end(struct brw_compile *p, int start_offset)
2427 {
2428 struct brw_context *brw = p->brw;
2429 int offset;
2430 int scale = 8;
2431 void *store = p->store;
2432
2433 /* Always start after the instruction (such as a WHILE) we're trying to fix
2434 * up.
2435 */
2436 for (offset = next_offset(store, start_offset); offset < p->next_insn_offset;
2437 offset = next_offset(store, offset)) {
2438 struct brw_instruction *insn = store + offset;
2439
2440 if (insn->header.opcode == BRW_OPCODE_WHILE) {
2441 int jip = brw->gen == 6 ? insn->bits1.branch_gen6.jump_count
2442 : insn->bits3.break_cont.jip;
2443 if (offset + jip * scale <= start_offset)
2444 return offset;
2445 }
2446 }
2447 assert(!"not reached");
2448 return start_offset;
2449 }
2450
2451 /* After program generation, go back and update the UIP and JIP of
2452 * BREAK, CONT, and HALT instructions to their correct locations.
2453 */
2454 void
2455 brw_set_uip_jip(struct brw_compile *p)
2456 {
2457 struct brw_context *brw = p->brw;
2458 int offset;
2459 int scale = 8;
2460 void *store = p->store;
2461
2462 if (brw->gen < 6)
2463 return;
2464
2465 for (offset = 0; offset < p->next_insn_offset;
2466 offset = next_offset(store, offset)) {
2467 struct brw_instruction *insn = store + offset;
2468
2469 if (insn->header.cmpt_control) {
2470 /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2471 assert(insn->header.opcode != BRW_OPCODE_BREAK &&
2472 insn->header.opcode != BRW_OPCODE_CONTINUE &&
2473 insn->header.opcode != BRW_OPCODE_HALT);
2474 continue;
2475 }
2476
2477 int block_end_offset = brw_find_next_block_end(p, offset);
2478 switch (insn->header.opcode) {
2479 case BRW_OPCODE_BREAK:
2480 assert(block_end_offset != 0);
2481 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2482 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2483 insn->bits3.break_cont.uip =
2484 (brw_find_loop_end(p, offset) - offset +
2485 (brw->gen == 6 ? 16 : 0)) / scale;
2486 break;
2487 case BRW_OPCODE_CONTINUE:
2488 assert(block_end_offset != 0);
2489 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2490 insn->bits3.break_cont.uip =
2491 (brw_find_loop_end(p, offset) - offset) / scale;
2492
2493 assert(insn->bits3.break_cont.uip != 0);
2494 assert(insn->bits3.break_cont.jip != 0);
2495 break;
2496
2497 case BRW_OPCODE_ENDIF:
2498 if (block_end_offset == 0)
2499 insn->bits3.break_cont.jip = 2;
2500 else
2501 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2502 break;
2503
2504 case BRW_OPCODE_HALT:
2505 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2506 *
2507 * "In case of the halt instruction not inside any conditional
2508 * code block, the value of <JIP> and <UIP> should be the
2509 * same. In case of the halt instruction inside conditional code
2510 * block, the <UIP> should be the end of the program, and the
2511 * <JIP> should be end of the most inner conditional code block."
2512 *
2513 * The uip will have already been set by whoever set up the
2514 * instruction.
2515 */
2516 if (block_end_offset == 0) {
2517 insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
2518 } else {
2519 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2520 }
2521 assert(insn->bits3.break_cont.uip != 0);
2522 assert(insn->bits3.break_cont.jip != 0);
2523 break;
2524 }
2525 }
2526 }
2527
2528 void brw_ff_sync(struct brw_compile *p,
2529 struct brw_reg dest,
2530 unsigned msg_reg_nr,
2531 struct brw_reg src0,
2532 bool allocate,
2533 unsigned response_length,
2534 bool eot)
2535 {
2536 struct brw_context *brw = p->brw;
2537 struct brw_instruction *insn;
2538
2539 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2540
2541 insn = next_insn(p, BRW_OPCODE_SEND);
2542 brw_set_dest(p, insn, dest);
2543 brw_set_src0(p, insn, src0);
2544 brw_set_src1(p, insn, brw_imm_d(0));
2545
2546 if (brw->gen < 6)
2547 insn->header.destreg__conditionalmod = msg_reg_nr;
2548
2549 brw_set_ff_sync_message(p,
2550 insn,
2551 allocate,
2552 response_length,
2553 eot);
2554 }
2555
2556 /**
2557 * Emit the SEND instruction necessary to generate stream output data on Gen6
2558 * (for transform feedback).
2559 *
2560 * If send_commit_msg is true, this is the last piece of stream output data
2561 * from this thread, so send the data as a committed write. According to the
2562 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2563 *
2564 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2565 * writes are complete by sending the final write as a committed write."
2566 */
2567 void
2568 brw_svb_write(struct brw_compile *p,
2569 struct brw_reg dest,
2570 unsigned msg_reg_nr,
2571 struct brw_reg src0,
2572 unsigned binding_table_index,
2573 bool send_commit_msg)
2574 {
2575 struct brw_instruction *insn;
2576
2577 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2578
2579 insn = next_insn(p, BRW_OPCODE_SEND);
2580 brw_set_dest(p, insn, dest);
2581 brw_set_src0(p, insn, src0);
2582 brw_set_src1(p, insn, brw_imm_d(0));
2583 brw_set_dp_write_message(p, insn,
2584 binding_table_index,
2585 0, /* msg_control: ignored */
2586 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2587 1, /* msg_length */
2588 true, /* header_present */
2589 0, /* last_render_target: ignored */
2590 send_commit_msg, /* response_length */
2591 0, /* end_of_thread */
2592 send_commit_msg); /* send_commit_msg */
2593 }
2594
2595 static void
2596 brw_set_dp_untyped_atomic_message(struct brw_compile *p,
2597 struct brw_instruction *insn,
2598 unsigned atomic_op,
2599 unsigned bind_table_index,
2600 unsigned msg_length,
2601 unsigned response_length,
2602 bool header_present)
2603 {
2604 if (p->brw->is_haswell) {
2605 brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1,
2606 msg_length, response_length,
2607 header_present, false);
2608
2609
2610 if (insn->header.access_mode == BRW_ALIGN_1) {
2611 if (insn->header.execution_size != BRW_EXECUTE_16)
2612 insn->bits3.ud |= 1 << 12; /* SIMD8 mode */
2613
2614 insn->bits3.gen7_dp.msg_type =
2615 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP;
2616 } else {
2617 insn->bits3.gen7_dp.msg_type =
2618 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2;
2619 }
2620
2621 } else {
2622 brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
2623 msg_length, response_length,
2624 header_present, false);
2625
2626 insn->bits3.gen7_dp.msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
2627
2628 if (insn->header.execution_size != BRW_EXECUTE_16)
2629 insn->bits3.ud |= 1 << 12; /* SIMD8 mode */
2630 }
2631
2632 if (response_length)
2633 insn->bits3.ud |= 1 << 13; /* Return data expected */
2634
2635 insn->bits3.gen7_dp.binding_table_index = bind_table_index;
2636 insn->bits3.ud |= atomic_op << 8;
2637 }
2638
2639 void
2640 brw_untyped_atomic(struct brw_compile *p,
2641 struct brw_reg dest,
2642 struct brw_reg mrf,
2643 unsigned atomic_op,
2644 unsigned bind_table_index,
2645 unsigned msg_length,
2646 unsigned response_length) {
2647 struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_SEND);
2648
2649 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD));
2650 brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
2651 brw_set_src1(p, insn, brw_imm_d(0));
2652 brw_set_dp_untyped_atomic_message(
2653 p, insn, atomic_op, bind_table_index, msg_length, response_length,
2654 insn->header.access_mode == BRW_ALIGN_1);
2655 }
2656
2657 static void
2658 brw_set_dp_untyped_surface_read_message(struct brw_compile *p,
2659 struct brw_instruction *insn,
2660 unsigned bind_table_index,
2661 unsigned msg_length,
2662 unsigned response_length,
2663 bool header_present)
2664 {
2665 const unsigned dispatch_width =
2666 (insn->header.execution_size == BRW_EXECUTE_16 ? 16 : 8);
2667 const unsigned num_channels = response_length / (dispatch_width / 8);
2668
2669 if (p->brw->is_haswell) {
2670 brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1,
2671 msg_length, response_length,
2672 header_present, false);
2673
2674 insn->bits3.gen7_dp.msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ;
2675 } else {
2676 brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
2677 msg_length, response_length,
2678 header_present, false);
2679
2680 insn->bits3.gen7_dp.msg_type = GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ;
2681 }
2682
2683 if (insn->header.access_mode == BRW_ALIGN_1) {
2684 if (dispatch_width == 16)
2685 insn->bits3.ud |= 1 << 12; /* SIMD16 mode */
2686 else
2687 insn->bits3.ud |= 2 << 12; /* SIMD8 mode */
2688 }
2689
2690 insn->bits3.gen7_dp.binding_table_index = bind_table_index;
2691
2692 /* Set mask of 32-bit channels to drop. */
2693 insn->bits3.ud |= (0xf & (0xf << num_channels)) << 8;
2694 }
2695
2696 void
2697 brw_untyped_surface_read(struct brw_compile *p,
2698 struct brw_reg dest,
2699 struct brw_reg mrf,
2700 unsigned bind_table_index,
2701 unsigned msg_length,
2702 unsigned response_length)
2703 {
2704 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2705
2706 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD));
2707 brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
2708 brw_set_dp_untyped_surface_read_message(
2709 p, insn, bind_table_index, msg_length, response_length,
2710 insn->header.access_mode == BRW_ALIGN_1);
2711 }
2712
2713 /**
2714 * This instruction is generated as a single-channel align1 instruction by
2715 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2716 *
2717 * We can't use the typed atomic op in the FS because that has the execution
2718 * mask ANDed with the pixel mask, but we just want to write the one dword for
2719 * all the pixels.
2720 *
2721 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2722 * one u32. So we use the same untyped atomic write message as the pixel
2723 * shader.
2724 *
2725 * The untyped atomic operation requires a BUFFER surface type with RAW
2726 * format, and is only accessible through the legacy DATA_CACHE dataport
2727 * messages.
2728 */
2729 void brw_shader_time_add(struct brw_compile *p,
2730 struct brw_reg payload,
2731 uint32_t surf_index)
2732 {
2733 struct brw_context *brw = p->brw;
2734 assert(brw->gen >= 7);
2735
2736 brw_push_insn_state(p);
2737 brw_set_access_mode(p, BRW_ALIGN_1);
2738 brw_set_mask_control(p, BRW_MASK_DISABLE);
2739 struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
2740 brw_pop_insn_state(p);
2741
2742 /* We use brw_vec1_reg and unmasked because we want to increment the given
2743 * offset only once.
2744 */
2745 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2746 BRW_ARF_NULL, 0));
2747 brw_set_src0(p, send, brw_vec1_reg(payload.file,
2748 payload.nr, 0));
2749 brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, surf_index,
2750 2 /* message length */,
2751 0 /* response length */,
2752 false /* header present */);
2753 }