i965/eu: Allow 3-src float ops with doubles
[mesa.git] / src / mesa / drivers / dri / i965 / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_context.h"
34 #include "brw_defines.h"
35 #include "brw_eu.h"
36
37 #include "util/ralloc.h"
38
39 /**
40 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
41 * registers, implicitly moving the operand to a message register.
42 *
43 * On Sandybridge, this is no longer the case. This function performs the
44 * explicit move; it should be called before emitting a SEND instruction.
45 */
46 void
47 gen6_resolve_implied_move(struct brw_codegen *p,
48 struct brw_reg *src,
49 unsigned msg_reg_nr)
50 {
51 const struct brw_device_info *devinfo = p->devinfo;
52 if (devinfo->gen < 6)
53 return;
54
55 if (src->file == BRW_MESSAGE_REGISTER_FILE)
56 return;
57
58 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
59 brw_push_insn_state(p);
60 brw_set_default_exec_size(p, BRW_EXECUTE_8);
61 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
62 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
63 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
64 retype(*src, BRW_REGISTER_TYPE_UD));
65 brw_pop_insn_state(p);
66 }
67 *src = brw_message_reg(msg_reg_nr);
68 }
69
70 static void
71 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
72 {
73 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
74 * "The send with EOT should use register space R112-R127 for <src>. This is
75 * to enable loading of a new thread into the same slot while the message
76 * with EOT for current thread is pending dispatch."
77 *
78 * Since we're pretending to have 16 MRFs anyway, we may as well use the
79 * registers required for messages with EOT.
80 */
81 const struct brw_device_info *devinfo = p->devinfo;
82 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
83 reg->file = BRW_GENERAL_REGISTER_FILE;
84 reg->nr += GEN7_MRF_HACK_START;
85 }
86 }
87
88 /**
89 * Convert a brw_reg_type enumeration value into the hardware representation.
90 *
91 * The hardware encoding may depend on whether the value is an immediate.
92 */
93 unsigned
94 brw_reg_type_to_hw_type(const struct brw_device_info *devinfo,
95 enum brw_reg_type type, enum brw_reg_file file)
96 {
97 if (file == BRW_IMMEDIATE_VALUE) {
98 static const int imm_hw_types[] = {
99 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
100 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
101 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
102 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
103 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
104 [BRW_REGISTER_TYPE_UB] = -1,
105 [BRW_REGISTER_TYPE_B] = -1,
106 [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
107 [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
108 [BRW_REGISTER_TYPE_V] = BRW_HW_REG_IMM_TYPE_V,
109 [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
110 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
111 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
112 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
113 };
114 assert(type < ARRAY_SIZE(imm_hw_types));
115 assert(imm_hw_types[type] != -1);
116 assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
117 return imm_hw_types[type];
118 } else {
119 /* Non-immediate registers */
120 static const int hw_types[] = {
121 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
122 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
123 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
124 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
125 [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
126 [BRW_REGISTER_TYPE_B] = BRW_HW_REG_NON_IMM_TYPE_B,
127 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
128 [BRW_REGISTER_TYPE_UV] = -1,
129 [BRW_REGISTER_TYPE_VF] = -1,
130 [BRW_REGISTER_TYPE_V] = -1,
131 [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
132 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
133 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
134 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
135 };
136 assert(type < ARRAY_SIZE(hw_types));
137 assert(hw_types[type] != -1);
138 assert(devinfo->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
139 assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_HF);
140 return hw_types[type];
141 }
142 }
143
144 void
145 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
146 {
147 const struct brw_device_info *devinfo = p->devinfo;
148
149 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
150 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
151 else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE)
152 assert(dest.nr < 128);
153
154 gen7_convert_mrf_to_grf(p, &dest);
155
156 brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
157 brw_inst_set_dst_reg_type(devinfo, inst,
158 brw_reg_type_to_hw_type(devinfo, dest.type,
159 dest.file));
160 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
161
162 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
163 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
164
165 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
166 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
167 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
168 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
169 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
170 } else {
171 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
172 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
173 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
174 dest.file == BRW_MESSAGE_REGISTER_FILE) {
175 assert(dest.writemask != 0);
176 }
177 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
178 * Although Dst.HorzStride is a don't care for Align16, HW needs
179 * this to be programmed as "01".
180 */
181 brw_inst_set_dst_hstride(devinfo, inst, 1);
182 }
183 } else {
184 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
185
186 /* These are different sizes in align1 vs align16:
187 */
188 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
189 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
190 dest.indirect_offset);
191 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
192 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
193 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
194 } else {
195 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
196 dest.indirect_offset);
197 /* even ignored in da16, still need to set as '01' */
198 brw_inst_set_dst_hstride(devinfo, inst, 1);
199 }
200 }
201
202 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
203 * or 16 (SIMD16), as that's normally correct. However, when dealing with
204 * small registers, we automatically reduce it to match the register size.
205 *
206 * In platforms that support fp64 we can emit instructions with a width of
207 * 4 that need two SIMD8 registers and an exec_size of 8 or 16. In these
208 * cases we need to make sure that these instructions have their exec sizes
209 * set properly when they are emitted and we can't rely on this code to fix
210 * it.
211 */
212 bool fix_exec_size;
213 if (devinfo->gen >= 6)
214 fix_exec_size = dest.width < BRW_EXECUTE_4;
215 else
216 fix_exec_size = dest.width < BRW_EXECUTE_8;
217
218 if (fix_exec_size)
219 brw_inst_set_exec_size(devinfo, inst, dest.width);
220 }
221
222 extern int reg_type_size[];
223
224 static void
225 validate_reg(const struct brw_device_info *devinfo,
226 brw_inst *inst, struct brw_reg reg)
227 {
228 const int hstride_for_reg[] = {0, 1, 2, 4};
229 const int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32};
230 const int width_for_reg[] = {1, 2, 4, 8, 16};
231 const int execsize_for_reg[] = {1, 2, 4, 8, 16, 32};
232 int width, hstride, vstride, execsize;
233
234 if (reg.file == BRW_IMMEDIATE_VALUE) {
235 /* 3.3.6: Region Parameters. Restriction: Immediate vectors
236 * mean the destination has to be 128-bit aligned and the
237 * destination horiz stride has to be a word.
238 */
239 if (reg.type == BRW_REGISTER_TYPE_V) {
240 assert(hstride_for_reg[brw_inst_dst_hstride(devinfo, inst)] *
241 reg_type_size[brw_inst_dst_reg_type(devinfo, inst)] == 2);
242 }
243
244 return;
245 }
246
247 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
248 reg.file == BRW_ARF_NULL)
249 return;
250
251 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
252 *
253 * "Swizzling is not allowed when an accumulator is used as an implicit
254 * source or an explicit source in an instruction."
255 */
256 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
257 reg.nr == BRW_ARF_ACCUMULATOR)
258 assert(reg.swizzle == BRW_SWIZZLE_XYZW);
259
260 assert(reg.hstride >= 0 && reg.hstride < ARRAY_SIZE(hstride_for_reg));
261 hstride = hstride_for_reg[reg.hstride];
262
263 if (reg.vstride == 0xf) {
264 vstride = -1;
265 } else {
266 assert(reg.vstride >= 0 && reg.vstride < ARRAY_SIZE(vstride_for_reg));
267 vstride = vstride_for_reg[reg.vstride];
268 }
269
270 assert(reg.width >= 0 && reg.width < ARRAY_SIZE(width_for_reg));
271 width = width_for_reg[reg.width];
272
273 assert(brw_inst_exec_size(devinfo, inst) >= 0 &&
274 brw_inst_exec_size(devinfo, inst) < ARRAY_SIZE(execsize_for_reg));
275 execsize = execsize_for_reg[brw_inst_exec_size(devinfo, inst)];
276
277 /* Restrictions from 3.3.10: Register Region Restrictions. */
278 /* 3. */
279 assert(execsize >= width);
280
281 /* 4. */
282 if (execsize == width && hstride != 0) {
283 assert(vstride == -1 || vstride == width * hstride);
284 }
285
286 /* 5. */
287 if (execsize == width && hstride == 0) {
288 /* no restriction on vstride. */
289 }
290
291 /* 6. */
292 if (width == 1) {
293 assert(hstride == 0);
294 }
295
296 /* 7. */
297 if (execsize == 1 && width == 1) {
298 assert(hstride == 0);
299 assert(vstride == 0);
300 }
301
302 /* 8. */
303 if (vstride == 0 && hstride == 0) {
304 assert(width == 1);
305 }
306
307 /* 10. Check destination issues. */
308 }
309
310 static bool
311 is_compactable_immediate(unsigned imm)
312 {
313 /* We get the low 12 bits as-is. */
314 imm &= ~0xfff;
315
316 /* We get one bit replicated through the top 20 bits. */
317 return imm == 0 || imm == 0xfffff000;
318 }
319
320 void
321 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
322 {
323 const struct brw_device_info *devinfo = p->devinfo;
324
325 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
326 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
327 else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
328 assert(reg.nr < 128);
329
330 gen7_convert_mrf_to_grf(p, &reg);
331
332 if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
333 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
334 /* Any source modifiers or regions will be ignored, since this just
335 * identifies the MRF/GRF to start reading the message contents from.
336 * Check for some likely failures.
337 */
338 assert(!reg.negate);
339 assert(!reg.abs);
340 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
341 }
342
343 validate_reg(devinfo, inst, reg);
344
345 brw_inst_set_src0_reg_file(devinfo, inst, reg.file);
346 brw_inst_set_src0_reg_type(devinfo, inst,
347 brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
348 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
349 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
350 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
351
352 if (reg.file == BRW_IMMEDIATE_VALUE) {
353 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
354
355 /* The Bspec's section titled "Non-present Operands" claims that if src0
356 * is an immediate that src1's type must be the same as that of src0.
357 *
358 * The SNB+ DataTypeIndex instruction compaction tables contain mappings
359 * that do not follow this rule. E.g., from the IVB/HSW table:
360 *
361 * DataTypeIndex 18-Bit Mapping Mapped Meaning
362 * 3 001000001011111101 r:f | i:vf | a:ud | <1> | dir |
363 *
364 * And from the SNB table:
365 *
366 * DataTypeIndex 18-Bit Mapping Mapped Meaning
367 * 8 001000000111101100 a:w | i:w | a:ud | <1> | dir |
368 *
369 * Neither of these cause warnings from the simulator when used,
370 * compacted or otherwise. In fact, all compaction mappings that have an
371 * immediate in src0 use a:ud for src1.
372 *
373 * The GM45 instruction compaction tables do not contain mapped meanings
374 * so it's not clear whether it has the restriction. We'll assume it was
375 * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
376 */
377 brw_inst_set_src1_reg_file(devinfo, inst, BRW_ARCHITECTURE_REGISTER_FILE);
378 if (devinfo->gen < 6) {
379 brw_inst_set_src1_reg_type(devinfo, inst,
380 brw_inst_src0_reg_type(devinfo, inst));
381 } else {
382 brw_inst_set_src1_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
383 }
384
385 /* Compacted instructions only have 12-bits (plus 1 for the other 20)
386 * for immediate values. Presumably the hardware engineers realized
387 * that the only useful floating-point value that could be represented
388 * in this format is 0.0, which can also be represented as a VF-typed
389 * immediate, so they gave us the previously mentioned mapping on IVB+.
390 *
391 * Strangely, we do have a mapping for imm:f in src1, so we don't need
392 * to do this there.
393 *
394 * If we see a 0.0:F, change the type to VF so that it can be compacted.
395 */
396 if (brw_inst_imm_ud(devinfo, inst) == 0x0 &&
397 brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_F) {
398 brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_IMM_TYPE_VF);
399 }
400
401 /* There are no mappings for dst:d | i:d, so if the immediate is suitable
402 * set the types to :UD so the instruction can be compacted.
403 */
404 if (is_compactable_immediate(brw_inst_imm_ud(devinfo, inst)) &&
405 brw_inst_cond_modifier(devinfo, inst) == BRW_CONDITIONAL_NONE &&
406 brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D &&
407 brw_inst_dst_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D) {
408 brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
409 brw_inst_set_dst_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
410 }
411 } else {
412 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
413 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
414 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
415 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
416 } else {
417 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
418 }
419 } else {
420 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
421
422 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
423 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
424 } else {
425 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
426 }
427 }
428
429 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
430 if (reg.width == BRW_WIDTH_1 &&
431 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
432 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
433 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
434 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
435 } else {
436 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
437 brw_inst_set_src0_width(devinfo, inst, reg.width);
438 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
439 }
440 } else {
441 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
442 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
443 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
444 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
445 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
446 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
447 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
448 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
449
450 /* This is an oddity of the fact we're using the same
451 * descriptions for registers in align_16 as align_1:
452 */
453 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
454 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
455 else
456 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
457 }
458 }
459 }
460
461
462 void
463 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
464 {
465 const struct brw_device_info *devinfo = p->devinfo;
466
467 if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
468 assert(reg.nr < 128);
469
470 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
471 *
472 * "Accumulator registers may be accessed explicitly as src0
473 * operands only."
474 */
475 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
476 reg.nr != BRW_ARF_ACCUMULATOR);
477
478 gen7_convert_mrf_to_grf(p, &reg);
479 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
480
481 validate_reg(devinfo, inst, reg);
482
483 brw_inst_set_src1_reg_file(devinfo, inst, reg.file);
484 brw_inst_set_src1_reg_type(devinfo, inst,
485 brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
486 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
487 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
488
489 /* Only src1 can be immediate in two-argument instructions.
490 */
491 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
492
493 if (reg.file == BRW_IMMEDIATE_VALUE) {
494 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
495 } else {
496 /* This is a hardware restriction, which may or may not be lifted
497 * in the future:
498 */
499 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
500 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
501
502 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
503 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
504 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
505 } else {
506 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
507 }
508
509 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
510 if (reg.width == BRW_WIDTH_1 &&
511 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
512 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
513 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
514 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
515 } else {
516 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
517 brw_inst_set_src1_width(devinfo, inst, reg.width);
518 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
519 }
520 } else {
521 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
522 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
523 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
524 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
525 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
526 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
527 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
528 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
529
530 /* This is an oddity of the fact we're using the same
531 * descriptions for registers in align_16 as align_1:
532 */
533 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
534 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
535 else
536 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
537 }
538 }
539 }
540
541 /**
542 * Set the Message Descriptor and Extended Message Descriptor fields
543 * for SEND messages.
544 *
545 * \note This zeroes out the Function Control bits, so it must be called
546 * \b before filling out any message-specific data. Callers can
547 * choose not to fill in irrelevant bits; they will be zero.
548 */
549 void
550 brw_set_message_descriptor(struct brw_codegen *p,
551 brw_inst *inst,
552 enum brw_message_target sfid,
553 unsigned msg_length,
554 unsigned response_length,
555 bool header_present,
556 bool end_of_thread)
557 {
558 const struct brw_device_info *devinfo = p->devinfo;
559
560 brw_set_src1(p, inst, brw_imm_d(0));
561
562 /* For indirect sends, `inst` will not be the SEND/SENDC instruction
563 * itself; instead, it will be a MOV/OR into the address register.
564 *
565 * In this case, we avoid setting the extended message descriptor bits,
566 * since they go on the later SEND/SENDC instead and if set here would
567 * instead clobber the conditionalmod bits.
568 */
569 unsigned opcode = brw_inst_opcode(devinfo, inst);
570 if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
571 brw_inst_set_sfid(devinfo, inst, sfid);
572 }
573
574 brw_inst_set_mlen(devinfo, inst, msg_length);
575 brw_inst_set_rlen(devinfo, inst, response_length);
576 brw_inst_set_eot(devinfo, inst, end_of_thread);
577
578 if (devinfo->gen >= 5) {
579 brw_inst_set_header_present(devinfo, inst, header_present);
580 }
581 }
582
583 static void brw_set_math_message( struct brw_codegen *p,
584 brw_inst *inst,
585 unsigned function,
586 unsigned integer_type,
587 bool low_precision,
588 unsigned dataType )
589 {
590 const struct brw_device_info *devinfo = p->devinfo;
591 unsigned msg_length;
592 unsigned response_length;
593
594 /* Infer message length from the function */
595 switch (function) {
596 case BRW_MATH_FUNCTION_POW:
597 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
598 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
599 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
600 msg_length = 2;
601 break;
602 default:
603 msg_length = 1;
604 break;
605 }
606
607 /* Infer response length from the function */
608 switch (function) {
609 case BRW_MATH_FUNCTION_SINCOS:
610 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
611 response_length = 2;
612 break;
613 default:
614 response_length = 1;
615 break;
616 }
617
618
619 brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
620 msg_length, response_length, false, false);
621 brw_inst_set_math_msg_function(devinfo, inst, function);
622 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
623 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
624 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
625 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
626 brw_inst_set_saturate(devinfo, inst, 0);
627 }
628
629
630 static void brw_set_ff_sync_message(struct brw_codegen *p,
631 brw_inst *insn,
632 bool allocate,
633 unsigned response_length,
634 bool end_of_thread)
635 {
636 const struct brw_device_info *devinfo = p->devinfo;
637
638 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
639 1, response_length, true, end_of_thread);
640 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
641 brw_inst_set_urb_allocate(devinfo, insn, allocate);
642 /* The following fields are not used by FF_SYNC: */
643 brw_inst_set_urb_global_offset(devinfo, insn, 0);
644 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
645 brw_inst_set_urb_used(devinfo, insn, 0);
646 brw_inst_set_urb_complete(devinfo, insn, 0);
647 }
648
649 static void brw_set_urb_message( struct brw_codegen *p,
650 brw_inst *insn,
651 enum brw_urb_write_flags flags,
652 unsigned msg_length,
653 unsigned response_length,
654 unsigned offset,
655 unsigned swizzle_control )
656 {
657 const struct brw_device_info *devinfo = p->devinfo;
658
659 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
660 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
661 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
662
663 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
664 msg_length, response_length, true,
665 flags & BRW_URB_WRITE_EOT);
666
667 if (flags & BRW_URB_WRITE_OWORD) {
668 assert(msg_length == 2); /* header + one OWORD of data */
669 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
670 } else {
671 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
672 }
673
674 brw_inst_set_urb_global_offset(devinfo, insn, offset);
675 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
676
677 if (devinfo->gen < 8) {
678 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
679 }
680
681 if (devinfo->gen < 7) {
682 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
683 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
684 } else {
685 brw_inst_set_urb_per_slot_offset(devinfo, insn,
686 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
687 }
688 }
689
690 void
691 brw_set_dp_write_message(struct brw_codegen *p,
692 brw_inst *insn,
693 unsigned binding_table_index,
694 unsigned msg_control,
695 unsigned msg_type,
696 unsigned msg_length,
697 bool header_present,
698 unsigned last_render_target,
699 unsigned response_length,
700 unsigned end_of_thread,
701 unsigned send_commit_msg)
702 {
703 const struct brw_device_info *devinfo = p->devinfo;
704 unsigned sfid;
705
706 if (devinfo->gen >= 7) {
707 /* Use the Render Cache for RT writes; otherwise use the Data Cache */
708 if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
709 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
710 else
711 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
712 } else if (devinfo->gen == 6) {
713 /* Use the render cache for all write messages. */
714 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
715 } else {
716 sfid = BRW_SFID_DATAPORT_WRITE;
717 }
718
719 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
720 header_present, end_of_thread);
721
722 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
723 brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type);
724 brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control);
725 brw_inst_set_rt_last(devinfo, insn, last_render_target);
726 if (devinfo->gen < 7) {
727 brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg);
728 }
729 }
730
731 void
732 brw_set_dp_read_message(struct brw_codegen *p,
733 brw_inst *insn,
734 unsigned binding_table_index,
735 unsigned msg_control,
736 unsigned msg_type,
737 unsigned target_cache,
738 unsigned msg_length,
739 bool header_present,
740 unsigned response_length)
741 {
742 const struct brw_device_info *devinfo = p->devinfo;
743 unsigned sfid;
744
745 if (devinfo->gen >= 7) {
746 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
747 } else if (devinfo->gen == 6) {
748 if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
749 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
750 else
751 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
752 } else {
753 sfid = BRW_SFID_DATAPORT_READ;
754 }
755
756 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
757 header_present, false);
758
759 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
760 brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type);
761 brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control);
762 if (devinfo->gen < 6)
763 brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache);
764 }
765
766 void
767 brw_set_sampler_message(struct brw_codegen *p,
768 brw_inst *inst,
769 unsigned binding_table_index,
770 unsigned sampler,
771 unsigned msg_type,
772 unsigned response_length,
773 unsigned msg_length,
774 unsigned header_present,
775 unsigned simd_mode,
776 unsigned return_format)
777 {
778 const struct brw_device_info *devinfo = p->devinfo;
779
780 brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
781 response_length, header_present, false);
782
783 brw_inst_set_binding_table_index(devinfo, inst, binding_table_index);
784 brw_inst_set_sampler(devinfo, inst, sampler);
785 brw_inst_set_sampler_msg_type(devinfo, inst, msg_type);
786 if (devinfo->gen >= 5) {
787 brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode);
788 } else if (devinfo->gen == 4 && !devinfo->is_g4x) {
789 brw_inst_set_sampler_return_format(devinfo, inst, return_format);
790 }
791 }
792
793 static void
794 gen7_set_dp_scratch_message(struct brw_codegen *p,
795 brw_inst *inst,
796 bool write,
797 bool dword,
798 bool invalidate_after_read,
799 unsigned num_regs,
800 unsigned addr_offset,
801 unsigned mlen,
802 unsigned rlen,
803 bool header_present)
804 {
805 const struct brw_device_info *devinfo = p->devinfo;
806 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
807 (devinfo->gen >= 8 && num_regs == 8));
808 brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
809 mlen, rlen, header_present, false);
810 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
811 brw_inst_set_scratch_read_write(devinfo, inst, write);
812 brw_inst_set_scratch_type(devinfo, inst, dword);
813 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
814 brw_inst_set_scratch_block_size(devinfo, inst, ffs(num_regs) - 1);
815 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
816 }
817
818 #define next_insn brw_next_insn
819 brw_inst *
820 brw_next_insn(struct brw_codegen *p, unsigned opcode)
821 {
822 const struct brw_device_info *devinfo = p->devinfo;
823 brw_inst *insn;
824
825 if (p->nr_insn + 1 > p->store_size) {
826 p->store_size <<= 1;
827 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
828 }
829
830 p->next_insn_offset += 16;
831 insn = &p->store[p->nr_insn++];
832 memcpy(insn, p->current, sizeof(*insn));
833
834 brw_inst_set_opcode(devinfo, insn, opcode);
835 return insn;
836 }
837
838 static brw_inst *
839 brw_alu1(struct brw_codegen *p, unsigned opcode,
840 struct brw_reg dest, struct brw_reg src)
841 {
842 brw_inst *insn = next_insn(p, opcode);
843 brw_set_dest(p, insn, dest);
844 brw_set_src0(p, insn, src);
845 return insn;
846 }
847
848 static brw_inst *
849 brw_alu2(struct brw_codegen *p, unsigned opcode,
850 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
851 {
852 brw_inst *insn = next_insn(p, opcode);
853 brw_set_dest(p, insn, dest);
854 brw_set_src0(p, insn, src0);
855 brw_set_src1(p, insn, src1);
856 return insn;
857 }
858
859 static int
860 get_3src_subreg_nr(struct brw_reg reg)
861 {
862 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
863 * use 32-bit units (components 0..7). Since they only support F/D/UD
864 * types, this doesn't lose any flexibility, but uses fewer bits.
865 */
866 return reg.subnr / 4;
867 }
868
869 static brw_inst *
870 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
871 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
872 {
873 const struct brw_device_info *devinfo = p->devinfo;
874 brw_inst *inst = next_insn(p, opcode);
875
876 gen7_convert_mrf_to_grf(p, &dest);
877
878 assert(brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16);
879
880 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
881 dest.file == BRW_MESSAGE_REGISTER_FILE);
882 assert(dest.nr < 128);
883 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
884 assert(dest.type == BRW_REGISTER_TYPE_F ||
885 dest.type == BRW_REGISTER_TYPE_DF ||
886 dest.type == BRW_REGISTER_TYPE_D ||
887 dest.type == BRW_REGISTER_TYPE_UD);
888 if (devinfo->gen == 6) {
889 brw_inst_set_3src_dst_reg_file(devinfo, inst,
890 dest.file == BRW_MESSAGE_REGISTER_FILE);
891 }
892 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
893 brw_inst_set_3src_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
894 brw_inst_set_3src_dst_writemask(devinfo, inst, dest.writemask);
895
896 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
897 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
898 assert(src0.nr < 128);
899 brw_inst_set_3src_src0_swizzle(devinfo, inst, src0.swizzle);
900 brw_inst_set_3src_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
901 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
902 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
903 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
904 brw_inst_set_3src_src0_rep_ctrl(devinfo, inst,
905 src0.vstride == BRW_VERTICAL_STRIDE_0);
906
907 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
908 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
909 assert(src1.nr < 128);
910 brw_inst_set_3src_src1_swizzle(devinfo, inst, src1.swizzle);
911 brw_inst_set_3src_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
912 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
913 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
914 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
915 brw_inst_set_3src_src1_rep_ctrl(devinfo, inst,
916 src1.vstride == BRW_VERTICAL_STRIDE_0);
917
918 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
919 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
920 assert(src2.nr < 128);
921 brw_inst_set_3src_src2_swizzle(devinfo, inst, src2.swizzle);
922 brw_inst_set_3src_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
923 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
924 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
925 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
926 brw_inst_set_3src_src2_rep_ctrl(devinfo, inst,
927 src2.vstride == BRW_VERTICAL_STRIDE_0);
928
929 if (devinfo->gen >= 7) {
930 /* Set both the source and destination types based on dest.type,
931 * ignoring the source register types. The MAD and LRP emitters ensure
932 * that all four types are float. The BFE and BFI2 emitters, however,
933 * may send us mixed D and UD types and want us to ignore that and use
934 * the destination type.
935 */
936 switch (dest.type) {
937 case BRW_REGISTER_TYPE_F:
938 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_F);
939 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_F);
940 break;
941 case BRW_REGISTER_TYPE_DF:
942 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_DF);
943 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_DF);
944 break;
945 case BRW_REGISTER_TYPE_D:
946 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_D);
947 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_D);
948 break;
949 case BRW_REGISTER_TYPE_UD:
950 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_UD);
951 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_UD);
952 break;
953 default:
954 unreachable("not reached");
955 }
956 }
957
958 return inst;
959 }
960
961
962 /***********************************************************************
963 * Convenience routines.
964 */
965 #define ALU1(OP) \
966 brw_inst *brw_##OP(struct brw_codegen *p, \
967 struct brw_reg dest, \
968 struct brw_reg src0) \
969 { \
970 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
971 }
972
973 #define ALU2(OP) \
974 brw_inst *brw_##OP(struct brw_codegen *p, \
975 struct brw_reg dest, \
976 struct brw_reg src0, \
977 struct brw_reg src1) \
978 { \
979 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
980 }
981
982 #define ALU3(OP) \
983 brw_inst *brw_##OP(struct brw_codegen *p, \
984 struct brw_reg dest, \
985 struct brw_reg src0, \
986 struct brw_reg src1, \
987 struct brw_reg src2) \
988 { \
989 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
990 }
991
992 #define ALU3F(OP) \
993 brw_inst *brw_##OP(struct brw_codegen *p, \
994 struct brw_reg dest, \
995 struct brw_reg src0, \
996 struct brw_reg src1, \
997 struct brw_reg src2) \
998 { \
999 assert(dest.type == BRW_REGISTER_TYPE_F || \
1000 dest.type == BRW_REGISTER_TYPE_DF); \
1001 if (dest.type == BRW_REGISTER_TYPE_F) { \
1002 assert(src0.type == BRW_REGISTER_TYPE_F); \
1003 assert(src1.type == BRW_REGISTER_TYPE_F); \
1004 assert(src2.type == BRW_REGISTER_TYPE_F); \
1005 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
1006 assert(src0.type == BRW_REGISTER_TYPE_DF); \
1007 assert(src1.type == BRW_REGISTER_TYPE_DF); \
1008 assert(src2.type == BRW_REGISTER_TYPE_DF); \
1009 } \
1010 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1011 }
1012
1013 /* Rounding operations (other than RNDD) require two instructions - the first
1014 * stores a rounded value (possibly the wrong way) in the dest register, but
1015 * also sets a per-channel "increment bit" in the flag register. A predicated
1016 * add of 1.0 fixes dest to contain the desired result.
1017 *
1018 * Sandybridge and later appear to round correctly without an ADD.
1019 */
1020 #define ROUND(OP) \
1021 void brw_##OP(struct brw_codegen *p, \
1022 struct brw_reg dest, \
1023 struct brw_reg src) \
1024 { \
1025 const struct brw_device_info *devinfo = p->devinfo; \
1026 brw_inst *rnd, *add; \
1027 rnd = next_insn(p, BRW_OPCODE_##OP); \
1028 brw_set_dest(p, rnd, dest); \
1029 brw_set_src0(p, rnd, src); \
1030 \
1031 if (devinfo->gen < 6) { \
1032 /* turn on round-increments */ \
1033 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
1034 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
1035 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
1036 } \
1037 }
1038
1039
1040 ALU1(MOV)
1041 ALU2(SEL)
1042 ALU1(NOT)
1043 ALU2(AND)
1044 ALU2(OR)
1045 ALU2(XOR)
1046 ALU2(SHR)
1047 ALU2(SHL)
1048 ALU2(ASR)
1049 ALU1(FRC)
1050 ALU1(RNDD)
1051 ALU2(MAC)
1052 ALU2(MACH)
1053 ALU1(LZD)
1054 ALU2(DP4)
1055 ALU2(DPH)
1056 ALU2(DP3)
1057 ALU2(DP2)
1058 ALU3F(MAD)
1059 ALU3F(LRP)
1060 ALU1(BFREV)
1061 ALU3(BFE)
1062 ALU2(BFI1)
1063 ALU3(BFI2)
1064 ALU1(FBH)
1065 ALU1(FBL)
1066 ALU1(CBIT)
1067 ALU2(ADDC)
1068 ALU2(SUBB)
1069
1070 ROUND(RNDZ)
1071 ROUND(RNDE)
1072
1073
1074 brw_inst *
1075 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1076 struct brw_reg src0, struct brw_reg src1)
1077 {
1078 /* 6.2.2: add */
1079 if (src0.type == BRW_REGISTER_TYPE_F ||
1080 (src0.file == BRW_IMMEDIATE_VALUE &&
1081 src0.type == BRW_REGISTER_TYPE_VF)) {
1082 assert(src1.type != BRW_REGISTER_TYPE_UD);
1083 assert(src1.type != BRW_REGISTER_TYPE_D);
1084 }
1085
1086 if (src1.type == BRW_REGISTER_TYPE_F ||
1087 (src1.file == BRW_IMMEDIATE_VALUE &&
1088 src1.type == BRW_REGISTER_TYPE_VF)) {
1089 assert(src0.type != BRW_REGISTER_TYPE_UD);
1090 assert(src0.type != BRW_REGISTER_TYPE_D);
1091 }
1092
1093 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1094 }
1095
1096 brw_inst *
1097 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1098 struct brw_reg src0, struct brw_reg src1)
1099 {
1100 assert(dest.type == src0.type);
1101 assert(src0.type == src1.type);
1102 switch (src0.type) {
1103 case BRW_REGISTER_TYPE_B:
1104 case BRW_REGISTER_TYPE_UB:
1105 case BRW_REGISTER_TYPE_W:
1106 case BRW_REGISTER_TYPE_UW:
1107 case BRW_REGISTER_TYPE_D:
1108 case BRW_REGISTER_TYPE_UD:
1109 break;
1110 default:
1111 unreachable("Bad type for brw_AVG");
1112 }
1113
1114 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1115 }
1116
1117 brw_inst *
1118 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1119 struct brw_reg src0, struct brw_reg src1)
1120 {
1121 /* 6.32.38: mul */
1122 if (src0.type == BRW_REGISTER_TYPE_D ||
1123 src0.type == BRW_REGISTER_TYPE_UD ||
1124 src1.type == BRW_REGISTER_TYPE_D ||
1125 src1.type == BRW_REGISTER_TYPE_UD) {
1126 assert(dest.type != BRW_REGISTER_TYPE_F);
1127 }
1128
1129 if (src0.type == BRW_REGISTER_TYPE_F ||
1130 (src0.file == BRW_IMMEDIATE_VALUE &&
1131 src0.type == BRW_REGISTER_TYPE_VF)) {
1132 assert(src1.type != BRW_REGISTER_TYPE_UD);
1133 assert(src1.type != BRW_REGISTER_TYPE_D);
1134 }
1135
1136 if (src1.type == BRW_REGISTER_TYPE_F ||
1137 (src1.file == BRW_IMMEDIATE_VALUE &&
1138 src1.type == BRW_REGISTER_TYPE_VF)) {
1139 assert(src0.type != BRW_REGISTER_TYPE_UD);
1140 assert(src0.type != BRW_REGISTER_TYPE_D);
1141 }
1142
1143 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1144 src0.nr != BRW_ARF_ACCUMULATOR);
1145 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1146 src1.nr != BRW_ARF_ACCUMULATOR);
1147
1148 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1149 }
1150
1151 brw_inst *
1152 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1153 struct brw_reg src0, struct brw_reg src1)
1154 {
1155 src0.vstride = BRW_VERTICAL_STRIDE_0;
1156 src0.width = BRW_WIDTH_1;
1157 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1158 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1159 }
1160
1161 brw_inst *
1162 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1163 struct brw_reg src0, struct brw_reg src1)
1164 {
1165 src0.vstride = BRW_VERTICAL_STRIDE_0;
1166 src0.width = BRW_WIDTH_1;
1167 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1168 src1.vstride = BRW_VERTICAL_STRIDE_8;
1169 src1.width = BRW_WIDTH_8;
1170 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1171 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1172 }
1173
1174 brw_inst *
1175 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1176 {
1177 const struct brw_device_info *devinfo = p->devinfo;
1178 const bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1179 /* The F32TO16 instruction doesn't support 32-bit destination types in
1180 * Align1 mode, and neither does the Gen8 implementation in terms of a
1181 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1182 * an undocumented feature.
1183 */
1184 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1185 (!align16 || devinfo->gen >= 8));
1186 brw_inst *inst;
1187
1188 if (align16) {
1189 assert(dst.type == BRW_REGISTER_TYPE_UD);
1190 } else {
1191 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1192 dst.type == BRW_REGISTER_TYPE_W ||
1193 dst.type == BRW_REGISTER_TYPE_UW ||
1194 dst.type == BRW_REGISTER_TYPE_HF);
1195 }
1196
1197 brw_push_insn_state(p);
1198
1199 if (needs_zero_fill) {
1200 brw_set_default_access_mode(p, BRW_ALIGN_1);
1201 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1202 }
1203
1204 if (devinfo->gen >= 8) {
1205 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1206 } else {
1207 assert(devinfo->gen == 7);
1208 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1209 }
1210
1211 if (needs_zero_fill) {
1212 brw_inst_set_no_dd_clear(devinfo, inst, true);
1213 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_ud(0u));
1214 brw_inst_set_no_dd_check(devinfo, inst, true);
1215 }
1216
1217 brw_pop_insn_state(p);
1218 return inst;
1219 }
1220
1221 brw_inst *
1222 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1223 {
1224 const struct brw_device_info *devinfo = p->devinfo;
1225 bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1226
1227 if (align16) {
1228 assert(src.type == BRW_REGISTER_TYPE_UD);
1229 } else {
1230 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1231 *
1232 * Because this instruction does not have a 16-bit floating-point
1233 * type, the source data type must be Word (W). The destination type
1234 * must be F (Float).
1235 */
1236 if (src.type == BRW_REGISTER_TYPE_UD)
1237 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1238
1239 assert(src.type == BRW_REGISTER_TYPE_W ||
1240 src.type == BRW_REGISTER_TYPE_UW ||
1241 src.type == BRW_REGISTER_TYPE_HF);
1242 }
1243
1244 if (devinfo->gen >= 8) {
1245 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1246 } else {
1247 assert(devinfo->gen == 7);
1248 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1249 }
1250 }
1251
1252
1253 void brw_NOP(struct brw_codegen *p)
1254 {
1255 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1256 brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_1);
1257 brw_set_dest(p, insn, retype(brw_vec1_grf(0,0), BRW_REGISTER_TYPE_UD));
1258 brw_set_src0(p, insn, retype(brw_vec1_grf(0,0), BRW_REGISTER_TYPE_UD));
1259 brw_set_src1(p, insn, brw_imm_ud(0x0));
1260 }
1261
1262
1263
1264
1265
1266 /***********************************************************************
1267 * Comparisons, if/else/endif
1268 */
1269
1270 brw_inst *
1271 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1272 unsigned predicate_control)
1273 {
1274 const struct brw_device_info *devinfo = p->devinfo;
1275 struct brw_reg ip = brw_ip_reg();
1276 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1277
1278 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_2);
1279 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1280 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1281 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1282
1283 return inst;
1284 }
1285
1286 static void
1287 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1288 {
1289 p->if_stack[p->if_stack_depth] = inst - p->store;
1290
1291 p->if_stack_depth++;
1292 if (p->if_stack_array_size <= p->if_stack_depth) {
1293 p->if_stack_array_size *= 2;
1294 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1295 p->if_stack_array_size);
1296 }
1297 }
1298
1299 static brw_inst *
1300 pop_if_stack(struct brw_codegen *p)
1301 {
1302 p->if_stack_depth--;
1303 return &p->store[p->if_stack[p->if_stack_depth]];
1304 }
1305
1306 static void
1307 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1308 {
1309 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1310 p->loop_stack_array_size *= 2;
1311 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1312 p->loop_stack_array_size);
1313 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1314 p->loop_stack_array_size);
1315 }
1316
1317 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1318 p->loop_stack_depth++;
1319 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1320 }
1321
1322 static brw_inst *
1323 get_inner_do_insn(struct brw_codegen *p)
1324 {
1325 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1326 }
1327
1328 /* EU takes the value from the flag register and pushes it onto some
1329 * sort of a stack (presumably merging with any flag value already on
1330 * the stack). Within an if block, the flags at the top of the stack
1331 * control execution on each channel of the unit, eg. on each of the
1332 * 16 pixel values in our wm programs.
1333 *
1334 * When the matching 'else' instruction is reached (presumably by
1335 * countdown of the instruction count patched in by our ELSE/ENDIF
1336 * functions), the relevant flags are inverted.
1337 *
1338 * When the matching 'endif' instruction is reached, the flags are
1339 * popped off. If the stack is now empty, normal execution resumes.
1340 */
1341 brw_inst *
1342 brw_IF(struct brw_codegen *p, unsigned execute_size)
1343 {
1344 const struct brw_device_info *devinfo = p->devinfo;
1345 brw_inst *insn;
1346
1347 insn = next_insn(p, BRW_OPCODE_IF);
1348
1349 /* Override the defaults for this instruction:
1350 */
1351 if (devinfo->gen < 6) {
1352 brw_set_dest(p, insn, brw_ip_reg());
1353 brw_set_src0(p, insn, brw_ip_reg());
1354 brw_set_src1(p, insn, brw_imm_d(0x0));
1355 } else if (devinfo->gen == 6) {
1356 brw_set_dest(p, insn, brw_imm_w(0));
1357 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1358 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1359 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1360 } else if (devinfo->gen == 7) {
1361 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1362 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1363 brw_set_src1(p, insn, brw_imm_w(0));
1364 brw_inst_set_jip(devinfo, insn, 0);
1365 brw_inst_set_uip(devinfo, insn, 0);
1366 } else {
1367 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1368 brw_set_src0(p, insn, brw_imm_d(0));
1369 brw_inst_set_jip(devinfo, insn, 0);
1370 brw_inst_set_uip(devinfo, insn, 0);
1371 }
1372
1373 brw_inst_set_exec_size(devinfo, insn, execute_size);
1374 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1375 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1376 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1377 if (!p->single_program_flow && devinfo->gen < 6)
1378 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1379
1380 push_if_stack(p, insn);
1381 p->if_depth_in_loop[p->loop_stack_depth]++;
1382 return insn;
1383 }
1384
1385 /* This function is only used for gen6-style IF instructions with an
1386 * embedded comparison (conditional modifier). It is not used on gen7.
1387 */
1388 brw_inst *
1389 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1390 struct brw_reg src0, struct brw_reg src1)
1391 {
1392 const struct brw_device_info *devinfo = p->devinfo;
1393 brw_inst *insn;
1394
1395 insn = next_insn(p, BRW_OPCODE_IF);
1396
1397 brw_set_dest(p, insn, brw_imm_w(0));
1398 brw_inst_set_exec_size(devinfo, insn, p->compressed ? BRW_EXECUTE_16
1399 : BRW_EXECUTE_8);
1400 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1401 brw_set_src0(p, insn, src0);
1402 brw_set_src1(p, insn, src1);
1403
1404 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1405 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1406 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1407
1408 push_if_stack(p, insn);
1409 return insn;
1410 }
1411
1412 /**
1413 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1414 */
1415 static void
1416 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1417 brw_inst *if_inst, brw_inst *else_inst)
1418 {
1419 const struct brw_device_info *devinfo = p->devinfo;
1420
1421 /* The next instruction (where the ENDIF would be, if it existed) */
1422 brw_inst *next_inst = &p->store[p->nr_insn];
1423
1424 assert(p->single_program_flow);
1425 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1426 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1427 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1428
1429 /* Convert IF to an ADD instruction that moves the instruction pointer
1430 * to the first instruction of the ELSE block. If there is no ELSE
1431 * block, point to where ENDIF would be. Reverse the predicate.
1432 *
1433 * There's no need to execute an ENDIF since we don't need to do any
1434 * stack operations, and if we're currently executing, we just want to
1435 * continue normally.
1436 */
1437 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1438 brw_inst_set_pred_inv(devinfo, if_inst, true);
1439
1440 if (else_inst != NULL) {
1441 /* Convert ELSE to an ADD instruction that points where the ENDIF
1442 * would be.
1443 */
1444 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1445
1446 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1447 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1448 } else {
1449 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1450 }
1451 }
1452
1453 /**
1454 * Patch IF and ELSE instructions with appropriate jump targets.
1455 */
1456 static void
1457 patch_IF_ELSE(struct brw_codegen *p,
1458 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1459 {
1460 const struct brw_device_info *devinfo = p->devinfo;
1461
1462 /* We shouldn't be patching IF and ELSE instructions in single program flow
1463 * mode when gen < 6, because in single program flow mode on those
1464 * platforms, we convert flow control instructions to conditional ADDs that
1465 * operate on IP (see brw_ENDIF).
1466 *
1467 * However, on Gen6, writing to IP doesn't work in single program flow mode
1468 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1469 * not be updated by non-flow control instructions."). And on later
1470 * platforms, there is no significant benefit to converting control flow
1471 * instructions to conditional ADDs. So we do patch IF and ELSE
1472 * instructions in single program flow mode on those platforms.
1473 */
1474 if (devinfo->gen < 6)
1475 assert(!p->single_program_flow);
1476
1477 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1478 assert(endif_inst != NULL);
1479 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1480
1481 unsigned br = brw_jump_scale(devinfo);
1482
1483 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1484 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1485
1486 if (else_inst == NULL) {
1487 /* Patch IF -> ENDIF */
1488 if (devinfo->gen < 6) {
1489 /* Turn it into an IFF, which means no mask stack operations for
1490 * all-false and jumping past the ENDIF.
1491 */
1492 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1493 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1494 br * (endif_inst - if_inst + 1));
1495 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1496 } else if (devinfo->gen == 6) {
1497 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1498 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1499 } else {
1500 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1501 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1502 }
1503 } else {
1504 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1505
1506 /* Patch IF -> ELSE */
1507 if (devinfo->gen < 6) {
1508 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1509 br * (else_inst - if_inst));
1510 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1511 } else if (devinfo->gen == 6) {
1512 brw_inst_set_gen6_jump_count(devinfo, if_inst,
1513 br * (else_inst - if_inst + 1));
1514 }
1515
1516 /* Patch ELSE -> ENDIF */
1517 if (devinfo->gen < 6) {
1518 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1519 * matching ENDIF.
1520 */
1521 brw_inst_set_gen4_jump_count(devinfo, else_inst,
1522 br * (endif_inst - else_inst + 1));
1523 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1524 } else if (devinfo->gen == 6) {
1525 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1526 brw_inst_set_gen6_jump_count(devinfo, else_inst,
1527 br * (endif_inst - else_inst));
1528 } else {
1529 /* The IF instruction's JIP should point just past the ELSE */
1530 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1531 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1532 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1533 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1534 if (devinfo->gen >= 8) {
1535 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1536 * should point to ENDIF.
1537 */
1538 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1539 }
1540 }
1541 }
1542 }
1543
1544 void
1545 brw_ELSE(struct brw_codegen *p)
1546 {
1547 const struct brw_device_info *devinfo = p->devinfo;
1548 brw_inst *insn;
1549
1550 insn = next_insn(p, BRW_OPCODE_ELSE);
1551
1552 if (devinfo->gen < 6) {
1553 brw_set_dest(p, insn, brw_ip_reg());
1554 brw_set_src0(p, insn, brw_ip_reg());
1555 brw_set_src1(p, insn, brw_imm_d(0x0));
1556 } else if (devinfo->gen == 6) {
1557 brw_set_dest(p, insn, brw_imm_w(0));
1558 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1559 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1560 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1561 } else if (devinfo->gen == 7) {
1562 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1563 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1564 brw_set_src1(p, insn, brw_imm_w(0));
1565 brw_inst_set_jip(devinfo, insn, 0);
1566 brw_inst_set_uip(devinfo, insn, 0);
1567 } else {
1568 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1569 brw_set_src0(p, insn, brw_imm_d(0));
1570 brw_inst_set_jip(devinfo, insn, 0);
1571 brw_inst_set_uip(devinfo, insn, 0);
1572 }
1573
1574 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1575 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1576 if (!p->single_program_flow && devinfo->gen < 6)
1577 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1578
1579 push_if_stack(p, insn);
1580 }
1581
1582 void
1583 brw_ENDIF(struct brw_codegen *p)
1584 {
1585 const struct brw_device_info *devinfo = p->devinfo;
1586 brw_inst *insn = NULL;
1587 brw_inst *else_inst = NULL;
1588 brw_inst *if_inst = NULL;
1589 brw_inst *tmp;
1590 bool emit_endif = true;
1591
1592 /* In single program flow mode, we can express IF and ELSE instructions
1593 * equivalently as ADD instructions that operate on IP. On platforms prior
1594 * to Gen6, flow control instructions cause an implied thread switch, so
1595 * this is a significant savings.
1596 *
1597 * However, on Gen6, writing to IP doesn't work in single program flow mode
1598 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1599 * not be updated by non-flow control instructions."). And on later
1600 * platforms, there is no significant benefit to converting control flow
1601 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1602 * Gen5.
1603 */
1604 if (devinfo->gen < 6 && p->single_program_flow)
1605 emit_endif = false;
1606
1607 /*
1608 * A single next_insn() may change the base address of instruction store
1609 * memory(p->store), so call it first before referencing the instruction
1610 * store pointer from an index
1611 */
1612 if (emit_endif)
1613 insn = next_insn(p, BRW_OPCODE_ENDIF);
1614
1615 /* Pop the IF and (optional) ELSE instructions from the stack */
1616 p->if_depth_in_loop[p->loop_stack_depth]--;
1617 tmp = pop_if_stack(p);
1618 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1619 else_inst = tmp;
1620 tmp = pop_if_stack(p);
1621 }
1622 if_inst = tmp;
1623
1624 if (!emit_endif) {
1625 /* ENDIF is useless; don't bother emitting it. */
1626 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1627 return;
1628 }
1629
1630 if (devinfo->gen < 6) {
1631 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1632 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1633 brw_set_src1(p, insn, brw_imm_d(0x0));
1634 } else if (devinfo->gen == 6) {
1635 brw_set_dest(p, insn, brw_imm_w(0));
1636 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1637 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1638 } else if (devinfo->gen == 7) {
1639 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1640 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1641 brw_set_src1(p, insn, brw_imm_w(0));
1642 } else {
1643 brw_set_src0(p, insn, brw_imm_d(0));
1644 }
1645
1646 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1647 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1648 if (devinfo->gen < 6)
1649 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1650
1651 /* Also pop item off the stack in the endif instruction: */
1652 if (devinfo->gen < 6) {
1653 brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1654 brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1655 } else if (devinfo->gen == 6) {
1656 brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1657 } else {
1658 brw_inst_set_jip(devinfo, insn, 2);
1659 }
1660 patch_IF_ELSE(p, if_inst, else_inst, insn);
1661 }
1662
1663 brw_inst *
1664 brw_BREAK(struct brw_codegen *p)
1665 {
1666 const struct brw_device_info *devinfo = p->devinfo;
1667 brw_inst *insn;
1668
1669 insn = next_insn(p, BRW_OPCODE_BREAK);
1670 if (devinfo->gen >= 8) {
1671 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1672 brw_set_src0(p, insn, brw_imm_d(0x0));
1673 } else if (devinfo->gen >= 6) {
1674 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1675 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1676 brw_set_src1(p, insn, brw_imm_d(0x0));
1677 } else {
1678 brw_set_dest(p, insn, brw_ip_reg());
1679 brw_set_src0(p, insn, brw_ip_reg());
1680 brw_set_src1(p, insn, brw_imm_d(0x0));
1681 brw_inst_set_gen4_pop_count(devinfo, insn,
1682 p->if_depth_in_loop[p->loop_stack_depth]);
1683 }
1684 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1685 brw_inst_set_exec_size(devinfo, insn, p->compressed ? BRW_EXECUTE_16
1686 : BRW_EXECUTE_8);
1687
1688 return insn;
1689 }
1690
1691 brw_inst *
1692 brw_CONT(struct brw_codegen *p)
1693 {
1694 const struct brw_device_info *devinfo = p->devinfo;
1695 brw_inst *insn;
1696
1697 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1698 brw_set_dest(p, insn, brw_ip_reg());
1699 if (devinfo->gen >= 8) {
1700 brw_set_src0(p, insn, brw_imm_d(0x0));
1701 } else {
1702 brw_set_src0(p, insn, brw_ip_reg());
1703 brw_set_src1(p, insn, brw_imm_d(0x0));
1704 }
1705
1706 if (devinfo->gen < 6) {
1707 brw_inst_set_gen4_pop_count(devinfo, insn,
1708 p->if_depth_in_loop[p->loop_stack_depth]);
1709 }
1710 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1711 brw_inst_set_exec_size(devinfo, insn, p->compressed ? BRW_EXECUTE_16
1712 : BRW_EXECUTE_8);
1713 return insn;
1714 }
1715
1716 brw_inst *
1717 gen6_HALT(struct brw_codegen *p)
1718 {
1719 const struct brw_device_info *devinfo = p->devinfo;
1720 brw_inst *insn;
1721
1722 insn = next_insn(p, BRW_OPCODE_HALT);
1723 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1724 if (devinfo->gen >= 8) {
1725 brw_set_src0(p, insn, brw_imm_d(0x0));
1726 } else {
1727 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1728 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1729 }
1730
1731 if (p->compressed) {
1732 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_16);
1733 } else {
1734 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1735 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_8);
1736 }
1737 return insn;
1738 }
1739
1740 /* DO/WHILE loop:
1741 *
1742 * The DO/WHILE is just an unterminated loop -- break or continue are
1743 * used for control within the loop. We have a few ways they can be
1744 * done.
1745 *
1746 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1747 * jip and no DO instruction.
1748 *
1749 * For non-uniform control flow pre-gen6, there's a DO instruction to
1750 * push the mask, and a WHILE to jump back, and BREAK to get out and
1751 * pop the mask.
1752 *
1753 * For gen6, there's no more mask stack, so no need for DO. WHILE
1754 * just points back to the first instruction of the loop.
1755 */
1756 brw_inst *
1757 brw_DO(struct brw_codegen *p, unsigned execute_size)
1758 {
1759 const struct brw_device_info *devinfo = p->devinfo;
1760
1761 if (devinfo->gen >= 6 || p->single_program_flow) {
1762 push_loop_stack(p, &p->store[p->nr_insn]);
1763 return &p->store[p->nr_insn];
1764 } else {
1765 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1766
1767 push_loop_stack(p, insn);
1768
1769 /* Override the defaults for this instruction:
1770 */
1771 brw_set_dest(p, insn, brw_null_reg());
1772 brw_set_src0(p, insn, brw_null_reg());
1773 brw_set_src1(p, insn, brw_null_reg());
1774
1775 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1776 brw_inst_set_exec_size(devinfo, insn, execute_size);
1777 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1778
1779 return insn;
1780 }
1781 }
1782
1783 /**
1784 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1785 * instruction here.
1786 *
1787 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1788 * nesting, since it can always just point to the end of the block/current loop.
1789 */
1790 static void
1791 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1792 {
1793 const struct brw_device_info *devinfo = p->devinfo;
1794 brw_inst *do_inst = get_inner_do_insn(p);
1795 brw_inst *inst;
1796 unsigned br = brw_jump_scale(devinfo);
1797
1798 assert(devinfo->gen < 6);
1799
1800 for (inst = while_inst - 1; inst != do_inst; inst--) {
1801 /* If the jump count is != 0, that means that this instruction has already
1802 * been patched because it's part of a loop inside of the one we're
1803 * patching.
1804 */
1805 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1806 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1807 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1808 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1809 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1810 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1811 }
1812 }
1813 }
1814
1815 brw_inst *
1816 brw_WHILE(struct brw_codegen *p)
1817 {
1818 const struct brw_device_info *devinfo = p->devinfo;
1819 brw_inst *insn, *do_insn;
1820 unsigned br = brw_jump_scale(devinfo);
1821
1822 if (devinfo->gen >= 6) {
1823 insn = next_insn(p, BRW_OPCODE_WHILE);
1824 do_insn = get_inner_do_insn(p);
1825
1826 if (devinfo->gen >= 8) {
1827 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1828 brw_set_src0(p, insn, brw_imm_d(0));
1829 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1830 } else if (devinfo->gen == 7) {
1831 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1832 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1833 brw_set_src1(p, insn, brw_imm_w(0));
1834 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1835 } else {
1836 brw_set_dest(p, insn, brw_imm_w(0));
1837 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1838 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1839 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1840 }
1841
1842 brw_inst_set_exec_size(devinfo, insn, p->compressed ? BRW_EXECUTE_16
1843 : BRW_EXECUTE_8);
1844 } else {
1845 if (p->single_program_flow) {
1846 insn = next_insn(p, BRW_OPCODE_ADD);
1847 do_insn = get_inner_do_insn(p);
1848
1849 brw_set_dest(p, insn, brw_ip_reg());
1850 brw_set_src0(p, insn, brw_ip_reg());
1851 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1852 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1853 } else {
1854 insn = next_insn(p, BRW_OPCODE_WHILE);
1855 do_insn = get_inner_do_insn(p);
1856
1857 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1858
1859 brw_set_dest(p, insn, brw_ip_reg());
1860 brw_set_src0(p, insn, brw_ip_reg());
1861 brw_set_src1(p, insn, brw_imm_d(0));
1862
1863 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1864 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1865 brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1866
1867 brw_patch_break_cont(p, insn);
1868 }
1869 }
1870 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1871
1872 p->loop_stack_depth--;
1873
1874 return insn;
1875 }
1876
1877 /* FORWARD JUMPS:
1878 */
1879 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1880 {
1881 const struct brw_device_info *devinfo = p->devinfo;
1882 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1883 unsigned jmpi = 1;
1884
1885 if (devinfo->gen >= 5)
1886 jmpi = 2;
1887
1888 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1889 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1890
1891 brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1892 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1893 }
1894
1895 /* To integrate with the above, it makes sense that the comparison
1896 * instruction should populate the flag register. It might be simpler
1897 * just to use the flag reg for most WM tasks?
1898 */
1899 void brw_CMP(struct brw_codegen *p,
1900 struct brw_reg dest,
1901 unsigned conditional,
1902 struct brw_reg src0,
1903 struct brw_reg src1)
1904 {
1905 const struct brw_device_info *devinfo = p->devinfo;
1906 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1907
1908 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1909 brw_set_dest(p, insn, dest);
1910 brw_set_src0(p, insn, src0);
1911 brw_set_src1(p, insn, src1);
1912
1913 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1914 * page says:
1915 * "Any CMP instruction with a null destination must use a {switch}."
1916 *
1917 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1918 * mentioned on their work-arounds pages.
1919 */
1920 if (devinfo->gen == 7) {
1921 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1922 dest.nr == BRW_ARF_NULL) {
1923 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1924 }
1925 }
1926 }
1927
1928 /***********************************************************************
1929 * Helpers for the various SEND message types:
1930 */
1931
1932 /** Extended math function, float[8].
1933 */
1934 void gen4_math(struct brw_codegen *p,
1935 struct brw_reg dest,
1936 unsigned function,
1937 unsigned msg_reg_nr,
1938 struct brw_reg src,
1939 unsigned precision )
1940 {
1941 const struct brw_device_info *devinfo = p->devinfo;
1942 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1943 unsigned data_type;
1944 if (has_scalar_region(src)) {
1945 data_type = BRW_MATH_DATA_SCALAR;
1946 } else {
1947 data_type = BRW_MATH_DATA_VECTOR;
1948 }
1949
1950 assert(devinfo->gen < 6);
1951
1952 /* Example code doesn't set predicate_control for send
1953 * instructions.
1954 */
1955 brw_inst_set_pred_control(devinfo, insn, 0);
1956 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1957
1958 brw_set_dest(p, insn, dest);
1959 brw_set_src0(p, insn, src);
1960 brw_set_math_message(p,
1961 insn,
1962 function,
1963 src.type == BRW_REGISTER_TYPE_D,
1964 precision,
1965 data_type);
1966 }
1967
1968 void gen6_math(struct brw_codegen *p,
1969 struct brw_reg dest,
1970 unsigned function,
1971 struct brw_reg src0,
1972 struct brw_reg src1)
1973 {
1974 const struct brw_device_info *devinfo = p->devinfo;
1975 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
1976
1977 assert(devinfo->gen >= 6);
1978
1979 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1980 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1981 assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
1982 (devinfo->gen >= 8 && src0.file == BRW_IMMEDIATE_VALUE));
1983
1984 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1985 if (devinfo->gen == 6) {
1986 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1987 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1988 }
1989
1990 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1991 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1992 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1993 assert(src0.type != BRW_REGISTER_TYPE_F);
1994 assert(src1.type != BRW_REGISTER_TYPE_F);
1995 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
1996 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
1997 } else {
1998 assert(src0.type == BRW_REGISTER_TYPE_F);
1999 assert(src1.type == BRW_REGISTER_TYPE_F);
2000 if (function == BRW_MATH_FUNCTION_POW) {
2001 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2002 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2003 } else {
2004 assert(src1.file == BRW_ARCHITECTURE_REGISTER_FILE &&
2005 src1.nr == BRW_ARF_NULL);
2006 }
2007 }
2008
2009 /* Source modifiers are ignored for extended math instructions on Gen6. */
2010 if (devinfo->gen == 6) {
2011 assert(!src0.negate);
2012 assert(!src0.abs);
2013 assert(!src1.negate);
2014 assert(!src1.abs);
2015 }
2016
2017 brw_inst_set_math_function(devinfo, insn, function);
2018
2019 brw_set_dest(p, insn, dest);
2020 brw_set_src0(p, insn, src0);
2021 brw_set_src1(p, insn, src1);
2022 }
2023
2024 /**
2025 * Return the right surface index to access the thread scratch space using
2026 * stateless dataport messages.
2027 */
2028 unsigned
2029 brw_scratch_surface_idx(const struct brw_codegen *p)
2030 {
2031 /* The scratch space is thread-local so IA coherency is unnecessary. */
2032 if (p->devinfo->gen >= 8)
2033 return GEN8_BTI_STATELESS_NON_COHERENT;
2034 else
2035 return BRW_BTI_STATELESS;
2036 }
2037
2038 /**
2039 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2040 * using a constant offset per channel.
2041 *
2042 * The offset must be aligned to oword size (16 bytes). Used for
2043 * register spilling.
2044 */
2045 void brw_oword_block_write_scratch(struct brw_codegen *p,
2046 struct brw_reg mrf,
2047 int num_regs,
2048 unsigned offset)
2049 {
2050 const struct brw_device_info *devinfo = p->devinfo;
2051 uint32_t msg_control, msg_type;
2052 int mlen;
2053
2054 if (devinfo->gen >= 6)
2055 offset /= 16;
2056
2057 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2058
2059 if (num_regs == 1) {
2060 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2061 mlen = 2;
2062 } else {
2063 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2064 mlen = 3;
2065 }
2066
2067 /* Set up the message header. This is g0, with g0.2 filled with
2068 * the offset. We don't want to leave our offset around in g0 or
2069 * it'll screw up texture samples, so set it up inside the message
2070 * reg.
2071 */
2072 {
2073 brw_push_insn_state(p);
2074 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2075 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2076 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2077
2078 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2079
2080 /* set message header global offset field (reg 0, element 2) */
2081 brw_MOV(p,
2082 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2083 mrf.nr,
2084 2), BRW_REGISTER_TYPE_UD),
2085 brw_imm_ud(offset));
2086
2087 brw_pop_insn_state(p);
2088 }
2089
2090 {
2091 struct brw_reg dest;
2092 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2093 int send_commit_msg;
2094 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2095 BRW_REGISTER_TYPE_UW);
2096
2097 if (brw_inst_qtr_control(devinfo, insn) != BRW_COMPRESSION_NONE) {
2098 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
2099 src_header = vec16(src_header);
2100 }
2101 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2102 if (devinfo->gen < 6)
2103 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2104
2105 /* Until gen6, writes followed by reads from the same location
2106 * are not guaranteed to be ordered unless write_commit is set.
2107 * If set, then a no-op write is issued to the destination
2108 * register to set a dependency, and a read from the destination
2109 * can be used to ensure the ordering.
2110 *
2111 * For gen6, only writes between different threads need ordering
2112 * protection. Our use of DP writes is all about register
2113 * spilling within a thread.
2114 */
2115 if (devinfo->gen >= 6) {
2116 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2117 send_commit_msg = 0;
2118 } else {
2119 dest = src_header;
2120 send_commit_msg = 1;
2121 }
2122
2123 brw_set_dest(p, insn, dest);
2124 if (devinfo->gen >= 6) {
2125 brw_set_src0(p, insn, mrf);
2126 } else {
2127 brw_set_src0(p, insn, brw_null_reg());
2128 }
2129
2130 if (devinfo->gen >= 6)
2131 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2132 else
2133 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2134
2135 brw_set_dp_write_message(p,
2136 insn,
2137 brw_scratch_surface_idx(p),
2138 msg_control,
2139 msg_type,
2140 mlen,
2141 true, /* header_present */
2142 0, /* not a render target */
2143 send_commit_msg, /* response_length */
2144 0, /* eot */
2145 send_commit_msg);
2146 }
2147 }
2148
2149
2150 /**
2151 * Read a block of owords (half a GRF each) from the scratch buffer
2152 * using a constant index per channel.
2153 *
2154 * Offset must be aligned to oword size (16 bytes). Used for register
2155 * spilling.
2156 */
2157 void
2158 brw_oword_block_read_scratch(struct brw_codegen *p,
2159 struct brw_reg dest,
2160 struct brw_reg mrf,
2161 int num_regs,
2162 unsigned offset)
2163 {
2164 const struct brw_device_info *devinfo = p->devinfo;
2165 uint32_t msg_control;
2166 int rlen;
2167
2168 if (devinfo->gen >= 6)
2169 offset /= 16;
2170
2171 if (p->devinfo->gen >= 7) {
2172 /* On gen 7 and above, we no longer have message registers and we can
2173 * send from any register we want. By using the destination register
2174 * for the message, we guarantee that the implied message write won't
2175 * accidentally overwrite anything. This has been a problem because
2176 * the MRF registers and source for the final FB write are both fixed
2177 * and may overlap.
2178 */
2179 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2180 } else {
2181 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2182 }
2183 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2184
2185 if (num_regs == 1) {
2186 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2187 rlen = 1;
2188 } else {
2189 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2190 rlen = 2;
2191 }
2192
2193 {
2194 brw_push_insn_state(p);
2195 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2196 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2197 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2198
2199 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2200
2201 /* set message header global offset field (reg 0, element 2) */
2202 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2203
2204 brw_pop_insn_state(p);
2205 }
2206
2207 {
2208 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2209
2210 assert(brw_inst_pred_control(devinfo, insn) == 0);
2211 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
2212
2213 brw_set_dest(p, insn, dest); /* UW? */
2214 if (devinfo->gen >= 6) {
2215 brw_set_src0(p, insn, mrf);
2216 } else {
2217 brw_set_src0(p, insn, brw_null_reg());
2218 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2219 }
2220
2221 brw_set_dp_read_message(p,
2222 insn,
2223 brw_scratch_surface_idx(p),
2224 msg_control,
2225 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2226 BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2227 1, /* msg_length */
2228 true, /* header_present */
2229 rlen);
2230 }
2231 }
2232
2233 void
2234 gen7_block_read_scratch(struct brw_codegen *p,
2235 struct brw_reg dest,
2236 int num_regs,
2237 unsigned offset)
2238 {
2239 const struct brw_device_info *devinfo = p->devinfo;
2240 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2241 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2242
2243 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
2244 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2245
2246 /* The HW requires that the header is present; this is to get the g0.5
2247 * scratch offset.
2248 */
2249 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2250
2251 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2252 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2253 * is 32 bytes, which happens to be the size of a register.
2254 */
2255 offset /= REG_SIZE;
2256 assert(offset < (1 << 12));
2257
2258 gen7_set_dp_scratch_message(p, insn,
2259 false, /* scratch read */
2260 false, /* OWords */
2261 false, /* invalidate after read */
2262 num_regs,
2263 offset,
2264 1, /* mlen: just g0 */
2265 num_regs, /* rlen */
2266 true); /* header present */
2267 }
2268
2269 /**
2270 * Read a float[4] vector from the data port Data Cache (const buffer).
2271 * Location (in buffer) should be a multiple of 16.
2272 * Used for fetching shader constants.
2273 */
2274 void brw_oword_block_read(struct brw_codegen *p,
2275 struct brw_reg dest,
2276 struct brw_reg mrf,
2277 uint32_t offset,
2278 uint32_t bind_table_index)
2279 {
2280 const struct brw_device_info *devinfo = p->devinfo;
2281
2282 /* On newer hardware, offset is in units of owords. */
2283 if (devinfo->gen >= 6)
2284 offset /= 16;
2285
2286 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2287
2288 brw_push_insn_state(p);
2289 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2290 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2291 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2292 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2293
2294 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2295
2296 /* set message header global offset field (reg 0, element 2) */
2297 brw_MOV(p,
2298 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2299 mrf.nr,
2300 2), BRW_REGISTER_TYPE_UD),
2301 brw_imm_ud(offset));
2302
2303 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2304
2305 /* cast dest to a uword[8] vector */
2306 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2307
2308 brw_set_dest(p, insn, dest);
2309 if (devinfo->gen >= 6) {
2310 brw_set_src0(p, insn, mrf);
2311 } else {
2312 brw_set_src0(p, insn, brw_null_reg());
2313 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2314 }
2315
2316 brw_set_dp_read_message(p,
2317 insn,
2318 bind_table_index,
2319 BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2320 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2321 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2322 1, /* msg_length */
2323 true, /* header_present */
2324 1); /* response_length (1 reg, 2 owords!) */
2325
2326 brw_pop_insn_state(p);
2327 }
2328
2329
2330 void brw_fb_WRITE(struct brw_codegen *p,
2331 int dispatch_width,
2332 struct brw_reg payload,
2333 struct brw_reg implied_header,
2334 unsigned msg_control,
2335 unsigned binding_table_index,
2336 unsigned msg_length,
2337 unsigned response_length,
2338 bool eot,
2339 bool last_render_target,
2340 bool header_present)
2341 {
2342 const struct brw_device_info *devinfo = p->devinfo;
2343 brw_inst *insn;
2344 unsigned msg_type;
2345 struct brw_reg dest, src0;
2346
2347 if (dispatch_width == 16)
2348 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2349 else
2350 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2351
2352 if (devinfo->gen >= 6) {
2353 insn = next_insn(p, BRW_OPCODE_SENDC);
2354 } else {
2355 insn = next_insn(p, BRW_OPCODE_SEND);
2356 }
2357 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
2358
2359 if (devinfo->gen >= 6) {
2360 /* headerless version, just submit color payload */
2361 src0 = payload;
2362
2363 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2364 } else {
2365 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2366 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2367 src0 = implied_header;
2368
2369 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2370 }
2371
2372 brw_set_dest(p, insn, dest);
2373 brw_set_src0(p, insn, src0);
2374 brw_set_dp_write_message(p,
2375 insn,
2376 binding_table_index,
2377 msg_control,
2378 msg_type,
2379 msg_length,
2380 header_present,
2381 last_render_target,
2382 response_length,
2383 eot,
2384 0 /* send_commit_msg */);
2385 }
2386
2387
2388 /**
2389 * Texture sample instruction.
2390 * Note: the msg_type plus msg_length values determine exactly what kind
2391 * of sampling operation is performed. See volume 4, page 161 of docs.
2392 */
2393 void brw_SAMPLE(struct brw_codegen *p,
2394 struct brw_reg dest,
2395 unsigned msg_reg_nr,
2396 struct brw_reg src0,
2397 unsigned binding_table_index,
2398 unsigned sampler,
2399 unsigned msg_type,
2400 unsigned response_length,
2401 unsigned msg_length,
2402 unsigned header_present,
2403 unsigned simd_mode,
2404 unsigned return_format)
2405 {
2406 const struct brw_device_info *devinfo = p->devinfo;
2407 brw_inst *insn;
2408
2409 if (msg_reg_nr != -1)
2410 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2411
2412 insn = next_insn(p, BRW_OPCODE_SEND);
2413 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2414
2415 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2416 *
2417 * "Instruction compression is not allowed for this instruction (that
2418 * is, send). The hardware behavior is undefined if this instruction is
2419 * set as compressed. However, compress control can be set to "SecHalf"
2420 * to affect the EMask generation."
2421 *
2422 * No similar wording is found in later PRMs, but there are examples
2423 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2424 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2425 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2426 */
2427 if (brw_inst_qtr_control(devinfo, insn) != BRW_COMPRESSION_2NDHALF)
2428 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
2429
2430 if (devinfo->gen < 6)
2431 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2432
2433 brw_set_dest(p, insn, dest);
2434 brw_set_src0(p, insn, src0);
2435 brw_set_sampler_message(p, insn,
2436 binding_table_index,
2437 sampler,
2438 msg_type,
2439 response_length,
2440 msg_length,
2441 header_present,
2442 simd_mode,
2443 return_format);
2444 }
2445
2446 /* Adjust the message header's sampler state pointer to
2447 * select the correct group of 16 samplers.
2448 */
2449 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2450 struct brw_reg header,
2451 struct brw_reg sampler_index)
2452 {
2453 /* The "Sampler Index" field can only store values between 0 and 15.
2454 * However, we can add an offset to the "Sampler State Pointer"
2455 * field, effectively selecting a different set of 16 samplers.
2456 *
2457 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2458 * offset, and each sampler state is only 16-bytes, so we can't
2459 * exclusively use the offset - we have to use both.
2460 */
2461
2462 const struct brw_device_info *devinfo = p->devinfo;
2463
2464 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2465 const int sampler_state_size = 16; /* 16 bytes */
2466 uint32_t sampler = sampler_index.ud;
2467
2468 if (sampler >= 16) {
2469 assert(devinfo->is_haswell || devinfo->gen >= 8);
2470 brw_ADD(p,
2471 get_element_ud(header, 3),
2472 get_element_ud(brw_vec8_grf(0, 0), 3),
2473 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2474 }
2475 } else {
2476 /* Non-const sampler array indexing case */
2477 if (devinfo->gen < 8 && !devinfo->is_haswell) {
2478 return;
2479 }
2480
2481 struct brw_reg temp = get_element_ud(header, 3);
2482
2483 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2484 brw_SHL(p, temp, temp, brw_imm_ud(4));
2485 brw_ADD(p,
2486 get_element_ud(header, 3),
2487 get_element_ud(brw_vec8_grf(0, 0), 3),
2488 temp);
2489 }
2490 }
2491
2492 /* All these variables are pretty confusing - we might be better off
2493 * using bitmasks and macros for this, in the old style. Or perhaps
2494 * just having the caller instantiate the fields in dword3 itself.
2495 */
2496 void brw_urb_WRITE(struct brw_codegen *p,
2497 struct brw_reg dest,
2498 unsigned msg_reg_nr,
2499 struct brw_reg src0,
2500 enum brw_urb_write_flags flags,
2501 unsigned msg_length,
2502 unsigned response_length,
2503 unsigned offset,
2504 unsigned swizzle)
2505 {
2506 const struct brw_device_info *devinfo = p->devinfo;
2507 brw_inst *insn;
2508
2509 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2510
2511 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2512 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2513 brw_push_insn_state(p);
2514 brw_set_default_access_mode(p, BRW_ALIGN_1);
2515 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2516 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2517 BRW_REGISTER_TYPE_UD),
2518 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2519 brw_imm_ud(0xff00));
2520 brw_pop_insn_state(p);
2521 }
2522
2523 insn = next_insn(p, BRW_OPCODE_SEND);
2524
2525 assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2526
2527 brw_set_dest(p, insn, dest);
2528 brw_set_src0(p, insn, src0);
2529 brw_set_src1(p, insn, brw_imm_d(0));
2530
2531 if (devinfo->gen < 6)
2532 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2533
2534 brw_set_urb_message(p,
2535 insn,
2536 flags,
2537 msg_length,
2538 response_length,
2539 offset,
2540 swizzle);
2541 }
2542
2543 struct brw_inst *
2544 brw_send_indirect_message(struct brw_codegen *p,
2545 unsigned sfid,
2546 struct brw_reg dst,
2547 struct brw_reg payload,
2548 struct brw_reg desc)
2549 {
2550 const struct brw_device_info *devinfo = p->devinfo;
2551 struct brw_inst *send;
2552 int setup;
2553
2554 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2555
2556 assert(desc.type == BRW_REGISTER_TYPE_UD);
2557
2558 /* We hold on to the setup instruction (the SEND in the direct case, the OR
2559 * in the indirect case) by its index in the instruction store. The
2560 * pointer returned by next_insn() may become invalid if emitting the SEND
2561 * in the indirect case reallocs the store.
2562 */
2563
2564 if (desc.file == BRW_IMMEDIATE_VALUE) {
2565 setup = p->nr_insn;
2566 send = next_insn(p, BRW_OPCODE_SEND);
2567 brw_set_src1(p, send, desc);
2568
2569 } else {
2570 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2571
2572 brw_push_insn_state(p);
2573 brw_set_default_access_mode(p, BRW_ALIGN_1);
2574 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2575 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2576
2577 /* Load the indirect descriptor to an address register using OR so the
2578 * caller can specify additional descriptor bits with the usual
2579 * brw_set_*_message() helper functions.
2580 */
2581 setup = p->nr_insn;
2582 brw_OR(p, addr, desc, brw_imm_ud(0));
2583
2584 brw_pop_insn_state(p);
2585
2586 send = next_insn(p, BRW_OPCODE_SEND);
2587 brw_set_src1(p, send, addr);
2588 }
2589
2590 if (dst.width < BRW_EXECUTE_8)
2591 brw_inst_set_exec_size(devinfo, send, dst.width);
2592
2593 brw_set_dest(p, send, dst);
2594 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2595 brw_inst_set_sfid(devinfo, send, sfid);
2596
2597 return &p->store[setup];
2598 }
2599
2600 static struct brw_inst *
2601 brw_send_indirect_surface_message(struct brw_codegen *p,
2602 unsigned sfid,
2603 struct brw_reg dst,
2604 struct brw_reg payload,
2605 struct brw_reg surface,
2606 unsigned message_len,
2607 unsigned response_len,
2608 bool header_present)
2609 {
2610 const struct brw_device_info *devinfo = p->devinfo;
2611 struct brw_inst *insn;
2612
2613 if (surface.file != BRW_IMMEDIATE_VALUE) {
2614 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2615
2616 brw_push_insn_state(p);
2617 brw_set_default_access_mode(p, BRW_ALIGN_1);
2618 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2619 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2620
2621 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2622 * some surface array is accessed out of bounds.
2623 */
2624 insn = brw_AND(p, addr,
2625 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2626 BRW_GET_SWZ(surface.swizzle, 0)),
2627 brw_imm_ud(0xff));
2628
2629 brw_pop_insn_state(p);
2630
2631 surface = addr;
2632 }
2633
2634 insn = brw_send_indirect_message(p, sfid, dst, payload, surface);
2635 brw_inst_set_mlen(devinfo, insn, message_len);
2636 brw_inst_set_rlen(devinfo, insn, response_len);
2637 brw_inst_set_header_present(devinfo, insn, header_present);
2638
2639 return insn;
2640 }
2641
2642 static int
2643 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2644 {
2645 int offset;
2646 void *store = p->store;
2647 const struct brw_device_info *devinfo = p->devinfo;
2648
2649 int depth = 0;
2650
2651 for (offset = next_offset(devinfo, store, start_offset);
2652 offset < p->next_insn_offset;
2653 offset = next_offset(devinfo, store, offset)) {
2654 brw_inst *insn = store + offset;
2655
2656 switch (brw_inst_opcode(devinfo, insn)) {
2657 case BRW_OPCODE_IF:
2658 depth++;
2659 break;
2660 case BRW_OPCODE_ENDIF:
2661 if (depth == 0)
2662 return offset;
2663 depth--;
2664 break;
2665 case BRW_OPCODE_ELSE:
2666 case BRW_OPCODE_WHILE:
2667 case BRW_OPCODE_HALT:
2668 if (depth == 0)
2669 return offset;
2670 }
2671 }
2672
2673 return 0;
2674 }
2675
2676 /* There is no DO instruction on gen6, so to find the end of the loop
2677 * we have to see if the loop is jumping back before our start
2678 * instruction.
2679 */
2680 static int
2681 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2682 {
2683 const struct brw_device_info *devinfo = p->devinfo;
2684 int offset;
2685 int scale = 16 / brw_jump_scale(devinfo);
2686 void *store = p->store;
2687
2688 assert(devinfo->gen >= 6);
2689
2690 /* Always start after the instruction (such as a WHILE) we're trying to fix
2691 * up.
2692 */
2693 for (offset = next_offset(devinfo, store, start_offset);
2694 offset < p->next_insn_offset;
2695 offset = next_offset(devinfo, store, offset)) {
2696 brw_inst *insn = store + offset;
2697
2698 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2699 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2700 : brw_inst_jip(devinfo, insn);
2701 if (offset + jip * scale <= start_offset)
2702 return offset;
2703 }
2704 }
2705 assert(!"not reached");
2706 return start_offset;
2707 }
2708
2709 /* After program generation, go back and update the UIP and JIP of
2710 * BREAK, CONT, and HALT instructions to their correct locations.
2711 */
2712 void
2713 brw_set_uip_jip(struct brw_codegen *p)
2714 {
2715 const struct brw_device_info *devinfo = p->devinfo;
2716 int offset;
2717 int br = brw_jump_scale(devinfo);
2718 int scale = 16 / br;
2719 void *store = p->store;
2720
2721 if (devinfo->gen < 6)
2722 return;
2723
2724 for (offset = 0; offset < p->next_insn_offset;
2725 offset = next_offset(devinfo, store, offset)) {
2726 brw_inst *insn = store + offset;
2727
2728 if (brw_inst_cmpt_control(devinfo, insn)) {
2729 /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2730 assert(brw_inst_opcode(devinfo, insn) != BRW_OPCODE_BREAK &&
2731 brw_inst_opcode(devinfo, insn) != BRW_OPCODE_CONTINUE &&
2732 brw_inst_opcode(devinfo, insn) != BRW_OPCODE_HALT);
2733 continue;
2734 }
2735
2736 int block_end_offset = brw_find_next_block_end(p, offset);
2737 switch (brw_inst_opcode(devinfo, insn)) {
2738 case BRW_OPCODE_BREAK:
2739 assert(block_end_offset != 0);
2740 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2741 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2742 brw_inst_set_uip(devinfo, insn,
2743 (brw_find_loop_end(p, offset) - offset +
2744 (devinfo->gen == 6 ? 16 : 0)) / scale);
2745 break;
2746 case BRW_OPCODE_CONTINUE:
2747 assert(block_end_offset != 0);
2748 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2749 brw_inst_set_uip(devinfo, insn,
2750 (brw_find_loop_end(p, offset) - offset) / scale);
2751
2752 assert(brw_inst_uip(devinfo, insn) != 0);
2753 assert(brw_inst_jip(devinfo, insn) != 0);
2754 break;
2755
2756 case BRW_OPCODE_ENDIF: {
2757 int32_t jump = (block_end_offset == 0) ?
2758 1 * br : (block_end_offset - offset) / scale;
2759 if (devinfo->gen >= 7)
2760 brw_inst_set_jip(devinfo, insn, jump);
2761 else
2762 brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2763 break;
2764 }
2765
2766 case BRW_OPCODE_HALT:
2767 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2768 *
2769 * "In case of the halt instruction not inside any conditional
2770 * code block, the value of <JIP> and <UIP> should be the
2771 * same. In case of the halt instruction inside conditional code
2772 * block, the <UIP> should be the end of the program, and the
2773 * <JIP> should be end of the most inner conditional code block."
2774 *
2775 * The uip will have already been set by whoever set up the
2776 * instruction.
2777 */
2778 if (block_end_offset == 0) {
2779 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2780 } else {
2781 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2782 }
2783 assert(brw_inst_uip(devinfo, insn) != 0);
2784 assert(brw_inst_jip(devinfo, insn) != 0);
2785 break;
2786 }
2787 }
2788 }
2789
2790 void brw_ff_sync(struct brw_codegen *p,
2791 struct brw_reg dest,
2792 unsigned msg_reg_nr,
2793 struct brw_reg src0,
2794 bool allocate,
2795 unsigned response_length,
2796 bool eot)
2797 {
2798 const struct brw_device_info *devinfo = p->devinfo;
2799 brw_inst *insn;
2800
2801 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2802
2803 insn = next_insn(p, BRW_OPCODE_SEND);
2804 brw_set_dest(p, insn, dest);
2805 brw_set_src0(p, insn, src0);
2806 brw_set_src1(p, insn, brw_imm_d(0));
2807
2808 if (devinfo->gen < 6)
2809 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2810
2811 brw_set_ff_sync_message(p,
2812 insn,
2813 allocate,
2814 response_length,
2815 eot);
2816 }
2817
2818 /**
2819 * Emit the SEND instruction necessary to generate stream output data on Gen6
2820 * (for transform feedback).
2821 *
2822 * If send_commit_msg is true, this is the last piece of stream output data
2823 * from this thread, so send the data as a committed write. According to the
2824 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2825 *
2826 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2827 * writes are complete by sending the final write as a committed write."
2828 */
2829 void
2830 brw_svb_write(struct brw_codegen *p,
2831 struct brw_reg dest,
2832 unsigned msg_reg_nr,
2833 struct brw_reg src0,
2834 unsigned binding_table_index,
2835 bool send_commit_msg)
2836 {
2837 brw_inst *insn;
2838
2839 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2840
2841 insn = next_insn(p, BRW_OPCODE_SEND);
2842 brw_set_dest(p, insn, dest);
2843 brw_set_src0(p, insn, src0);
2844 brw_set_src1(p, insn, brw_imm_d(0));
2845 brw_set_dp_write_message(p, insn,
2846 binding_table_index,
2847 0, /* msg_control: ignored */
2848 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2849 1, /* msg_length */
2850 true, /* header_present */
2851 0, /* last_render_target: ignored */
2852 send_commit_msg, /* response_length */
2853 0, /* end_of_thread */
2854 send_commit_msg); /* send_commit_msg */
2855 }
2856
2857 static unsigned
2858 brw_surface_payload_size(struct brw_codegen *p,
2859 unsigned num_channels,
2860 bool has_simd4x2,
2861 bool has_simd16)
2862 {
2863 if (has_simd4x2 && brw_inst_access_mode(p->devinfo, p->current) == BRW_ALIGN_16)
2864 return 1;
2865 else if (has_simd16 && p->compressed)
2866 return 2 * num_channels;
2867 else
2868 return num_channels;
2869 }
2870
2871 static void
2872 brw_set_dp_untyped_atomic_message(struct brw_codegen *p,
2873 brw_inst *insn,
2874 unsigned atomic_op,
2875 bool response_expected)
2876 {
2877 const struct brw_device_info *devinfo = p->devinfo;
2878 unsigned msg_control =
2879 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2880 (response_expected ? 1 << 5 : 0); /* Return data expected */
2881
2882 if (devinfo->gen >= 8 || devinfo->is_haswell) {
2883 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2884 if (!p->compressed)
2885 msg_control |= 1 << 4; /* SIMD8 mode */
2886
2887 brw_inst_set_dp_msg_type(devinfo, insn,
2888 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
2889 } else {
2890 brw_inst_set_dp_msg_type(devinfo, insn,
2891 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
2892 }
2893 } else {
2894 brw_inst_set_dp_msg_type(devinfo, insn,
2895 GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
2896
2897 if (!p->compressed)
2898 msg_control |= 1 << 4; /* SIMD8 mode */
2899 }
2900
2901 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2902 }
2903
2904 void
2905 brw_untyped_atomic(struct brw_codegen *p,
2906 struct brw_reg dst,
2907 struct brw_reg payload,
2908 struct brw_reg surface,
2909 unsigned atomic_op,
2910 unsigned msg_length,
2911 bool response_expected)
2912 {
2913 const struct brw_device_info *devinfo = p->devinfo;
2914 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2915 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2916 GEN7_SFID_DATAPORT_DATA_CACHE);
2917 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
2918 /* Mask out unused components -- This is especially important in Align16
2919 * mode on generations that don't have native support for SIMD4x2 atomics,
2920 * because unused but enabled components will cause the dataport to perform
2921 * additional atomic operations on the addresses that happen to be in the
2922 * uninitialized Y, Z and W coordinates of the payload.
2923 */
2924 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2925 struct brw_inst *insn = brw_send_indirect_surface_message(
2926 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
2927 brw_surface_payload_size(p, response_expected,
2928 devinfo->gen >= 8 || devinfo->is_haswell, true),
2929 align1);
2930
2931 brw_set_dp_untyped_atomic_message(
2932 p, insn, atomic_op, response_expected);
2933 }
2934
2935 static void
2936 brw_set_dp_untyped_surface_read_message(struct brw_codegen *p,
2937 struct brw_inst *insn,
2938 unsigned num_channels)
2939 {
2940 const struct brw_device_info *devinfo = p->devinfo;
2941 /* Set mask of 32-bit channels to drop. */
2942 unsigned msg_control = 0xf & (0xf << num_channels);
2943
2944 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2945 if (p->compressed)
2946 msg_control |= 1 << 4; /* SIMD16 mode */
2947 else
2948 msg_control |= 2 << 4; /* SIMD8 mode */
2949 }
2950
2951 brw_inst_set_dp_msg_type(devinfo, insn,
2952 (devinfo->gen >= 8 || devinfo->is_haswell ?
2953 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
2954 GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ));
2955 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2956 }
2957
2958 void
2959 brw_untyped_surface_read(struct brw_codegen *p,
2960 struct brw_reg dst,
2961 struct brw_reg payload,
2962 struct brw_reg surface,
2963 unsigned msg_length,
2964 unsigned num_channels)
2965 {
2966 const struct brw_device_info *devinfo = p->devinfo;
2967 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2968 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2969 GEN7_SFID_DATAPORT_DATA_CACHE);
2970 struct brw_inst *insn = brw_send_indirect_surface_message(
2971 p, sfid, dst, payload, surface, msg_length,
2972 brw_surface_payload_size(p, num_channels, true, true),
2973 false);
2974
2975 brw_set_dp_untyped_surface_read_message(
2976 p, insn, num_channels);
2977 }
2978
2979 static void
2980 brw_set_dp_untyped_surface_write_message(struct brw_codegen *p,
2981 struct brw_inst *insn,
2982 unsigned num_channels)
2983 {
2984 const struct brw_device_info *devinfo = p->devinfo;
2985 /* Set mask of 32-bit channels to drop. */
2986 unsigned msg_control = 0xf & (0xf << num_channels);
2987
2988 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2989 if (p->compressed)
2990 msg_control |= 1 << 4; /* SIMD16 mode */
2991 else
2992 msg_control |= 2 << 4; /* SIMD8 mode */
2993 } else {
2994 if (devinfo->gen >= 8 || devinfo->is_haswell)
2995 msg_control |= 0 << 4; /* SIMD4x2 mode */
2996 else
2997 msg_control |= 2 << 4; /* SIMD8 mode */
2998 }
2999
3000 brw_inst_set_dp_msg_type(devinfo, insn,
3001 devinfo->gen >= 8 || devinfo->is_haswell ?
3002 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
3003 GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
3004 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3005 }
3006
3007 void
3008 brw_untyped_surface_write(struct brw_codegen *p,
3009 struct brw_reg payload,
3010 struct brw_reg surface,
3011 unsigned msg_length,
3012 unsigned num_channels)
3013 {
3014 const struct brw_device_info *devinfo = p->devinfo;
3015 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3016 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3017 GEN7_SFID_DATAPORT_DATA_CACHE);
3018 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3019 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3020 const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3021 WRITEMASK_X : WRITEMASK_XYZW;
3022 struct brw_inst *insn = brw_send_indirect_surface_message(
3023 p, sfid, brw_writemask(brw_null_reg(), mask),
3024 payload, surface, msg_length, 0, align1);
3025
3026 brw_set_dp_untyped_surface_write_message(
3027 p, insn, num_channels);
3028 }
3029
3030 static void
3031 brw_set_dp_typed_atomic_message(struct brw_codegen *p,
3032 struct brw_inst *insn,
3033 unsigned atomic_op,
3034 bool response_expected)
3035 {
3036 const struct brw_device_info *devinfo = p->devinfo;
3037 unsigned msg_control =
3038 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
3039 (response_expected ? 1 << 5 : 0); /* Return data expected */
3040
3041 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3042 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3043 if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
3044 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3045
3046 brw_inst_set_dp_msg_type(devinfo, insn,
3047 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP);
3048 } else {
3049 brw_inst_set_dp_msg_type(devinfo, insn,
3050 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2);
3051 }
3052
3053 } else {
3054 brw_inst_set_dp_msg_type(devinfo, insn,
3055 GEN7_DATAPORT_RC_TYPED_ATOMIC_OP);
3056
3057 if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
3058 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3059 }
3060
3061 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3062 }
3063
3064 void
3065 brw_typed_atomic(struct brw_codegen *p,
3066 struct brw_reg dst,
3067 struct brw_reg payload,
3068 struct brw_reg surface,
3069 unsigned atomic_op,
3070 unsigned msg_length,
3071 bool response_expected) {
3072 const struct brw_device_info *devinfo = p->devinfo;
3073 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3074 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3075 GEN6_SFID_DATAPORT_RENDER_CACHE);
3076 const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3077 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3078 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3079 struct brw_inst *insn = brw_send_indirect_surface_message(
3080 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
3081 brw_surface_payload_size(p, response_expected,
3082 devinfo->gen >= 8 || devinfo->is_haswell, false),
3083 true);
3084
3085 brw_set_dp_typed_atomic_message(
3086 p, insn, atomic_op, response_expected);
3087 }
3088
3089 static void
3090 brw_set_dp_typed_surface_read_message(struct brw_codegen *p,
3091 struct brw_inst *insn,
3092 unsigned num_channels)
3093 {
3094 const struct brw_device_info *devinfo = p->devinfo;
3095 /* Set mask of unused channels. */
3096 unsigned msg_control = 0xf & (0xf << num_channels);
3097
3098 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3099 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3100 if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
3101 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3102 else
3103 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3104 }
3105
3106 brw_inst_set_dp_msg_type(devinfo, insn,
3107 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ);
3108 } else {
3109 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3110 if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
3111 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3112 }
3113
3114 brw_inst_set_dp_msg_type(devinfo, insn,
3115 GEN7_DATAPORT_RC_TYPED_SURFACE_READ);
3116 }
3117
3118 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3119 }
3120
3121 void
3122 brw_typed_surface_read(struct brw_codegen *p,
3123 struct brw_reg dst,
3124 struct brw_reg payload,
3125 struct brw_reg surface,
3126 unsigned msg_length,
3127 unsigned num_channels)
3128 {
3129 const struct brw_device_info *devinfo = p->devinfo;
3130 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3131 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3132 GEN6_SFID_DATAPORT_RENDER_CACHE);
3133 struct brw_inst *insn = brw_send_indirect_surface_message(
3134 p, sfid, dst, payload, surface, msg_length,
3135 brw_surface_payload_size(p, num_channels,
3136 devinfo->gen >= 8 || devinfo->is_haswell, false),
3137 true);
3138
3139 brw_set_dp_typed_surface_read_message(
3140 p, insn, num_channels);
3141 }
3142
3143 static void
3144 brw_set_dp_typed_surface_write_message(struct brw_codegen *p,
3145 struct brw_inst *insn,
3146 unsigned num_channels)
3147 {
3148 const struct brw_device_info *devinfo = p->devinfo;
3149 /* Set mask of unused channels. */
3150 unsigned msg_control = 0xf & (0xf << num_channels);
3151
3152 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3153 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3154 if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
3155 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3156 else
3157 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3158 }
3159
3160 brw_inst_set_dp_msg_type(devinfo, insn,
3161 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE);
3162
3163 } else {
3164 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3165 if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
3166 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3167 }
3168
3169 brw_inst_set_dp_msg_type(devinfo, insn,
3170 GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE);
3171 }
3172
3173 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3174 }
3175
3176 void
3177 brw_typed_surface_write(struct brw_codegen *p,
3178 struct brw_reg payload,
3179 struct brw_reg surface,
3180 unsigned msg_length,
3181 unsigned num_channels)
3182 {
3183 const struct brw_device_info *devinfo = p->devinfo;
3184 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3185 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3186 GEN6_SFID_DATAPORT_RENDER_CACHE);
3187 const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3188 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3189 const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3190 WRITEMASK_X : WRITEMASK_XYZW);
3191 struct brw_inst *insn = brw_send_indirect_surface_message(
3192 p, sfid, brw_writemask(brw_null_reg(), mask),
3193 payload, surface, msg_length, 0, true);
3194
3195 brw_set_dp_typed_surface_write_message(
3196 p, insn, num_channels);
3197 }
3198
3199 static void
3200 brw_set_memory_fence_message(struct brw_codegen *p,
3201 struct brw_inst *insn,
3202 enum brw_message_target sfid,
3203 bool commit_enable)
3204 {
3205 const struct brw_device_info *devinfo = p->devinfo;
3206
3207 brw_set_message_descriptor(p, insn, sfid,
3208 1 /* message length */,
3209 (commit_enable ? 1 : 0) /* response length */,
3210 true /* header present */,
3211 false);
3212
3213 switch (sfid) {
3214 case GEN6_SFID_DATAPORT_RENDER_CACHE:
3215 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3216 break;
3217 case GEN7_SFID_DATAPORT_DATA_CACHE:
3218 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3219 break;
3220 default:
3221 unreachable("Not reached");
3222 }
3223
3224 if (commit_enable)
3225 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3226 }
3227
3228 void
3229 brw_memory_fence(struct brw_codegen *p,
3230 struct brw_reg dst)
3231 {
3232 const struct brw_device_info *devinfo = p->devinfo;
3233 const bool commit_enable = devinfo->gen == 7 && !devinfo->is_haswell;
3234 struct brw_inst *insn;
3235
3236 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3237 * message doesn't write anything back.
3238 */
3239 insn = next_insn(p, BRW_OPCODE_SEND);
3240 dst = retype(dst, BRW_REGISTER_TYPE_UW);
3241 brw_set_dest(p, insn, dst);
3242 brw_set_src0(p, insn, dst);
3243 brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3244 commit_enable);
3245
3246 if (devinfo->gen == 7 && !devinfo->is_haswell) {
3247 /* IVB does typed surface access through the render cache, so we need to
3248 * flush it too. Use a different register so both flushes can be
3249 * pipelined by the hardware.
3250 */
3251 insn = next_insn(p, BRW_OPCODE_SEND);
3252 brw_set_dest(p, insn, offset(dst, 1));
3253 brw_set_src0(p, insn, offset(dst, 1));
3254 brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3255 commit_enable);
3256
3257 /* Now write the response of the second message into the response of the
3258 * first to trigger a pipeline stall -- This way future render and data
3259 * cache messages will be properly ordered with respect to past data and
3260 * render cache messages.
3261 */
3262 brw_push_insn_state(p);
3263 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3264 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3265 brw_MOV(p, dst, offset(dst, 1));
3266 brw_pop_insn_state(p);
3267 }
3268 }
3269
3270 void
3271 brw_pixel_interpolator_query(struct brw_codegen *p,
3272 struct brw_reg dest,
3273 struct brw_reg mrf,
3274 bool noperspective,
3275 unsigned mode,
3276 struct brw_reg data,
3277 unsigned msg_length,
3278 unsigned response_length)
3279 {
3280 const struct brw_device_info *devinfo = p->devinfo;
3281 struct brw_inst *insn;
3282 const uint16_t exec_size = brw_inst_exec_size(devinfo, p->current);
3283
3284 /* brw_send_indirect_message will automatically use a direct send message
3285 * if data is actually immediate.
3286 */
3287 insn = brw_send_indirect_message(p,
3288 GEN7_SFID_PIXEL_INTERPOLATOR,
3289 dest,
3290 mrf,
3291 vec1(data));
3292 brw_inst_set_mlen(devinfo, insn, msg_length);
3293 brw_inst_set_rlen(devinfo, insn, response_length);
3294
3295 brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16);
3296 brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
3297 brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
3298 brw_inst_set_pi_message_type(devinfo, insn, mode);
3299 }
3300
3301 void
3302 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst)
3303 {
3304 const struct brw_device_info *devinfo = p->devinfo;
3305 brw_inst *inst;
3306
3307 assert(devinfo->gen >= 7);
3308
3309 brw_push_insn_state(p);
3310
3311 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3312 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3313
3314 if (devinfo->gen >= 8) {
3315 /* Getting the first active channel index is easy on Gen8: Just find
3316 * the first bit set in the mask register. The same register exists
3317 * on HSW already but it reads back as all ones when the current
3318 * instruction has execution masking disabled, so it's kind of
3319 * useless.
3320 */
3321 inst = brw_FBL(p, vec1(dst),
3322 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
3323
3324 /* Quarter control has the effect of magically shifting the value of
3325 * this register. Make sure it's set to zero.
3326 */
3327 brw_inst_set_qtr_control(devinfo, inst, GEN6_COMPRESSION_1Q);
3328 } else {
3329 const struct brw_reg flag = retype(brw_flag_reg(1, 0),
3330 BRW_REGISTER_TYPE_UD);
3331
3332 brw_MOV(p, flag, brw_imm_ud(0));
3333
3334 /* Run a 16-wide instruction returning zero with execution masking
3335 * and a conditional modifier enabled in order to get the current
3336 * execution mask in f1.0.
3337 */
3338 inst = brw_MOV(p, brw_null_reg(), brw_imm_ud(0));
3339 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_16);
3340 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3341 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3342 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3343
3344 brw_FBL(p, vec1(dst), flag);
3345 }
3346 } else {
3347 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3348
3349 if (devinfo->gen >= 8) {
3350 /* In SIMD4x2 mode the first active channel index is just the
3351 * negation of the first bit of the mask register.
3352 */
3353 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3354 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3355 brw_imm_ud(1));
3356
3357 } else {
3358 /* Overwrite the destination without and with execution masking to
3359 * find out which of the channels is active.
3360 */
3361 brw_push_insn_state(p);
3362 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3363 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3364 brw_imm_ud(1));
3365
3366 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3367 brw_imm_ud(0));
3368 brw_pop_insn_state(p);
3369 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3370 }
3371 }
3372
3373 brw_pop_insn_state(p);
3374 }
3375
3376 void
3377 brw_broadcast(struct brw_codegen *p,
3378 struct brw_reg dst,
3379 struct brw_reg src,
3380 struct brw_reg idx)
3381 {
3382 const struct brw_device_info *devinfo = p->devinfo;
3383 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3384 brw_inst *inst;
3385
3386 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3387 src.address_mode == BRW_ADDRESS_DIRECT);
3388
3389 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3390 idx.file == BRW_IMMEDIATE_VALUE) {
3391 /* Trivial, the source is already uniform or the index is a constant.
3392 * We will typically not get here if the optimizer is doing its job, but
3393 * asserting would be mean.
3394 */
3395 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3396 brw_MOV(p, dst,
3397 (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3398 stride(suboffset(src, 4 * i), 0, 4, 1)));
3399 } else {
3400 if (align1) {
3401 const struct brw_reg addr =
3402 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3403 const unsigned offset = src.nr * REG_SIZE + src.subnr;
3404 /* Limit in bytes of the signed indirect addressing immediate. */
3405 const unsigned limit = 512;
3406
3407 brw_push_insn_state(p);
3408 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3409 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3410
3411 /* Take into account the component size and horizontal stride. */
3412 assert(src.vstride == src.hstride + src.width);
3413 brw_SHL(p, addr, vec1(idx),
3414 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3415 src.hstride - 1));
3416
3417 /* We can only address up to limit bytes using the indirect
3418 * addressing immediate, account for the difference if the source
3419 * register is above this limit.
3420 */
3421 if (offset >= limit)
3422 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3423
3424 brw_pop_insn_state(p);
3425
3426 /* Use indirect addressing to fetch the specified component. */
3427 brw_MOV(p, dst,
3428 retype(brw_vec1_indirect(addr.subnr, offset % limit),
3429 src.type));
3430 } else {
3431 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3432 * to all bits of a flag register,
3433 */
3434 inst = brw_MOV(p,
3435 brw_null_reg(),
3436 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 0, 4, 1));
3437 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3438 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3439 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3440
3441 /* and use predicated SEL to pick the right channel. */
3442 inst = brw_SEL(p, dst,
3443 stride(suboffset(src, 4), 0, 4, 1),
3444 stride(src, 0, 4, 1));
3445 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3446 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3447 }
3448 }
3449 }
3450
3451 /**
3452 * This instruction is generated as a single-channel align1 instruction by
3453 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3454 *
3455 * We can't use the typed atomic op in the FS because that has the execution
3456 * mask ANDed with the pixel mask, but we just want to write the one dword for
3457 * all the pixels.
3458 *
3459 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3460 * one u32. So we use the same untyped atomic write message as the pixel
3461 * shader.
3462 *
3463 * The untyped atomic operation requires a BUFFER surface type with RAW
3464 * format, and is only accessible through the legacy DATA_CACHE dataport
3465 * messages.
3466 */
3467 void brw_shader_time_add(struct brw_codegen *p,
3468 struct brw_reg payload,
3469 uint32_t surf_index)
3470 {
3471 const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ?
3472 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3473 GEN7_SFID_DATAPORT_DATA_CACHE);
3474 assert(p->devinfo->gen >= 7);
3475
3476 brw_push_insn_state(p);
3477 brw_set_default_access_mode(p, BRW_ALIGN_1);
3478 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3479 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3480 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3481
3482 /* We use brw_vec1_reg and unmasked because we want to increment the given
3483 * offset only once.
3484 */
3485 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3486 BRW_ARF_NULL, 0));
3487 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3488 payload.nr, 0));
3489 brw_set_src1(p, send, brw_imm_ud(0));
3490 brw_set_message_descriptor(p, send, sfid, 2, 0, false, false);
3491 brw_inst_set_binding_table_index(p->devinfo, send, surf_index);
3492 brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
3493
3494 brw_pop_insn_state(p);
3495 }
3496
3497
3498 /**
3499 * Emit the SEND message for a barrier
3500 */
3501 void
3502 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3503 {
3504 const struct brw_device_info *devinfo = p->devinfo;
3505 struct brw_inst *inst;
3506
3507 assert(devinfo->gen >= 7);
3508
3509 inst = next_insn(p, BRW_OPCODE_SEND);
3510 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3511 brw_set_src0(p, inst, src);
3512 brw_set_src1(p, inst, brw_null_reg());
3513
3514 brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY,
3515 1 /* msg_length */,
3516 0 /* response_length */,
3517 false /* header_present */,
3518 false /* end_of_thread */);
3519
3520 brw_inst_set_gateway_notify(devinfo, inst, 1);
3521 brw_inst_set_gateway_subfuncid(devinfo, inst,
3522 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3523
3524 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3525 }
3526
3527
3528 /**
3529 * Emit the wait instruction for a barrier
3530 */
3531 void
3532 brw_WAIT(struct brw_codegen *p)
3533 {
3534 const struct brw_device_info *devinfo = p->devinfo;
3535 struct brw_inst *insn;
3536
3537 struct brw_reg src = brw_notification_reg();
3538
3539 insn = next_insn(p, BRW_OPCODE_WAIT);
3540 brw_set_dest(p, insn, src);
3541 brw_set_src0(p, insn, src);
3542 brw_set_src1(p, insn, brw_null_reg());
3543
3544 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3545 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3546 }