i965: Move the back-end compiler to src/intel/compiler
[mesa.git] / src / intel / compiler / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35
36 #include "util/ralloc.h"
37
38 /**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45 void
46 gen6_resolve_implied_move(struct brw_codegen *p,
47 struct brw_reg *src,
48 unsigned msg_reg_nr)
49 {
50 const struct gen_device_info *devinfo = p->devinfo;
51 if (devinfo->gen < 6)
52 return;
53
54 if (src->file == BRW_MESSAGE_REGISTER_FILE)
55 return;
56
57 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58 brw_push_insn_state(p);
59 brw_set_default_exec_size(p, BRW_EXECUTE_8);
60 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
61 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
62 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
63 retype(*src, BRW_REGISTER_TYPE_UD));
64 brw_pop_insn_state(p);
65 }
66 *src = brw_message_reg(msg_reg_nr);
67 }
68
69 static void
70 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
71 {
72 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
73 * "The send with EOT should use register space R112-R127 for <src>. This is
74 * to enable loading of a new thread into the same slot while the message
75 * with EOT for current thread is pending dispatch."
76 *
77 * Since we're pretending to have 16 MRFs anyway, we may as well use the
78 * registers required for messages with EOT.
79 */
80 const struct gen_device_info *devinfo = p->devinfo;
81 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
82 reg->file = BRW_GENERAL_REGISTER_FILE;
83 reg->nr += GEN7_MRF_HACK_START;
84 }
85 }
86
87 /**
88 * Convert a brw_reg_type enumeration value into the hardware representation.
89 *
90 * The hardware encoding may depend on whether the value is an immediate.
91 */
92 unsigned
93 brw_reg_type_to_hw_type(const struct gen_device_info *devinfo,
94 enum brw_reg_type type, enum brw_reg_file file)
95 {
96 if (file == BRW_IMMEDIATE_VALUE) {
97 static const int imm_hw_types[] = {
98 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
99 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
100 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
101 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
102 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
103 [BRW_REGISTER_TYPE_UB] = -1,
104 [BRW_REGISTER_TYPE_B] = -1,
105 [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
106 [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
107 [BRW_REGISTER_TYPE_V] = BRW_HW_REG_IMM_TYPE_V,
108 [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
109 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
110 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
111 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
112 };
113 assert(type < ARRAY_SIZE(imm_hw_types));
114 assert(imm_hw_types[type] != -1);
115 assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
116 return imm_hw_types[type];
117 } else {
118 /* Non-immediate registers */
119 static const int hw_types[] = {
120 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
121 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
122 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
123 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
124 [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
125 [BRW_REGISTER_TYPE_B] = BRW_HW_REG_NON_IMM_TYPE_B,
126 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
127 [BRW_REGISTER_TYPE_UV] = -1,
128 [BRW_REGISTER_TYPE_VF] = -1,
129 [BRW_REGISTER_TYPE_V] = -1,
130 [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
131 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
132 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
133 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
134 };
135 assert(type < ARRAY_SIZE(hw_types));
136 assert(hw_types[type] != -1);
137 assert(devinfo->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
138 assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_Q);
139 return hw_types[type];
140 }
141 }
142
143 /**
144 * Return the element size given a hardware register type and file.
145 *
146 * The hardware encoding may depend on whether the value is an immediate.
147 */
148 unsigned
149 brw_hw_reg_type_to_size(const struct gen_device_info *devinfo,
150 unsigned type, enum brw_reg_file file)
151 {
152 if (file == BRW_IMMEDIATE_VALUE) {
153 static const unsigned imm_hw_sizes[] = {
154 [BRW_HW_REG_TYPE_UD] = 4,
155 [BRW_HW_REG_TYPE_D] = 4,
156 [BRW_HW_REG_TYPE_UW] = 2,
157 [BRW_HW_REG_TYPE_W] = 2,
158 [BRW_HW_REG_IMM_TYPE_UV] = 2,
159 [BRW_HW_REG_IMM_TYPE_VF] = 4,
160 [BRW_HW_REG_IMM_TYPE_V] = 2,
161 [BRW_HW_REG_TYPE_F] = 4,
162 [GEN8_HW_REG_TYPE_UQ] = 8,
163 [GEN8_HW_REG_TYPE_Q] = 8,
164 [GEN8_HW_REG_IMM_TYPE_DF] = 8,
165 [GEN8_HW_REG_IMM_TYPE_HF] = 2,
166 };
167 assert(type < ARRAY_SIZE(imm_hw_sizes));
168 assert(devinfo->gen >= 6 || type != BRW_HW_REG_IMM_TYPE_UV);
169 assert(devinfo->gen >= 8 || type <= BRW_HW_REG_TYPE_F);
170 return imm_hw_sizes[type];
171 } else {
172 /* Non-immediate registers */
173 static const unsigned hw_sizes[] = {
174 [BRW_HW_REG_TYPE_UD] = 4,
175 [BRW_HW_REG_TYPE_D] = 4,
176 [BRW_HW_REG_TYPE_UW] = 2,
177 [BRW_HW_REG_TYPE_W] = 2,
178 [BRW_HW_REG_NON_IMM_TYPE_UB] = 1,
179 [BRW_HW_REG_NON_IMM_TYPE_B] = 1,
180 [GEN7_HW_REG_NON_IMM_TYPE_DF] = 8,
181 [BRW_HW_REG_TYPE_F] = 4,
182 [GEN8_HW_REG_TYPE_UQ] = 8,
183 [GEN8_HW_REG_TYPE_Q] = 8,
184 [GEN8_HW_REG_NON_IMM_TYPE_HF] = 2,
185 };
186 assert(type < ARRAY_SIZE(hw_sizes));
187 assert(devinfo->gen >= 7 ||
188 (type < GEN7_HW_REG_NON_IMM_TYPE_DF || type == BRW_HW_REG_TYPE_F));
189 assert(devinfo->gen >= 8 || type <= BRW_HW_REG_TYPE_F);
190 return hw_sizes[type];
191 }
192 }
193
194 void
195 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
196 {
197 const struct gen_device_info *devinfo = p->devinfo;
198
199 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
200 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
201 else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE)
202 assert(dest.nr < 128);
203
204 gen7_convert_mrf_to_grf(p, &dest);
205
206 brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
207 brw_inst_set_dst_reg_type(devinfo, inst,
208 brw_reg_type_to_hw_type(devinfo, dest.type,
209 dest.file));
210 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
211
212 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
213 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
214
215 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
216 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
217 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
218 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
219 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
220 } else {
221 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
222 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
223 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
224 dest.file == BRW_MESSAGE_REGISTER_FILE) {
225 assert(dest.writemask != 0);
226 }
227 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
228 * Although Dst.HorzStride is a don't care for Align16, HW needs
229 * this to be programmed as "01".
230 */
231 brw_inst_set_dst_hstride(devinfo, inst, 1);
232 }
233 } else {
234 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
235
236 /* These are different sizes in align1 vs align16:
237 */
238 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
239 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
240 dest.indirect_offset);
241 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
242 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
243 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
244 } else {
245 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
246 dest.indirect_offset);
247 /* even ignored in da16, still need to set as '01' */
248 brw_inst_set_dst_hstride(devinfo, inst, 1);
249 }
250 }
251
252 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
253 * or 16 (SIMD16), as that's normally correct. However, when dealing with
254 * small registers, we automatically reduce it to match the register size.
255 *
256 * In platforms that support fp64 we can emit instructions with a width of
257 * 4 that need two SIMD8 registers and an exec_size of 8 or 16. In these
258 * cases we need to make sure that these instructions have their exec sizes
259 * set properly when they are emitted and we can't rely on this code to fix
260 * it.
261 */
262 bool fix_exec_size;
263 if (devinfo->gen >= 6)
264 fix_exec_size = dest.width < BRW_EXECUTE_4;
265 else
266 fix_exec_size = dest.width < BRW_EXECUTE_8;
267
268 if (fix_exec_size)
269 brw_inst_set_exec_size(devinfo, inst, dest.width);
270 }
271
272 static void
273 validate_reg(const struct gen_device_info *devinfo,
274 brw_inst *inst, struct brw_reg reg)
275 {
276 const int hstride_for_reg[] = {0, 1, 2, 4};
277 const int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32};
278 const int width_for_reg[] = {1, 2, 4, 8, 16};
279 const int execsize_for_reg[] = {1, 2, 4, 8, 16, 32};
280 int width, hstride, vstride, execsize;
281
282 if (reg.file == BRW_IMMEDIATE_VALUE) {
283 /* 3.3.6: Region Parameters. Restriction: Immediate vectors
284 * mean the destination has to be 128-bit aligned and the
285 * destination horiz stride has to be a word.
286 */
287 if (reg.type == BRW_REGISTER_TYPE_V) {
288 unsigned UNUSED elem_size = brw_element_size(devinfo, inst, dst);
289 assert(hstride_for_reg[brw_inst_dst_hstride(devinfo, inst)] *
290 elem_size == 2);
291 }
292
293 return;
294 }
295
296 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
297 reg.file == BRW_ARF_NULL)
298 return;
299
300 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
301 *
302 * "Swizzling is not allowed when an accumulator is used as an implicit
303 * source or an explicit source in an instruction."
304 */
305 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
306 reg.nr == BRW_ARF_ACCUMULATOR)
307 assert(reg.swizzle == BRW_SWIZZLE_XYZW);
308
309 assert(reg.hstride >= 0 && reg.hstride < ARRAY_SIZE(hstride_for_reg));
310 hstride = hstride_for_reg[reg.hstride];
311
312 if (reg.vstride == 0xf) {
313 vstride = -1;
314 } else {
315 assert(reg.vstride >= 0 && reg.vstride < ARRAY_SIZE(vstride_for_reg));
316 vstride = vstride_for_reg[reg.vstride];
317 }
318
319 assert(reg.width >= 0 && reg.width < ARRAY_SIZE(width_for_reg));
320 width = width_for_reg[reg.width];
321
322 assert(brw_inst_exec_size(devinfo, inst) >= 0 &&
323 brw_inst_exec_size(devinfo, inst) < ARRAY_SIZE(execsize_for_reg));
324 execsize = execsize_for_reg[brw_inst_exec_size(devinfo, inst)];
325
326 /* Restrictions from 3.3.10: Register Region Restrictions. */
327 /* 3. */
328 assert(execsize >= width);
329
330 /* 4. */
331 if (execsize == width && hstride != 0) {
332 assert(vstride == -1 || vstride == width * hstride);
333 }
334
335 /* 5. */
336 if (execsize == width && hstride == 0) {
337 /* no restriction on vstride. */
338 }
339
340 /* 6. */
341 if (width == 1) {
342 assert(hstride == 0);
343 }
344
345 /* 7. */
346 if (execsize == 1 && width == 1) {
347 assert(hstride == 0);
348 assert(vstride == 0);
349 }
350
351 /* 8. */
352 if (vstride == 0 && hstride == 0) {
353 assert(width == 1);
354 }
355
356 /* 10. Check destination issues. */
357 }
358
359 static bool
360 is_compactable_immediate(unsigned imm)
361 {
362 /* We get the low 12 bits as-is. */
363 imm &= ~0xfff;
364
365 /* We get one bit replicated through the top 20 bits. */
366 return imm == 0 || imm == 0xfffff000;
367 }
368
369 void
370 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
371 {
372 const struct gen_device_info *devinfo = p->devinfo;
373
374 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
375 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
376 else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
377 assert(reg.nr < 128);
378
379 gen7_convert_mrf_to_grf(p, &reg);
380
381 if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
382 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
383 /* Any source modifiers or regions will be ignored, since this just
384 * identifies the MRF/GRF to start reading the message contents from.
385 * Check for some likely failures.
386 */
387 assert(!reg.negate);
388 assert(!reg.abs);
389 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
390 }
391
392 validate_reg(devinfo, inst, reg);
393
394 brw_inst_set_src0_reg_file(devinfo, inst, reg.file);
395 brw_inst_set_src0_reg_type(devinfo, inst,
396 brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
397 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
398 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
399 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
400
401 if (reg.file == BRW_IMMEDIATE_VALUE) {
402 if (reg.type == BRW_REGISTER_TYPE_DF ||
403 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
404 brw_inst_set_imm_df(devinfo, inst, reg.df);
405 else if (reg.type == BRW_REGISTER_TYPE_UQ ||
406 reg.type == BRW_REGISTER_TYPE_Q)
407 brw_inst_set_imm_uq(devinfo, inst, reg.u64);
408 else
409 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
410
411 /* The Bspec's section titled "Non-present Operands" claims that if src0
412 * is an immediate that src1's type must be the same as that of src0.
413 *
414 * The SNB+ DataTypeIndex instruction compaction tables contain mappings
415 * that do not follow this rule. E.g., from the IVB/HSW table:
416 *
417 * DataTypeIndex 18-Bit Mapping Mapped Meaning
418 * 3 001000001011111101 r:f | i:vf | a:ud | <1> | dir |
419 *
420 * And from the SNB table:
421 *
422 * DataTypeIndex 18-Bit Mapping Mapped Meaning
423 * 8 001000000111101100 a:w | i:w | a:ud | <1> | dir |
424 *
425 * Neither of these cause warnings from the simulator when used,
426 * compacted or otherwise. In fact, all compaction mappings that have an
427 * immediate in src0 use a:ud for src1.
428 *
429 * The GM45 instruction compaction tables do not contain mapped meanings
430 * so it's not clear whether it has the restriction. We'll assume it was
431 * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
432 *
433 * Don't do any of this for 64-bit immediates, since the src1 fields
434 * overlap with the immediate and setting them would overwrite the
435 * immediate we set.
436 */
437 if (type_sz(reg.type) < 8) {
438 brw_inst_set_src1_reg_file(devinfo, inst,
439 BRW_ARCHITECTURE_REGISTER_FILE);
440 if (devinfo->gen < 6) {
441 brw_inst_set_src1_reg_type(devinfo, inst,
442 brw_inst_src0_reg_type(devinfo, inst));
443 } else {
444 brw_inst_set_src1_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
445 }
446 }
447
448 /* Compacted instructions only have 12-bits (plus 1 for the other 20)
449 * for immediate values. Presumably the hardware engineers realized
450 * that the only useful floating-point value that could be represented
451 * in this format is 0.0, which can also be represented as a VF-typed
452 * immediate, so they gave us the previously mentioned mapping on IVB+.
453 *
454 * Strangely, we do have a mapping for imm:f in src1, so we don't need
455 * to do this there.
456 *
457 * If we see a 0.0:F, change the type to VF so that it can be compacted.
458 */
459 if (brw_inst_imm_ud(devinfo, inst) == 0x0 &&
460 brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_F &&
461 brw_inst_dst_reg_type(devinfo, inst) != GEN7_HW_REG_NON_IMM_TYPE_DF) {
462 brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_IMM_TYPE_VF);
463 }
464
465 /* There are no mappings for dst:d | i:d, so if the immediate is suitable
466 * set the types to :UD so the instruction can be compacted.
467 */
468 if (is_compactable_immediate(brw_inst_imm_ud(devinfo, inst)) &&
469 brw_inst_cond_modifier(devinfo, inst) == BRW_CONDITIONAL_NONE &&
470 brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D &&
471 brw_inst_dst_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D) {
472 brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
473 brw_inst_set_dst_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
474 }
475 } else {
476 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
477 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
478 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
479 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
480 } else {
481 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
482 }
483 } else {
484 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
485
486 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
487 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
488 } else {
489 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
490 }
491 }
492
493 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
494 if (reg.width == BRW_WIDTH_1 &&
495 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
496 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
497 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
498 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
499 } else {
500 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
501 brw_inst_set_src0_width(devinfo, inst, reg.width);
502 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
503 }
504 } else {
505 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
506 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
507 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
508 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
509 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
510 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
511 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
512 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
513
514 /* This is an oddity of the fact we're using the same
515 * descriptions for registers in align_16 as align_1:
516 */
517 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
518 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
519 else
520 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
521 }
522 }
523 }
524
525
526 void
527 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
528 {
529 const struct gen_device_info *devinfo = p->devinfo;
530
531 if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
532 assert(reg.nr < 128);
533
534 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
535 *
536 * "Accumulator registers may be accessed explicitly as src0
537 * operands only."
538 */
539 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
540 reg.nr != BRW_ARF_ACCUMULATOR);
541
542 gen7_convert_mrf_to_grf(p, &reg);
543 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
544
545 validate_reg(devinfo, inst, reg);
546
547 brw_inst_set_src1_reg_file(devinfo, inst, reg.file);
548 brw_inst_set_src1_reg_type(devinfo, inst,
549 brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
550 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
551 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
552
553 /* Only src1 can be immediate in two-argument instructions.
554 */
555 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
556
557 if (reg.file == BRW_IMMEDIATE_VALUE) {
558 /* two-argument instructions can only use 32-bit immediates */
559 assert(type_sz(reg.type) < 8);
560 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
561 } else {
562 /* This is a hardware restriction, which may or may not be lifted
563 * in the future:
564 */
565 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
566 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
567
568 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
569 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
570 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
571 } else {
572 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
573 }
574
575 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
576 if (reg.width == BRW_WIDTH_1 &&
577 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
578 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
579 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
580 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
581 } else {
582 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
583 brw_inst_set_src1_width(devinfo, inst, reg.width);
584 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
585 }
586 } else {
587 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
588 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
589 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
590 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
591 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
592 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
593 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
594 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
595
596 /* This is an oddity of the fact we're using the same
597 * descriptions for registers in align_16 as align_1:
598 */
599 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
600 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
601 else
602 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
603 }
604 }
605 }
606
607 /**
608 * Set the Message Descriptor and Extended Message Descriptor fields
609 * for SEND messages.
610 *
611 * \note This zeroes out the Function Control bits, so it must be called
612 * \b before filling out any message-specific data. Callers can
613 * choose not to fill in irrelevant bits; they will be zero.
614 */
615 void
616 brw_set_message_descriptor(struct brw_codegen *p,
617 brw_inst *inst,
618 enum brw_message_target sfid,
619 unsigned msg_length,
620 unsigned response_length,
621 bool header_present,
622 bool end_of_thread)
623 {
624 const struct gen_device_info *devinfo = p->devinfo;
625
626 brw_set_src1(p, inst, brw_imm_d(0));
627
628 /* For indirect sends, `inst` will not be the SEND/SENDC instruction
629 * itself; instead, it will be a MOV/OR into the address register.
630 *
631 * In this case, we avoid setting the extended message descriptor bits,
632 * since they go on the later SEND/SENDC instead and if set here would
633 * instead clobber the conditionalmod bits.
634 */
635 unsigned opcode = brw_inst_opcode(devinfo, inst);
636 if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
637 brw_inst_set_sfid(devinfo, inst, sfid);
638 }
639
640 brw_inst_set_mlen(devinfo, inst, msg_length);
641 brw_inst_set_rlen(devinfo, inst, response_length);
642 brw_inst_set_eot(devinfo, inst, end_of_thread);
643
644 if (devinfo->gen >= 5) {
645 brw_inst_set_header_present(devinfo, inst, header_present);
646 }
647 }
648
649 static void brw_set_math_message( struct brw_codegen *p,
650 brw_inst *inst,
651 unsigned function,
652 unsigned integer_type,
653 bool low_precision,
654 unsigned dataType )
655 {
656 const struct gen_device_info *devinfo = p->devinfo;
657 unsigned msg_length;
658 unsigned response_length;
659
660 /* Infer message length from the function */
661 switch (function) {
662 case BRW_MATH_FUNCTION_POW:
663 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
664 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
665 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
666 msg_length = 2;
667 break;
668 default:
669 msg_length = 1;
670 break;
671 }
672
673 /* Infer response length from the function */
674 switch (function) {
675 case BRW_MATH_FUNCTION_SINCOS:
676 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
677 response_length = 2;
678 break;
679 default:
680 response_length = 1;
681 break;
682 }
683
684
685 brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
686 msg_length, response_length, false, false);
687 brw_inst_set_math_msg_function(devinfo, inst, function);
688 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
689 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
690 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
691 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
692 brw_inst_set_saturate(devinfo, inst, 0);
693 }
694
695
696 static void brw_set_ff_sync_message(struct brw_codegen *p,
697 brw_inst *insn,
698 bool allocate,
699 unsigned response_length,
700 bool end_of_thread)
701 {
702 const struct gen_device_info *devinfo = p->devinfo;
703
704 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
705 1, response_length, true, end_of_thread);
706 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
707 brw_inst_set_urb_allocate(devinfo, insn, allocate);
708 /* The following fields are not used by FF_SYNC: */
709 brw_inst_set_urb_global_offset(devinfo, insn, 0);
710 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
711 brw_inst_set_urb_used(devinfo, insn, 0);
712 brw_inst_set_urb_complete(devinfo, insn, 0);
713 }
714
715 static void brw_set_urb_message( struct brw_codegen *p,
716 brw_inst *insn,
717 enum brw_urb_write_flags flags,
718 unsigned msg_length,
719 unsigned response_length,
720 unsigned offset,
721 unsigned swizzle_control )
722 {
723 const struct gen_device_info *devinfo = p->devinfo;
724
725 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
726 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
727 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
728
729 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
730 msg_length, response_length, true,
731 flags & BRW_URB_WRITE_EOT);
732
733 if (flags & BRW_URB_WRITE_OWORD) {
734 assert(msg_length == 2); /* header + one OWORD of data */
735 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
736 } else {
737 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
738 }
739
740 brw_inst_set_urb_global_offset(devinfo, insn, offset);
741 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
742
743 if (devinfo->gen < 8) {
744 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
745 }
746
747 if (devinfo->gen < 7) {
748 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
749 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
750 } else {
751 brw_inst_set_urb_per_slot_offset(devinfo, insn,
752 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
753 }
754 }
755
756 void
757 brw_set_dp_write_message(struct brw_codegen *p,
758 brw_inst *insn,
759 unsigned binding_table_index,
760 unsigned msg_control,
761 unsigned msg_type,
762 unsigned target_cache,
763 unsigned msg_length,
764 bool header_present,
765 unsigned last_render_target,
766 unsigned response_length,
767 unsigned end_of_thread,
768 unsigned send_commit_msg)
769 {
770 const struct gen_device_info *devinfo = p->devinfo;
771 const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
772 BRW_SFID_DATAPORT_WRITE);
773
774 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
775 header_present, end_of_thread);
776
777 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
778 brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type);
779 brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control);
780 brw_inst_set_rt_last(devinfo, insn, last_render_target);
781 if (devinfo->gen < 7) {
782 brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg);
783 }
784 }
785
786 void
787 brw_set_dp_read_message(struct brw_codegen *p,
788 brw_inst *insn,
789 unsigned binding_table_index,
790 unsigned msg_control,
791 unsigned msg_type,
792 unsigned target_cache,
793 unsigned msg_length,
794 bool header_present,
795 unsigned response_length)
796 {
797 const struct gen_device_info *devinfo = p->devinfo;
798 const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
799 BRW_SFID_DATAPORT_READ);
800
801 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
802 header_present, false);
803
804 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
805 brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type);
806 brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control);
807 if (devinfo->gen < 6)
808 brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache);
809 }
810
811 void
812 brw_set_sampler_message(struct brw_codegen *p,
813 brw_inst *inst,
814 unsigned binding_table_index,
815 unsigned sampler,
816 unsigned msg_type,
817 unsigned response_length,
818 unsigned msg_length,
819 unsigned header_present,
820 unsigned simd_mode,
821 unsigned return_format)
822 {
823 const struct gen_device_info *devinfo = p->devinfo;
824
825 brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
826 response_length, header_present, false);
827
828 brw_inst_set_binding_table_index(devinfo, inst, binding_table_index);
829 brw_inst_set_sampler(devinfo, inst, sampler);
830 brw_inst_set_sampler_msg_type(devinfo, inst, msg_type);
831 if (devinfo->gen >= 5) {
832 brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode);
833 } else if (devinfo->gen == 4 && !devinfo->is_g4x) {
834 brw_inst_set_sampler_return_format(devinfo, inst, return_format);
835 }
836 }
837
838 static void
839 gen7_set_dp_scratch_message(struct brw_codegen *p,
840 brw_inst *inst,
841 bool write,
842 bool dword,
843 bool invalidate_after_read,
844 unsigned num_regs,
845 unsigned addr_offset,
846 unsigned mlen,
847 unsigned rlen,
848 bool header_present)
849 {
850 const struct gen_device_info *devinfo = p->devinfo;
851 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
852 (devinfo->gen >= 8 && num_regs == 8));
853 const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
854 num_regs - 1);
855
856 brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
857 mlen, rlen, header_present, false);
858 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
859 brw_inst_set_scratch_read_write(devinfo, inst, write);
860 brw_inst_set_scratch_type(devinfo, inst, dword);
861 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
862 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
863 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
864 }
865
866 #define next_insn brw_next_insn
867 brw_inst *
868 brw_next_insn(struct brw_codegen *p, unsigned opcode)
869 {
870 const struct gen_device_info *devinfo = p->devinfo;
871 brw_inst *insn;
872
873 if (p->nr_insn + 1 > p->store_size) {
874 p->store_size <<= 1;
875 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
876 }
877
878 p->next_insn_offset += 16;
879 insn = &p->store[p->nr_insn++];
880 memcpy(insn, p->current, sizeof(*insn));
881
882 brw_inst_set_opcode(devinfo, insn, opcode);
883 return insn;
884 }
885
886 static brw_inst *
887 brw_alu1(struct brw_codegen *p, unsigned opcode,
888 struct brw_reg dest, struct brw_reg src)
889 {
890 brw_inst *insn = next_insn(p, opcode);
891 brw_set_dest(p, insn, dest);
892 brw_set_src0(p, insn, src);
893 return insn;
894 }
895
896 static brw_inst *
897 brw_alu2(struct brw_codegen *p, unsigned opcode,
898 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
899 {
900 /* 64-bit immediates are only supported on 1-src instructions */
901 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
902 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
903
904 brw_inst *insn = next_insn(p, opcode);
905 brw_set_dest(p, insn, dest);
906 brw_set_src0(p, insn, src0);
907 brw_set_src1(p, insn, src1);
908 return insn;
909 }
910
911 static int
912 get_3src_subreg_nr(struct brw_reg reg)
913 {
914 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
915 * use 32-bit units (components 0..7). Since they only support F/D/UD
916 * types, this doesn't lose any flexibility, but uses fewer bits.
917 */
918 return reg.subnr / 4;
919 }
920
921 static brw_inst *
922 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
923 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
924 {
925 const struct gen_device_info *devinfo = p->devinfo;
926 brw_inst *inst = next_insn(p, opcode);
927
928 gen7_convert_mrf_to_grf(p, &dest);
929
930 assert(brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16);
931
932 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
933 dest.file == BRW_MESSAGE_REGISTER_FILE);
934 assert(dest.nr < 128);
935 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
936 assert(dest.type == BRW_REGISTER_TYPE_F ||
937 dest.type == BRW_REGISTER_TYPE_DF ||
938 dest.type == BRW_REGISTER_TYPE_D ||
939 dest.type == BRW_REGISTER_TYPE_UD);
940 if (devinfo->gen == 6) {
941 brw_inst_set_3src_dst_reg_file(devinfo, inst,
942 dest.file == BRW_MESSAGE_REGISTER_FILE);
943 }
944 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
945 brw_inst_set_3src_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
946 brw_inst_set_3src_dst_writemask(devinfo, inst, dest.writemask);
947
948 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
949 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
950 assert(src0.nr < 128);
951 brw_inst_set_3src_src0_swizzle(devinfo, inst, src0.swizzle);
952 brw_inst_set_3src_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
953 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
954 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
955 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
956 brw_inst_set_3src_src0_rep_ctrl(devinfo, inst,
957 src0.vstride == BRW_VERTICAL_STRIDE_0);
958
959 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
960 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
961 assert(src1.nr < 128);
962 brw_inst_set_3src_src1_swizzle(devinfo, inst, src1.swizzle);
963 brw_inst_set_3src_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
964 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
965 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
966 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
967 brw_inst_set_3src_src1_rep_ctrl(devinfo, inst,
968 src1.vstride == BRW_VERTICAL_STRIDE_0);
969
970 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
971 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
972 assert(src2.nr < 128);
973 brw_inst_set_3src_src2_swizzle(devinfo, inst, src2.swizzle);
974 brw_inst_set_3src_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
975 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
976 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
977 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
978 brw_inst_set_3src_src2_rep_ctrl(devinfo, inst,
979 src2.vstride == BRW_VERTICAL_STRIDE_0);
980
981 if (devinfo->gen >= 7) {
982 /* Set both the source and destination types based on dest.type,
983 * ignoring the source register types. The MAD and LRP emitters ensure
984 * that all four types are float. The BFE and BFI2 emitters, however,
985 * may send us mixed D and UD types and want us to ignore that and use
986 * the destination type.
987 */
988 switch (dest.type) {
989 case BRW_REGISTER_TYPE_F:
990 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_F);
991 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_F);
992 break;
993 case BRW_REGISTER_TYPE_DF:
994 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_DF);
995 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_DF);
996 break;
997 case BRW_REGISTER_TYPE_D:
998 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_D);
999 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_D);
1000 break;
1001 case BRW_REGISTER_TYPE_UD:
1002 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_UD);
1003 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_UD);
1004 break;
1005 default:
1006 unreachable("not reached");
1007 }
1008 }
1009
1010 return inst;
1011 }
1012
1013
1014 /***********************************************************************
1015 * Convenience routines.
1016 */
1017 #define ALU1(OP) \
1018 brw_inst *brw_##OP(struct brw_codegen *p, \
1019 struct brw_reg dest, \
1020 struct brw_reg src0) \
1021 { \
1022 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
1023 }
1024
1025 #define ALU2(OP) \
1026 brw_inst *brw_##OP(struct brw_codegen *p, \
1027 struct brw_reg dest, \
1028 struct brw_reg src0, \
1029 struct brw_reg src1) \
1030 { \
1031 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
1032 }
1033
1034 #define ALU3(OP) \
1035 brw_inst *brw_##OP(struct brw_codegen *p, \
1036 struct brw_reg dest, \
1037 struct brw_reg src0, \
1038 struct brw_reg src1, \
1039 struct brw_reg src2) \
1040 { \
1041 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1042 }
1043
1044 #define ALU3F(OP) \
1045 brw_inst *brw_##OP(struct brw_codegen *p, \
1046 struct brw_reg dest, \
1047 struct brw_reg src0, \
1048 struct brw_reg src1, \
1049 struct brw_reg src2) \
1050 { \
1051 assert(dest.type == BRW_REGISTER_TYPE_F || \
1052 dest.type == BRW_REGISTER_TYPE_DF); \
1053 if (dest.type == BRW_REGISTER_TYPE_F) { \
1054 assert(src0.type == BRW_REGISTER_TYPE_F); \
1055 assert(src1.type == BRW_REGISTER_TYPE_F); \
1056 assert(src2.type == BRW_REGISTER_TYPE_F); \
1057 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
1058 assert(src0.type == BRW_REGISTER_TYPE_DF); \
1059 assert(src1.type == BRW_REGISTER_TYPE_DF); \
1060 assert(src2.type == BRW_REGISTER_TYPE_DF); \
1061 } \
1062 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1063 }
1064
1065 /* Rounding operations (other than RNDD) require two instructions - the first
1066 * stores a rounded value (possibly the wrong way) in the dest register, but
1067 * also sets a per-channel "increment bit" in the flag register. A predicated
1068 * add of 1.0 fixes dest to contain the desired result.
1069 *
1070 * Sandybridge and later appear to round correctly without an ADD.
1071 */
1072 #define ROUND(OP) \
1073 void brw_##OP(struct brw_codegen *p, \
1074 struct brw_reg dest, \
1075 struct brw_reg src) \
1076 { \
1077 const struct gen_device_info *devinfo = p->devinfo; \
1078 brw_inst *rnd, *add; \
1079 rnd = next_insn(p, BRW_OPCODE_##OP); \
1080 brw_set_dest(p, rnd, dest); \
1081 brw_set_src0(p, rnd, src); \
1082 \
1083 if (devinfo->gen < 6) { \
1084 /* turn on round-increments */ \
1085 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
1086 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
1087 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
1088 } \
1089 }
1090
1091
1092 ALU1(MOV)
1093 ALU2(SEL)
1094 ALU1(NOT)
1095 ALU2(AND)
1096 ALU2(OR)
1097 ALU2(XOR)
1098 ALU2(SHR)
1099 ALU2(SHL)
1100 ALU1(DIM)
1101 ALU2(ASR)
1102 ALU1(FRC)
1103 ALU1(RNDD)
1104 ALU2(MAC)
1105 ALU2(MACH)
1106 ALU1(LZD)
1107 ALU2(DP4)
1108 ALU2(DPH)
1109 ALU2(DP3)
1110 ALU2(DP2)
1111 ALU3F(MAD)
1112 ALU3F(LRP)
1113 ALU1(BFREV)
1114 ALU3(BFE)
1115 ALU2(BFI1)
1116 ALU3(BFI2)
1117 ALU1(FBH)
1118 ALU1(FBL)
1119 ALU1(CBIT)
1120 ALU2(ADDC)
1121 ALU2(SUBB)
1122
1123 ROUND(RNDZ)
1124 ROUND(RNDE)
1125
1126
1127 brw_inst *
1128 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1129 struct brw_reg src0, struct brw_reg src1)
1130 {
1131 /* 6.2.2: add */
1132 if (src0.type == BRW_REGISTER_TYPE_F ||
1133 (src0.file == BRW_IMMEDIATE_VALUE &&
1134 src0.type == BRW_REGISTER_TYPE_VF)) {
1135 assert(src1.type != BRW_REGISTER_TYPE_UD);
1136 assert(src1.type != BRW_REGISTER_TYPE_D);
1137 }
1138
1139 if (src1.type == BRW_REGISTER_TYPE_F ||
1140 (src1.file == BRW_IMMEDIATE_VALUE &&
1141 src1.type == BRW_REGISTER_TYPE_VF)) {
1142 assert(src0.type != BRW_REGISTER_TYPE_UD);
1143 assert(src0.type != BRW_REGISTER_TYPE_D);
1144 }
1145
1146 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1147 }
1148
1149 brw_inst *
1150 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1151 struct brw_reg src0, struct brw_reg src1)
1152 {
1153 assert(dest.type == src0.type);
1154 assert(src0.type == src1.type);
1155 switch (src0.type) {
1156 case BRW_REGISTER_TYPE_B:
1157 case BRW_REGISTER_TYPE_UB:
1158 case BRW_REGISTER_TYPE_W:
1159 case BRW_REGISTER_TYPE_UW:
1160 case BRW_REGISTER_TYPE_D:
1161 case BRW_REGISTER_TYPE_UD:
1162 break;
1163 default:
1164 unreachable("Bad type for brw_AVG");
1165 }
1166
1167 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1168 }
1169
1170 brw_inst *
1171 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1172 struct brw_reg src0, struct brw_reg src1)
1173 {
1174 /* 6.32.38: mul */
1175 if (src0.type == BRW_REGISTER_TYPE_D ||
1176 src0.type == BRW_REGISTER_TYPE_UD ||
1177 src1.type == BRW_REGISTER_TYPE_D ||
1178 src1.type == BRW_REGISTER_TYPE_UD) {
1179 assert(dest.type != BRW_REGISTER_TYPE_F);
1180 }
1181
1182 if (src0.type == BRW_REGISTER_TYPE_F ||
1183 (src0.file == BRW_IMMEDIATE_VALUE &&
1184 src0.type == BRW_REGISTER_TYPE_VF)) {
1185 assert(src1.type != BRW_REGISTER_TYPE_UD);
1186 assert(src1.type != BRW_REGISTER_TYPE_D);
1187 }
1188
1189 if (src1.type == BRW_REGISTER_TYPE_F ||
1190 (src1.file == BRW_IMMEDIATE_VALUE &&
1191 src1.type == BRW_REGISTER_TYPE_VF)) {
1192 assert(src0.type != BRW_REGISTER_TYPE_UD);
1193 assert(src0.type != BRW_REGISTER_TYPE_D);
1194 }
1195
1196 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1197 src0.nr != BRW_ARF_ACCUMULATOR);
1198 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1199 src1.nr != BRW_ARF_ACCUMULATOR);
1200
1201 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1202 }
1203
1204 brw_inst *
1205 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1206 struct brw_reg src0, struct brw_reg src1)
1207 {
1208 src0.vstride = BRW_VERTICAL_STRIDE_0;
1209 src0.width = BRW_WIDTH_1;
1210 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1211 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1212 }
1213
1214 brw_inst *
1215 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1216 struct brw_reg src0, struct brw_reg src1)
1217 {
1218 src0.vstride = BRW_VERTICAL_STRIDE_0;
1219 src0.width = BRW_WIDTH_1;
1220 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1221 src1.vstride = BRW_VERTICAL_STRIDE_8;
1222 src1.width = BRW_WIDTH_8;
1223 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1224 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1225 }
1226
1227 brw_inst *
1228 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1229 {
1230 const struct gen_device_info *devinfo = p->devinfo;
1231 const bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1232 /* The F32TO16 instruction doesn't support 32-bit destination types in
1233 * Align1 mode, and neither does the Gen8 implementation in terms of a
1234 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1235 * an undocumented feature.
1236 */
1237 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1238 (!align16 || devinfo->gen >= 8));
1239 brw_inst *inst;
1240
1241 if (align16) {
1242 assert(dst.type == BRW_REGISTER_TYPE_UD);
1243 } else {
1244 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1245 dst.type == BRW_REGISTER_TYPE_W ||
1246 dst.type == BRW_REGISTER_TYPE_UW ||
1247 dst.type == BRW_REGISTER_TYPE_HF);
1248 }
1249
1250 brw_push_insn_state(p);
1251
1252 if (needs_zero_fill) {
1253 brw_set_default_access_mode(p, BRW_ALIGN_1);
1254 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1255 }
1256
1257 if (devinfo->gen >= 8) {
1258 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1259 } else {
1260 assert(devinfo->gen == 7);
1261 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1262 }
1263
1264 if (needs_zero_fill) {
1265 brw_inst_set_no_dd_clear(devinfo, inst, true);
1266 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1267 brw_inst_set_no_dd_check(devinfo, inst, true);
1268 }
1269
1270 brw_pop_insn_state(p);
1271 return inst;
1272 }
1273
1274 brw_inst *
1275 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1276 {
1277 const struct gen_device_info *devinfo = p->devinfo;
1278 bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1279
1280 if (align16) {
1281 assert(src.type == BRW_REGISTER_TYPE_UD);
1282 } else {
1283 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1284 *
1285 * Because this instruction does not have a 16-bit floating-point
1286 * type, the source data type must be Word (W). The destination type
1287 * must be F (Float).
1288 */
1289 if (src.type == BRW_REGISTER_TYPE_UD)
1290 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1291
1292 assert(src.type == BRW_REGISTER_TYPE_W ||
1293 src.type == BRW_REGISTER_TYPE_UW ||
1294 src.type == BRW_REGISTER_TYPE_HF);
1295 }
1296
1297 if (devinfo->gen >= 8) {
1298 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1299 } else {
1300 assert(devinfo->gen == 7);
1301 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1302 }
1303 }
1304
1305
1306 void brw_NOP(struct brw_codegen *p)
1307 {
1308 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1309 memset(insn, 0, sizeof(*insn));
1310 brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1311 }
1312
1313
1314
1315
1316
1317 /***********************************************************************
1318 * Comparisons, if/else/endif
1319 */
1320
1321 brw_inst *
1322 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1323 unsigned predicate_control)
1324 {
1325 const struct gen_device_info *devinfo = p->devinfo;
1326 struct brw_reg ip = brw_ip_reg();
1327 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1328
1329 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_2);
1330 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1331 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1332 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1333
1334 return inst;
1335 }
1336
1337 static void
1338 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1339 {
1340 p->if_stack[p->if_stack_depth] = inst - p->store;
1341
1342 p->if_stack_depth++;
1343 if (p->if_stack_array_size <= p->if_stack_depth) {
1344 p->if_stack_array_size *= 2;
1345 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1346 p->if_stack_array_size);
1347 }
1348 }
1349
1350 static brw_inst *
1351 pop_if_stack(struct brw_codegen *p)
1352 {
1353 p->if_stack_depth--;
1354 return &p->store[p->if_stack[p->if_stack_depth]];
1355 }
1356
1357 static void
1358 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1359 {
1360 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1361 p->loop_stack_array_size *= 2;
1362 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1363 p->loop_stack_array_size);
1364 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1365 p->loop_stack_array_size);
1366 }
1367
1368 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1369 p->loop_stack_depth++;
1370 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1371 }
1372
1373 static brw_inst *
1374 get_inner_do_insn(struct brw_codegen *p)
1375 {
1376 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1377 }
1378
1379 /* EU takes the value from the flag register and pushes it onto some
1380 * sort of a stack (presumably merging with any flag value already on
1381 * the stack). Within an if block, the flags at the top of the stack
1382 * control execution on each channel of the unit, eg. on each of the
1383 * 16 pixel values in our wm programs.
1384 *
1385 * When the matching 'else' instruction is reached (presumably by
1386 * countdown of the instruction count patched in by our ELSE/ENDIF
1387 * functions), the relevant flags are inverted.
1388 *
1389 * When the matching 'endif' instruction is reached, the flags are
1390 * popped off. If the stack is now empty, normal execution resumes.
1391 */
1392 brw_inst *
1393 brw_IF(struct brw_codegen *p, unsigned execute_size)
1394 {
1395 const struct gen_device_info *devinfo = p->devinfo;
1396 brw_inst *insn;
1397
1398 insn = next_insn(p, BRW_OPCODE_IF);
1399
1400 /* Override the defaults for this instruction:
1401 */
1402 if (devinfo->gen < 6) {
1403 brw_set_dest(p, insn, brw_ip_reg());
1404 brw_set_src0(p, insn, brw_ip_reg());
1405 brw_set_src1(p, insn, brw_imm_d(0x0));
1406 } else if (devinfo->gen == 6) {
1407 brw_set_dest(p, insn, brw_imm_w(0));
1408 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1409 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1410 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1411 } else if (devinfo->gen == 7) {
1412 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1413 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1414 brw_set_src1(p, insn, brw_imm_w(0));
1415 brw_inst_set_jip(devinfo, insn, 0);
1416 brw_inst_set_uip(devinfo, insn, 0);
1417 } else {
1418 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1419 brw_set_src0(p, insn, brw_imm_d(0));
1420 brw_inst_set_jip(devinfo, insn, 0);
1421 brw_inst_set_uip(devinfo, insn, 0);
1422 }
1423
1424 brw_inst_set_exec_size(devinfo, insn, execute_size);
1425 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1426 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1427 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1428 if (!p->single_program_flow && devinfo->gen < 6)
1429 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1430
1431 push_if_stack(p, insn);
1432 p->if_depth_in_loop[p->loop_stack_depth]++;
1433 return insn;
1434 }
1435
1436 /* This function is only used for gen6-style IF instructions with an
1437 * embedded comparison (conditional modifier). It is not used on gen7.
1438 */
1439 brw_inst *
1440 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1441 struct brw_reg src0, struct brw_reg src1)
1442 {
1443 const struct gen_device_info *devinfo = p->devinfo;
1444 brw_inst *insn;
1445
1446 insn = next_insn(p, BRW_OPCODE_IF);
1447
1448 brw_set_dest(p, insn, brw_imm_w(0));
1449 brw_inst_set_exec_size(devinfo, insn,
1450 brw_inst_exec_size(devinfo, p->current));
1451 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1452 brw_set_src0(p, insn, src0);
1453 brw_set_src1(p, insn, src1);
1454
1455 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1456 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1457 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1458
1459 push_if_stack(p, insn);
1460 return insn;
1461 }
1462
1463 /**
1464 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1465 */
1466 static void
1467 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1468 brw_inst *if_inst, brw_inst *else_inst)
1469 {
1470 const struct gen_device_info *devinfo = p->devinfo;
1471
1472 /* The next instruction (where the ENDIF would be, if it existed) */
1473 brw_inst *next_inst = &p->store[p->nr_insn];
1474
1475 assert(p->single_program_flow);
1476 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1477 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1478 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1479
1480 /* Convert IF to an ADD instruction that moves the instruction pointer
1481 * to the first instruction of the ELSE block. If there is no ELSE
1482 * block, point to where ENDIF would be. Reverse the predicate.
1483 *
1484 * There's no need to execute an ENDIF since we don't need to do any
1485 * stack operations, and if we're currently executing, we just want to
1486 * continue normally.
1487 */
1488 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1489 brw_inst_set_pred_inv(devinfo, if_inst, true);
1490
1491 if (else_inst != NULL) {
1492 /* Convert ELSE to an ADD instruction that points where the ENDIF
1493 * would be.
1494 */
1495 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1496
1497 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1498 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1499 } else {
1500 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1501 }
1502 }
1503
1504 /**
1505 * Patch IF and ELSE instructions with appropriate jump targets.
1506 */
1507 static void
1508 patch_IF_ELSE(struct brw_codegen *p,
1509 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1510 {
1511 const struct gen_device_info *devinfo = p->devinfo;
1512
1513 /* We shouldn't be patching IF and ELSE instructions in single program flow
1514 * mode when gen < 6, because in single program flow mode on those
1515 * platforms, we convert flow control instructions to conditional ADDs that
1516 * operate on IP (see brw_ENDIF).
1517 *
1518 * However, on Gen6, writing to IP doesn't work in single program flow mode
1519 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1520 * not be updated by non-flow control instructions."). And on later
1521 * platforms, there is no significant benefit to converting control flow
1522 * instructions to conditional ADDs. So we do patch IF and ELSE
1523 * instructions in single program flow mode on those platforms.
1524 */
1525 if (devinfo->gen < 6)
1526 assert(!p->single_program_flow);
1527
1528 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1529 assert(endif_inst != NULL);
1530 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1531
1532 unsigned br = brw_jump_scale(devinfo);
1533
1534 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1535 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1536
1537 if (else_inst == NULL) {
1538 /* Patch IF -> ENDIF */
1539 if (devinfo->gen < 6) {
1540 /* Turn it into an IFF, which means no mask stack operations for
1541 * all-false and jumping past the ENDIF.
1542 */
1543 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1544 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1545 br * (endif_inst - if_inst + 1));
1546 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1547 } else if (devinfo->gen == 6) {
1548 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1549 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1550 } else {
1551 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1552 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1553 }
1554 } else {
1555 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1556
1557 /* Patch IF -> ELSE */
1558 if (devinfo->gen < 6) {
1559 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1560 br * (else_inst - if_inst));
1561 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1562 } else if (devinfo->gen == 6) {
1563 brw_inst_set_gen6_jump_count(devinfo, if_inst,
1564 br * (else_inst - if_inst + 1));
1565 }
1566
1567 /* Patch ELSE -> ENDIF */
1568 if (devinfo->gen < 6) {
1569 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1570 * matching ENDIF.
1571 */
1572 brw_inst_set_gen4_jump_count(devinfo, else_inst,
1573 br * (endif_inst - else_inst + 1));
1574 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1575 } else if (devinfo->gen == 6) {
1576 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1577 brw_inst_set_gen6_jump_count(devinfo, else_inst,
1578 br * (endif_inst - else_inst));
1579 } else {
1580 /* The IF instruction's JIP should point just past the ELSE */
1581 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1582 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1583 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1584 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1585 if (devinfo->gen >= 8) {
1586 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1587 * should point to ENDIF.
1588 */
1589 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1590 }
1591 }
1592 }
1593 }
1594
1595 void
1596 brw_ELSE(struct brw_codegen *p)
1597 {
1598 const struct gen_device_info *devinfo = p->devinfo;
1599 brw_inst *insn;
1600
1601 insn = next_insn(p, BRW_OPCODE_ELSE);
1602
1603 if (devinfo->gen < 6) {
1604 brw_set_dest(p, insn, brw_ip_reg());
1605 brw_set_src0(p, insn, brw_ip_reg());
1606 brw_set_src1(p, insn, brw_imm_d(0x0));
1607 } else if (devinfo->gen == 6) {
1608 brw_set_dest(p, insn, brw_imm_w(0));
1609 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1610 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1611 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1612 } else if (devinfo->gen == 7) {
1613 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1614 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1615 brw_set_src1(p, insn, brw_imm_w(0));
1616 brw_inst_set_jip(devinfo, insn, 0);
1617 brw_inst_set_uip(devinfo, insn, 0);
1618 } else {
1619 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1620 brw_set_src0(p, insn, brw_imm_d(0));
1621 brw_inst_set_jip(devinfo, insn, 0);
1622 brw_inst_set_uip(devinfo, insn, 0);
1623 }
1624
1625 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1626 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1627 if (!p->single_program_flow && devinfo->gen < 6)
1628 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1629
1630 push_if_stack(p, insn);
1631 }
1632
1633 void
1634 brw_ENDIF(struct brw_codegen *p)
1635 {
1636 const struct gen_device_info *devinfo = p->devinfo;
1637 brw_inst *insn = NULL;
1638 brw_inst *else_inst = NULL;
1639 brw_inst *if_inst = NULL;
1640 brw_inst *tmp;
1641 bool emit_endif = true;
1642
1643 /* In single program flow mode, we can express IF and ELSE instructions
1644 * equivalently as ADD instructions that operate on IP. On platforms prior
1645 * to Gen6, flow control instructions cause an implied thread switch, so
1646 * this is a significant savings.
1647 *
1648 * However, on Gen6, writing to IP doesn't work in single program flow mode
1649 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1650 * not be updated by non-flow control instructions."). And on later
1651 * platforms, there is no significant benefit to converting control flow
1652 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1653 * Gen5.
1654 */
1655 if (devinfo->gen < 6 && p->single_program_flow)
1656 emit_endif = false;
1657
1658 /*
1659 * A single next_insn() may change the base address of instruction store
1660 * memory(p->store), so call it first before referencing the instruction
1661 * store pointer from an index
1662 */
1663 if (emit_endif)
1664 insn = next_insn(p, BRW_OPCODE_ENDIF);
1665
1666 /* Pop the IF and (optional) ELSE instructions from the stack */
1667 p->if_depth_in_loop[p->loop_stack_depth]--;
1668 tmp = pop_if_stack(p);
1669 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1670 else_inst = tmp;
1671 tmp = pop_if_stack(p);
1672 }
1673 if_inst = tmp;
1674
1675 if (!emit_endif) {
1676 /* ENDIF is useless; don't bother emitting it. */
1677 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1678 return;
1679 }
1680
1681 if (devinfo->gen < 6) {
1682 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1683 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1684 brw_set_src1(p, insn, brw_imm_d(0x0));
1685 } else if (devinfo->gen == 6) {
1686 brw_set_dest(p, insn, brw_imm_w(0));
1687 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1688 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1689 } else if (devinfo->gen == 7) {
1690 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1691 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1692 brw_set_src1(p, insn, brw_imm_w(0));
1693 } else {
1694 brw_set_src0(p, insn, brw_imm_d(0));
1695 }
1696
1697 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1698 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1699 if (devinfo->gen < 6)
1700 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1701
1702 /* Also pop item off the stack in the endif instruction: */
1703 if (devinfo->gen < 6) {
1704 brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1705 brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1706 } else if (devinfo->gen == 6) {
1707 brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1708 } else {
1709 brw_inst_set_jip(devinfo, insn, 2);
1710 }
1711 patch_IF_ELSE(p, if_inst, else_inst, insn);
1712 }
1713
1714 brw_inst *
1715 brw_BREAK(struct brw_codegen *p)
1716 {
1717 const struct gen_device_info *devinfo = p->devinfo;
1718 brw_inst *insn;
1719
1720 insn = next_insn(p, BRW_OPCODE_BREAK);
1721 if (devinfo->gen >= 8) {
1722 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1723 brw_set_src0(p, insn, brw_imm_d(0x0));
1724 } else if (devinfo->gen >= 6) {
1725 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1726 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1727 brw_set_src1(p, insn, brw_imm_d(0x0));
1728 } else {
1729 brw_set_dest(p, insn, brw_ip_reg());
1730 brw_set_src0(p, insn, brw_ip_reg());
1731 brw_set_src1(p, insn, brw_imm_d(0x0));
1732 brw_inst_set_gen4_pop_count(devinfo, insn,
1733 p->if_depth_in_loop[p->loop_stack_depth]);
1734 }
1735 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1736 brw_inst_set_exec_size(devinfo, insn,
1737 brw_inst_exec_size(devinfo, p->current));
1738
1739 return insn;
1740 }
1741
1742 brw_inst *
1743 brw_CONT(struct brw_codegen *p)
1744 {
1745 const struct gen_device_info *devinfo = p->devinfo;
1746 brw_inst *insn;
1747
1748 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1749 brw_set_dest(p, insn, brw_ip_reg());
1750 if (devinfo->gen >= 8) {
1751 brw_set_src0(p, insn, brw_imm_d(0x0));
1752 } else {
1753 brw_set_src0(p, insn, brw_ip_reg());
1754 brw_set_src1(p, insn, brw_imm_d(0x0));
1755 }
1756
1757 if (devinfo->gen < 6) {
1758 brw_inst_set_gen4_pop_count(devinfo, insn,
1759 p->if_depth_in_loop[p->loop_stack_depth]);
1760 }
1761 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1762 brw_inst_set_exec_size(devinfo, insn,
1763 brw_inst_exec_size(devinfo, p->current));
1764 return insn;
1765 }
1766
1767 brw_inst *
1768 gen6_HALT(struct brw_codegen *p)
1769 {
1770 const struct gen_device_info *devinfo = p->devinfo;
1771 brw_inst *insn;
1772
1773 insn = next_insn(p, BRW_OPCODE_HALT);
1774 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1775 if (devinfo->gen >= 8) {
1776 brw_set_src0(p, insn, brw_imm_d(0x0));
1777 } else {
1778 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1779 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1780 }
1781
1782 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1783 brw_inst_set_exec_size(devinfo, insn,
1784 brw_inst_exec_size(devinfo, p->current));
1785 return insn;
1786 }
1787
1788 /* DO/WHILE loop:
1789 *
1790 * The DO/WHILE is just an unterminated loop -- break or continue are
1791 * used for control within the loop. We have a few ways they can be
1792 * done.
1793 *
1794 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1795 * jip and no DO instruction.
1796 *
1797 * For non-uniform control flow pre-gen6, there's a DO instruction to
1798 * push the mask, and a WHILE to jump back, and BREAK to get out and
1799 * pop the mask.
1800 *
1801 * For gen6, there's no more mask stack, so no need for DO. WHILE
1802 * just points back to the first instruction of the loop.
1803 */
1804 brw_inst *
1805 brw_DO(struct brw_codegen *p, unsigned execute_size)
1806 {
1807 const struct gen_device_info *devinfo = p->devinfo;
1808
1809 if (devinfo->gen >= 6 || p->single_program_flow) {
1810 push_loop_stack(p, &p->store[p->nr_insn]);
1811 return &p->store[p->nr_insn];
1812 } else {
1813 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1814
1815 push_loop_stack(p, insn);
1816
1817 /* Override the defaults for this instruction:
1818 */
1819 brw_set_dest(p, insn, brw_null_reg());
1820 brw_set_src0(p, insn, brw_null_reg());
1821 brw_set_src1(p, insn, brw_null_reg());
1822
1823 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1824 brw_inst_set_exec_size(devinfo, insn, execute_size);
1825 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1826
1827 return insn;
1828 }
1829 }
1830
1831 /**
1832 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1833 * instruction here.
1834 *
1835 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1836 * nesting, since it can always just point to the end of the block/current loop.
1837 */
1838 static void
1839 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1840 {
1841 const struct gen_device_info *devinfo = p->devinfo;
1842 brw_inst *do_inst = get_inner_do_insn(p);
1843 brw_inst *inst;
1844 unsigned br = brw_jump_scale(devinfo);
1845
1846 assert(devinfo->gen < 6);
1847
1848 for (inst = while_inst - 1; inst != do_inst; inst--) {
1849 /* If the jump count is != 0, that means that this instruction has already
1850 * been patched because it's part of a loop inside of the one we're
1851 * patching.
1852 */
1853 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1854 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1855 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1856 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1857 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1858 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1859 }
1860 }
1861 }
1862
1863 brw_inst *
1864 brw_WHILE(struct brw_codegen *p)
1865 {
1866 const struct gen_device_info *devinfo = p->devinfo;
1867 brw_inst *insn, *do_insn;
1868 unsigned br = brw_jump_scale(devinfo);
1869
1870 if (devinfo->gen >= 6) {
1871 insn = next_insn(p, BRW_OPCODE_WHILE);
1872 do_insn = get_inner_do_insn(p);
1873
1874 if (devinfo->gen >= 8) {
1875 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1876 brw_set_src0(p, insn, brw_imm_d(0));
1877 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1878 } else if (devinfo->gen == 7) {
1879 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1880 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1881 brw_set_src1(p, insn, brw_imm_w(0));
1882 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1883 } else {
1884 brw_set_dest(p, insn, brw_imm_w(0));
1885 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1886 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1887 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1888 }
1889
1890 brw_inst_set_exec_size(devinfo, insn,
1891 brw_inst_exec_size(devinfo, p->current));
1892
1893 } else {
1894 if (p->single_program_flow) {
1895 insn = next_insn(p, BRW_OPCODE_ADD);
1896 do_insn = get_inner_do_insn(p);
1897
1898 brw_set_dest(p, insn, brw_ip_reg());
1899 brw_set_src0(p, insn, brw_ip_reg());
1900 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1901 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1902 } else {
1903 insn = next_insn(p, BRW_OPCODE_WHILE);
1904 do_insn = get_inner_do_insn(p);
1905
1906 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1907
1908 brw_set_dest(p, insn, brw_ip_reg());
1909 brw_set_src0(p, insn, brw_ip_reg());
1910 brw_set_src1(p, insn, brw_imm_d(0));
1911
1912 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1913 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1914 brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1915
1916 brw_patch_break_cont(p, insn);
1917 }
1918 }
1919 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1920
1921 p->loop_stack_depth--;
1922
1923 return insn;
1924 }
1925
1926 /* FORWARD JUMPS:
1927 */
1928 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1929 {
1930 const struct gen_device_info *devinfo = p->devinfo;
1931 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1932 unsigned jmpi = 1;
1933
1934 if (devinfo->gen >= 5)
1935 jmpi = 2;
1936
1937 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1938 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1939
1940 brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1941 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1942 }
1943
1944 /* To integrate with the above, it makes sense that the comparison
1945 * instruction should populate the flag register. It might be simpler
1946 * just to use the flag reg for most WM tasks?
1947 */
1948 void brw_CMP(struct brw_codegen *p,
1949 struct brw_reg dest,
1950 unsigned conditional,
1951 struct brw_reg src0,
1952 struct brw_reg src1)
1953 {
1954 const struct gen_device_info *devinfo = p->devinfo;
1955 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1956
1957 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1958 brw_set_dest(p, insn, dest);
1959 brw_set_src0(p, insn, src0);
1960 brw_set_src1(p, insn, src1);
1961
1962 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1963 * page says:
1964 * "Any CMP instruction with a null destination must use a {switch}."
1965 *
1966 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1967 * mentioned on their work-arounds pages.
1968 */
1969 if (devinfo->gen == 7) {
1970 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1971 dest.nr == BRW_ARF_NULL) {
1972 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1973 }
1974 }
1975 }
1976
1977 /***********************************************************************
1978 * Helpers for the various SEND message types:
1979 */
1980
1981 /** Extended math function, float[8].
1982 */
1983 void gen4_math(struct brw_codegen *p,
1984 struct brw_reg dest,
1985 unsigned function,
1986 unsigned msg_reg_nr,
1987 struct brw_reg src,
1988 unsigned precision )
1989 {
1990 const struct gen_device_info *devinfo = p->devinfo;
1991 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1992 unsigned data_type;
1993 if (has_scalar_region(src)) {
1994 data_type = BRW_MATH_DATA_SCALAR;
1995 } else {
1996 data_type = BRW_MATH_DATA_VECTOR;
1997 }
1998
1999 assert(devinfo->gen < 6);
2000
2001 /* Example code doesn't set predicate_control for send
2002 * instructions.
2003 */
2004 brw_inst_set_pred_control(devinfo, insn, 0);
2005 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2006
2007 brw_set_dest(p, insn, dest);
2008 brw_set_src0(p, insn, src);
2009 brw_set_math_message(p,
2010 insn,
2011 function,
2012 src.type == BRW_REGISTER_TYPE_D,
2013 precision,
2014 data_type);
2015 }
2016
2017 void gen6_math(struct brw_codegen *p,
2018 struct brw_reg dest,
2019 unsigned function,
2020 struct brw_reg src0,
2021 struct brw_reg src1)
2022 {
2023 const struct gen_device_info *devinfo = p->devinfo;
2024 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
2025
2026 assert(devinfo->gen >= 6);
2027
2028 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
2029 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
2030
2031 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
2032 if (devinfo->gen == 6) {
2033 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
2034 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
2035 }
2036
2037 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
2038 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
2039 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
2040 assert(src0.type != BRW_REGISTER_TYPE_F);
2041 assert(src1.type != BRW_REGISTER_TYPE_F);
2042 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2043 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2044 } else {
2045 assert(src0.type == BRW_REGISTER_TYPE_F);
2046 assert(src1.type == BRW_REGISTER_TYPE_F);
2047 }
2048
2049 /* Source modifiers are ignored for extended math instructions on Gen6. */
2050 if (devinfo->gen == 6) {
2051 assert(!src0.negate);
2052 assert(!src0.abs);
2053 assert(!src1.negate);
2054 assert(!src1.abs);
2055 }
2056
2057 brw_inst_set_math_function(devinfo, insn, function);
2058
2059 brw_set_dest(p, insn, dest);
2060 brw_set_src0(p, insn, src0);
2061 brw_set_src1(p, insn, src1);
2062 }
2063
2064 /**
2065 * Return the right surface index to access the thread scratch space using
2066 * stateless dataport messages.
2067 */
2068 unsigned
2069 brw_scratch_surface_idx(const struct brw_codegen *p)
2070 {
2071 /* The scratch space is thread-local so IA coherency is unnecessary. */
2072 if (p->devinfo->gen >= 8)
2073 return GEN8_BTI_STATELESS_NON_COHERENT;
2074 else
2075 return BRW_BTI_STATELESS;
2076 }
2077
2078 /**
2079 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2080 * using a constant offset per channel.
2081 *
2082 * The offset must be aligned to oword size (16 bytes). Used for
2083 * register spilling.
2084 */
2085 void brw_oword_block_write_scratch(struct brw_codegen *p,
2086 struct brw_reg mrf,
2087 int num_regs,
2088 unsigned offset)
2089 {
2090 const struct gen_device_info *devinfo = p->devinfo;
2091 const unsigned target_cache =
2092 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2093 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2094 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2095 uint32_t msg_type;
2096
2097 if (devinfo->gen >= 6)
2098 offset /= 16;
2099
2100 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2101
2102 const unsigned mlen = 1 + num_regs;
2103
2104 /* Set up the message header. This is g0, with g0.2 filled with
2105 * the offset. We don't want to leave our offset around in g0 or
2106 * it'll screw up texture samples, so set it up inside the message
2107 * reg.
2108 */
2109 {
2110 brw_push_insn_state(p);
2111 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2112 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2113 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2114
2115 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2116
2117 /* set message header global offset field (reg 0, element 2) */
2118 brw_MOV(p,
2119 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2120 mrf.nr,
2121 2), BRW_REGISTER_TYPE_UD),
2122 brw_imm_ud(offset));
2123
2124 brw_pop_insn_state(p);
2125 }
2126
2127 {
2128 struct brw_reg dest;
2129 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2130 int send_commit_msg;
2131 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2132 BRW_REGISTER_TYPE_UW);
2133
2134 brw_inst_set_compression(devinfo, insn, false);
2135
2136 if (brw_inst_exec_size(devinfo, insn) >= 16)
2137 src_header = vec16(src_header);
2138
2139 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2140 if (devinfo->gen < 6)
2141 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2142
2143 /* Until gen6, writes followed by reads from the same location
2144 * are not guaranteed to be ordered unless write_commit is set.
2145 * If set, then a no-op write is issued to the destination
2146 * register to set a dependency, and a read from the destination
2147 * can be used to ensure the ordering.
2148 *
2149 * For gen6, only writes between different threads need ordering
2150 * protection. Our use of DP writes is all about register
2151 * spilling within a thread.
2152 */
2153 if (devinfo->gen >= 6) {
2154 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2155 send_commit_msg = 0;
2156 } else {
2157 dest = src_header;
2158 send_commit_msg = 1;
2159 }
2160
2161 brw_set_dest(p, insn, dest);
2162 if (devinfo->gen >= 6) {
2163 brw_set_src0(p, insn, mrf);
2164 } else {
2165 brw_set_src0(p, insn, brw_null_reg());
2166 }
2167
2168 if (devinfo->gen >= 6)
2169 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2170 else
2171 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2172
2173 brw_set_dp_write_message(p,
2174 insn,
2175 brw_scratch_surface_idx(p),
2176 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2177 msg_type,
2178 target_cache,
2179 mlen,
2180 true, /* header_present */
2181 0, /* not a render target */
2182 send_commit_msg, /* response_length */
2183 0, /* eot */
2184 send_commit_msg);
2185 }
2186 }
2187
2188
2189 /**
2190 * Read a block of owords (half a GRF each) from the scratch buffer
2191 * using a constant index per channel.
2192 *
2193 * Offset must be aligned to oword size (16 bytes). Used for register
2194 * spilling.
2195 */
2196 void
2197 brw_oword_block_read_scratch(struct brw_codegen *p,
2198 struct brw_reg dest,
2199 struct brw_reg mrf,
2200 int num_regs,
2201 unsigned offset)
2202 {
2203 const struct gen_device_info *devinfo = p->devinfo;
2204
2205 if (devinfo->gen >= 6)
2206 offset /= 16;
2207
2208 if (p->devinfo->gen >= 7) {
2209 /* On gen 7 and above, we no longer have message registers and we can
2210 * send from any register we want. By using the destination register
2211 * for the message, we guarantee that the implied message write won't
2212 * accidentally overwrite anything. This has been a problem because
2213 * the MRF registers and source for the final FB write are both fixed
2214 * and may overlap.
2215 */
2216 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2217 } else {
2218 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2219 }
2220 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2221
2222 const unsigned rlen = num_regs;
2223 const unsigned target_cache =
2224 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2225 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2226 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2227
2228 {
2229 brw_push_insn_state(p);
2230 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2231 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2232 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2233
2234 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2235
2236 /* set message header global offset field (reg 0, element 2) */
2237 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2238
2239 brw_pop_insn_state(p);
2240 }
2241
2242 {
2243 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2244
2245 assert(brw_inst_pred_control(devinfo, insn) == 0);
2246 brw_inst_set_compression(devinfo, insn, false);
2247
2248 brw_set_dest(p, insn, dest); /* UW? */
2249 if (devinfo->gen >= 6) {
2250 brw_set_src0(p, insn, mrf);
2251 } else {
2252 brw_set_src0(p, insn, brw_null_reg());
2253 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2254 }
2255
2256 brw_set_dp_read_message(p,
2257 insn,
2258 brw_scratch_surface_idx(p),
2259 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2260 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2261 target_cache,
2262 1, /* msg_length */
2263 true, /* header_present */
2264 rlen);
2265 }
2266 }
2267
2268 void
2269 gen7_block_read_scratch(struct brw_codegen *p,
2270 struct brw_reg dest,
2271 int num_regs,
2272 unsigned offset)
2273 {
2274 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2275 assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2276
2277 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2278
2279 /* The HW requires that the header is present; this is to get the g0.5
2280 * scratch offset.
2281 */
2282 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2283
2284 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2285 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2286 * is 32 bytes, which happens to be the size of a register.
2287 */
2288 offset /= REG_SIZE;
2289 assert(offset < (1 << 12));
2290
2291 gen7_set_dp_scratch_message(p, insn,
2292 false, /* scratch read */
2293 false, /* OWords */
2294 false, /* invalidate after read */
2295 num_regs,
2296 offset,
2297 1, /* mlen: just g0 */
2298 num_regs, /* rlen */
2299 true); /* header present */
2300 }
2301
2302 /**
2303 * Read float[4] vectors from the data port constant cache.
2304 * Location (in buffer) should be a multiple of 16.
2305 * Used for fetching shader constants.
2306 */
2307 void brw_oword_block_read(struct brw_codegen *p,
2308 struct brw_reg dest,
2309 struct brw_reg mrf,
2310 uint32_t offset,
2311 uint32_t bind_table_index)
2312 {
2313 const struct gen_device_info *devinfo = p->devinfo;
2314 const unsigned target_cache =
2315 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
2316 BRW_DATAPORT_READ_TARGET_DATA_CACHE);
2317 const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current);
2318
2319 /* On newer hardware, offset is in units of owords. */
2320 if (devinfo->gen >= 6)
2321 offset /= 16;
2322
2323 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2324
2325 brw_push_insn_state(p);
2326 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2327 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2328 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2329
2330 brw_push_insn_state(p);
2331 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2332 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2333
2334 /* set message header global offset field (reg 0, element 2) */
2335 brw_MOV(p,
2336 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2337 mrf.nr,
2338 2), BRW_REGISTER_TYPE_UD),
2339 brw_imm_ud(offset));
2340 brw_pop_insn_state(p);
2341
2342 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2343
2344 /* cast dest to a uword[8] vector */
2345 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2346
2347 brw_set_dest(p, insn, dest);
2348 if (devinfo->gen >= 6) {
2349 brw_set_src0(p, insn, mrf);
2350 } else {
2351 brw_set_src0(p, insn, brw_null_reg());
2352 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2353 }
2354
2355 brw_set_dp_read_message(p, insn, bind_table_index,
2356 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2357 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2358 target_cache,
2359 1, /* msg_length */
2360 true, /* header_present */
2361 DIV_ROUND_UP(exec_size, 8)); /* response_length */
2362
2363 brw_pop_insn_state(p);
2364 }
2365
2366
2367 void brw_fb_WRITE(struct brw_codegen *p,
2368 struct brw_reg payload,
2369 struct brw_reg implied_header,
2370 unsigned msg_control,
2371 unsigned binding_table_index,
2372 unsigned msg_length,
2373 unsigned response_length,
2374 bool eot,
2375 bool last_render_target,
2376 bool header_present)
2377 {
2378 const struct gen_device_info *devinfo = p->devinfo;
2379 const unsigned target_cache =
2380 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2381 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2382 brw_inst *insn;
2383 unsigned msg_type;
2384 struct brw_reg dest, src0;
2385
2386 if (brw_inst_exec_size(devinfo, p->current) >= BRW_EXECUTE_16)
2387 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2388 else
2389 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2390
2391 if (devinfo->gen >= 6) {
2392 insn = next_insn(p, BRW_OPCODE_SENDC);
2393 } else {
2394 insn = next_insn(p, BRW_OPCODE_SEND);
2395 }
2396 brw_inst_set_compression(devinfo, insn, false);
2397
2398 if (devinfo->gen >= 6) {
2399 /* headerless version, just submit color payload */
2400 src0 = payload;
2401
2402 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2403 } else {
2404 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2405 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2406 src0 = implied_header;
2407
2408 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2409 }
2410
2411 brw_set_dest(p, insn, dest);
2412 brw_set_src0(p, insn, src0);
2413 brw_set_dp_write_message(p,
2414 insn,
2415 binding_table_index,
2416 msg_control,
2417 msg_type,
2418 target_cache,
2419 msg_length,
2420 header_present,
2421 last_render_target,
2422 response_length,
2423 eot,
2424 0 /* send_commit_msg */);
2425 }
2426
2427 brw_inst *
2428 gen9_fb_READ(struct brw_codegen *p,
2429 struct brw_reg dst,
2430 struct brw_reg payload,
2431 unsigned binding_table_index,
2432 unsigned msg_length,
2433 unsigned response_length,
2434 bool per_sample)
2435 {
2436 const struct gen_device_info *devinfo = p->devinfo;
2437 assert(devinfo->gen >= 9);
2438 const unsigned msg_subtype =
2439 brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16 ? 0 : 1;
2440 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2441
2442 brw_set_dest(p, insn, dst);
2443 brw_set_src0(p, insn, payload);
2444 brw_set_dp_read_message(p, insn, binding_table_index,
2445 per_sample << 5 | msg_subtype,
2446 GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2447 GEN6_SFID_DATAPORT_RENDER_CACHE,
2448 msg_length, true /* header_present */,
2449 response_length);
2450 brw_inst_set_rt_slot_group(devinfo, insn,
2451 brw_inst_qtr_control(devinfo, p->current) / 2);
2452
2453 return insn;
2454 }
2455
2456 /**
2457 * Texture sample instruction.
2458 * Note: the msg_type plus msg_length values determine exactly what kind
2459 * of sampling operation is performed. See volume 4, page 161 of docs.
2460 */
2461 void brw_SAMPLE(struct brw_codegen *p,
2462 struct brw_reg dest,
2463 unsigned msg_reg_nr,
2464 struct brw_reg src0,
2465 unsigned binding_table_index,
2466 unsigned sampler,
2467 unsigned msg_type,
2468 unsigned response_length,
2469 unsigned msg_length,
2470 unsigned header_present,
2471 unsigned simd_mode,
2472 unsigned return_format)
2473 {
2474 const struct gen_device_info *devinfo = p->devinfo;
2475 brw_inst *insn;
2476
2477 if (msg_reg_nr != -1)
2478 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2479
2480 insn = next_insn(p, BRW_OPCODE_SEND);
2481 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2482
2483 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2484 *
2485 * "Instruction compression is not allowed for this instruction (that
2486 * is, send). The hardware behavior is undefined if this instruction is
2487 * set as compressed. However, compress control can be set to "SecHalf"
2488 * to affect the EMask generation."
2489 *
2490 * No similar wording is found in later PRMs, but there are examples
2491 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2492 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2493 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2494 */
2495 brw_inst_set_compression(devinfo, insn, false);
2496
2497 if (devinfo->gen < 6)
2498 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2499
2500 brw_set_dest(p, insn, dest);
2501 brw_set_src0(p, insn, src0);
2502 brw_set_sampler_message(p, insn,
2503 binding_table_index,
2504 sampler,
2505 msg_type,
2506 response_length,
2507 msg_length,
2508 header_present,
2509 simd_mode,
2510 return_format);
2511 }
2512
2513 /* Adjust the message header's sampler state pointer to
2514 * select the correct group of 16 samplers.
2515 */
2516 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2517 struct brw_reg header,
2518 struct brw_reg sampler_index)
2519 {
2520 /* The "Sampler Index" field can only store values between 0 and 15.
2521 * However, we can add an offset to the "Sampler State Pointer"
2522 * field, effectively selecting a different set of 16 samplers.
2523 *
2524 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2525 * offset, and each sampler state is only 16-bytes, so we can't
2526 * exclusively use the offset - we have to use both.
2527 */
2528
2529 const struct gen_device_info *devinfo = p->devinfo;
2530
2531 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2532 const int sampler_state_size = 16; /* 16 bytes */
2533 uint32_t sampler = sampler_index.ud;
2534
2535 if (sampler >= 16) {
2536 assert(devinfo->is_haswell || devinfo->gen >= 8);
2537 brw_ADD(p,
2538 get_element_ud(header, 3),
2539 get_element_ud(brw_vec8_grf(0, 0), 3),
2540 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2541 }
2542 } else {
2543 /* Non-const sampler array indexing case */
2544 if (devinfo->gen < 8 && !devinfo->is_haswell) {
2545 return;
2546 }
2547
2548 struct brw_reg temp = get_element_ud(header, 3);
2549
2550 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2551 brw_SHL(p, temp, temp, brw_imm_ud(4));
2552 brw_ADD(p,
2553 get_element_ud(header, 3),
2554 get_element_ud(brw_vec8_grf(0, 0), 3),
2555 temp);
2556 }
2557 }
2558
2559 /* All these variables are pretty confusing - we might be better off
2560 * using bitmasks and macros for this, in the old style. Or perhaps
2561 * just having the caller instantiate the fields in dword3 itself.
2562 */
2563 void brw_urb_WRITE(struct brw_codegen *p,
2564 struct brw_reg dest,
2565 unsigned msg_reg_nr,
2566 struct brw_reg src0,
2567 enum brw_urb_write_flags flags,
2568 unsigned msg_length,
2569 unsigned response_length,
2570 unsigned offset,
2571 unsigned swizzle)
2572 {
2573 const struct gen_device_info *devinfo = p->devinfo;
2574 brw_inst *insn;
2575
2576 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2577
2578 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2579 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2580 brw_push_insn_state(p);
2581 brw_set_default_access_mode(p, BRW_ALIGN_1);
2582 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2583 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2584 BRW_REGISTER_TYPE_UD),
2585 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2586 brw_imm_ud(0xff00));
2587 brw_pop_insn_state(p);
2588 }
2589
2590 insn = next_insn(p, BRW_OPCODE_SEND);
2591
2592 assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2593
2594 brw_set_dest(p, insn, dest);
2595 brw_set_src0(p, insn, src0);
2596 brw_set_src1(p, insn, brw_imm_d(0));
2597
2598 if (devinfo->gen < 6)
2599 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2600
2601 brw_set_urb_message(p,
2602 insn,
2603 flags,
2604 msg_length,
2605 response_length,
2606 offset,
2607 swizzle);
2608 }
2609
2610 struct brw_inst *
2611 brw_send_indirect_message(struct brw_codegen *p,
2612 unsigned sfid,
2613 struct brw_reg dst,
2614 struct brw_reg payload,
2615 struct brw_reg desc)
2616 {
2617 const struct gen_device_info *devinfo = p->devinfo;
2618 struct brw_inst *send;
2619 int setup;
2620
2621 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2622
2623 assert(desc.type == BRW_REGISTER_TYPE_UD);
2624
2625 /* We hold on to the setup instruction (the SEND in the direct case, the OR
2626 * in the indirect case) by its index in the instruction store. The
2627 * pointer returned by next_insn() may become invalid if emitting the SEND
2628 * in the indirect case reallocs the store.
2629 */
2630
2631 if (desc.file == BRW_IMMEDIATE_VALUE) {
2632 setup = p->nr_insn;
2633 send = next_insn(p, BRW_OPCODE_SEND);
2634 brw_set_src1(p, send, desc);
2635
2636 } else {
2637 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2638
2639 brw_push_insn_state(p);
2640 brw_set_default_access_mode(p, BRW_ALIGN_1);
2641 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2642 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2643
2644 /* Load the indirect descriptor to an address register using OR so the
2645 * caller can specify additional descriptor bits with the usual
2646 * brw_set_*_message() helper functions.
2647 */
2648 setup = p->nr_insn;
2649 brw_OR(p, addr, desc, brw_imm_ud(0));
2650
2651 brw_pop_insn_state(p);
2652
2653 send = next_insn(p, BRW_OPCODE_SEND);
2654 brw_set_src1(p, send, addr);
2655 }
2656
2657 if (dst.width < BRW_EXECUTE_8)
2658 brw_inst_set_exec_size(devinfo, send, dst.width);
2659
2660 brw_set_dest(p, send, dst);
2661 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2662 brw_inst_set_sfid(devinfo, send, sfid);
2663
2664 return &p->store[setup];
2665 }
2666
2667 static struct brw_inst *
2668 brw_send_indirect_surface_message(struct brw_codegen *p,
2669 unsigned sfid,
2670 struct brw_reg dst,
2671 struct brw_reg payload,
2672 struct brw_reg surface,
2673 unsigned message_len,
2674 unsigned response_len,
2675 bool header_present)
2676 {
2677 const struct gen_device_info *devinfo = p->devinfo;
2678 struct brw_inst *insn;
2679
2680 if (surface.file != BRW_IMMEDIATE_VALUE) {
2681 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2682
2683 brw_push_insn_state(p);
2684 brw_set_default_access_mode(p, BRW_ALIGN_1);
2685 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2686 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2687
2688 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2689 * some surface array is accessed out of bounds.
2690 */
2691 insn = brw_AND(p, addr,
2692 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2693 BRW_GET_SWZ(surface.swizzle, 0)),
2694 brw_imm_ud(0xff));
2695
2696 brw_pop_insn_state(p);
2697
2698 surface = addr;
2699 }
2700
2701 insn = brw_send_indirect_message(p, sfid, dst, payload, surface);
2702 brw_inst_set_mlen(devinfo, insn, message_len);
2703 brw_inst_set_rlen(devinfo, insn, response_len);
2704 brw_inst_set_header_present(devinfo, insn, header_present);
2705
2706 return insn;
2707 }
2708
2709 static bool
2710 while_jumps_before_offset(const struct gen_device_info *devinfo,
2711 brw_inst *insn, int while_offset, int start_offset)
2712 {
2713 int scale = 16 / brw_jump_scale(devinfo);
2714 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2715 : brw_inst_jip(devinfo, insn);
2716 assert(jip < 0);
2717 return while_offset + jip * scale <= start_offset;
2718 }
2719
2720
2721 static int
2722 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2723 {
2724 int offset;
2725 void *store = p->store;
2726 const struct gen_device_info *devinfo = p->devinfo;
2727
2728 int depth = 0;
2729
2730 for (offset = next_offset(devinfo, store, start_offset);
2731 offset < p->next_insn_offset;
2732 offset = next_offset(devinfo, store, offset)) {
2733 brw_inst *insn = store + offset;
2734
2735 switch (brw_inst_opcode(devinfo, insn)) {
2736 case BRW_OPCODE_IF:
2737 depth++;
2738 break;
2739 case BRW_OPCODE_ENDIF:
2740 if (depth == 0)
2741 return offset;
2742 depth--;
2743 break;
2744 case BRW_OPCODE_WHILE:
2745 /* If the while doesn't jump before our instruction, it's the end
2746 * of a sibling do...while loop. Ignore it.
2747 */
2748 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2749 continue;
2750 /* fallthrough */
2751 case BRW_OPCODE_ELSE:
2752 case BRW_OPCODE_HALT:
2753 if (depth == 0)
2754 return offset;
2755 }
2756 }
2757
2758 return 0;
2759 }
2760
2761 /* There is no DO instruction on gen6, so to find the end of the loop
2762 * we have to see if the loop is jumping back before our start
2763 * instruction.
2764 */
2765 static int
2766 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2767 {
2768 const struct gen_device_info *devinfo = p->devinfo;
2769 int offset;
2770 void *store = p->store;
2771
2772 assert(devinfo->gen >= 6);
2773
2774 /* Always start after the instruction (such as a WHILE) we're trying to fix
2775 * up.
2776 */
2777 for (offset = next_offset(devinfo, store, start_offset);
2778 offset < p->next_insn_offset;
2779 offset = next_offset(devinfo, store, offset)) {
2780 brw_inst *insn = store + offset;
2781
2782 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2783 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2784 return offset;
2785 }
2786 }
2787 assert(!"not reached");
2788 return start_offset;
2789 }
2790
2791 /* After program generation, go back and update the UIP and JIP of
2792 * BREAK, CONT, and HALT instructions to their correct locations.
2793 */
2794 void
2795 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2796 {
2797 const struct gen_device_info *devinfo = p->devinfo;
2798 int offset;
2799 int br = brw_jump_scale(devinfo);
2800 int scale = 16 / br;
2801 void *store = p->store;
2802
2803 if (devinfo->gen < 6)
2804 return;
2805
2806 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2807 brw_inst *insn = store + offset;
2808 assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2809
2810 int block_end_offset = brw_find_next_block_end(p, offset);
2811 switch (brw_inst_opcode(devinfo, insn)) {
2812 case BRW_OPCODE_BREAK:
2813 assert(block_end_offset != 0);
2814 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2815 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2816 brw_inst_set_uip(devinfo, insn,
2817 (brw_find_loop_end(p, offset) - offset +
2818 (devinfo->gen == 6 ? 16 : 0)) / scale);
2819 break;
2820 case BRW_OPCODE_CONTINUE:
2821 assert(block_end_offset != 0);
2822 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2823 brw_inst_set_uip(devinfo, insn,
2824 (brw_find_loop_end(p, offset) - offset) / scale);
2825
2826 assert(brw_inst_uip(devinfo, insn) != 0);
2827 assert(brw_inst_jip(devinfo, insn) != 0);
2828 break;
2829
2830 case BRW_OPCODE_ENDIF: {
2831 int32_t jump = (block_end_offset == 0) ?
2832 1 * br : (block_end_offset - offset) / scale;
2833 if (devinfo->gen >= 7)
2834 brw_inst_set_jip(devinfo, insn, jump);
2835 else
2836 brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2837 break;
2838 }
2839
2840 case BRW_OPCODE_HALT:
2841 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2842 *
2843 * "In case of the halt instruction not inside any conditional
2844 * code block, the value of <JIP> and <UIP> should be the
2845 * same. In case of the halt instruction inside conditional code
2846 * block, the <UIP> should be the end of the program, and the
2847 * <JIP> should be end of the most inner conditional code block."
2848 *
2849 * The uip will have already been set by whoever set up the
2850 * instruction.
2851 */
2852 if (block_end_offset == 0) {
2853 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2854 } else {
2855 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2856 }
2857 assert(brw_inst_uip(devinfo, insn) != 0);
2858 assert(brw_inst_jip(devinfo, insn) != 0);
2859 break;
2860 }
2861 }
2862 }
2863
2864 void brw_ff_sync(struct brw_codegen *p,
2865 struct brw_reg dest,
2866 unsigned msg_reg_nr,
2867 struct brw_reg src0,
2868 bool allocate,
2869 unsigned response_length,
2870 bool eot)
2871 {
2872 const struct gen_device_info *devinfo = p->devinfo;
2873 brw_inst *insn;
2874
2875 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2876
2877 insn = next_insn(p, BRW_OPCODE_SEND);
2878 brw_set_dest(p, insn, dest);
2879 brw_set_src0(p, insn, src0);
2880 brw_set_src1(p, insn, brw_imm_d(0));
2881
2882 if (devinfo->gen < 6)
2883 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2884
2885 brw_set_ff_sync_message(p,
2886 insn,
2887 allocate,
2888 response_length,
2889 eot);
2890 }
2891
2892 /**
2893 * Emit the SEND instruction necessary to generate stream output data on Gen6
2894 * (for transform feedback).
2895 *
2896 * If send_commit_msg is true, this is the last piece of stream output data
2897 * from this thread, so send the data as a committed write. According to the
2898 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2899 *
2900 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2901 * writes are complete by sending the final write as a committed write."
2902 */
2903 void
2904 brw_svb_write(struct brw_codegen *p,
2905 struct brw_reg dest,
2906 unsigned msg_reg_nr,
2907 struct brw_reg src0,
2908 unsigned binding_table_index,
2909 bool send_commit_msg)
2910 {
2911 const struct gen_device_info *devinfo = p->devinfo;
2912 const unsigned target_cache =
2913 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2914 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2915 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2916 brw_inst *insn;
2917
2918 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2919
2920 insn = next_insn(p, BRW_OPCODE_SEND);
2921 brw_set_dest(p, insn, dest);
2922 brw_set_src0(p, insn, src0);
2923 brw_set_src1(p, insn, brw_imm_d(0));
2924 brw_set_dp_write_message(p, insn,
2925 binding_table_index,
2926 0, /* msg_control: ignored */
2927 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2928 target_cache,
2929 1, /* msg_length */
2930 true, /* header_present */
2931 0, /* last_render_target: ignored */
2932 send_commit_msg, /* response_length */
2933 0, /* end_of_thread */
2934 send_commit_msg); /* send_commit_msg */
2935 }
2936
2937 static unsigned
2938 brw_surface_payload_size(struct brw_codegen *p,
2939 unsigned num_channels,
2940 bool has_simd4x2,
2941 bool has_simd16)
2942 {
2943 if (has_simd4x2 &&
2944 brw_inst_access_mode(p->devinfo, p->current) == BRW_ALIGN_16)
2945 return 1;
2946 else if (has_simd16 &&
2947 brw_inst_exec_size(p->devinfo, p->current) == BRW_EXECUTE_16)
2948 return 2 * num_channels;
2949 else
2950 return num_channels;
2951 }
2952
2953 static void
2954 brw_set_dp_untyped_atomic_message(struct brw_codegen *p,
2955 brw_inst *insn,
2956 unsigned atomic_op,
2957 bool response_expected)
2958 {
2959 const struct gen_device_info *devinfo = p->devinfo;
2960 unsigned msg_control =
2961 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2962 (response_expected ? 1 << 5 : 0); /* Return data expected */
2963
2964 if (devinfo->gen >= 8 || devinfo->is_haswell) {
2965 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2966 if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
2967 msg_control |= 1 << 4; /* SIMD8 mode */
2968
2969 brw_inst_set_dp_msg_type(devinfo, insn,
2970 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
2971 } else {
2972 brw_inst_set_dp_msg_type(devinfo, insn,
2973 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
2974 }
2975 } else {
2976 brw_inst_set_dp_msg_type(devinfo, insn,
2977 GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
2978
2979 if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
2980 msg_control |= 1 << 4; /* SIMD8 mode */
2981 }
2982
2983 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2984 }
2985
2986 void
2987 brw_untyped_atomic(struct brw_codegen *p,
2988 struct brw_reg dst,
2989 struct brw_reg payload,
2990 struct brw_reg surface,
2991 unsigned atomic_op,
2992 unsigned msg_length,
2993 bool response_expected)
2994 {
2995 const struct gen_device_info *devinfo = p->devinfo;
2996 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2997 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2998 GEN7_SFID_DATAPORT_DATA_CACHE);
2999 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3000 /* Mask out unused components -- This is especially important in Align16
3001 * mode on generations that don't have native support for SIMD4x2 atomics,
3002 * because unused but enabled components will cause the dataport to perform
3003 * additional atomic operations on the addresses that happen to be in the
3004 * uninitialized Y, Z and W coordinates of the payload.
3005 */
3006 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3007 struct brw_inst *insn = brw_send_indirect_surface_message(
3008 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
3009 brw_surface_payload_size(p, response_expected,
3010 devinfo->gen >= 8 || devinfo->is_haswell, true),
3011 align1);
3012
3013 brw_set_dp_untyped_atomic_message(
3014 p, insn, atomic_op, response_expected);
3015 }
3016
3017 static void
3018 brw_set_dp_untyped_surface_read_message(struct brw_codegen *p,
3019 struct brw_inst *insn,
3020 unsigned num_channels)
3021 {
3022 const struct gen_device_info *devinfo = p->devinfo;
3023 /* Set mask of 32-bit channels to drop. */
3024 unsigned msg_control = 0xf & (0xf << num_channels);
3025
3026 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3027 if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
3028 msg_control |= 1 << 4; /* SIMD16 mode */
3029 else
3030 msg_control |= 2 << 4; /* SIMD8 mode */
3031 }
3032
3033 brw_inst_set_dp_msg_type(devinfo, insn,
3034 (devinfo->gen >= 8 || devinfo->is_haswell ?
3035 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
3036 GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ));
3037 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3038 }
3039
3040 void
3041 brw_untyped_surface_read(struct brw_codegen *p,
3042 struct brw_reg dst,
3043 struct brw_reg payload,
3044 struct brw_reg surface,
3045 unsigned msg_length,
3046 unsigned num_channels)
3047 {
3048 const struct gen_device_info *devinfo = p->devinfo;
3049 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3050 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3051 GEN7_SFID_DATAPORT_DATA_CACHE);
3052 struct brw_inst *insn = brw_send_indirect_surface_message(
3053 p, sfid, dst, payload, surface, msg_length,
3054 brw_surface_payload_size(p, num_channels, true, true),
3055 false);
3056
3057 brw_set_dp_untyped_surface_read_message(
3058 p, insn, num_channels);
3059 }
3060
3061 static void
3062 brw_set_dp_untyped_surface_write_message(struct brw_codegen *p,
3063 struct brw_inst *insn,
3064 unsigned num_channels)
3065 {
3066 const struct gen_device_info *devinfo = p->devinfo;
3067 /* Set mask of 32-bit channels to drop. */
3068 unsigned msg_control = 0xf & (0xf << num_channels);
3069
3070 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3071 if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
3072 msg_control |= 1 << 4; /* SIMD16 mode */
3073 else
3074 msg_control |= 2 << 4; /* SIMD8 mode */
3075 } else {
3076 if (devinfo->gen >= 8 || devinfo->is_haswell)
3077 msg_control |= 0 << 4; /* SIMD4x2 mode */
3078 else
3079 msg_control |= 2 << 4; /* SIMD8 mode */
3080 }
3081
3082 brw_inst_set_dp_msg_type(devinfo, insn,
3083 devinfo->gen >= 8 || devinfo->is_haswell ?
3084 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
3085 GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
3086 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3087 }
3088
3089 void
3090 brw_untyped_surface_write(struct brw_codegen *p,
3091 struct brw_reg payload,
3092 struct brw_reg surface,
3093 unsigned msg_length,
3094 unsigned num_channels)
3095 {
3096 const struct gen_device_info *devinfo = p->devinfo;
3097 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3098 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3099 GEN7_SFID_DATAPORT_DATA_CACHE);
3100 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3101 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3102 const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3103 WRITEMASK_X : WRITEMASK_XYZW;
3104 struct brw_inst *insn = brw_send_indirect_surface_message(
3105 p, sfid, brw_writemask(brw_null_reg(), mask),
3106 payload, surface, msg_length, 0, align1);
3107
3108 brw_set_dp_untyped_surface_write_message(
3109 p, insn, num_channels);
3110 }
3111
3112 static void
3113 brw_set_dp_typed_atomic_message(struct brw_codegen *p,
3114 struct brw_inst *insn,
3115 unsigned atomic_op,
3116 bool response_expected)
3117 {
3118 const struct gen_device_info *devinfo = p->devinfo;
3119 unsigned msg_control =
3120 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
3121 (response_expected ? 1 << 5 : 0); /* Return data expected */
3122
3123 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3124 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3125 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3126 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3127
3128 brw_inst_set_dp_msg_type(devinfo, insn,
3129 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP);
3130 } else {
3131 brw_inst_set_dp_msg_type(devinfo, insn,
3132 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2);
3133 }
3134
3135 } else {
3136 brw_inst_set_dp_msg_type(devinfo, insn,
3137 GEN7_DATAPORT_RC_TYPED_ATOMIC_OP);
3138
3139 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3140 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3141 }
3142
3143 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3144 }
3145
3146 void
3147 brw_typed_atomic(struct brw_codegen *p,
3148 struct brw_reg dst,
3149 struct brw_reg payload,
3150 struct brw_reg surface,
3151 unsigned atomic_op,
3152 unsigned msg_length,
3153 bool response_expected) {
3154 const struct gen_device_info *devinfo = p->devinfo;
3155 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3156 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3157 GEN6_SFID_DATAPORT_RENDER_CACHE);
3158 const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3159 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3160 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3161 struct brw_inst *insn = brw_send_indirect_surface_message(
3162 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
3163 brw_surface_payload_size(p, response_expected,
3164 devinfo->gen >= 8 || devinfo->is_haswell, false),
3165 true);
3166
3167 brw_set_dp_typed_atomic_message(
3168 p, insn, atomic_op, response_expected);
3169 }
3170
3171 static void
3172 brw_set_dp_typed_surface_read_message(struct brw_codegen *p,
3173 struct brw_inst *insn,
3174 unsigned num_channels)
3175 {
3176 const struct gen_device_info *devinfo = p->devinfo;
3177 /* Set mask of unused channels. */
3178 unsigned msg_control = 0xf & (0xf << num_channels);
3179
3180 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3181 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3182 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3183 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3184 else
3185 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3186 }
3187
3188 brw_inst_set_dp_msg_type(devinfo, insn,
3189 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ);
3190 } else {
3191 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3192 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3193 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3194 }
3195
3196 brw_inst_set_dp_msg_type(devinfo, insn,
3197 GEN7_DATAPORT_RC_TYPED_SURFACE_READ);
3198 }
3199
3200 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3201 }
3202
3203 void
3204 brw_typed_surface_read(struct brw_codegen *p,
3205 struct brw_reg dst,
3206 struct brw_reg payload,
3207 struct brw_reg surface,
3208 unsigned msg_length,
3209 unsigned num_channels)
3210 {
3211 const struct gen_device_info *devinfo = p->devinfo;
3212 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3213 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3214 GEN6_SFID_DATAPORT_RENDER_CACHE);
3215 struct brw_inst *insn = brw_send_indirect_surface_message(
3216 p, sfid, dst, payload, surface, msg_length,
3217 brw_surface_payload_size(p, num_channels,
3218 devinfo->gen >= 8 || devinfo->is_haswell, false),
3219 true);
3220
3221 brw_set_dp_typed_surface_read_message(
3222 p, insn, num_channels);
3223 }
3224
3225 static void
3226 brw_set_dp_typed_surface_write_message(struct brw_codegen *p,
3227 struct brw_inst *insn,
3228 unsigned num_channels)
3229 {
3230 const struct gen_device_info *devinfo = p->devinfo;
3231 /* Set mask of unused channels. */
3232 unsigned msg_control = 0xf & (0xf << num_channels);
3233
3234 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3235 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3236 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3237 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3238 else
3239 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3240 }
3241
3242 brw_inst_set_dp_msg_type(devinfo, insn,
3243 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE);
3244
3245 } else {
3246 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3247 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3248 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3249 }
3250
3251 brw_inst_set_dp_msg_type(devinfo, insn,
3252 GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE);
3253 }
3254
3255 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3256 }
3257
3258 void
3259 brw_typed_surface_write(struct brw_codegen *p,
3260 struct brw_reg payload,
3261 struct brw_reg surface,
3262 unsigned msg_length,
3263 unsigned num_channels)
3264 {
3265 const struct gen_device_info *devinfo = p->devinfo;
3266 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3267 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3268 GEN6_SFID_DATAPORT_RENDER_CACHE);
3269 const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3270 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3271 const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3272 WRITEMASK_X : WRITEMASK_XYZW);
3273 struct brw_inst *insn = brw_send_indirect_surface_message(
3274 p, sfid, brw_writemask(brw_null_reg(), mask),
3275 payload, surface, msg_length, 0, true);
3276
3277 brw_set_dp_typed_surface_write_message(
3278 p, insn, num_channels);
3279 }
3280
3281 static void
3282 brw_set_memory_fence_message(struct brw_codegen *p,
3283 struct brw_inst *insn,
3284 enum brw_message_target sfid,
3285 bool commit_enable)
3286 {
3287 const struct gen_device_info *devinfo = p->devinfo;
3288
3289 brw_set_message_descriptor(p, insn, sfid,
3290 1 /* message length */,
3291 (commit_enable ? 1 : 0) /* response length */,
3292 true /* header present */,
3293 false);
3294
3295 switch (sfid) {
3296 case GEN6_SFID_DATAPORT_RENDER_CACHE:
3297 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3298 break;
3299 case GEN7_SFID_DATAPORT_DATA_CACHE:
3300 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3301 break;
3302 default:
3303 unreachable("Not reached");
3304 }
3305
3306 if (commit_enable)
3307 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3308 }
3309
3310 void
3311 brw_memory_fence(struct brw_codegen *p,
3312 struct brw_reg dst)
3313 {
3314 const struct gen_device_info *devinfo = p->devinfo;
3315 const bool commit_enable = devinfo->gen == 7 && !devinfo->is_haswell;
3316 struct brw_inst *insn;
3317
3318 brw_push_insn_state(p);
3319 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3320 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3321 dst = vec1(dst);
3322
3323 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3324 * message doesn't write anything back.
3325 */
3326 insn = next_insn(p, BRW_OPCODE_SEND);
3327 dst = retype(dst, BRW_REGISTER_TYPE_UW);
3328 brw_set_dest(p, insn, dst);
3329 brw_set_src0(p, insn, dst);
3330 brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3331 commit_enable);
3332
3333 if (devinfo->gen == 7 && !devinfo->is_haswell) {
3334 /* IVB does typed surface access through the render cache, so we need to
3335 * flush it too. Use a different register so both flushes can be
3336 * pipelined by the hardware.
3337 */
3338 insn = next_insn(p, BRW_OPCODE_SEND);
3339 brw_set_dest(p, insn, offset(dst, 1));
3340 brw_set_src0(p, insn, offset(dst, 1));
3341 brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3342 commit_enable);
3343
3344 /* Now write the response of the second message into the response of the
3345 * first to trigger a pipeline stall -- This way future render and data
3346 * cache messages will be properly ordered with respect to past data and
3347 * render cache messages.
3348 */
3349 brw_MOV(p, dst, offset(dst, 1));
3350 }
3351
3352 brw_pop_insn_state(p);
3353 }
3354
3355 void
3356 brw_pixel_interpolator_query(struct brw_codegen *p,
3357 struct brw_reg dest,
3358 struct brw_reg mrf,
3359 bool noperspective,
3360 unsigned mode,
3361 struct brw_reg data,
3362 unsigned msg_length,
3363 unsigned response_length)
3364 {
3365 const struct gen_device_info *devinfo = p->devinfo;
3366 struct brw_inst *insn;
3367 const uint16_t exec_size = brw_inst_exec_size(devinfo, p->current);
3368
3369 /* brw_send_indirect_message will automatically use a direct send message
3370 * if data is actually immediate.
3371 */
3372 insn = brw_send_indirect_message(p,
3373 GEN7_SFID_PIXEL_INTERPOLATOR,
3374 dest,
3375 mrf,
3376 vec1(data));
3377 brw_inst_set_mlen(devinfo, insn, msg_length);
3378 brw_inst_set_rlen(devinfo, insn, response_length);
3379
3380 brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16);
3381 brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
3382 brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
3383 brw_inst_set_pi_message_type(devinfo, insn, mode);
3384 }
3385
3386 void
3387 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3388 struct brw_reg mask)
3389 {
3390 const struct gen_device_info *devinfo = p->devinfo;
3391 const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current);
3392 const unsigned qtr_control = brw_inst_qtr_control(devinfo, p->current);
3393 brw_inst *inst;
3394
3395 assert(devinfo->gen >= 7);
3396 assert(mask.type == BRW_REGISTER_TYPE_UD);
3397
3398 brw_push_insn_state(p);
3399
3400 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3401 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3402
3403 if (devinfo->gen >= 8) {
3404 /* Getting the first active channel index is easy on Gen8: Just find
3405 * the first bit set in the execution mask. The register exists on
3406 * HSW already but it reads back as all ones when the current
3407 * instruction has execution masking disabled, so it's kind of
3408 * useless.
3409 */
3410 struct brw_reg exec_mask =
3411 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3412
3413 if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3414 /* Unfortunately, ce0 does not take into account the thread
3415 * dispatch mask, which may be a problem in cases where it's not
3416 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3417 * some n). Combine ce0 with the given dispatch (or vector) mask
3418 * to mask off those channels which were never dispatched by the
3419 * hardware.
3420 */
3421 brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3422 brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3423 exec_mask = vec1(dst);
3424 }
3425
3426 /* Quarter control has the effect of magically shifting the value of
3427 * ce0 so you'll get the first active channel relative to the
3428 * specified quarter control as result.
3429 */
3430 inst = brw_FBL(p, vec1(dst), exec_mask);
3431 } else {
3432 const struct brw_reg flag = brw_flag_reg(1, 0);
3433
3434 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3435
3436 /* Run enough instructions returning zero with execution masking and
3437 * a conditional modifier enabled in order to get the full execution
3438 * mask in f1.0. We could use a single 32-wide move here if it
3439 * weren't because of the hardware bug that causes channel enables to
3440 * be applied incorrectly to the second half of 32-wide instructions
3441 * on Gen7.
3442 */
3443 const unsigned lower_size = MIN2(16, exec_size);
3444 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3445 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3446 brw_imm_uw(0));
3447 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3448 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3449 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3450 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3451 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3452 }
3453
3454 /* Find the first bit set in the exec_size-wide portion of the flag
3455 * register that was updated by the last sequence of MOV
3456 * instructions.
3457 */
3458 const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3459 brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3460 }
3461 } else {
3462 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3463
3464 if (devinfo->gen >= 8 &&
3465 mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3466 /* In SIMD4x2 mode the first active channel index is just the
3467 * negation of the first bit of the mask register. Note that ce0
3468 * doesn't take into account the dispatch mask, so the Gen7 path
3469 * should be used instead unless you have the guarantee that the
3470 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3471 * for some n).
3472 */
3473 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3474 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3475 brw_imm_ud(1));
3476
3477 } else {
3478 /* Overwrite the destination without and with execution masking to
3479 * find out which of the channels is active.
3480 */
3481 brw_push_insn_state(p);
3482 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3483 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3484 brw_imm_ud(1));
3485
3486 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3487 brw_imm_ud(0));
3488 brw_pop_insn_state(p);
3489 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3490 }
3491 }
3492
3493 brw_pop_insn_state(p);
3494 }
3495
3496 void
3497 brw_broadcast(struct brw_codegen *p,
3498 struct brw_reg dst,
3499 struct brw_reg src,
3500 struct brw_reg idx)
3501 {
3502 const struct gen_device_info *devinfo = p->devinfo;
3503 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3504 brw_inst *inst;
3505
3506 brw_push_insn_state(p);
3507 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3508 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3509
3510 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3511 src.address_mode == BRW_ADDRESS_DIRECT);
3512
3513 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3514 idx.file == BRW_IMMEDIATE_VALUE) {
3515 /* Trivial, the source is already uniform or the index is a constant.
3516 * We will typically not get here if the optimizer is doing its job, but
3517 * asserting would be mean.
3518 */
3519 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3520 brw_MOV(p, dst,
3521 (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3522 stride(suboffset(src, 4 * i), 0, 4, 1)));
3523 } else {
3524 if (align1) {
3525 const struct brw_reg addr =
3526 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3527 const unsigned offset = src.nr * REG_SIZE + src.subnr;
3528 /* Limit in bytes of the signed indirect addressing immediate. */
3529 const unsigned limit = 512;
3530
3531 brw_push_insn_state(p);
3532 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3533 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3534
3535 /* Take into account the component size and horizontal stride. */
3536 assert(src.vstride == src.hstride + src.width);
3537 brw_SHL(p, addr, vec1(idx),
3538 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3539 src.hstride - 1));
3540
3541 /* We can only address up to limit bytes using the indirect
3542 * addressing immediate, account for the difference if the source
3543 * register is above this limit.
3544 */
3545 if (offset >= limit)
3546 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3547
3548 brw_pop_insn_state(p);
3549
3550 /* Use indirect addressing to fetch the specified component. */
3551 brw_MOV(p, dst,
3552 retype(brw_vec1_indirect(addr.subnr, offset % limit),
3553 src.type));
3554 } else {
3555 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3556 * to all bits of a flag register,
3557 */
3558 inst = brw_MOV(p,
3559 brw_null_reg(),
3560 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3561 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3562 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3563 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3564
3565 /* and use predicated SEL to pick the right channel. */
3566 inst = brw_SEL(p, dst,
3567 stride(suboffset(src, 4), 4, 4, 1),
3568 stride(src, 4, 4, 1));
3569 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3570 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3571 }
3572 }
3573
3574 brw_pop_insn_state(p);
3575 }
3576
3577 /**
3578 * This instruction is generated as a single-channel align1 instruction by
3579 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3580 *
3581 * We can't use the typed atomic op in the FS because that has the execution
3582 * mask ANDed with the pixel mask, but we just want to write the one dword for
3583 * all the pixels.
3584 *
3585 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3586 * one u32. So we use the same untyped atomic write message as the pixel
3587 * shader.
3588 *
3589 * The untyped atomic operation requires a BUFFER surface type with RAW
3590 * format, and is only accessible through the legacy DATA_CACHE dataport
3591 * messages.
3592 */
3593 void brw_shader_time_add(struct brw_codegen *p,
3594 struct brw_reg payload,
3595 uint32_t surf_index)
3596 {
3597 const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ?
3598 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3599 GEN7_SFID_DATAPORT_DATA_CACHE);
3600 assert(p->devinfo->gen >= 7);
3601
3602 brw_push_insn_state(p);
3603 brw_set_default_access_mode(p, BRW_ALIGN_1);
3604 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3605 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3606 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3607
3608 /* We use brw_vec1_reg and unmasked because we want to increment the given
3609 * offset only once.
3610 */
3611 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3612 BRW_ARF_NULL, 0));
3613 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3614 payload.nr, 0));
3615 brw_set_src1(p, send, brw_imm_ud(0));
3616 brw_set_message_descriptor(p, send, sfid, 2, 0, false, false);
3617 brw_inst_set_binding_table_index(p->devinfo, send, surf_index);
3618 brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
3619
3620 brw_pop_insn_state(p);
3621 }
3622
3623
3624 /**
3625 * Emit the SEND message for a barrier
3626 */
3627 void
3628 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3629 {
3630 const struct gen_device_info *devinfo = p->devinfo;
3631 struct brw_inst *inst;
3632
3633 assert(devinfo->gen >= 7);
3634
3635 brw_push_insn_state(p);
3636 brw_set_default_access_mode(p, BRW_ALIGN_1);
3637 inst = next_insn(p, BRW_OPCODE_SEND);
3638 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3639 brw_set_src0(p, inst, src);
3640 brw_set_src1(p, inst, brw_null_reg());
3641
3642 brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY,
3643 1 /* msg_length */,
3644 0 /* response_length */,
3645 false /* header_present */,
3646 false /* end_of_thread */);
3647
3648 brw_inst_set_gateway_notify(devinfo, inst, 1);
3649 brw_inst_set_gateway_subfuncid(devinfo, inst,
3650 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3651
3652 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3653 brw_pop_insn_state(p);
3654 }
3655
3656
3657 /**
3658 * Emit the wait instruction for a barrier
3659 */
3660 void
3661 brw_WAIT(struct brw_codegen *p)
3662 {
3663 const struct gen_device_info *devinfo = p->devinfo;
3664 struct brw_inst *insn;
3665
3666 struct brw_reg src = brw_notification_reg();
3667
3668 insn = next_insn(p, BRW_OPCODE_WAIT);
3669 brw_set_dest(p, insn, src);
3670 brw_set_src0(p, insn, src);
3671 brw_set_src1(p, insn, brw_null_reg());
3672
3673 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3674 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3675 }