i965/eu: Take into account the target cache argument in brw_set_dp_read_message.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_context.h"
34 #include "brw_defines.h"
35 #include "brw_eu.h"
36
37 #include "util/ralloc.h"
38
39 /**
40 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
41 * registers, implicitly moving the operand to a message register.
42 *
43 * On Sandybridge, this is no longer the case. This function performs the
44 * explicit move; it should be called before emitting a SEND instruction.
45 */
46 void
47 gen6_resolve_implied_move(struct brw_codegen *p,
48 struct brw_reg *src,
49 unsigned msg_reg_nr)
50 {
51 const struct brw_device_info *devinfo = p->devinfo;
52 if (devinfo->gen < 6)
53 return;
54
55 if (src->file == BRW_MESSAGE_REGISTER_FILE)
56 return;
57
58 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
59 brw_push_insn_state(p);
60 brw_set_default_exec_size(p, BRW_EXECUTE_8);
61 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
62 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
63 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
64 retype(*src, BRW_REGISTER_TYPE_UD));
65 brw_pop_insn_state(p);
66 }
67 *src = brw_message_reg(msg_reg_nr);
68 }
69
70 static void
71 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
72 {
73 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
74 * "The send with EOT should use register space R112-R127 for <src>. This is
75 * to enable loading of a new thread into the same slot while the message
76 * with EOT for current thread is pending dispatch."
77 *
78 * Since we're pretending to have 16 MRFs anyway, we may as well use the
79 * registers required for messages with EOT.
80 */
81 const struct brw_device_info *devinfo = p->devinfo;
82 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
83 reg->file = BRW_GENERAL_REGISTER_FILE;
84 reg->nr += GEN7_MRF_HACK_START;
85 }
86 }
87
88 /**
89 * Convert a brw_reg_type enumeration value into the hardware representation.
90 *
91 * The hardware encoding may depend on whether the value is an immediate.
92 */
93 unsigned
94 brw_reg_type_to_hw_type(const struct brw_device_info *devinfo,
95 enum brw_reg_type type, enum brw_reg_file file)
96 {
97 if (file == BRW_IMMEDIATE_VALUE) {
98 static const int imm_hw_types[] = {
99 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
100 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
101 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
102 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
103 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
104 [BRW_REGISTER_TYPE_UB] = -1,
105 [BRW_REGISTER_TYPE_B] = -1,
106 [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
107 [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
108 [BRW_REGISTER_TYPE_V] = BRW_HW_REG_IMM_TYPE_V,
109 [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
110 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
111 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
112 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
113 };
114 assert(type < ARRAY_SIZE(imm_hw_types));
115 assert(imm_hw_types[type] != -1);
116 assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
117 return imm_hw_types[type];
118 } else {
119 /* Non-immediate registers */
120 static const int hw_types[] = {
121 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
122 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
123 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
124 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
125 [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
126 [BRW_REGISTER_TYPE_B] = BRW_HW_REG_NON_IMM_TYPE_B,
127 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
128 [BRW_REGISTER_TYPE_UV] = -1,
129 [BRW_REGISTER_TYPE_VF] = -1,
130 [BRW_REGISTER_TYPE_V] = -1,
131 [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
132 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
133 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
134 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
135 };
136 assert(type < ARRAY_SIZE(hw_types));
137 assert(hw_types[type] != -1);
138 assert(devinfo->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
139 assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_HF);
140 return hw_types[type];
141 }
142 }
143
144 void
145 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
146 {
147 const struct brw_device_info *devinfo = p->devinfo;
148
149 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
150 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
151 else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE)
152 assert(dest.nr < 128);
153
154 gen7_convert_mrf_to_grf(p, &dest);
155
156 brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
157 brw_inst_set_dst_reg_type(devinfo, inst,
158 brw_reg_type_to_hw_type(devinfo, dest.type,
159 dest.file));
160 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
161
162 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
163 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
164
165 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
166 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
167 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
168 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
169 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
170 } else {
171 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
172 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
173 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
174 dest.file == BRW_MESSAGE_REGISTER_FILE) {
175 assert(dest.writemask != 0);
176 }
177 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
178 * Although Dst.HorzStride is a don't care for Align16, HW needs
179 * this to be programmed as "01".
180 */
181 brw_inst_set_dst_hstride(devinfo, inst, 1);
182 }
183 } else {
184 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
185
186 /* These are different sizes in align1 vs align16:
187 */
188 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
189 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
190 dest.indirect_offset);
191 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
192 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
193 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
194 } else {
195 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
196 dest.indirect_offset);
197 /* even ignored in da16, still need to set as '01' */
198 brw_inst_set_dst_hstride(devinfo, inst, 1);
199 }
200 }
201
202 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
203 * or 16 (SIMD16), as that's normally correct. However, when dealing with
204 * small registers, we automatically reduce it to match the register size.
205 *
206 * In platforms that support fp64 we can emit instructions with a width of
207 * 4 that need two SIMD8 registers and an exec_size of 8 or 16. In these
208 * cases we need to make sure that these instructions have their exec sizes
209 * set properly when they are emitted and we can't rely on this code to fix
210 * it.
211 */
212 bool fix_exec_size;
213 if (devinfo->gen >= 6)
214 fix_exec_size = dest.width < BRW_EXECUTE_4;
215 else
216 fix_exec_size = dest.width < BRW_EXECUTE_8;
217
218 if (fix_exec_size)
219 brw_inst_set_exec_size(devinfo, inst, dest.width);
220 }
221
222 extern int reg_type_size[];
223
224 static void
225 validate_reg(const struct brw_device_info *devinfo,
226 brw_inst *inst, struct brw_reg reg)
227 {
228 const int hstride_for_reg[] = {0, 1, 2, 4};
229 const int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32};
230 const int width_for_reg[] = {1, 2, 4, 8, 16};
231 const int execsize_for_reg[] = {1, 2, 4, 8, 16, 32};
232 int width, hstride, vstride, execsize;
233
234 if (reg.file == BRW_IMMEDIATE_VALUE) {
235 /* 3.3.6: Region Parameters. Restriction: Immediate vectors
236 * mean the destination has to be 128-bit aligned and the
237 * destination horiz stride has to be a word.
238 */
239 if (reg.type == BRW_REGISTER_TYPE_V) {
240 assert(hstride_for_reg[brw_inst_dst_hstride(devinfo, inst)] *
241 reg_type_size[brw_inst_dst_reg_type(devinfo, inst)] == 2);
242 }
243
244 return;
245 }
246
247 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
248 reg.file == BRW_ARF_NULL)
249 return;
250
251 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
252 *
253 * "Swizzling is not allowed when an accumulator is used as an implicit
254 * source or an explicit source in an instruction."
255 */
256 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
257 reg.nr == BRW_ARF_ACCUMULATOR)
258 assert(reg.swizzle == BRW_SWIZZLE_XYZW);
259
260 assert(reg.hstride >= 0 && reg.hstride < ARRAY_SIZE(hstride_for_reg));
261 hstride = hstride_for_reg[reg.hstride];
262
263 if (reg.vstride == 0xf) {
264 vstride = -1;
265 } else {
266 assert(reg.vstride >= 0 && reg.vstride < ARRAY_SIZE(vstride_for_reg));
267 vstride = vstride_for_reg[reg.vstride];
268 }
269
270 assert(reg.width >= 0 && reg.width < ARRAY_SIZE(width_for_reg));
271 width = width_for_reg[reg.width];
272
273 assert(brw_inst_exec_size(devinfo, inst) >= 0 &&
274 brw_inst_exec_size(devinfo, inst) < ARRAY_SIZE(execsize_for_reg));
275 execsize = execsize_for_reg[brw_inst_exec_size(devinfo, inst)];
276
277 /* Restrictions from 3.3.10: Register Region Restrictions. */
278 /* 3. */
279 assert(execsize >= width);
280
281 /* 4. */
282 if (execsize == width && hstride != 0) {
283 assert(vstride == -1 || vstride == width * hstride);
284 }
285
286 /* 5. */
287 if (execsize == width && hstride == 0) {
288 /* no restriction on vstride. */
289 }
290
291 /* 6. */
292 if (width == 1) {
293 assert(hstride == 0);
294 }
295
296 /* 7. */
297 if (execsize == 1 && width == 1) {
298 assert(hstride == 0);
299 assert(vstride == 0);
300 }
301
302 /* 8. */
303 if (vstride == 0 && hstride == 0) {
304 assert(width == 1);
305 }
306
307 /* 10. Check destination issues. */
308 }
309
310 static bool
311 is_compactable_immediate(unsigned imm)
312 {
313 /* We get the low 12 bits as-is. */
314 imm &= ~0xfff;
315
316 /* We get one bit replicated through the top 20 bits. */
317 return imm == 0 || imm == 0xfffff000;
318 }
319
320 void
321 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
322 {
323 const struct brw_device_info *devinfo = p->devinfo;
324
325 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
326 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
327 else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
328 assert(reg.nr < 128);
329
330 gen7_convert_mrf_to_grf(p, &reg);
331
332 if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
333 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
334 /* Any source modifiers or regions will be ignored, since this just
335 * identifies the MRF/GRF to start reading the message contents from.
336 * Check for some likely failures.
337 */
338 assert(!reg.negate);
339 assert(!reg.abs);
340 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
341 }
342
343 validate_reg(devinfo, inst, reg);
344
345 brw_inst_set_src0_reg_file(devinfo, inst, reg.file);
346 brw_inst_set_src0_reg_type(devinfo, inst,
347 brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
348 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
349 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
350 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
351
352 if (reg.file == BRW_IMMEDIATE_VALUE) {
353 if (reg.type == BRW_REGISTER_TYPE_DF ||
354 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
355 brw_inst_set_imm_df(devinfo, inst, reg.df);
356 else
357 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
358
359 /* The Bspec's section titled "Non-present Operands" claims that if src0
360 * is an immediate that src1's type must be the same as that of src0.
361 *
362 * The SNB+ DataTypeIndex instruction compaction tables contain mappings
363 * that do not follow this rule. E.g., from the IVB/HSW table:
364 *
365 * DataTypeIndex 18-Bit Mapping Mapped Meaning
366 * 3 001000001011111101 r:f | i:vf | a:ud | <1> | dir |
367 *
368 * And from the SNB table:
369 *
370 * DataTypeIndex 18-Bit Mapping Mapped Meaning
371 * 8 001000000111101100 a:w | i:w | a:ud | <1> | dir |
372 *
373 * Neither of these cause warnings from the simulator when used,
374 * compacted or otherwise. In fact, all compaction mappings that have an
375 * immediate in src0 use a:ud for src1.
376 *
377 * The GM45 instruction compaction tables do not contain mapped meanings
378 * so it's not clear whether it has the restriction. We'll assume it was
379 * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
380 *
381 * Don't do any of this for 64-bit immediates, since the src1 fields
382 * overlap with the immediate and setting them would overwrite the
383 * immediate we set.
384 */
385 if (type_sz(reg.type) < 8) {
386 brw_inst_set_src1_reg_file(devinfo, inst,
387 BRW_ARCHITECTURE_REGISTER_FILE);
388 if (devinfo->gen < 6) {
389 brw_inst_set_src1_reg_type(devinfo, inst,
390 brw_inst_src0_reg_type(devinfo, inst));
391 } else {
392 brw_inst_set_src1_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
393 }
394 }
395
396 /* Compacted instructions only have 12-bits (plus 1 for the other 20)
397 * for immediate values. Presumably the hardware engineers realized
398 * that the only useful floating-point value that could be represented
399 * in this format is 0.0, which can also be represented as a VF-typed
400 * immediate, so they gave us the previously mentioned mapping on IVB+.
401 *
402 * Strangely, we do have a mapping for imm:f in src1, so we don't need
403 * to do this there.
404 *
405 * If we see a 0.0:F, change the type to VF so that it can be compacted.
406 */
407 if (brw_inst_imm_ud(devinfo, inst) == 0x0 &&
408 brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_F) {
409 brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_IMM_TYPE_VF);
410 }
411
412 /* There are no mappings for dst:d | i:d, so if the immediate is suitable
413 * set the types to :UD so the instruction can be compacted.
414 */
415 if (is_compactable_immediate(brw_inst_imm_ud(devinfo, inst)) &&
416 brw_inst_cond_modifier(devinfo, inst) == BRW_CONDITIONAL_NONE &&
417 brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D &&
418 brw_inst_dst_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D) {
419 brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
420 brw_inst_set_dst_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
421 }
422 } else {
423 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
424 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
425 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
426 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
427 } else {
428 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
429 }
430 } else {
431 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
432
433 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
434 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
435 } else {
436 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
437 }
438 }
439
440 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
441 if (reg.width == BRW_WIDTH_1 &&
442 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
443 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
444 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
445 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
446 } else {
447 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
448 brw_inst_set_src0_width(devinfo, inst, reg.width);
449 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
450 }
451 } else {
452 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
453 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
454 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
455 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
456 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
457 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
458 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
459 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
460
461 /* This is an oddity of the fact we're using the same
462 * descriptions for registers in align_16 as align_1:
463 */
464 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
465 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
466 else
467 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
468 }
469 }
470 }
471
472
473 void
474 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
475 {
476 const struct brw_device_info *devinfo = p->devinfo;
477
478 if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
479 assert(reg.nr < 128);
480
481 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
482 *
483 * "Accumulator registers may be accessed explicitly as src0
484 * operands only."
485 */
486 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
487 reg.nr != BRW_ARF_ACCUMULATOR);
488
489 gen7_convert_mrf_to_grf(p, &reg);
490 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
491
492 validate_reg(devinfo, inst, reg);
493
494 brw_inst_set_src1_reg_file(devinfo, inst, reg.file);
495 brw_inst_set_src1_reg_type(devinfo, inst,
496 brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
497 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
498 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
499
500 /* Only src1 can be immediate in two-argument instructions.
501 */
502 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
503
504 if (reg.file == BRW_IMMEDIATE_VALUE) {
505 /* two-argument instructions can only use 32-bit immediates */
506 assert(type_sz(reg.type) < 8);
507 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
508 } else {
509 /* This is a hardware restriction, which may or may not be lifted
510 * in the future:
511 */
512 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
513 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
514
515 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
516 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
517 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
518 } else {
519 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
520 }
521
522 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
523 if (reg.width == BRW_WIDTH_1 &&
524 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
525 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
526 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
527 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
528 } else {
529 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
530 brw_inst_set_src1_width(devinfo, inst, reg.width);
531 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
532 }
533 } else {
534 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
535 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
536 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
537 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
538 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
539 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
540 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
541 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
542
543 /* This is an oddity of the fact we're using the same
544 * descriptions for registers in align_16 as align_1:
545 */
546 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
547 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
548 else
549 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
550 }
551 }
552 }
553
554 /**
555 * Set the Message Descriptor and Extended Message Descriptor fields
556 * for SEND messages.
557 *
558 * \note This zeroes out the Function Control bits, so it must be called
559 * \b before filling out any message-specific data. Callers can
560 * choose not to fill in irrelevant bits; they will be zero.
561 */
562 void
563 brw_set_message_descriptor(struct brw_codegen *p,
564 brw_inst *inst,
565 enum brw_message_target sfid,
566 unsigned msg_length,
567 unsigned response_length,
568 bool header_present,
569 bool end_of_thread)
570 {
571 const struct brw_device_info *devinfo = p->devinfo;
572
573 brw_set_src1(p, inst, brw_imm_d(0));
574
575 /* For indirect sends, `inst` will not be the SEND/SENDC instruction
576 * itself; instead, it will be a MOV/OR into the address register.
577 *
578 * In this case, we avoid setting the extended message descriptor bits,
579 * since they go on the later SEND/SENDC instead and if set here would
580 * instead clobber the conditionalmod bits.
581 */
582 unsigned opcode = brw_inst_opcode(devinfo, inst);
583 if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
584 brw_inst_set_sfid(devinfo, inst, sfid);
585 }
586
587 brw_inst_set_mlen(devinfo, inst, msg_length);
588 brw_inst_set_rlen(devinfo, inst, response_length);
589 brw_inst_set_eot(devinfo, inst, end_of_thread);
590
591 if (devinfo->gen >= 5) {
592 brw_inst_set_header_present(devinfo, inst, header_present);
593 }
594 }
595
596 static void brw_set_math_message( struct brw_codegen *p,
597 brw_inst *inst,
598 unsigned function,
599 unsigned integer_type,
600 bool low_precision,
601 unsigned dataType )
602 {
603 const struct brw_device_info *devinfo = p->devinfo;
604 unsigned msg_length;
605 unsigned response_length;
606
607 /* Infer message length from the function */
608 switch (function) {
609 case BRW_MATH_FUNCTION_POW:
610 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
611 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
612 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
613 msg_length = 2;
614 break;
615 default:
616 msg_length = 1;
617 break;
618 }
619
620 /* Infer response length from the function */
621 switch (function) {
622 case BRW_MATH_FUNCTION_SINCOS:
623 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
624 response_length = 2;
625 break;
626 default:
627 response_length = 1;
628 break;
629 }
630
631
632 brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
633 msg_length, response_length, false, false);
634 brw_inst_set_math_msg_function(devinfo, inst, function);
635 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
636 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
637 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
638 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
639 brw_inst_set_saturate(devinfo, inst, 0);
640 }
641
642
643 static void brw_set_ff_sync_message(struct brw_codegen *p,
644 brw_inst *insn,
645 bool allocate,
646 unsigned response_length,
647 bool end_of_thread)
648 {
649 const struct brw_device_info *devinfo = p->devinfo;
650
651 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
652 1, response_length, true, end_of_thread);
653 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
654 brw_inst_set_urb_allocate(devinfo, insn, allocate);
655 /* The following fields are not used by FF_SYNC: */
656 brw_inst_set_urb_global_offset(devinfo, insn, 0);
657 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
658 brw_inst_set_urb_used(devinfo, insn, 0);
659 brw_inst_set_urb_complete(devinfo, insn, 0);
660 }
661
662 static void brw_set_urb_message( struct brw_codegen *p,
663 brw_inst *insn,
664 enum brw_urb_write_flags flags,
665 unsigned msg_length,
666 unsigned response_length,
667 unsigned offset,
668 unsigned swizzle_control )
669 {
670 const struct brw_device_info *devinfo = p->devinfo;
671
672 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
673 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
674 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
675
676 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
677 msg_length, response_length, true,
678 flags & BRW_URB_WRITE_EOT);
679
680 if (flags & BRW_URB_WRITE_OWORD) {
681 assert(msg_length == 2); /* header + one OWORD of data */
682 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
683 } else {
684 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
685 }
686
687 brw_inst_set_urb_global_offset(devinfo, insn, offset);
688 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
689
690 if (devinfo->gen < 8) {
691 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
692 }
693
694 if (devinfo->gen < 7) {
695 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
696 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
697 } else {
698 brw_inst_set_urb_per_slot_offset(devinfo, insn,
699 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
700 }
701 }
702
703 void
704 brw_set_dp_write_message(struct brw_codegen *p,
705 brw_inst *insn,
706 unsigned binding_table_index,
707 unsigned msg_control,
708 unsigned msg_type,
709 unsigned msg_length,
710 bool header_present,
711 unsigned last_render_target,
712 unsigned response_length,
713 unsigned end_of_thread,
714 unsigned send_commit_msg)
715 {
716 const struct brw_device_info *devinfo = p->devinfo;
717 unsigned sfid;
718
719 if (devinfo->gen >= 7) {
720 /* Use the Render Cache for RT writes; otherwise use the Data Cache */
721 if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
722 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
723 else
724 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
725 } else if (devinfo->gen == 6) {
726 /* Use the render cache for all write messages. */
727 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
728 } else {
729 sfid = BRW_SFID_DATAPORT_WRITE;
730 }
731
732 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
733 header_present, end_of_thread);
734
735 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
736 brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type);
737 brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control);
738 brw_inst_set_rt_last(devinfo, insn, last_render_target);
739 if (devinfo->gen < 7) {
740 brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg);
741 }
742 }
743
744 void
745 brw_set_dp_read_message(struct brw_codegen *p,
746 brw_inst *insn,
747 unsigned binding_table_index,
748 unsigned msg_control,
749 unsigned msg_type,
750 unsigned target_cache,
751 unsigned msg_length,
752 bool header_present,
753 unsigned response_length)
754 {
755 const struct brw_device_info *devinfo = p->devinfo;
756 unsigned sfid;
757
758 if (devinfo->gen >= 7) {
759 if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
760 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
761 else if (target_cache == BRW_DATAPORT_READ_TARGET_DATA_CACHE)
762 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
763 else if (target_cache == BRW_DATAPORT_READ_TARGET_SAMPLER_CACHE)
764 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
765 else
766 unreachable("Invalid target cache");
767
768 } else if (devinfo->gen == 6) {
769 if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
770 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
771 else
772 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
773 } else {
774 sfid = BRW_SFID_DATAPORT_READ;
775 }
776
777 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
778 header_present, false);
779
780 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
781 brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type);
782 brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control);
783 if (devinfo->gen < 6)
784 brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache);
785 }
786
787 void
788 brw_set_sampler_message(struct brw_codegen *p,
789 brw_inst *inst,
790 unsigned binding_table_index,
791 unsigned sampler,
792 unsigned msg_type,
793 unsigned response_length,
794 unsigned msg_length,
795 unsigned header_present,
796 unsigned simd_mode,
797 unsigned return_format)
798 {
799 const struct brw_device_info *devinfo = p->devinfo;
800
801 brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
802 response_length, header_present, false);
803
804 brw_inst_set_binding_table_index(devinfo, inst, binding_table_index);
805 brw_inst_set_sampler(devinfo, inst, sampler);
806 brw_inst_set_sampler_msg_type(devinfo, inst, msg_type);
807 if (devinfo->gen >= 5) {
808 brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode);
809 } else if (devinfo->gen == 4 && !devinfo->is_g4x) {
810 brw_inst_set_sampler_return_format(devinfo, inst, return_format);
811 }
812 }
813
814 static void
815 gen7_set_dp_scratch_message(struct brw_codegen *p,
816 brw_inst *inst,
817 bool write,
818 bool dword,
819 bool invalidate_after_read,
820 unsigned num_regs,
821 unsigned addr_offset,
822 unsigned mlen,
823 unsigned rlen,
824 bool header_present)
825 {
826 const struct brw_device_info *devinfo = p->devinfo;
827 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
828 (devinfo->gen >= 8 && num_regs == 8));
829 const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
830 num_regs - 1);
831
832 brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
833 mlen, rlen, header_present, false);
834 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
835 brw_inst_set_scratch_read_write(devinfo, inst, write);
836 brw_inst_set_scratch_type(devinfo, inst, dword);
837 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
838 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
839 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
840 }
841
842 #define next_insn brw_next_insn
843 brw_inst *
844 brw_next_insn(struct brw_codegen *p, unsigned opcode)
845 {
846 const struct brw_device_info *devinfo = p->devinfo;
847 brw_inst *insn;
848
849 if (p->nr_insn + 1 > p->store_size) {
850 p->store_size <<= 1;
851 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
852 }
853
854 p->next_insn_offset += 16;
855 insn = &p->store[p->nr_insn++];
856 memcpy(insn, p->current, sizeof(*insn));
857
858 brw_inst_set_opcode(devinfo, insn, opcode);
859 return insn;
860 }
861
862 static brw_inst *
863 brw_alu1(struct brw_codegen *p, unsigned opcode,
864 struct brw_reg dest, struct brw_reg src)
865 {
866 brw_inst *insn = next_insn(p, opcode);
867 brw_set_dest(p, insn, dest);
868 brw_set_src0(p, insn, src);
869 return insn;
870 }
871
872 static brw_inst *
873 brw_alu2(struct brw_codegen *p, unsigned opcode,
874 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
875 {
876 /* 64-bit immediates are only supported on 1-src instructions */
877 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
878 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
879
880 brw_inst *insn = next_insn(p, opcode);
881 brw_set_dest(p, insn, dest);
882 brw_set_src0(p, insn, src0);
883 brw_set_src1(p, insn, src1);
884 return insn;
885 }
886
887 static int
888 get_3src_subreg_nr(struct brw_reg reg)
889 {
890 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
891 * use 32-bit units (components 0..7). Since they only support F/D/UD
892 * types, this doesn't lose any flexibility, but uses fewer bits.
893 */
894 return reg.subnr / 4;
895 }
896
897 static brw_inst *
898 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
899 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
900 {
901 const struct brw_device_info *devinfo = p->devinfo;
902 brw_inst *inst = next_insn(p, opcode);
903
904 gen7_convert_mrf_to_grf(p, &dest);
905
906 assert(brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16);
907
908 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
909 dest.file == BRW_MESSAGE_REGISTER_FILE);
910 assert(dest.nr < 128);
911 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
912 assert(dest.type == BRW_REGISTER_TYPE_F ||
913 dest.type == BRW_REGISTER_TYPE_DF ||
914 dest.type == BRW_REGISTER_TYPE_D ||
915 dest.type == BRW_REGISTER_TYPE_UD);
916 if (devinfo->gen == 6) {
917 brw_inst_set_3src_dst_reg_file(devinfo, inst,
918 dest.file == BRW_MESSAGE_REGISTER_FILE);
919 }
920 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
921 brw_inst_set_3src_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
922 brw_inst_set_3src_dst_writemask(devinfo, inst, dest.writemask);
923
924 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
925 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
926 assert(src0.nr < 128);
927 brw_inst_set_3src_src0_swizzle(devinfo, inst, src0.swizzle);
928 brw_inst_set_3src_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
929 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
930 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
931 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
932 brw_inst_set_3src_src0_rep_ctrl(devinfo, inst,
933 src0.vstride == BRW_VERTICAL_STRIDE_0);
934
935 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
936 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
937 assert(src1.nr < 128);
938 brw_inst_set_3src_src1_swizzle(devinfo, inst, src1.swizzle);
939 brw_inst_set_3src_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
940 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
941 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
942 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
943 brw_inst_set_3src_src1_rep_ctrl(devinfo, inst,
944 src1.vstride == BRW_VERTICAL_STRIDE_0);
945
946 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
947 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
948 assert(src2.nr < 128);
949 brw_inst_set_3src_src2_swizzle(devinfo, inst, src2.swizzle);
950 brw_inst_set_3src_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
951 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
952 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
953 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
954 brw_inst_set_3src_src2_rep_ctrl(devinfo, inst,
955 src2.vstride == BRW_VERTICAL_STRIDE_0);
956
957 if (devinfo->gen >= 7) {
958 /* Set both the source and destination types based on dest.type,
959 * ignoring the source register types. The MAD and LRP emitters ensure
960 * that all four types are float. The BFE and BFI2 emitters, however,
961 * may send us mixed D and UD types and want us to ignore that and use
962 * the destination type.
963 */
964 switch (dest.type) {
965 case BRW_REGISTER_TYPE_F:
966 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_F);
967 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_F);
968 break;
969 case BRW_REGISTER_TYPE_DF:
970 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_DF);
971 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_DF);
972 break;
973 case BRW_REGISTER_TYPE_D:
974 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_D);
975 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_D);
976 break;
977 case BRW_REGISTER_TYPE_UD:
978 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_UD);
979 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_UD);
980 break;
981 default:
982 unreachable("not reached");
983 }
984 }
985
986 return inst;
987 }
988
989
990 /***********************************************************************
991 * Convenience routines.
992 */
993 #define ALU1(OP) \
994 brw_inst *brw_##OP(struct brw_codegen *p, \
995 struct brw_reg dest, \
996 struct brw_reg src0) \
997 { \
998 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
999 }
1000
1001 #define ALU2(OP) \
1002 brw_inst *brw_##OP(struct brw_codegen *p, \
1003 struct brw_reg dest, \
1004 struct brw_reg src0, \
1005 struct brw_reg src1) \
1006 { \
1007 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
1008 }
1009
1010 #define ALU3(OP) \
1011 brw_inst *brw_##OP(struct brw_codegen *p, \
1012 struct brw_reg dest, \
1013 struct brw_reg src0, \
1014 struct brw_reg src1, \
1015 struct brw_reg src2) \
1016 { \
1017 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1018 }
1019
1020 #define ALU3F(OP) \
1021 brw_inst *brw_##OP(struct brw_codegen *p, \
1022 struct brw_reg dest, \
1023 struct brw_reg src0, \
1024 struct brw_reg src1, \
1025 struct brw_reg src2) \
1026 { \
1027 assert(dest.type == BRW_REGISTER_TYPE_F || \
1028 dest.type == BRW_REGISTER_TYPE_DF); \
1029 if (dest.type == BRW_REGISTER_TYPE_F) { \
1030 assert(src0.type == BRW_REGISTER_TYPE_F); \
1031 assert(src1.type == BRW_REGISTER_TYPE_F); \
1032 assert(src2.type == BRW_REGISTER_TYPE_F); \
1033 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
1034 assert(src0.type == BRW_REGISTER_TYPE_DF); \
1035 assert(src1.type == BRW_REGISTER_TYPE_DF); \
1036 assert(src2.type == BRW_REGISTER_TYPE_DF); \
1037 } \
1038 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1039 }
1040
1041 /* Rounding operations (other than RNDD) require two instructions - the first
1042 * stores a rounded value (possibly the wrong way) in the dest register, but
1043 * also sets a per-channel "increment bit" in the flag register. A predicated
1044 * add of 1.0 fixes dest to contain the desired result.
1045 *
1046 * Sandybridge and later appear to round correctly without an ADD.
1047 */
1048 #define ROUND(OP) \
1049 void brw_##OP(struct brw_codegen *p, \
1050 struct brw_reg dest, \
1051 struct brw_reg src) \
1052 { \
1053 const struct brw_device_info *devinfo = p->devinfo; \
1054 brw_inst *rnd, *add; \
1055 rnd = next_insn(p, BRW_OPCODE_##OP); \
1056 brw_set_dest(p, rnd, dest); \
1057 brw_set_src0(p, rnd, src); \
1058 \
1059 if (devinfo->gen < 6) { \
1060 /* turn on round-increments */ \
1061 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
1062 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
1063 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
1064 } \
1065 }
1066
1067
1068 ALU1(MOV)
1069 ALU2(SEL)
1070 ALU1(NOT)
1071 ALU2(AND)
1072 ALU2(OR)
1073 ALU2(XOR)
1074 ALU2(SHR)
1075 ALU2(SHL)
1076 ALU1(DIM)
1077 ALU2(ASR)
1078 ALU1(FRC)
1079 ALU1(RNDD)
1080 ALU2(MAC)
1081 ALU2(MACH)
1082 ALU1(LZD)
1083 ALU2(DP4)
1084 ALU2(DPH)
1085 ALU2(DP3)
1086 ALU2(DP2)
1087 ALU3F(MAD)
1088 ALU3F(LRP)
1089 ALU1(BFREV)
1090 ALU3(BFE)
1091 ALU2(BFI1)
1092 ALU3(BFI2)
1093 ALU1(FBH)
1094 ALU1(FBL)
1095 ALU1(CBIT)
1096 ALU2(ADDC)
1097 ALU2(SUBB)
1098
1099 ROUND(RNDZ)
1100 ROUND(RNDE)
1101
1102
1103 brw_inst *
1104 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1105 struct brw_reg src0, struct brw_reg src1)
1106 {
1107 /* 6.2.2: add */
1108 if (src0.type == BRW_REGISTER_TYPE_F ||
1109 (src0.file == BRW_IMMEDIATE_VALUE &&
1110 src0.type == BRW_REGISTER_TYPE_VF)) {
1111 assert(src1.type != BRW_REGISTER_TYPE_UD);
1112 assert(src1.type != BRW_REGISTER_TYPE_D);
1113 }
1114
1115 if (src1.type == BRW_REGISTER_TYPE_F ||
1116 (src1.file == BRW_IMMEDIATE_VALUE &&
1117 src1.type == BRW_REGISTER_TYPE_VF)) {
1118 assert(src0.type != BRW_REGISTER_TYPE_UD);
1119 assert(src0.type != BRW_REGISTER_TYPE_D);
1120 }
1121
1122 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1123 }
1124
1125 brw_inst *
1126 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1127 struct brw_reg src0, struct brw_reg src1)
1128 {
1129 assert(dest.type == src0.type);
1130 assert(src0.type == src1.type);
1131 switch (src0.type) {
1132 case BRW_REGISTER_TYPE_B:
1133 case BRW_REGISTER_TYPE_UB:
1134 case BRW_REGISTER_TYPE_W:
1135 case BRW_REGISTER_TYPE_UW:
1136 case BRW_REGISTER_TYPE_D:
1137 case BRW_REGISTER_TYPE_UD:
1138 break;
1139 default:
1140 unreachable("Bad type for brw_AVG");
1141 }
1142
1143 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1144 }
1145
1146 brw_inst *
1147 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1148 struct brw_reg src0, struct brw_reg src1)
1149 {
1150 /* 6.32.38: mul */
1151 if (src0.type == BRW_REGISTER_TYPE_D ||
1152 src0.type == BRW_REGISTER_TYPE_UD ||
1153 src1.type == BRW_REGISTER_TYPE_D ||
1154 src1.type == BRW_REGISTER_TYPE_UD) {
1155 assert(dest.type != BRW_REGISTER_TYPE_F);
1156 }
1157
1158 if (src0.type == BRW_REGISTER_TYPE_F ||
1159 (src0.file == BRW_IMMEDIATE_VALUE &&
1160 src0.type == BRW_REGISTER_TYPE_VF)) {
1161 assert(src1.type != BRW_REGISTER_TYPE_UD);
1162 assert(src1.type != BRW_REGISTER_TYPE_D);
1163 }
1164
1165 if (src1.type == BRW_REGISTER_TYPE_F ||
1166 (src1.file == BRW_IMMEDIATE_VALUE &&
1167 src1.type == BRW_REGISTER_TYPE_VF)) {
1168 assert(src0.type != BRW_REGISTER_TYPE_UD);
1169 assert(src0.type != BRW_REGISTER_TYPE_D);
1170 }
1171
1172 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1173 src0.nr != BRW_ARF_ACCUMULATOR);
1174 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1175 src1.nr != BRW_ARF_ACCUMULATOR);
1176
1177 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1178 }
1179
1180 brw_inst *
1181 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1182 struct brw_reg src0, struct brw_reg src1)
1183 {
1184 src0.vstride = BRW_VERTICAL_STRIDE_0;
1185 src0.width = BRW_WIDTH_1;
1186 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1187 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1188 }
1189
1190 brw_inst *
1191 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1192 struct brw_reg src0, struct brw_reg src1)
1193 {
1194 src0.vstride = BRW_VERTICAL_STRIDE_0;
1195 src0.width = BRW_WIDTH_1;
1196 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1197 src1.vstride = BRW_VERTICAL_STRIDE_8;
1198 src1.width = BRW_WIDTH_8;
1199 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1200 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1201 }
1202
1203 brw_inst *
1204 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1205 {
1206 const struct brw_device_info *devinfo = p->devinfo;
1207 const bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1208 /* The F32TO16 instruction doesn't support 32-bit destination types in
1209 * Align1 mode, and neither does the Gen8 implementation in terms of a
1210 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1211 * an undocumented feature.
1212 */
1213 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1214 (!align16 || devinfo->gen >= 8));
1215 brw_inst *inst;
1216
1217 if (align16) {
1218 assert(dst.type == BRW_REGISTER_TYPE_UD);
1219 } else {
1220 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1221 dst.type == BRW_REGISTER_TYPE_W ||
1222 dst.type == BRW_REGISTER_TYPE_UW ||
1223 dst.type == BRW_REGISTER_TYPE_HF);
1224 }
1225
1226 brw_push_insn_state(p);
1227
1228 if (needs_zero_fill) {
1229 brw_set_default_access_mode(p, BRW_ALIGN_1);
1230 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1231 }
1232
1233 if (devinfo->gen >= 8) {
1234 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1235 } else {
1236 assert(devinfo->gen == 7);
1237 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1238 }
1239
1240 if (needs_zero_fill) {
1241 brw_inst_set_no_dd_clear(devinfo, inst, true);
1242 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_ud(0u));
1243 brw_inst_set_no_dd_check(devinfo, inst, true);
1244 }
1245
1246 brw_pop_insn_state(p);
1247 return inst;
1248 }
1249
1250 brw_inst *
1251 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1252 {
1253 const struct brw_device_info *devinfo = p->devinfo;
1254 bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1255
1256 if (align16) {
1257 assert(src.type == BRW_REGISTER_TYPE_UD);
1258 } else {
1259 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1260 *
1261 * Because this instruction does not have a 16-bit floating-point
1262 * type, the source data type must be Word (W). The destination type
1263 * must be F (Float).
1264 */
1265 if (src.type == BRW_REGISTER_TYPE_UD)
1266 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1267
1268 assert(src.type == BRW_REGISTER_TYPE_W ||
1269 src.type == BRW_REGISTER_TYPE_UW ||
1270 src.type == BRW_REGISTER_TYPE_HF);
1271 }
1272
1273 if (devinfo->gen >= 8) {
1274 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1275 } else {
1276 assert(devinfo->gen == 7);
1277 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1278 }
1279 }
1280
1281
1282 void brw_NOP(struct brw_codegen *p)
1283 {
1284 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1285 brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_1);
1286 brw_set_dest(p, insn, retype(brw_vec1_grf(0,0), BRW_REGISTER_TYPE_UD));
1287 brw_set_src0(p, insn, retype(brw_vec1_grf(0,0), BRW_REGISTER_TYPE_UD));
1288 brw_set_src1(p, insn, brw_imm_ud(0x0));
1289 }
1290
1291
1292
1293
1294
1295 /***********************************************************************
1296 * Comparisons, if/else/endif
1297 */
1298
1299 brw_inst *
1300 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1301 unsigned predicate_control)
1302 {
1303 const struct brw_device_info *devinfo = p->devinfo;
1304 struct brw_reg ip = brw_ip_reg();
1305 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1306
1307 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_2);
1308 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1309 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1310 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1311
1312 return inst;
1313 }
1314
1315 static void
1316 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1317 {
1318 p->if_stack[p->if_stack_depth] = inst - p->store;
1319
1320 p->if_stack_depth++;
1321 if (p->if_stack_array_size <= p->if_stack_depth) {
1322 p->if_stack_array_size *= 2;
1323 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1324 p->if_stack_array_size);
1325 }
1326 }
1327
1328 static brw_inst *
1329 pop_if_stack(struct brw_codegen *p)
1330 {
1331 p->if_stack_depth--;
1332 return &p->store[p->if_stack[p->if_stack_depth]];
1333 }
1334
1335 static void
1336 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1337 {
1338 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1339 p->loop_stack_array_size *= 2;
1340 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1341 p->loop_stack_array_size);
1342 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1343 p->loop_stack_array_size);
1344 }
1345
1346 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1347 p->loop_stack_depth++;
1348 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1349 }
1350
1351 static brw_inst *
1352 get_inner_do_insn(struct brw_codegen *p)
1353 {
1354 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1355 }
1356
1357 /* EU takes the value from the flag register and pushes it onto some
1358 * sort of a stack (presumably merging with any flag value already on
1359 * the stack). Within an if block, the flags at the top of the stack
1360 * control execution on each channel of the unit, eg. on each of the
1361 * 16 pixel values in our wm programs.
1362 *
1363 * When the matching 'else' instruction is reached (presumably by
1364 * countdown of the instruction count patched in by our ELSE/ENDIF
1365 * functions), the relevant flags are inverted.
1366 *
1367 * When the matching 'endif' instruction is reached, the flags are
1368 * popped off. If the stack is now empty, normal execution resumes.
1369 */
1370 brw_inst *
1371 brw_IF(struct brw_codegen *p, unsigned execute_size)
1372 {
1373 const struct brw_device_info *devinfo = p->devinfo;
1374 brw_inst *insn;
1375
1376 insn = next_insn(p, BRW_OPCODE_IF);
1377
1378 /* Override the defaults for this instruction:
1379 */
1380 if (devinfo->gen < 6) {
1381 brw_set_dest(p, insn, brw_ip_reg());
1382 brw_set_src0(p, insn, brw_ip_reg());
1383 brw_set_src1(p, insn, brw_imm_d(0x0));
1384 } else if (devinfo->gen == 6) {
1385 brw_set_dest(p, insn, brw_imm_w(0));
1386 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1387 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1388 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1389 } else if (devinfo->gen == 7) {
1390 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1391 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1392 brw_set_src1(p, insn, brw_imm_w(0));
1393 brw_inst_set_jip(devinfo, insn, 0);
1394 brw_inst_set_uip(devinfo, insn, 0);
1395 } else {
1396 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1397 brw_set_src0(p, insn, brw_imm_d(0));
1398 brw_inst_set_jip(devinfo, insn, 0);
1399 brw_inst_set_uip(devinfo, insn, 0);
1400 }
1401
1402 brw_inst_set_exec_size(devinfo, insn, execute_size);
1403 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1404 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1405 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1406 if (!p->single_program_flow && devinfo->gen < 6)
1407 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1408
1409 push_if_stack(p, insn);
1410 p->if_depth_in_loop[p->loop_stack_depth]++;
1411 return insn;
1412 }
1413
1414 /* This function is only used for gen6-style IF instructions with an
1415 * embedded comparison (conditional modifier). It is not used on gen7.
1416 */
1417 brw_inst *
1418 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1419 struct brw_reg src0, struct brw_reg src1)
1420 {
1421 const struct brw_device_info *devinfo = p->devinfo;
1422 brw_inst *insn;
1423
1424 insn = next_insn(p, BRW_OPCODE_IF);
1425
1426 brw_set_dest(p, insn, brw_imm_w(0));
1427 brw_inst_set_exec_size(devinfo, insn,
1428 brw_inst_exec_size(devinfo, p->current));
1429 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1430 brw_set_src0(p, insn, src0);
1431 brw_set_src1(p, insn, src1);
1432
1433 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1434 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1435 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1436
1437 push_if_stack(p, insn);
1438 return insn;
1439 }
1440
1441 /**
1442 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1443 */
1444 static void
1445 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1446 brw_inst *if_inst, brw_inst *else_inst)
1447 {
1448 const struct brw_device_info *devinfo = p->devinfo;
1449
1450 /* The next instruction (where the ENDIF would be, if it existed) */
1451 brw_inst *next_inst = &p->store[p->nr_insn];
1452
1453 assert(p->single_program_flow);
1454 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1455 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1456 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1457
1458 /* Convert IF to an ADD instruction that moves the instruction pointer
1459 * to the first instruction of the ELSE block. If there is no ELSE
1460 * block, point to where ENDIF would be. Reverse the predicate.
1461 *
1462 * There's no need to execute an ENDIF since we don't need to do any
1463 * stack operations, and if we're currently executing, we just want to
1464 * continue normally.
1465 */
1466 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1467 brw_inst_set_pred_inv(devinfo, if_inst, true);
1468
1469 if (else_inst != NULL) {
1470 /* Convert ELSE to an ADD instruction that points where the ENDIF
1471 * would be.
1472 */
1473 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1474
1475 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1476 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1477 } else {
1478 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1479 }
1480 }
1481
1482 /**
1483 * Patch IF and ELSE instructions with appropriate jump targets.
1484 */
1485 static void
1486 patch_IF_ELSE(struct brw_codegen *p,
1487 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1488 {
1489 const struct brw_device_info *devinfo = p->devinfo;
1490
1491 /* We shouldn't be patching IF and ELSE instructions in single program flow
1492 * mode when gen < 6, because in single program flow mode on those
1493 * platforms, we convert flow control instructions to conditional ADDs that
1494 * operate on IP (see brw_ENDIF).
1495 *
1496 * However, on Gen6, writing to IP doesn't work in single program flow mode
1497 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1498 * not be updated by non-flow control instructions."). And on later
1499 * platforms, there is no significant benefit to converting control flow
1500 * instructions to conditional ADDs. So we do patch IF and ELSE
1501 * instructions in single program flow mode on those platforms.
1502 */
1503 if (devinfo->gen < 6)
1504 assert(!p->single_program_flow);
1505
1506 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1507 assert(endif_inst != NULL);
1508 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1509
1510 unsigned br = brw_jump_scale(devinfo);
1511
1512 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1513 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1514
1515 if (else_inst == NULL) {
1516 /* Patch IF -> ENDIF */
1517 if (devinfo->gen < 6) {
1518 /* Turn it into an IFF, which means no mask stack operations for
1519 * all-false and jumping past the ENDIF.
1520 */
1521 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1522 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1523 br * (endif_inst - if_inst + 1));
1524 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1525 } else if (devinfo->gen == 6) {
1526 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1527 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1528 } else {
1529 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1530 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1531 }
1532 } else {
1533 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1534
1535 /* Patch IF -> ELSE */
1536 if (devinfo->gen < 6) {
1537 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1538 br * (else_inst - if_inst));
1539 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1540 } else if (devinfo->gen == 6) {
1541 brw_inst_set_gen6_jump_count(devinfo, if_inst,
1542 br * (else_inst - if_inst + 1));
1543 }
1544
1545 /* Patch ELSE -> ENDIF */
1546 if (devinfo->gen < 6) {
1547 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1548 * matching ENDIF.
1549 */
1550 brw_inst_set_gen4_jump_count(devinfo, else_inst,
1551 br * (endif_inst - else_inst + 1));
1552 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1553 } else if (devinfo->gen == 6) {
1554 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1555 brw_inst_set_gen6_jump_count(devinfo, else_inst,
1556 br * (endif_inst - else_inst));
1557 } else {
1558 /* The IF instruction's JIP should point just past the ELSE */
1559 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1560 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1561 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1562 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1563 if (devinfo->gen >= 8) {
1564 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1565 * should point to ENDIF.
1566 */
1567 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1568 }
1569 }
1570 }
1571 }
1572
1573 void
1574 brw_ELSE(struct brw_codegen *p)
1575 {
1576 const struct brw_device_info *devinfo = p->devinfo;
1577 brw_inst *insn;
1578
1579 insn = next_insn(p, BRW_OPCODE_ELSE);
1580
1581 if (devinfo->gen < 6) {
1582 brw_set_dest(p, insn, brw_ip_reg());
1583 brw_set_src0(p, insn, brw_ip_reg());
1584 brw_set_src1(p, insn, brw_imm_d(0x0));
1585 } else if (devinfo->gen == 6) {
1586 brw_set_dest(p, insn, brw_imm_w(0));
1587 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1588 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1589 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1590 } else if (devinfo->gen == 7) {
1591 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1592 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1593 brw_set_src1(p, insn, brw_imm_w(0));
1594 brw_inst_set_jip(devinfo, insn, 0);
1595 brw_inst_set_uip(devinfo, insn, 0);
1596 } else {
1597 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1598 brw_set_src0(p, insn, brw_imm_d(0));
1599 brw_inst_set_jip(devinfo, insn, 0);
1600 brw_inst_set_uip(devinfo, insn, 0);
1601 }
1602
1603 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1604 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1605 if (!p->single_program_flow && devinfo->gen < 6)
1606 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1607
1608 push_if_stack(p, insn);
1609 }
1610
1611 void
1612 brw_ENDIF(struct brw_codegen *p)
1613 {
1614 const struct brw_device_info *devinfo = p->devinfo;
1615 brw_inst *insn = NULL;
1616 brw_inst *else_inst = NULL;
1617 brw_inst *if_inst = NULL;
1618 brw_inst *tmp;
1619 bool emit_endif = true;
1620
1621 /* In single program flow mode, we can express IF and ELSE instructions
1622 * equivalently as ADD instructions that operate on IP. On platforms prior
1623 * to Gen6, flow control instructions cause an implied thread switch, so
1624 * this is a significant savings.
1625 *
1626 * However, on Gen6, writing to IP doesn't work in single program flow mode
1627 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1628 * not be updated by non-flow control instructions."). And on later
1629 * platforms, there is no significant benefit to converting control flow
1630 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1631 * Gen5.
1632 */
1633 if (devinfo->gen < 6 && p->single_program_flow)
1634 emit_endif = false;
1635
1636 /*
1637 * A single next_insn() may change the base address of instruction store
1638 * memory(p->store), so call it first before referencing the instruction
1639 * store pointer from an index
1640 */
1641 if (emit_endif)
1642 insn = next_insn(p, BRW_OPCODE_ENDIF);
1643
1644 /* Pop the IF and (optional) ELSE instructions from the stack */
1645 p->if_depth_in_loop[p->loop_stack_depth]--;
1646 tmp = pop_if_stack(p);
1647 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1648 else_inst = tmp;
1649 tmp = pop_if_stack(p);
1650 }
1651 if_inst = tmp;
1652
1653 if (!emit_endif) {
1654 /* ENDIF is useless; don't bother emitting it. */
1655 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1656 return;
1657 }
1658
1659 if (devinfo->gen < 6) {
1660 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1661 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1662 brw_set_src1(p, insn, brw_imm_d(0x0));
1663 } else if (devinfo->gen == 6) {
1664 brw_set_dest(p, insn, brw_imm_w(0));
1665 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1666 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1667 } else if (devinfo->gen == 7) {
1668 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1669 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1670 brw_set_src1(p, insn, brw_imm_w(0));
1671 } else {
1672 brw_set_src0(p, insn, brw_imm_d(0));
1673 }
1674
1675 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1676 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1677 if (devinfo->gen < 6)
1678 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1679
1680 /* Also pop item off the stack in the endif instruction: */
1681 if (devinfo->gen < 6) {
1682 brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1683 brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1684 } else if (devinfo->gen == 6) {
1685 brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1686 } else {
1687 brw_inst_set_jip(devinfo, insn, 2);
1688 }
1689 patch_IF_ELSE(p, if_inst, else_inst, insn);
1690 }
1691
1692 brw_inst *
1693 brw_BREAK(struct brw_codegen *p)
1694 {
1695 const struct brw_device_info *devinfo = p->devinfo;
1696 brw_inst *insn;
1697
1698 insn = next_insn(p, BRW_OPCODE_BREAK);
1699 if (devinfo->gen >= 8) {
1700 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1701 brw_set_src0(p, insn, brw_imm_d(0x0));
1702 } else if (devinfo->gen >= 6) {
1703 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1704 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1705 brw_set_src1(p, insn, brw_imm_d(0x0));
1706 } else {
1707 brw_set_dest(p, insn, brw_ip_reg());
1708 brw_set_src0(p, insn, brw_ip_reg());
1709 brw_set_src1(p, insn, brw_imm_d(0x0));
1710 brw_inst_set_gen4_pop_count(devinfo, insn,
1711 p->if_depth_in_loop[p->loop_stack_depth]);
1712 }
1713 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1714 brw_inst_set_exec_size(devinfo, insn,
1715 brw_inst_exec_size(devinfo, p->current));
1716
1717 return insn;
1718 }
1719
1720 brw_inst *
1721 brw_CONT(struct brw_codegen *p)
1722 {
1723 const struct brw_device_info *devinfo = p->devinfo;
1724 brw_inst *insn;
1725
1726 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1727 brw_set_dest(p, insn, brw_ip_reg());
1728 if (devinfo->gen >= 8) {
1729 brw_set_src0(p, insn, brw_imm_d(0x0));
1730 } else {
1731 brw_set_src0(p, insn, brw_ip_reg());
1732 brw_set_src1(p, insn, brw_imm_d(0x0));
1733 }
1734
1735 if (devinfo->gen < 6) {
1736 brw_inst_set_gen4_pop_count(devinfo, insn,
1737 p->if_depth_in_loop[p->loop_stack_depth]);
1738 }
1739 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1740 brw_inst_set_exec_size(devinfo, insn,
1741 brw_inst_exec_size(devinfo, p->current));
1742 return insn;
1743 }
1744
1745 brw_inst *
1746 gen6_HALT(struct brw_codegen *p)
1747 {
1748 const struct brw_device_info *devinfo = p->devinfo;
1749 brw_inst *insn;
1750
1751 insn = next_insn(p, BRW_OPCODE_HALT);
1752 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1753 if (devinfo->gen >= 8) {
1754 brw_set_src0(p, insn, brw_imm_d(0x0));
1755 } else {
1756 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1757 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1758 }
1759
1760 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1761 brw_inst_set_exec_size(devinfo, insn,
1762 brw_inst_exec_size(devinfo, p->current));
1763 return insn;
1764 }
1765
1766 /* DO/WHILE loop:
1767 *
1768 * The DO/WHILE is just an unterminated loop -- break or continue are
1769 * used for control within the loop. We have a few ways they can be
1770 * done.
1771 *
1772 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1773 * jip and no DO instruction.
1774 *
1775 * For non-uniform control flow pre-gen6, there's a DO instruction to
1776 * push the mask, and a WHILE to jump back, and BREAK to get out and
1777 * pop the mask.
1778 *
1779 * For gen6, there's no more mask stack, so no need for DO. WHILE
1780 * just points back to the first instruction of the loop.
1781 */
1782 brw_inst *
1783 brw_DO(struct brw_codegen *p, unsigned execute_size)
1784 {
1785 const struct brw_device_info *devinfo = p->devinfo;
1786
1787 if (devinfo->gen >= 6 || p->single_program_flow) {
1788 push_loop_stack(p, &p->store[p->nr_insn]);
1789 return &p->store[p->nr_insn];
1790 } else {
1791 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1792
1793 push_loop_stack(p, insn);
1794
1795 /* Override the defaults for this instruction:
1796 */
1797 brw_set_dest(p, insn, brw_null_reg());
1798 brw_set_src0(p, insn, brw_null_reg());
1799 brw_set_src1(p, insn, brw_null_reg());
1800
1801 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1802 brw_inst_set_exec_size(devinfo, insn, execute_size);
1803 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1804
1805 return insn;
1806 }
1807 }
1808
1809 /**
1810 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1811 * instruction here.
1812 *
1813 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1814 * nesting, since it can always just point to the end of the block/current loop.
1815 */
1816 static void
1817 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1818 {
1819 const struct brw_device_info *devinfo = p->devinfo;
1820 brw_inst *do_inst = get_inner_do_insn(p);
1821 brw_inst *inst;
1822 unsigned br = brw_jump_scale(devinfo);
1823
1824 assert(devinfo->gen < 6);
1825
1826 for (inst = while_inst - 1; inst != do_inst; inst--) {
1827 /* If the jump count is != 0, that means that this instruction has already
1828 * been patched because it's part of a loop inside of the one we're
1829 * patching.
1830 */
1831 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1832 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1833 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1834 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1835 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1836 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1837 }
1838 }
1839 }
1840
1841 brw_inst *
1842 brw_WHILE(struct brw_codegen *p)
1843 {
1844 const struct brw_device_info *devinfo = p->devinfo;
1845 brw_inst *insn, *do_insn;
1846 unsigned br = brw_jump_scale(devinfo);
1847
1848 if (devinfo->gen >= 6) {
1849 insn = next_insn(p, BRW_OPCODE_WHILE);
1850 do_insn = get_inner_do_insn(p);
1851
1852 if (devinfo->gen >= 8) {
1853 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1854 brw_set_src0(p, insn, brw_imm_d(0));
1855 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1856 } else if (devinfo->gen == 7) {
1857 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1858 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1859 brw_set_src1(p, insn, brw_imm_w(0));
1860 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1861 } else {
1862 brw_set_dest(p, insn, brw_imm_w(0));
1863 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1864 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1865 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1866 }
1867
1868 brw_inst_set_exec_size(devinfo, insn,
1869 brw_inst_exec_size(devinfo, p->current));
1870
1871 } else {
1872 if (p->single_program_flow) {
1873 insn = next_insn(p, BRW_OPCODE_ADD);
1874 do_insn = get_inner_do_insn(p);
1875
1876 brw_set_dest(p, insn, brw_ip_reg());
1877 brw_set_src0(p, insn, brw_ip_reg());
1878 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1879 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1880 } else {
1881 insn = next_insn(p, BRW_OPCODE_WHILE);
1882 do_insn = get_inner_do_insn(p);
1883
1884 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1885
1886 brw_set_dest(p, insn, brw_ip_reg());
1887 brw_set_src0(p, insn, brw_ip_reg());
1888 brw_set_src1(p, insn, brw_imm_d(0));
1889
1890 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1891 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1892 brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1893
1894 brw_patch_break_cont(p, insn);
1895 }
1896 }
1897 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1898
1899 p->loop_stack_depth--;
1900
1901 return insn;
1902 }
1903
1904 /* FORWARD JUMPS:
1905 */
1906 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1907 {
1908 const struct brw_device_info *devinfo = p->devinfo;
1909 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1910 unsigned jmpi = 1;
1911
1912 if (devinfo->gen >= 5)
1913 jmpi = 2;
1914
1915 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1916 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1917
1918 brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1919 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1920 }
1921
1922 /* To integrate with the above, it makes sense that the comparison
1923 * instruction should populate the flag register. It might be simpler
1924 * just to use the flag reg for most WM tasks?
1925 */
1926 void brw_CMP(struct brw_codegen *p,
1927 struct brw_reg dest,
1928 unsigned conditional,
1929 struct brw_reg src0,
1930 struct brw_reg src1)
1931 {
1932 const struct brw_device_info *devinfo = p->devinfo;
1933 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1934
1935 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1936 brw_set_dest(p, insn, dest);
1937 brw_set_src0(p, insn, src0);
1938 brw_set_src1(p, insn, src1);
1939
1940 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1941 * page says:
1942 * "Any CMP instruction with a null destination must use a {switch}."
1943 *
1944 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1945 * mentioned on their work-arounds pages.
1946 */
1947 if (devinfo->gen == 7) {
1948 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1949 dest.nr == BRW_ARF_NULL) {
1950 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1951 }
1952 }
1953 }
1954
1955 /***********************************************************************
1956 * Helpers for the various SEND message types:
1957 */
1958
1959 /** Extended math function, float[8].
1960 */
1961 void gen4_math(struct brw_codegen *p,
1962 struct brw_reg dest,
1963 unsigned function,
1964 unsigned msg_reg_nr,
1965 struct brw_reg src,
1966 unsigned precision )
1967 {
1968 const struct brw_device_info *devinfo = p->devinfo;
1969 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1970 unsigned data_type;
1971 if (has_scalar_region(src)) {
1972 data_type = BRW_MATH_DATA_SCALAR;
1973 } else {
1974 data_type = BRW_MATH_DATA_VECTOR;
1975 }
1976
1977 assert(devinfo->gen < 6);
1978
1979 /* Example code doesn't set predicate_control for send
1980 * instructions.
1981 */
1982 brw_inst_set_pred_control(devinfo, insn, 0);
1983 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1984
1985 brw_set_dest(p, insn, dest);
1986 brw_set_src0(p, insn, src);
1987 brw_set_math_message(p,
1988 insn,
1989 function,
1990 src.type == BRW_REGISTER_TYPE_D,
1991 precision,
1992 data_type);
1993 }
1994
1995 void gen6_math(struct brw_codegen *p,
1996 struct brw_reg dest,
1997 unsigned function,
1998 struct brw_reg src0,
1999 struct brw_reg src1)
2000 {
2001 const struct brw_device_info *devinfo = p->devinfo;
2002 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
2003
2004 assert(devinfo->gen >= 6);
2005
2006 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
2007 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
2008 assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
2009 (devinfo->gen >= 8 && src0.file == BRW_IMMEDIATE_VALUE));
2010
2011 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
2012 if (devinfo->gen == 6) {
2013 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
2014 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
2015 }
2016
2017 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
2018 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
2019 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
2020 assert(src0.type != BRW_REGISTER_TYPE_F);
2021 assert(src1.type != BRW_REGISTER_TYPE_F);
2022 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2023 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2024 } else {
2025 assert(src0.type == BRW_REGISTER_TYPE_F);
2026 assert(src1.type == BRW_REGISTER_TYPE_F);
2027 if (function == BRW_MATH_FUNCTION_POW) {
2028 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2029 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2030 } else {
2031 assert(src1.file == BRW_ARCHITECTURE_REGISTER_FILE &&
2032 src1.nr == BRW_ARF_NULL);
2033 }
2034 }
2035
2036 /* Source modifiers are ignored for extended math instructions on Gen6. */
2037 if (devinfo->gen == 6) {
2038 assert(!src0.negate);
2039 assert(!src0.abs);
2040 assert(!src1.negate);
2041 assert(!src1.abs);
2042 }
2043
2044 brw_inst_set_math_function(devinfo, insn, function);
2045
2046 brw_set_dest(p, insn, dest);
2047 brw_set_src0(p, insn, src0);
2048 brw_set_src1(p, insn, src1);
2049 }
2050
2051 /**
2052 * Return the right surface index to access the thread scratch space using
2053 * stateless dataport messages.
2054 */
2055 unsigned
2056 brw_scratch_surface_idx(const struct brw_codegen *p)
2057 {
2058 /* The scratch space is thread-local so IA coherency is unnecessary. */
2059 if (p->devinfo->gen >= 8)
2060 return GEN8_BTI_STATELESS_NON_COHERENT;
2061 else
2062 return BRW_BTI_STATELESS;
2063 }
2064
2065 /**
2066 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2067 * using a constant offset per channel.
2068 *
2069 * The offset must be aligned to oword size (16 bytes). Used for
2070 * register spilling.
2071 */
2072 void brw_oword_block_write_scratch(struct brw_codegen *p,
2073 struct brw_reg mrf,
2074 int num_regs,
2075 unsigned offset)
2076 {
2077 const struct brw_device_info *devinfo = p->devinfo;
2078 uint32_t msg_type;
2079
2080 if (devinfo->gen >= 6)
2081 offset /= 16;
2082
2083 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2084
2085 const unsigned mlen = 1 + num_regs;
2086 const unsigned msg_control =
2087 (num_regs == 1 ? BRW_DATAPORT_OWORD_BLOCK_2_OWORDS :
2088 num_regs == 2 ? BRW_DATAPORT_OWORD_BLOCK_4_OWORDS :
2089 num_regs == 4 ? BRW_DATAPORT_OWORD_BLOCK_8_OWORDS : 0);
2090 assert(msg_control);
2091
2092 /* Set up the message header. This is g0, with g0.2 filled with
2093 * the offset. We don't want to leave our offset around in g0 or
2094 * it'll screw up texture samples, so set it up inside the message
2095 * reg.
2096 */
2097 {
2098 brw_push_insn_state(p);
2099 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2100 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2101 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2102
2103 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2104
2105 /* set message header global offset field (reg 0, element 2) */
2106 brw_MOV(p,
2107 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2108 mrf.nr,
2109 2), BRW_REGISTER_TYPE_UD),
2110 brw_imm_ud(offset));
2111
2112 brw_pop_insn_state(p);
2113 }
2114
2115 {
2116 struct brw_reg dest;
2117 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2118 int send_commit_msg;
2119 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2120 BRW_REGISTER_TYPE_UW);
2121
2122 brw_inst_set_compression(devinfo, insn, false);
2123
2124 if (brw_inst_exec_size(devinfo, insn) >= 16)
2125 src_header = vec16(src_header);
2126
2127 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2128 if (devinfo->gen < 6)
2129 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2130
2131 /* Until gen6, writes followed by reads from the same location
2132 * are not guaranteed to be ordered unless write_commit is set.
2133 * If set, then a no-op write is issued to the destination
2134 * register to set a dependency, and a read from the destination
2135 * can be used to ensure the ordering.
2136 *
2137 * For gen6, only writes between different threads need ordering
2138 * protection. Our use of DP writes is all about register
2139 * spilling within a thread.
2140 */
2141 if (devinfo->gen >= 6) {
2142 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2143 send_commit_msg = 0;
2144 } else {
2145 dest = src_header;
2146 send_commit_msg = 1;
2147 }
2148
2149 brw_set_dest(p, insn, dest);
2150 if (devinfo->gen >= 6) {
2151 brw_set_src0(p, insn, mrf);
2152 } else {
2153 brw_set_src0(p, insn, brw_null_reg());
2154 }
2155
2156 if (devinfo->gen >= 6)
2157 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2158 else
2159 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2160
2161 brw_set_dp_write_message(p,
2162 insn,
2163 brw_scratch_surface_idx(p),
2164 msg_control,
2165 msg_type,
2166 mlen,
2167 true, /* header_present */
2168 0, /* not a render target */
2169 send_commit_msg, /* response_length */
2170 0, /* eot */
2171 send_commit_msg);
2172 }
2173 }
2174
2175
2176 /**
2177 * Read a block of owords (half a GRF each) from the scratch buffer
2178 * using a constant index per channel.
2179 *
2180 * Offset must be aligned to oword size (16 bytes). Used for register
2181 * spilling.
2182 */
2183 void
2184 brw_oword_block_read_scratch(struct brw_codegen *p,
2185 struct brw_reg dest,
2186 struct brw_reg mrf,
2187 int num_regs,
2188 unsigned offset)
2189 {
2190 const struct brw_device_info *devinfo = p->devinfo;
2191
2192 if (devinfo->gen >= 6)
2193 offset /= 16;
2194
2195 if (p->devinfo->gen >= 7) {
2196 /* On gen 7 and above, we no longer have message registers and we can
2197 * send from any register we want. By using the destination register
2198 * for the message, we guarantee that the implied message write won't
2199 * accidentally overwrite anything. This has been a problem because
2200 * the MRF registers and source for the final FB write are both fixed
2201 * and may overlap.
2202 */
2203 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2204 } else {
2205 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2206 }
2207 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2208
2209 const unsigned rlen = num_regs;
2210 const unsigned msg_control =
2211 (num_regs == 1 ? BRW_DATAPORT_OWORD_BLOCK_2_OWORDS :
2212 num_regs == 2 ? BRW_DATAPORT_OWORD_BLOCK_4_OWORDS :
2213 num_regs == 4 ? BRW_DATAPORT_OWORD_BLOCK_8_OWORDS : 0);
2214 assert(msg_control);
2215 const unsigned target_cache = devinfo->gen >= 7 ?
2216 BRW_DATAPORT_READ_TARGET_DATA_CACHE :
2217 BRW_DATAPORT_READ_TARGET_RENDER_CACHE;
2218
2219 {
2220 brw_push_insn_state(p);
2221 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2222 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2223 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2224
2225 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2226
2227 /* set message header global offset field (reg 0, element 2) */
2228 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2229
2230 brw_pop_insn_state(p);
2231 }
2232
2233 {
2234 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2235
2236 assert(brw_inst_pred_control(devinfo, insn) == 0);
2237 brw_inst_set_compression(devinfo, insn, false);
2238
2239 brw_set_dest(p, insn, dest); /* UW? */
2240 if (devinfo->gen >= 6) {
2241 brw_set_src0(p, insn, mrf);
2242 } else {
2243 brw_set_src0(p, insn, brw_null_reg());
2244 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2245 }
2246
2247 brw_set_dp_read_message(p,
2248 insn,
2249 brw_scratch_surface_idx(p),
2250 msg_control,
2251 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2252 target_cache,
2253 1, /* msg_length */
2254 true, /* header_present */
2255 rlen);
2256 }
2257 }
2258
2259 void
2260 gen7_block_read_scratch(struct brw_codegen *p,
2261 struct brw_reg dest,
2262 int num_regs,
2263 unsigned offset)
2264 {
2265 const struct brw_device_info *devinfo = p->devinfo;
2266 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2267 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2268
2269 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2270
2271 /* The HW requires that the header is present; this is to get the g0.5
2272 * scratch offset.
2273 */
2274 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2275
2276 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2277 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2278 * is 32 bytes, which happens to be the size of a register.
2279 */
2280 offset /= REG_SIZE;
2281 assert(offset < (1 << 12));
2282
2283 gen7_set_dp_scratch_message(p, insn,
2284 false, /* scratch read */
2285 false, /* OWords */
2286 false, /* invalidate after read */
2287 num_regs,
2288 offset,
2289 1, /* mlen: just g0 */
2290 num_regs, /* rlen */
2291 true); /* header present */
2292 }
2293
2294 /**
2295 * Read a float[4] vector from the data port Data Cache (const buffer).
2296 * Location (in buffer) should be a multiple of 16.
2297 * Used for fetching shader constants.
2298 */
2299 void brw_oword_block_read(struct brw_codegen *p,
2300 struct brw_reg dest,
2301 struct brw_reg mrf,
2302 uint32_t offset,
2303 uint32_t bind_table_index)
2304 {
2305 const struct brw_device_info *devinfo = p->devinfo;
2306
2307 /* On newer hardware, offset is in units of owords. */
2308 if (devinfo->gen >= 6)
2309 offset /= 16;
2310
2311 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2312
2313 brw_push_insn_state(p);
2314 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2315 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2316 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2317 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2318
2319 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2320
2321 /* set message header global offset field (reg 0, element 2) */
2322 brw_MOV(p,
2323 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2324 mrf.nr,
2325 2), BRW_REGISTER_TYPE_UD),
2326 brw_imm_ud(offset));
2327
2328 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2329
2330 /* cast dest to a uword[8] vector */
2331 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2332
2333 brw_set_dest(p, insn, dest);
2334 if (devinfo->gen >= 6) {
2335 brw_set_src0(p, insn, mrf);
2336 } else {
2337 brw_set_src0(p, insn, brw_null_reg());
2338 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2339 }
2340
2341 brw_set_dp_read_message(p,
2342 insn,
2343 bind_table_index,
2344 BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2345 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2346 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2347 1, /* msg_length */
2348 true, /* header_present */
2349 1); /* response_length (1 reg, 2 owords!) */
2350
2351 brw_pop_insn_state(p);
2352 }
2353
2354
2355 void brw_fb_WRITE(struct brw_codegen *p,
2356 struct brw_reg payload,
2357 struct brw_reg implied_header,
2358 unsigned msg_control,
2359 unsigned binding_table_index,
2360 unsigned msg_length,
2361 unsigned response_length,
2362 bool eot,
2363 bool last_render_target,
2364 bool header_present)
2365 {
2366 const struct brw_device_info *devinfo = p->devinfo;
2367 brw_inst *insn;
2368 unsigned msg_type;
2369 struct brw_reg dest, src0;
2370
2371 if (brw_inst_exec_size(devinfo, p->current) >= BRW_EXECUTE_16)
2372 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2373 else
2374 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2375
2376 if (devinfo->gen >= 6) {
2377 insn = next_insn(p, BRW_OPCODE_SENDC);
2378 } else {
2379 insn = next_insn(p, BRW_OPCODE_SEND);
2380 }
2381 brw_inst_set_compression(devinfo, insn, false);
2382
2383 if (devinfo->gen >= 6) {
2384 /* headerless version, just submit color payload */
2385 src0 = payload;
2386
2387 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2388 } else {
2389 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2390 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2391 src0 = implied_header;
2392
2393 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2394 }
2395
2396 brw_set_dest(p, insn, dest);
2397 brw_set_src0(p, insn, src0);
2398 brw_set_dp_write_message(p,
2399 insn,
2400 binding_table_index,
2401 msg_control,
2402 msg_type,
2403 msg_length,
2404 header_present,
2405 last_render_target,
2406 response_length,
2407 eot,
2408 0 /* send_commit_msg */);
2409 }
2410
2411
2412 /**
2413 * Texture sample instruction.
2414 * Note: the msg_type plus msg_length values determine exactly what kind
2415 * of sampling operation is performed. See volume 4, page 161 of docs.
2416 */
2417 void brw_SAMPLE(struct brw_codegen *p,
2418 struct brw_reg dest,
2419 unsigned msg_reg_nr,
2420 struct brw_reg src0,
2421 unsigned binding_table_index,
2422 unsigned sampler,
2423 unsigned msg_type,
2424 unsigned response_length,
2425 unsigned msg_length,
2426 unsigned header_present,
2427 unsigned simd_mode,
2428 unsigned return_format)
2429 {
2430 const struct brw_device_info *devinfo = p->devinfo;
2431 brw_inst *insn;
2432
2433 if (msg_reg_nr != -1)
2434 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2435
2436 insn = next_insn(p, BRW_OPCODE_SEND);
2437 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2438
2439 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2440 *
2441 * "Instruction compression is not allowed for this instruction (that
2442 * is, send). The hardware behavior is undefined if this instruction is
2443 * set as compressed. However, compress control can be set to "SecHalf"
2444 * to affect the EMask generation."
2445 *
2446 * No similar wording is found in later PRMs, but there are examples
2447 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2448 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2449 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2450 */
2451 brw_inst_set_compression(devinfo, insn, false);
2452
2453 if (devinfo->gen < 6)
2454 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2455
2456 brw_set_dest(p, insn, dest);
2457 brw_set_src0(p, insn, src0);
2458 brw_set_sampler_message(p, insn,
2459 binding_table_index,
2460 sampler,
2461 msg_type,
2462 response_length,
2463 msg_length,
2464 header_present,
2465 simd_mode,
2466 return_format);
2467 }
2468
2469 /* Adjust the message header's sampler state pointer to
2470 * select the correct group of 16 samplers.
2471 */
2472 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2473 struct brw_reg header,
2474 struct brw_reg sampler_index)
2475 {
2476 /* The "Sampler Index" field can only store values between 0 and 15.
2477 * However, we can add an offset to the "Sampler State Pointer"
2478 * field, effectively selecting a different set of 16 samplers.
2479 *
2480 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2481 * offset, and each sampler state is only 16-bytes, so we can't
2482 * exclusively use the offset - we have to use both.
2483 */
2484
2485 const struct brw_device_info *devinfo = p->devinfo;
2486
2487 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2488 const int sampler_state_size = 16; /* 16 bytes */
2489 uint32_t sampler = sampler_index.ud;
2490
2491 if (sampler >= 16) {
2492 assert(devinfo->is_haswell || devinfo->gen >= 8);
2493 brw_ADD(p,
2494 get_element_ud(header, 3),
2495 get_element_ud(brw_vec8_grf(0, 0), 3),
2496 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2497 }
2498 } else {
2499 /* Non-const sampler array indexing case */
2500 if (devinfo->gen < 8 && !devinfo->is_haswell) {
2501 return;
2502 }
2503
2504 struct brw_reg temp = get_element_ud(header, 3);
2505
2506 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2507 brw_SHL(p, temp, temp, brw_imm_ud(4));
2508 brw_ADD(p,
2509 get_element_ud(header, 3),
2510 get_element_ud(brw_vec8_grf(0, 0), 3),
2511 temp);
2512 }
2513 }
2514
2515 /* All these variables are pretty confusing - we might be better off
2516 * using bitmasks and macros for this, in the old style. Or perhaps
2517 * just having the caller instantiate the fields in dword3 itself.
2518 */
2519 void brw_urb_WRITE(struct brw_codegen *p,
2520 struct brw_reg dest,
2521 unsigned msg_reg_nr,
2522 struct brw_reg src0,
2523 enum brw_urb_write_flags flags,
2524 unsigned msg_length,
2525 unsigned response_length,
2526 unsigned offset,
2527 unsigned swizzle)
2528 {
2529 const struct brw_device_info *devinfo = p->devinfo;
2530 brw_inst *insn;
2531
2532 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2533
2534 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2535 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2536 brw_push_insn_state(p);
2537 brw_set_default_access_mode(p, BRW_ALIGN_1);
2538 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2539 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2540 BRW_REGISTER_TYPE_UD),
2541 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2542 brw_imm_ud(0xff00));
2543 brw_pop_insn_state(p);
2544 }
2545
2546 insn = next_insn(p, BRW_OPCODE_SEND);
2547
2548 assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2549
2550 brw_set_dest(p, insn, dest);
2551 brw_set_src0(p, insn, src0);
2552 brw_set_src1(p, insn, brw_imm_d(0));
2553
2554 if (devinfo->gen < 6)
2555 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2556
2557 brw_set_urb_message(p,
2558 insn,
2559 flags,
2560 msg_length,
2561 response_length,
2562 offset,
2563 swizzle);
2564 }
2565
2566 struct brw_inst *
2567 brw_send_indirect_message(struct brw_codegen *p,
2568 unsigned sfid,
2569 struct brw_reg dst,
2570 struct brw_reg payload,
2571 struct brw_reg desc)
2572 {
2573 const struct brw_device_info *devinfo = p->devinfo;
2574 struct brw_inst *send;
2575 int setup;
2576
2577 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2578
2579 assert(desc.type == BRW_REGISTER_TYPE_UD);
2580
2581 /* We hold on to the setup instruction (the SEND in the direct case, the OR
2582 * in the indirect case) by its index in the instruction store. The
2583 * pointer returned by next_insn() may become invalid if emitting the SEND
2584 * in the indirect case reallocs the store.
2585 */
2586
2587 if (desc.file == BRW_IMMEDIATE_VALUE) {
2588 setup = p->nr_insn;
2589 send = next_insn(p, BRW_OPCODE_SEND);
2590 brw_set_src1(p, send, desc);
2591
2592 } else {
2593 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2594
2595 brw_push_insn_state(p);
2596 brw_set_default_access_mode(p, BRW_ALIGN_1);
2597 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2598 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2599
2600 /* Load the indirect descriptor to an address register using OR so the
2601 * caller can specify additional descriptor bits with the usual
2602 * brw_set_*_message() helper functions.
2603 */
2604 setup = p->nr_insn;
2605 brw_OR(p, addr, desc, brw_imm_ud(0));
2606
2607 brw_pop_insn_state(p);
2608
2609 send = next_insn(p, BRW_OPCODE_SEND);
2610 brw_set_src1(p, send, addr);
2611 }
2612
2613 if (dst.width < BRW_EXECUTE_8)
2614 brw_inst_set_exec_size(devinfo, send, dst.width);
2615
2616 brw_set_dest(p, send, dst);
2617 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2618 brw_inst_set_sfid(devinfo, send, sfid);
2619
2620 return &p->store[setup];
2621 }
2622
2623 static struct brw_inst *
2624 brw_send_indirect_surface_message(struct brw_codegen *p,
2625 unsigned sfid,
2626 struct brw_reg dst,
2627 struct brw_reg payload,
2628 struct brw_reg surface,
2629 unsigned message_len,
2630 unsigned response_len,
2631 bool header_present)
2632 {
2633 const struct brw_device_info *devinfo = p->devinfo;
2634 struct brw_inst *insn;
2635
2636 if (surface.file != BRW_IMMEDIATE_VALUE) {
2637 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2638
2639 brw_push_insn_state(p);
2640 brw_set_default_access_mode(p, BRW_ALIGN_1);
2641 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2642 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2643
2644 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2645 * some surface array is accessed out of bounds.
2646 */
2647 insn = brw_AND(p, addr,
2648 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2649 BRW_GET_SWZ(surface.swizzle, 0)),
2650 brw_imm_ud(0xff));
2651
2652 brw_pop_insn_state(p);
2653
2654 surface = addr;
2655 }
2656
2657 insn = brw_send_indirect_message(p, sfid, dst, payload, surface);
2658 brw_inst_set_mlen(devinfo, insn, message_len);
2659 brw_inst_set_rlen(devinfo, insn, response_len);
2660 brw_inst_set_header_present(devinfo, insn, header_present);
2661
2662 return insn;
2663 }
2664
2665 static bool
2666 while_jumps_before_offset(const struct brw_device_info *devinfo,
2667 brw_inst *insn, int while_offset, int start_offset)
2668 {
2669 int scale = 16 / brw_jump_scale(devinfo);
2670 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2671 : brw_inst_jip(devinfo, insn);
2672 return while_offset + jip * scale <= start_offset;
2673 }
2674
2675
2676 static int
2677 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2678 {
2679 int offset;
2680 void *store = p->store;
2681 const struct brw_device_info *devinfo = p->devinfo;
2682
2683 int depth = 0;
2684
2685 for (offset = next_offset(devinfo, store, start_offset);
2686 offset < p->next_insn_offset;
2687 offset = next_offset(devinfo, store, offset)) {
2688 brw_inst *insn = store + offset;
2689
2690 switch (brw_inst_opcode(devinfo, insn)) {
2691 case BRW_OPCODE_IF:
2692 depth++;
2693 break;
2694 case BRW_OPCODE_ENDIF:
2695 if (depth == 0)
2696 return offset;
2697 depth--;
2698 break;
2699 case BRW_OPCODE_WHILE:
2700 /* If the while doesn't jump before our instruction, it's the end
2701 * of a sibling do...while loop. Ignore it.
2702 */
2703 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2704 continue;
2705 /* fallthrough */
2706 case BRW_OPCODE_ELSE:
2707 case BRW_OPCODE_HALT:
2708 if (depth == 0)
2709 return offset;
2710 }
2711 }
2712
2713 return 0;
2714 }
2715
2716 /* There is no DO instruction on gen6, so to find the end of the loop
2717 * we have to see if the loop is jumping back before our start
2718 * instruction.
2719 */
2720 static int
2721 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2722 {
2723 const struct brw_device_info *devinfo = p->devinfo;
2724 int offset;
2725 void *store = p->store;
2726
2727 assert(devinfo->gen >= 6);
2728
2729 /* Always start after the instruction (such as a WHILE) we're trying to fix
2730 * up.
2731 */
2732 for (offset = next_offset(devinfo, store, start_offset);
2733 offset < p->next_insn_offset;
2734 offset = next_offset(devinfo, store, offset)) {
2735 brw_inst *insn = store + offset;
2736
2737 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2738 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2739 return offset;
2740 }
2741 }
2742 assert(!"not reached");
2743 return start_offset;
2744 }
2745
2746 /* After program generation, go back and update the UIP and JIP of
2747 * BREAK, CONT, and HALT instructions to their correct locations.
2748 */
2749 void
2750 brw_set_uip_jip(struct brw_codegen *p)
2751 {
2752 const struct brw_device_info *devinfo = p->devinfo;
2753 int offset;
2754 int br = brw_jump_scale(devinfo);
2755 int scale = 16 / br;
2756 void *store = p->store;
2757
2758 if (devinfo->gen < 6)
2759 return;
2760
2761 for (offset = 0; offset < p->next_insn_offset;
2762 offset = next_offset(devinfo, store, offset)) {
2763 brw_inst *insn = store + offset;
2764
2765 if (brw_inst_cmpt_control(devinfo, insn)) {
2766 /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2767 assert(brw_inst_opcode(devinfo, insn) != BRW_OPCODE_BREAK &&
2768 brw_inst_opcode(devinfo, insn) != BRW_OPCODE_CONTINUE &&
2769 brw_inst_opcode(devinfo, insn) != BRW_OPCODE_HALT);
2770 continue;
2771 }
2772
2773 int block_end_offset = brw_find_next_block_end(p, offset);
2774 switch (brw_inst_opcode(devinfo, insn)) {
2775 case BRW_OPCODE_BREAK:
2776 assert(block_end_offset != 0);
2777 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2778 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2779 brw_inst_set_uip(devinfo, insn,
2780 (brw_find_loop_end(p, offset) - offset +
2781 (devinfo->gen == 6 ? 16 : 0)) / scale);
2782 break;
2783 case BRW_OPCODE_CONTINUE:
2784 assert(block_end_offset != 0);
2785 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2786 brw_inst_set_uip(devinfo, insn,
2787 (brw_find_loop_end(p, offset) - offset) / scale);
2788
2789 assert(brw_inst_uip(devinfo, insn) != 0);
2790 assert(brw_inst_jip(devinfo, insn) != 0);
2791 break;
2792
2793 case BRW_OPCODE_ENDIF: {
2794 int32_t jump = (block_end_offset == 0) ?
2795 1 * br : (block_end_offset - offset) / scale;
2796 if (devinfo->gen >= 7)
2797 brw_inst_set_jip(devinfo, insn, jump);
2798 else
2799 brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2800 break;
2801 }
2802
2803 case BRW_OPCODE_HALT:
2804 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2805 *
2806 * "In case of the halt instruction not inside any conditional
2807 * code block, the value of <JIP> and <UIP> should be the
2808 * same. In case of the halt instruction inside conditional code
2809 * block, the <UIP> should be the end of the program, and the
2810 * <JIP> should be end of the most inner conditional code block."
2811 *
2812 * The uip will have already been set by whoever set up the
2813 * instruction.
2814 */
2815 if (block_end_offset == 0) {
2816 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2817 } else {
2818 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2819 }
2820 assert(brw_inst_uip(devinfo, insn) != 0);
2821 assert(brw_inst_jip(devinfo, insn) != 0);
2822 break;
2823 }
2824 }
2825 }
2826
2827 void brw_ff_sync(struct brw_codegen *p,
2828 struct brw_reg dest,
2829 unsigned msg_reg_nr,
2830 struct brw_reg src0,
2831 bool allocate,
2832 unsigned response_length,
2833 bool eot)
2834 {
2835 const struct brw_device_info *devinfo = p->devinfo;
2836 brw_inst *insn;
2837
2838 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2839
2840 insn = next_insn(p, BRW_OPCODE_SEND);
2841 brw_set_dest(p, insn, dest);
2842 brw_set_src0(p, insn, src0);
2843 brw_set_src1(p, insn, brw_imm_d(0));
2844
2845 if (devinfo->gen < 6)
2846 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2847
2848 brw_set_ff_sync_message(p,
2849 insn,
2850 allocate,
2851 response_length,
2852 eot);
2853 }
2854
2855 /**
2856 * Emit the SEND instruction necessary to generate stream output data on Gen6
2857 * (for transform feedback).
2858 *
2859 * If send_commit_msg is true, this is the last piece of stream output data
2860 * from this thread, so send the data as a committed write. According to the
2861 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2862 *
2863 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2864 * writes are complete by sending the final write as a committed write."
2865 */
2866 void
2867 brw_svb_write(struct brw_codegen *p,
2868 struct brw_reg dest,
2869 unsigned msg_reg_nr,
2870 struct brw_reg src0,
2871 unsigned binding_table_index,
2872 bool send_commit_msg)
2873 {
2874 brw_inst *insn;
2875
2876 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2877
2878 insn = next_insn(p, BRW_OPCODE_SEND);
2879 brw_set_dest(p, insn, dest);
2880 brw_set_src0(p, insn, src0);
2881 brw_set_src1(p, insn, brw_imm_d(0));
2882 brw_set_dp_write_message(p, insn,
2883 binding_table_index,
2884 0, /* msg_control: ignored */
2885 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2886 1, /* msg_length */
2887 true, /* header_present */
2888 0, /* last_render_target: ignored */
2889 send_commit_msg, /* response_length */
2890 0, /* end_of_thread */
2891 send_commit_msg); /* send_commit_msg */
2892 }
2893
2894 static unsigned
2895 brw_surface_payload_size(struct brw_codegen *p,
2896 unsigned num_channels,
2897 bool has_simd4x2,
2898 bool has_simd16)
2899 {
2900 if (has_simd4x2 &&
2901 brw_inst_access_mode(p->devinfo, p->current) == BRW_ALIGN_16)
2902 return 1;
2903 else if (has_simd16 &&
2904 brw_inst_exec_size(p->devinfo, p->current) == BRW_EXECUTE_16)
2905 return 2 * num_channels;
2906 else
2907 return num_channels;
2908 }
2909
2910 static void
2911 brw_set_dp_untyped_atomic_message(struct brw_codegen *p,
2912 brw_inst *insn,
2913 unsigned atomic_op,
2914 bool response_expected)
2915 {
2916 const struct brw_device_info *devinfo = p->devinfo;
2917 unsigned msg_control =
2918 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2919 (response_expected ? 1 << 5 : 0); /* Return data expected */
2920
2921 if (devinfo->gen >= 8 || devinfo->is_haswell) {
2922 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2923 if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
2924 msg_control |= 1 << 4; /* SIMD8 mode */
2925
2926 brw_inst_set_dp_msg_type(devinfo, insn,
2927 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
2928 } else {
2929 brw_inst_set_dp_msg_type(devinfo, insn,
2930 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
2931 }
2932 } else {
2933 brw_inst_set_dp_msg_type(devinfo, insn,
2934 GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
2935
2936 if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
2937 msg_control |= 1 << 4; /* SIMD8 mode */
2938 }
2939
2940 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2941 }
2942
2943 void
2944 brw_untyped_atomic(struct brw_codegen *p,
2945 struct brw_reg dst,
2946 struct brw_reg payload,
2947 struct brw_reg surface,
2948 unsigned atomic_op,
2949 unsigned msg_length,
2950 bool response_expected)
2951 {
2952 const struct brw_device_info *devinfo = p->devinfo;
2953 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2954 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2955 GEN7_SFID_DATAPORT_DATA_CACHE);
2956 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
2957 /* Mask out unused components -- This is especially important in Align16
2958 * mode on generations that don't have native support for SIMD4x2 atomics,
2959 * because unused but enabled components will cause the dataport to perform
2960 * additional atomic operations on the addresses that happen to be in the
2961 * uninitialized Y, Z and W coordinates of the payload.
2962 */
2963 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2964 struct brw_inst *insn = brw_send_indirect_surface_message(
2965 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
2966 brw_surface_payload_size(p, response_expected,
2967 devinfo->gen >= 8 || devinfo->is_haswell, true),
2968 align1);
2969
2970 brw_set_dp_untyped_atomic_message(
2971 p, insn, atomic_op, response_expected);
2972 }
2973
2974 static void
2975 brw_set_dp_untyped_surface_read_message(struct brw_codegen *p,
2976 struct brw_inst *insn,
2977 unsigned num_channels)
2978 {
2979 const struct brw_device_info *devinfo = p->devinfo;
2980 /* Set mask of 32-bit channels to drop. */
2981 unsigned msg_control = 0xf & (0xf << num_channels);
2982
2983 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2984 if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
2985 msg_control |= 1 << 4; /* SIMD16 mode */
2986 else
2987 msg_control |= 2 << 4; /* SIMD8 mode */
2988 }
2989
2990 brw_inst_set_dp_msg_type(devinfo, insn,
2991 (devinfo->gen >= 8 || devinfo->is_haswell ?
2992 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
2993 GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ));
2994 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2995 }
2996
2997 void
2998 brw_untyped_surface_read(struct brw_codegen *p,
2999 struct brw_reg dst,
3000 struct brw_reg payload,
3001 struct brw_reg surface,
3002 unsigned msg_length,
3003 unsigned num_channels)
3004 {
3005 const struct brw_device_info *devinfo = p->devinfo;
3006 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3007 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3008 GEN7_SFID_DATAPORT_DATA_CACHE);
3009 struct brw_inst *insn = brw_send_indirect_surface_message(
3010 p, sfid, dst, payload, surface, msg_length,
3011 brw_surface_payload_size(p, num_channels, true, true),
3012 false);
3013
3014 brw_set_dp_untyped_surface_read_message(
3015 p, insn, num_channels);
3016 }
3017
3018 static void
3019 brw_set_dp_untyped_surface_write_message(struct brw_codegen *p,
3020 struct brw_inst *insn,
3021 unsigned num_channels)
3022 {
3023 const struct brw_device_info *devinfo = p->devinfo;
3024 /* Set mask of 32-bit channels to drop. */
3025 unsigned msg_control = 0xf & (0xf << num_channels);
3026
3027 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3028 if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
3029 msg_control |= 1 << 4; /* SIMD16 mode */
3030 else
3031 msg_control |= 2 << 4; /* SIMD8 mode */
3032 } else {
3033 if (devinfo->gen >= 8 || devinfo->is_haswell)
3034 msg_control |= 0 << 4; /* SIMD4x2 mode */
3035 else
3036 msg_control |= 2 << 4; /* SIMD8 mode */
3037 }
3038
3039 brw_inst_set_dp_msg_type(devinfo, insn,
3040 devinfo->gen >= 8 || devinfo->is_haswell ?
3041 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
3042 GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
3043 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3044 }
3045
3046 void
3047 brw_untyped_surface_write(struct brw_codegen *p,
3048 struct brw_reg payload,
3049 struct brw_reg surface,
3050 unsigned msg_length,
3051 unsigned num_channels)
3052 {
3053 const struct brw_device_info *devinfo = p->devinfo;
3054 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3055 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3056 GEN7_SFID_DATAPORT_DATA_CACHE);
3057 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3058 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3059 const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3060 WRITEMASK_X : WRITEMASK_XYZW;
3061 struct brw_inst *insn = brw_send_indirect_surface_message(
3062 p, sfid, brw_writemask(brw_null_reg(), mask),
3063 payload, surface, msg_length, 0, align1);
3064
3065 brw_set_dp_untyped_surface_write_message(
3066 p, insn, num_channels);
3067 }
3068
3069 static void
3070 brw_set_dp_typed_atomic_message(struct brw_codegen *p,
3071 struct brw_inst *insn,
3072 unsigned atomic_op,
3073 bool response_expected)
3074 {
3075 const struct brw_device_info *devinfo = p->devinfo;
3076 unsigned msg_control =
3077 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
3078 (response_expected ? 1 << 5 : 0); /* Return data expected */
3079
3080 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3081 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3082 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3083 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3084
3085 brw_inst_set_dp_msg_type(devinfo, insn,
3086 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP);
3087 } else {
3088 brw_inst_set_dp_msg_type(devinfo, insn,
3089 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2);
3090 }
3091
3092 } else {
3093 brw_inst_set_dp_msg_type(devinfo, insn,
3094 GEN7_DATAPORT_RC_TYPED_ATOMIC_OP);
3095
3096 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3097 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3098 }
3099
3100 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3101 }
3102
3103 void
3104 brw_typed_atomic(struct brw_codegen *p,
3105 struct brw_reg dst,
3106 struct brw_reg payload,
3107 struct brw_reg surface,
3108 unsigned atomic_op,
3109 unsigned msg_length,
3110 bool response_expected) {
3111 const struct brw_device_info *devinfo = p->devinfo;
3112 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3113 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3114 GEN6_SFID_DATAPORT_RENDER_CACHE);
3115 const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3116 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3117 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3118 struct brw_inst *insn = brw_send_indirect_surface_message(
3119 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
3120 brw_surface_payload_size(p, response_expected,
3121 devinfo->gen >= 8 || devinfo->is_haswell, false),
3122 true);
3123
3124 brw_set_dp_typed_atomic_message(
3125 p, insn, atomic_op, response_expected);
3126 }
3127
3128 static void
3129 brw_set_dp_typed_surface_read_message(struct brw_codegen *p,
3130 struct brw_inst *insn,
3131 unsigned num_channels)
3132 {
3133 const struct brw_device_info *devinfo = p->devinfo;
3134 /* Set mask of unused channels. */
3135 unsigned msg_control = 0xf & (0xf << num_channels);
3136
3137 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3138 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3139 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3140 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3141 else
3142 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3143 }
3144
3145 brw_inst_set_dp_msg_type(devinfo, insn,
3146 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ);
3147 } else {
3148 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3149 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3150 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3151 }
3152
3153 brw_inst_set_dp_msg_type(devinfo, insn,
3154 GEN7_DATAPORT_RC_TYPED_SURFACE_READ);
3155 }
3156
3157 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3158 }
3159
3160 void
3161 brw_typed_surface_read(struct brw_codegen *p,
3162 struct brw_reg dst,
3163 struct brw_reg payload,
3164 struct brw_reg surface,
3165 unsigned msg_length,
3166 unsigned num_channels)
3167 {
3168 const struct brw_device_info *devinfo = p->devinfo;
3169 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3170 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3171 GEN6_SFID_DATAPORT_RENDER_CACHE);
3172 struct brw_inst *insn = brw_send_indirect_surface_message(
3173 p, sfid, dst, payload, surface, msg_length,
3174 brw_surface_payload_size(p, num_channels,
3175 devinfo->gen >= 8 || devinfo->is_haswell, false),
3176 true);
3177
3178 brw_set_dp_typed_surface_read_message(
3179 p, insn, num_channels);
3180 }
3181
3182 static void
3183 brw_set_dp_typed_surface_write_message(struct brw_codegen *p,
3184 struct brw_inst *insn,
3185 unsigned num_channels)
3186 {
3187 const struct brw_device_info *devinfo = p->devinfo;
3188 /* Set mask of unused channels. */
3189 unsigned msg_control = 0xf & (0xf << num_channels);
3190
3191 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3192 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3193 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3194 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3195 else
3196 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3197 }
3198
3199 brw_inst_set_dp_msg_type(devinfo, insn,
3200 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE);
3201
3202 } else {
3203 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3204 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3205 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3206 }
3207
3208 brw_inst_set_dp_msg_type(devinfo, insn,
3209 GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE);
3210 }
3211
3212 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3213 }
3214
3215 void
3216 brw_typed_surface_write(struct brw_codegen *p,
3217 struct brw_reg payload,
3218 struct brw_reg surface,
3219 unsigned msg_length,
3220 unsigned num_channels)
3221 {
3222 const struct brw_device_info *devinfo = p->devinfo;
3223 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3224 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3225 GEN6_SFID_DATAPORT_RENDER_CACHE);
3226 const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3227 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3228 const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3229 WRITEMASK_X : WRITEMASK_XYZW);
3230 struct brw_inst *insn = brw_send_indirect_surface_message(
3231 p, sfid, brw_writemask(brw_null_reg(), mask),
3232 payload, surface, msg_length, 0, true);
3233
3234 brw_set_dp_typed_surface_write_message(
3235 p, insn, num_channels);
3236 }
3237
3238 static void
3239 brw_set_memory_fence_message(struct brw_codegen *p,
3240 struct brw_inst *insn,
3241 enum brw_message_target sfid,
3242 bool commit_enable)
3243 {
3244 const struct brw_device_info *devinfo = p->devinfo;
3245
3246 brw_set_message_descriptor(p, insn, sfid,
3247 1 /* message length */,
3248 (commit_enable ? 1 : 0) /* response length */,
3249 true /* header present */,
3250 false);
3251
3252 switch (sfid) {
3253 case GEN6_SFID_DATAPORT_RENDER_CACHE:
3254 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3255 break;
3256 case GEN7_SFID_DATAPORT_DATA_CACHE:
3257 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3258 break;
3259 default:
3260 unreachable("Not reached");
3261 }
3262
3263 if (commit_enable)
3264 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3265 }
3266
3267 void
3268 brw_memory_fence(struct brw_codegen *p,
3269 struct brw_reg dst)
3270 {
3271 const struct brw_device_info *devinfo = p->devinfo;
3272 const bool commit_enable = devinfo->gen == 7 && !devinfo->is_haswell;
3273 struct brw_inst *insn;
3274
3275 brw_push_insn_state(p);
3276 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3277 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3278 dst = vec1(dst);
3279
3280 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3281 * message doesn't write anything back.
3282 */
3283 insn = next_insn(p, BRW_OPCODE_SEND);
3284 dst = retype(dst, BRW_REGISTER_TYPE_UW);
3285 brw_set_dest(p, insn, dst);
3286 brw_set_src0(p, insn, dst);
3287 brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3288 commit_enable);
3289
3290 if (devinfo->gen == 7 && !devinfo->is_haswell) {
3291 /* IVB does typed surface access through the render cache, so we need to
3292 * flush it too. Use a different register so both flushes can be
3293 * pipelined by the hardware.
3294 */
3295 insn = next_insn(p, BRW_OPCODE_SEND);
3296 brw_set_dest(p, insn, offset(dst, 1));
3297 brw_set_src0(p, insn, offset(dst, 1));
3298 brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3299 commit_enable);
3300
3301 /* Now write the response of the second message into the response of the
3302 * first to trigger a pipeline stall -- This way future render and data
3303 * cache messages will be properly ordered with respect to past data and
3304 * render cache messages.
3305 */
3306 brw_MOV(p, dst, offset(dst, 1));
3307 }
3308
3309 brw_pop_insn_state(p);
3310 }
3311
3312 void
3313 brw_pixel_interpolator_query(struct brw_codegen *p,
3314 struct brw_reg dest,
3315 struct brw_reg mrf,
3316 bool noperspective,
3317 unsigned mode,
3318 struct brw_reg data,
3319 unsigned msg_length,
3320 unsigned response_length)
3321 {
3322 const struct brw_device_info *devinfo = p->devinfo;
3323 struct brw_inst *insn;
3324 const uint16_t exec_size = brw_inst_exec_size(devinfo, p->current);
3325
3326 /* brw_send_indirect_message will automatically use a direct send message
3327 * if data is actually immediate.
3328 */
3329 insn = brw_send_indirect_message(p,
3330 GEN7_SFID_PIXEL_INTERPOLATOR,
3331 dest,
3332 mrf,
3333 vec1(data));
3334 brw_inst_set_mlen(devinfo, insn, msg_length);
3335 brw_inst_set_rlen(devinfo, insn, response_length);
3336
3337 brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16);
3338 brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
3339 brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
3340 brw_inst_set_pi_message_type(devinfo, insn, mode);
3341 }
3342
3343 void
3344 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst)
3345 {
3346 const struct brw_device_info *devinfo = p->devinfo;
3347 const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current);
3348 const unsigned qtr_control = brw_inst_qtr_control(devinfo, p->current);
3349 brw_inst *inst;
3350
3351 assert(devinfo->gen >= 7);
3352
3353 brw_push_insn_state(p);
3354
3355 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3356 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3357
3358 if (devinfo->gen >= 8) {
3359 /* Getting the first active channel index is easy on Gen8: Just find
3360 * the first bit set in the mask register. The same register exists
3361 * on HSW already but it reads back as all ones when the current
3362 * instruction has execution masking disabled, so it's kind of
3363 * useless.
3364 */
3365 inst = brw_FBL(p, vec1(dst),
3366 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
3367
3368 /* Quarter control has the effect of magically shifting the value of
3369 * this register so you'll get the first active channel relative to
3370 * the specified quarter control as result.
3371 */
3372 } else {
3373 const struct brw_reg flag = brw_flag_reg(1, 0);
3374
3375 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3376
3377 /* Run enough instructions returning zero with execution masking and
3378 * a conditional modifier enabled in order to get the full execution
3379 * mask in f1.0. We could use a single 32-wide move here if it
3380 * weren't because of the hardware bug that causes channel enables to
3381 * be applied incorrectly to the second half of 32-wide instructions
3382 * on Gen7.
3383 */
3384 const unsigned lower_size = MIN2(16, exec_size);
3385 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3386 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3387 brw_imm_uw(0));
3388 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3389 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3390 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3391 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3392 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3393 }
3394
3395 /* Find the first bit set in the exec_size-wide portion of the flag
3396 * register that was updated by the last sequence of MOV
3397 * instructions.
3398 */
3399 const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3400 brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3401 }
3402 } else {
3403 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3404
3405 if (devinfo->gen >= 8) {
3406 /* In SIMD4x2 mode the first active channel index is just the
3407 * negation of the first bit of the mask register.
3408 */
3409 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3410 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3411 brw_imm_ud(1));
3412
3413 } else {
3414 /* Overwrite the destination without and with execution masking to
3415 * find out which of the channels is active.
3416 */
3417 brw_push_insn_state(p);
3418 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3419 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3420 brw_imm_ud(1));
3421
3422 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3423 brw_imm_ud(0));
3424 brw_pop_insn_state(p);
3425 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3426 }
3427 }
3428
3429 brw_pop_insn_state(p);
3430 }
3431
3432 void
3433 brw_broadcast(struct brw_codegen *p,
3434 struct brw_reg dst,
3435 struct brw_reg src,
3436 struct brw_reg idx)
3437 {
3438 const struct brw_device_info *devinfo = p->devinfo;
3439 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3440 brw_inst *inst;
3441
3442 brw_push_insn_state(p);
3443 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3444 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3445
3446 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3447 src.address_mode == BRW_ADDRESS_DIRECT);
3448
3449 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3450 idx.file == BRW_IMMEDIATE_VALUE) {
3451 /* Trivial, the source is already uniform or the index is a constant.
3452 * We will typically not get here if the optimizer is doing its job, but
3453 * asserting would be mean.
3454 */
3455 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3456 brw_MOV(p, dst,
3457 (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3458 stride(suboffset(src, 4 * i), 0, 4, 1)));
3459 } else {
3460 if (align1) {
3461 const struct brw_reg addr =
3462 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3463 const unsigned offset = src.nr * REG_SIZE + src.subnr;
3464 /* Limit in bytes of the signed indirect addressing immediate. */
3465 const unsigned limit = 512;
3466
3467 brw_push_insn_state(p);
3468 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3469 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3470
3471 /* Take into account the component size and horizontal stride. */
3472 assert(src.vstride == src.hstride + src.width);
3473 brw_SHL(p, addr, vec1(idx),
3474 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3475 src.hstride - 1));
3476
3477 /* We can only address up to limit bytes using the indirect
3478 * addressing immediate, account for the difference if the source
3479 * register is above this limit.
3480 */
3481 if (offset >= limit)
3482 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3483
3484 brw_pop_insn_state(p);
3485
3486 /* Use indirect addressing to fetch the specified component. */
3487 brw_MOV(p, dst,
3488 retype(brw_vec1_indirect(addr.subnr, offset % limit),
3489 src.type));
3490 } else {
3491 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3492 * to all bits of a flag register,
3493 */
3494 inst = brw_MOV(p,
3495 brw_null_reg(),
3496 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3497 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3498 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3499 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3500
3501 /* and use predicated SEL to pick the right channel. */
3502 inst = brw_SEL(p, dst,
3503 stride(suboffset(src, 4), 4, 4, 1),
3504 stride(src, 4, 4, 1));
3505 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3506 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3507 }
3508 }
3509
3510 brw_pop_insn_state(p);
3511 }
3512
3513 /**
3514 * This instruction is generated as a single-channel align1 instruction by
3515 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3516 *
3517 * We can't use the typed atomic op in the FS because that has the execution
3518 * mask ANDed with the pixel mask, but we just want to write the one dword for
3519 * all the pixels.
3520 *
3521 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3522 * one u32. So we use the same untyped atomic write message as the pixel
3523 * shader.
3524 *
3525 * The untyped atomic operation requires a BUFFER surface type with RAW
3526 * format, and is only accessible through the legacy DATA_CACHE dataport
3527 * messages.
3528 */
3529 void brw_shader_time_add(struct brw_codegen *p,
3530 struct brw_reg payload,
3531 uint32_t surf_index)
3532 {
3533 const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ?
3534 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3535 GEN7_SFID_DATAPORT_DATA_CACHE);
3536 assert(p->devinfo->gen >= 7);
3537
3538 brw_push_insn_state(p);
3539 brw_set_default_access_mode(p, BRW_ALIGN_1);
3540 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3541 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3542 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3543
3544 /* We use brw_vec1_reg and unmasked because we want to increment the given
3545 * offset only once.
3546 */
3547 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3548 BRW_ARF_NULL, 0));
3549 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3550 payload.nr, 0));
3551 brw_set_src1(p, send, brw_imm_ud(0));
3552 brw_set_message_descriptor(p, send, sfid, 2, 0, false, false);
3553 brw_inst_set_binding_table_index(p->devinfo, send, surf_index);
3554 brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
3555
3556 brw_pop_insn_state(p);
3557 }
3558
3559
3560 /**
3561 * Emit the SEND message for a barrier
3562 */
3563 void
3564 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3565 {
3566 const struct brw_device_info *devinfo = p->devinfo;
3567 struct brw_inst *inst;
3568
3569 assert(devinfo->gen >= 7);
3570
3571 inst = next_insn(p, BRW_OPCODE_SEND);
3572 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3573 brw_set_src0(p, inst, src);
3574 brw_set_src1(p, inst, brw_null_reg());
3575
3576 brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY,
3577 1 /* msg_length */,
3578 0 /* response_length */,
3579 false /* header_present */,
3580 false /* end_of_thread */);
3581
3582 brw_inst_set_gateway_notify(devinfo, inst, 1);
3583 brw_inst_set_gateway_subfuncid(devinfo, inst,
3584 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3585
3586 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3587 }
3588
3589
3590 /**
3591 * Emit the wait instruction for a barrier
3592 */
3593 void
3594 brw_WAIT(struct brw_codegen *p)
3595 {
3596 const struct brw_device_info *devinfo = p->devinfo;
3597 struct brw_inst *insn;
3598
3599 struct brw_reg src = brw_notification_reg();
3600
3601 insn = next_insn(p, BRW_OPCODE_WAIT);
3602 brw_set_dest(p, insn, src);
3603 brw_set_src0(p, insn, src);
3604 brw_set_src1(p, insn, brw_null_reg());
3605
3606 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3607 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3608 }