Merge remote-tracking branch 'mesa-public/master' into vulkan
[mesa.git] / src / mesa / drivers / dri / i965 / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_context.h"
34 #include "brw_defines.h"
35 #include "brw_eu.h"
36
37 #include "util/ralloc.h"
38
39 /**
40 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
41 * registers, implicitly moving the operand to a message register.
42 *
43 * On Sandybridge, this is no longer the case. This function performs the
44 * explicit move; it should be called before emitting a SEND instruction.
45 */
46 void
47 gen6_resolve_implied_move(struct brw_codegen *p,
48 struct brw_reg *src,
49 unsigned msg_reg_nr)
50 {
51 const struct brw_device_info *devinfo = p->devinfo;
52 if (devinfo->gen < 6)
53 return;
54
55 if (src->file == BRW_MESSAGE_REGISTER_FILE)
56 return;
57
58 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
59 brw_push_insn_state(p);
60 brw_set_default_exec_size(p, BRW_EXECUTE_8);
61 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
62 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
63 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
64 retype(*src, BRW_REGISTER_TYPE_UD));
65 brw_pop_insn_state(p);
66 }
67 *src = brw_message_reg(msg_reg_nr);
68 }
69
70 static void
71 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
72 {
73 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
74 * "The send with EOT should use register space R112-R127 for <src>. This is
75 * to enable loading of a new thread into the same slot while the message
76 * with EOT for current thread is pending dispatch."
77 *
78 * Since we're pretending to have 16 MRFs anyway, we may as well use the
79 * registers required for messages with EOT.
80 */
81 const struct brw_device_info *devinfo = p->devinfo;
82 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
83 reg->file = BRW_GENERAL_REGISTER_FILE;
84 reg->nr += GEN7_MRF_HACK_START;
85 }
86 }
87
88 /**
89 * Convert a brw_reg_type enumeration value into the hardware representation.
90 *
91 * The hardware encoding may depend on whether the value is an immediate.
92 */
93 unsigned
94 brw_reg_type_to_hw_type(const struct brw_device_info *devinfo,
95 enum brw_reg_type type, unsigned file)
96 {
97 if (file == BRW_IMMEDIATE_VALUE) {
98 const static int imm_hw_types[] = {
99 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
100 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
101 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
102 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
103 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
104 [BRW_REGISTER_TYPE_UB] = -1,
105 [BRW_REGISTER_TYPE_B] = -1,
106 [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
107 [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
108 [BRW_REGISTER_TYPE_V] = BRW_HW_REG_IMM_TYPE_V,
109 [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
110 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
111 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
112 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
113 };
114 assert(type < ARRAY_SIZE(imm_hw_types));
115 assert(imm_hw_types[type] != -1);
116 assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
117 return imm_hw_types[type];
118 } else {
119 /* Non-immediate registers */
120 const static int hw_types[] = {
121 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
122 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
123 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
124 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
125 [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
126 [BRW_REGISTER_TYPE_B] = BRW_HW_REG_NON_IMM_TYPE_B,
127 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
128 [BRW_REGISTER_TYPE_UV] = -1,
129 [BRW_REGISTER_TYPE_VF] = -1,
130 [BRW_REGISTER_TYPE_V] = -1,
131 [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
132 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
133 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
134 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
135 };
136 assert(type < ARRAY_SIZE(hw_types));
137 assert(hw_types[type] != -1);
138 assert(devinfo->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
139 assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_HF);
140 return hw_types[type];
141 }
142 }
143
144 void
145 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
146 {
147 const struct brw_device_info *devinfo = p->devinfo;
148
149 if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
150 dest.file != BRW_MESSAGE_REGISTER_FILE)
151 assert(dest.nr < 128);
152
153 gen7_convert_mrf_to_grf(p, &dest);
154
155 brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
156 brw_inst_set_dst_reg_type(devinfo, inst,
157 brw_reg_type_to_hw_type(devinfo, dest.type,
158 dest.file));
159 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
160
161 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
162 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
163
164 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
165 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
166 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
167 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
168 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
169 } else {
170 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
171 brw_inst_set_da16_writemask(devinfo, inst, dest.dw1.bits.writemask);
172 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
173 dest.file == BRW_MESSAGE_REGISTER_FILE) {
174 assert(dest.dw1.bits.writemask != 0);
175 }
176 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
177 * Although Dst.HorzStride is a don't care for Align16, HW needs
178 * this to be programmed as "01".
179 */
180 brw_inst_set_dst_hstride(devinfo, inst, 1);
181 }
182 } else {
183 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
184
185 /* These are different sizes in align1 vs align16:
186 */
187 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
188 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
189 dest.dw1.bits.indirect_offset);
190 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
191 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
192 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
193 } else {
194 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
195 dest.dw1.bits.indirect_offset);
196 /* even ignored in da16, still need to set as '01' */
197 brw_inst_set_dst_hstride(devinfo, inst, 1);
198 }
199 }
200
201 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
202 * or 16 (SIMD16), as that's normally correct. However, when dealing with
203 * small registers, we automatically reduce it to match the register size.
204 */
205 if (dest.width < BRW_EXECUTE_8)
206 brw_inst_set_exec_size(devinfo, inst, dest.width);
207 }
208
209 extern int reg_type_size[];
210
211 static void
212 validate_reg(const struct brw_device_info *devinfo,
213 brw_inst *inst, struct brw_reg reg)
214 {
215 const int hstride_for_reg[] = {0, 1, 2, 4};
216 const int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32};
217 const int width_for_reg[] = {1, 2, 4, 8, 16};
218 const int execsize_for_reg[] = {1, 2, 4, 8, 16, 32};
219 int width, hstride, vstride, execsize;
220
221 if (reg.file == BRW_IMMEDIATE_VALUE) {
222 /* 3.3.6: Region Parameters. Restriction: Immediate vectors
223 * mean the destination has to be 128-bit aligned and the
224 * destination horiz stride has to be a word.
225 */
226 if (reg.type == BRW_REGISTER_TYPE_V) {
227 assert(hstride_for_reg[brw_inst_dst_hstride(devinfo, inst)] *
228 reg_type_size[brw_inst_dst_reg_type(devinfo, inst)] == 2);
229 }
230
231 return;
232 }
233
234 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
235 reg.file == BRW_ARF_NULL)
236 return;
237
238 assert(reg.hstride >= 0 && reg.hstride < ARRAY_SIZE(hstride_for_reg));
239 hstride = hstride_for_reg[reg.hstride];
240
241 if (reg.vstride == 0xf) {
242 vstride = -1;
243 } else {
244 assert(reg.vstride >= 0 && reg.vstride < ARRAY_SIZE(vstride_for_reg));
245 vstride = vstride_for_reg[reg.vstride];
246 }
247
248 assert(reg.width >= 0 && reg.width < ARRAY_SIZE(width_for_reg));
249 width = width_for_reg[reg.width];
250
251 assert(brw_inst_exec_size(devinfo, inst) >= 0 &&
252 brw_inst_exec_size(devinfo, inst) < ARRAY_SIZE(execsize_for_reg));
253 execsize = execsize_for_reg[brw_inst_exec_size(devinfo, inst)];
254
255 /* Restrictions from 3.3.10: Register Region Restrictions. */
256 /* 3. */
257 assert(execsize >= width);
258
259 /* 4. */
260 if (execsize == width && hstride != 0) {
261 assert(vstride == -1 || vstride == width * hstride);
262 }
263
264 /* 5. */
265 if (execsize == width && hstride == 0) {
266 /* no restriction on vstride. */
267 }
268
269 /* 6. */
270 if (width == 1) {
271 assert(hstride == 0);
272 }
273
274 /* 7. */
275 if (execsize == 1 && width == 1) {
276 assert(hstride == 0);
277 assert(vstride == 0);
278 }
279
280 /* 8. */
281 if (vstride == 0 && hstride == 0) {
282 assert(width == 1);
283 }
284
285 /* 10. Check destination issues. */
286 }
287
288 static bool
289 is_compactable_immediate(unsigned imm)
290 {
291 /* We get the low 12 bits as-is. */
292 imm &= ~0xfff;
293
294 /* We get one bit replicated through the top 20 bits. */
295 return imm == 0 || imm == 0xfffff000;
296 }
297
298 void
299 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
300 {
301 const struct brw_device_info *devinfo = p->devinfo;
302
303 if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
304 assert(reg.nr < 128);
305
306 gen7_convert_mrf_to_grf(p, &reg);
307
308 if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
309 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
310 /* Any source modifiers or regions will be ignored, since this just
311 * identifies the MRF/GRF to start reading the message contents from.
312 * Check for some likely failures.
313 */
314 assert(!reg.negate);
315 assert(!reg.abs);
316 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
317 }
318
319 validate_reg(devinfo, inst, reg);
320
321 brw_inst_set_src0_reg_file(devinfo, inst, reg.file);
322 brw_inst_set_src0_reg_type(devinfo, inst,
323 brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
324 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
325 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
326 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
327
328 if (reg.file == BRW_IMMEDIATE_VALUE) {
329 brw_inst_set_imm_ud(devinfo, inst, reg.dw1.ud);
330
331 /* The Bspec's section titled "Non-present Operands" claims that if src0
332 * is an immediate that src1's type must be the same as that of src0.
333 *
334 * The SNB+ DataTypeIndex instruction compaction tables contain mappings
335 * that do not follow this rule. E.g., from the IVB/HSW table:
336 *
337 * DataTypeIndex 18-Bit Mapping Mapped Meaning
338 * 3 001000001011111101 r:f | i:vf | a:ud | <1> | dir |
339 *
340 * And from the SNB table:
341 *
342 * DataTypeIndex 18-Bit Mapping Mapped Meaning
343 * 8 001000000111101100 a:w | i:w | a:ud | <1> | dir |
344 *
345 * Neither of these cause warnings from the simulator when used,
346 * compacted or otherwise. In fact, all compaction mappings that have an
347 * immediate in src0 use a:ud for src1.
348 *
349 * The GM45 instruction compaction tables do not contain mapped meanings
350 * so it's not clear whether it has the restriction. We'll assume it was
351 * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
352 */
353 brw_inst_set_src1_reg_file(devinfo, inst, BRW_ARCHITECTURE_REGISTER_FILE);
354 if (devinfo->gen < 6) {
355 brw_inst_set_src1_reg_type(devinfo, inst,
356 brw_inst_src0_reg_type(devinfo, inst));
357 } else {
358 brw_inst_set_src1_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
359 }
360
361 /* Compacted instructions only have 12-bits (plus 1 for the other 20)
362 * for immediate values. Presumably the hardware engineers realized
363 * that the only useful floating-point value that could be represented
364 * in this format is 0.0, which can also be represented as a VF-typed
365 * immediate, so they gave us the previously mentioned mapping on IVB+.
366 *
367 * Strangely, we do have a mapping for imm:f in src1, so we don't need
368 * to do this there.
369 *
370 * If we see a 0.0:F, change the type to VF so that it can be compacted.
371 */
372 if (brw_inst_imm_ud(devinfo, inst) == 0x0 &&
373 brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_F) {
374 brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_IMM_TYPE_VF);
375 }
376
377 /* There are no mappings for dst:d | i:d, so if the immediate is suitable
378 * set the types to :UD so the instruction can be compacted.
379 */
380 if (is_compactable_immediate(brw_inst_imm_ud(devinfo, inst)) &&
381 brw_inst_cond_modifier(devinfo, inst) == BRW_CONDITIONAL_NONE &&
382 brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D &&
383 brw_inst_dst_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D) {
384 brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
385 brw_inst_set_dst_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
386 }
387 } else {
388 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
389 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
390 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
391 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
392 } else {
393 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
394 }
395 } else {
396 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
397
398 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
399 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.dw1.bits.indirect_offset);
400 } else {
401 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.dw1.bits.indirect_offset);
402 }
403 }
404
405 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
406 if (reg.width == BRW_WIDTH_1 &&
407 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
408 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
409 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
410 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
411 } else {
412 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
413 brw_inst_set_src0_width(devinfo, inst, reg.width);
414 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
415 }
416 } else {
417 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
418 BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X));
419 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
420 BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y));
421 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
422 BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z));
423 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
424 BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W));
425
426 /* This is an oddity of the fact we're using the same
427 * descriptions for registers in align_16 as align_1:
428 */
429 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
430 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
431 else
432 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
433 }
434 }
435 }
436
437
438 void
439 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
440 {
441 const struct brw_device_info *devinfo = p->devinfo;
442
443 if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
444 assert(reg.nr < 128);
445
446 gen7_convert_mrf_to_grf(p, &reg);
447 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
448
449 validate_reg(devinfo, inst, reg);
450
451 brw_inst_set_src1_reg_file(devinfo, inst, reg.file);
452 brw_inst_set_src1_reg_type(devinfo, inst,
453 brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
454 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
455 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
456
457 /* Only src1 can be immediate in two-argument instructions.
458 */
459 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
460
461 if (reg.file == BRW_IMMEDIATE_VALUE) {
462 brw_inst_set_imm_ud(devinfo, inst, reg.dw1.ud);
463 } else {
464 /* This is a hardware restriction, which may or may not be lifted
465 * in the future:
466 */
467 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
468 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
469
470 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
471 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
472 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
473 } else {
474 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
475 }
476
477 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
478 if (reg.width == BRW_WIDTH_1 &&
479 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
480 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
481 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
482 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
483 } else {
484 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
485 brw_inst_set_src1_width(devinfo, inst, reg.width);
486 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
487 }
488 } else {
489 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
490 BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X));
491 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
492 BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y));
493 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
494 BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z));
495 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
496 BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W));
497
498 /* This is an oddity of the fact we're using the same
499 * descriptions for registers in align_16 as align_1:
500 */
501 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
502 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
503 else
504 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
505 }
506 }
507 }
508
509 /**
510 * Set the Message Descriptor and Extended Message Descriptor fields
511 * for SEND messages.
512 *
513 * \note This zeroes out the Function Control bits, so it must be called
514 * \b before filling out any message-specific data. Callers can
515 * choose not to fill in irrelevant bits; they will be zero.
516 */
517 static void
518 brw_set_message_descriptor(struct brw_codegen *p,
519 brw_inst *inst,
520 enum brw_message_target sfid,
521 unsigned msg_length,
522 unsigned response_length,
523 bool header_present,
524 bool end_of_thread)
525 {
526 const struct brw_device_info *devinfo = p->devinfo;
527
528 brw_set_src1(p, inst, brw_imm_d(0));
529
530 /* For indirect sends, `inst` will not be the SEND/SENDC instruction
531 * itself; instead, it will be a MOV/OR into the address register.
532 *
533 * In this case, we avoid setting the extended message descriptor bits,
534 * since they go on the later SEND/SENDC instead and if set here would
535 * instead clobber the conditionalmod bits.
536 */
537 unsigned opcode = brw_inst_opcode(devinfo, inst);
538 if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
539 brw_inst_set_sfid(devinfo, inst, sfid);
540 }
541
542 brw_inst_set_mlen(devinfo, inst, msg_length);
543 brw_inst_set_rlen(devinfo, inst, response_length);
544 brw_inst_set_eot(devinfo, inst, end_of_thread);
545
546 if (devinfo->gen >= 5) {
547 brw_inst_set_header_present(devinfo, inst, header_present);
548 }
549 }
550
551 static void brw_set_math_message( struct brw_codegen *p,
552 brw_inst *inst,
553 unsigned function,
554 unsigned integer_type,
555 bool low_precision,
556 unsigned dataType )
557 {
558 const struct brw_device_info *devinfo = p->devinfo;
559 unsigned msg_length;
560 unsigned response_length;
561
562 /* Infer message length from the function */
563 switch (function) {
564 case BRW_MATH_FUNCTION_POW:
565 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
566 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
567 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
568 msg_length = 2;
569 break;
570 default:
571 msg_length = 1;
572 break;
573 }
574
575 /* Infer response length from the function */
576 switch (function) {
577 case BRW_MATH_FUNCTION_SINCOS:
578 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
579 response_length = 2;
580 break;
581 default:
582 response_length = 1;
583 break;
584 }
585
586
587 brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
588 msg_length, response_length, false, false);
589 brw_inst_set_math_msg_function(devinfo, inst, function);
590 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
591 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
592 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
593 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
594 brw_inst_set_saturate(devinfo, inst, 0);
595 }
596
597
598 static void brw_set_ff_sync_message(struct brw_codegen *p,
599 brw_inst *insn,
600 bool allocate,
601 unsigned response_length,
602 bool end_of_thread)
603 {
604 const struct brw_device_info *devinfo = p->devinfo;
605
606 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
607 1, response_length, true, end_of_thread);
608 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
609 brw_inst_set_urb_allocate(devinfo, insn, allocate);
610 /* The following fields are not used by FF_SYNC: */
611 brw_inst_set_urb_global_offset(devinfo, insn, 0);
612 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
613 brw_inst_set_urb_used(devinfo, insn, 0);
614 brw_inst_set_urb_complete(devinfo, insn, 0);
615 }
616
617 static void brw_set_urb_message( struct brw_codegen *p,
618 brw_inst *insn,
619 enum brw_urb_write_flags flags,
620 unsigned msg_length,
621 unsigned response_length,
622 unsigned offset,
623 unsigned swizzle_control )
624 {
625 const struct brw_device_info *devinfo = p->devinfo;
626
627 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
628 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
629 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
630
631 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
632 msg_length, response_length, true,
633 flags & BRW_URB_WRITE_EOT);
634
635 if (flags & BRW_URB_WRITE_OWORD) {
636 assert(msg_length == 2); /* header + one OWORD of data */
637 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
638 } else {
639 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
640 }
641
642 brw_inst_set_urb_global_offset(devinfo, insn, offset);
643 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
644
645 if (devinfo->gen < 8) {
646 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
647 }
648
649 if (devinfo->gen < 7) {
650 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
651 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
652 } else {
653 brw_inst_set_urb_per_slot_offset(devinfo, insn,
654 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
655 }
656 }
657
658 void
659 brw_set_dp_write_message(struct brw_codegen *p,
660 brw_inst *insn,
661 unsigned binding_table_index,
662 unsigned msg_control,
663 unsigned msg_type,
664 unsigned msg_length,
665 bool header_present,
666 unsigned last_render_target,
667 unsigned response_length,
668 unsigned end_of_thread,
669 unsigned send_commit_msg)
670 {
671 const struct brw_device_info *devinfo = p->devinfo;
672 unsigned sfid;
673
674 if (devinfo->gen >= 7) {
675 /* Use the Render Cache for RT writes; otherwise use the Data Cache */
676 if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
677 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
678 else
679 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
680 } else if (devinfo->gen == 6) {
681 /* Use the render cache for all write messages. */
682 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
683 } else {
684 sfid = BRW_SFID_DATAPORT_WRITE;
685 }
686
687 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
688 header_present, end_of_thread);
689
690 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
691 brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type);
692 brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control);
693 brw_inst_set_rt_last(devinfo, insn, last_render_target);
694 if (devinfo->gen < 7) {
695 brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg);
696 }
697 }
698
699 void
700 brw_set_dp_read_message(struct brw_codegen *p,
701 brw_inst *insn,
702 unsigned binding_table_index,
703 unsigned msg_control,
704 unsigned msg_type,
705 unsigned target_cache,
706 unsigned msg_length,
707 bool header_present,
708 unsigned response_length)
709 {
710 const struct brw_device_info *devinfo = p->devinfo;
711 unsigned sfid;
712
713 if (devinfo->gen >= 7) {
714 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
715 } else if (devinfo->gen == 6) {
716 if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
717 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
718 else
719 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
720 } else {
721 sfid = BRW_SFID_DATAPORT_READ;
722 }
723
724 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
725 header_present, false);
726
727 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
728 brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type);
729 brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control);
730 if (devinfo->gen < 6)
731 brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache);
732 }
733
734 void
735 brw_set_sampler_message(struct brw_codegen *p,
736 brw_inst *inst,
737 unsigned binding_table_index,
738 unsigned sampler,
739 unsigned msg_type,
740 unsigned response_length,
741 unsigned msg_length,
742 unsigned header_present,
743 unsigned simd_mode,
744 unsigned return_format)
745 {
746 const struct brw_device_info *devinfo = p->devinfo;
747
748 brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
749 response_length, header_present, false);
750
751 brw_inst_set_binding_table_index(devinfo, inst, binding_table_index);
752 brw_inst_set_sampler(devinfo, inst, sampler);
753 brw_inst_set_sampler_msg_type(devinfo, inst, msg_type);
754 if (devinfo->gen >= 5) {
755 brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode);
756 } else if (devinfo->gen == 4 && !devinfo->is_g4x) {
757 brw_inst_set_sampler_return_format(devinfo, inst, return_format);
758 }
759 }
760
761 static void
762 gen7_set_dp_scratch_message(struct brw_codegen *p,
763 brw_inst *inst,
764 bool write,
765 bool dword,
766 bool invalidate_after_read,
767 unsigned num_regs,
768 unsigned addr_offset,
769 unsigned mlen,
770 unsigned rlen,
771 bool header_present)
772 {
773 const struct brw_device_info *devinfo = p->devinfo;
774 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
775 (devinfo->gen >= 8 && num_regs == 8));
776 brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
777 mlen, rlen, header_present, false);
778 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
779 brw_inst_set_scratch_read_write(devinfo, inst, write);
780 brw_inst_set_scratch_type(devinfo, inst, dword);
781 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
782 brw_inst_set_scratch_block_size(devinfo, inst, ffs(num_regs) - 1);
783 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
784 }
785
786 #define next_insn brw_next_insn
787 brw_inst *
788 brw_next_insn(struct brw_codegen *p, unsigned opcode)
789 {
790 const struct brw_device_info *devinfo = p->devinfo;
791 brw_inst *insn;
792
793 if (p->nr_insn + 1 > p->store_size) {
794 p->store_size <<= 1;
795 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
796 }
797
798 p->next_insn_offset += 16;
799 insn = &p->store[p->nr_insn++];
800 memcpy(insn, p->current, sizeof(*insn));
801
802 brw_inst_set_opcode(devinfo, insn, opcode);
803 return insn;
804 }
805
806 static brw_inst *
807 brw_alu1(struct brw_codegen *p, unsigned opcode,
808 struct brw_reg dest, struct brw_reg src)
809 {
810 brw_inst *insn = next_insn(p, opcode);
811 brw_set_dest(p, insn, dest);
812 brw_set_src0(p, insn, src);
813 return insn;
814 }
815
816 static brw_inst *
817 brw_alu2(struct brw_codegen *p, unsigned opcode,
818 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
819 {
820 brw_inst *insn = next_insn(p, opcode);
821 brw_set_dest(p, insn, dest);
822 brw_set_src0(p, insn, src0);
823 brw_set_src1(p, insn, src1);
824 return insn;
825 }
826
827 static int
828 get_3src_subreg_nr(struct brw_reg reg)
829 {
830 if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
831 assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
832 return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
833 } else {
834 return reg.subnr / 4;
835 }
836 }
837
838 static brw_inst *
839 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
840 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
841 {
842 const struct brw_device_info *devinfo = p->devinfo;
843 brw_inst *inst = next_insn(p, opcode);
844
845 gen7_convert_mrf_to_grf(p, &dest);
846
847 assert(brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16);
848
849 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
850 dest.file == BRW_MESSAGE_REGISTER_FILE);
851 assert(dest.nr < 128);
852 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
853 assert(dest.type == BRW_REGISTER_TYPE_F ||
854 dest.type == BRW_REGISTER_TYPE_D ||
855 dest.type == BRW_REGISTER_TYPE_UD);
856 if (devinfo->gen == 6) {
857 brw_inst_set_3src_dst_reg_file(devinfo, inst,
858 dest.file == BRW_MESSAGE_REGISTER_FILE);
859 }
860 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
861 brw_inst_set_3src_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
862 brw_inst_set_3src_dst_writemask(devinfo, inst, dest.dw1.bits.writemask);
863
864 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
865 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
866 assert(src0.nr < 128);
867 brw_inst_set_3src_src0_swizzle(devinfo, inst, src0.dw1.bits.swizzle);
868 brw_inst_set_3src_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
869 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
870 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
871 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
872 brw_inst_set_3src_src0_rep_ctrl(devinfo, inst,
873 src0.vstride == BRW_VERTICAL_STRIDE_0);
874
875 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
876 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
877 assert(src1.nr < 128);
878 brw_inst_set_3src_src1_swizzle(devinfo, inst, src1.dw1.bits.swizzle);
879 brw_inst_set_3src_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
880 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
881 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
882 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
883 brw_inst_set_3src_src1_rep_ctrl(devinfo, inst,
884 src1.vstride == BRW_VERTICAL_STRIDE_0);
885
886 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
887 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
888 assert(src2.nr < 128);
889 brw_inst_set_3src_src2_swizzle(devinfo, inst, src2.dw1.bits.swizzle);
890 brw_inst_set_3src_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
891 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
892 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
893 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
894 brw_inst_set_3src_src2_rep_ctrl(devinfo, inst,
895 src2.vstride == BRW_VERTICAL_STRIDE_0);
896
897 if (devinfo->gen >= 7) {
898 /* Set both the source and destination types based on dest.type,
899 * ignoring the source register types. The MAD and LRP emitters ensure
900 * that all four types are float. The BFE and BFI2 emitters, however,
901 * may send us mixed D and UD types and want us to ignore that and use
902 * the destination type.
903 */
904 switch (dest.type) {
905 case BRW_REGISTER_TYPE_F:
906 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_F);
907 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_F);
908 break;
909 case BRW_REGISTER_TYPE_D:
910 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_D);
911 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_D);
912 break;
913 case BRW_REGISTER_TYPE_UD:
914 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_UD);
915 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_UD);
916 break;
917 default:
918 unreachable("not reached");
919 }
920 }
921
922 return inst;
923 }
924
925
926 /***********************************************************************
927 * Convenience routines.
928 */
929 #define ALU1(OP) \
930 brw_inst *brw_##OP(struct brw_codegen *p, \
931 struct brw_reg dest, \
932 struct brw_reg src0) \
933 { \
934 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
935 }
936
937 #define ALU2(OP) \
938 brw_inst *brw_##OP(struct brw_codegen *p, \
939 struct brw_reg dest, \
940 struct brw_reg src0, \
941 struct brw_reg src1) \
942 { \
943 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
944 }
945
946 #define ALU3(OP) \
947 brw_inst *brw_##OP(struct brw_codegen *p, \
948 struct brw_reg dest, \
949 struct brw_reg src0, \
950 struct brw_reg src1, \
951 struct brw_reg src2) \
952 { \
953 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
954 }
955
956 #define ALU3F(OP) \
957 brw_inst *brw_##OP(struct brw_codegen *p, \
958 struct brw_reg dest, \
959 struct brw_reg src0, \
960 struct brw_reg src1, \
961 struct brw_reg src2) \
962 { \
963 assert(dest.type == BRW_REGISTER_TYPE_F); \
964 assert(src0.type == BRW_REGISTER_TYPE_F); \
965 assert(src1.type == BRW_REGISTER_TYPE_F); \
966 assert(src2.type == BRW_REGISTER_TYPE_F); \
967 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
968 }
969
970 /* Rounding operations (other than RNDD) require two instructions - the first
971 * stores a rounded value (possibly the wrong way) in the dest register, but
972 * also sets a per-channel "increment bit" in the flag register. A predicated
973 * add of 1.0 fixes dest to contain the desired result.
974 *
975 * Sandybridge and later appear to round correctly without an ADD.
976 */
977 #define ROUND(OP) \
978 void brw_##OP(struct brw_codegen *p, \
979 struct brw_reg dest, \
980 struct brw_reg src) \
981 { \
982 const struct brw_device_info *devinfo = p->devinfo; \
983 brw_inst *rnd, *add; \
984 rnd = next_insn(p, BRW_OPCODE_##OP); \
985 brw_set_dest(p, rnd, dest); \
986 brw_set_src0(p, rnd, src); \
987 \
988 if (devinfo->gen < 6) { \
989 /* turn on round-increments */ \
990 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
991 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
992 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
993 } \
994 }
995
996
997 ALU1(MOV)
998 ALU2(SEL)
999 ALU1(NOT)
1000 ALU2(AND)
1001 ALU2(OR)
1002 ALU2(XOR)
1003 ALU2(SHR)
1004 ALU2(SHL)
1005 ALU2(ASR)
1006 ALU1(FRC)
1007 ALU1(RNDD)
1008 ALU2(MAC)
1009 ALU2(MACH)
1010 ALU1(LZD)
1011 ALU2(DP4)
1012 ALU2(DPH)
1013 ALU2(DP3)
1014 ALU2(DP2)
1015 ALU3F(MAD)
1016 ALU3F(LRP)
1017 ALU1(BFREV)
1018 ALU3(BFE)
1019 ALU2(BFI1)
1020 ALU3(BFI2)
1021 ALU1(FBH)
1022 ALU1(FBL)
1023 ALU1(CBIT)
1024 ALU2(ADDC)
1025 ALU2(SUBB)
1026
1027 ROUND(RNDZ)
1028 ROUND(RNDE)
1029
1030
1031 brw_inst *
1032 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1033 struct brw_reg src0, struct brw_reg src1)
1034 {
1035 /* 6.2.2: add */
1036 if (src0.type == BRW_REGISTER_TYPE_F ||
1037 (src0.file == BRW_IMMEDIATE_VALUE &&
1038 src0.type == BRW_REGISTER_TYPE_VF)) {
1039 assert(src1.type != BRW_REGISTER_TYPE_UD);
1040 assert(src1.type != BRW_REGISTER_TYPE_D);
1041 }
1042
1043 if (src1.type == BRW_REGISTER_TYPE_F ||
1044 (src1.file == BRW_IMMEDIATE_VALUE &&
1045 src1.type == BRW_REGISTER_TYPE_VF)) {
1046 assert(src0.type != BRW_REGISTER_TYPE_UD);
1047 assert(src0.type != BRW_REGISTER_TYPE_D);
1048 }
1049
1050 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1051 }
1052
1053 brw_inst *
1054 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1055 struct brw_reg src0, struct brw_reg src1)
1056 {
1057 assert(dest.type == src0.type);
1058 assert(src0.type == src1.type);
1059 switch (src0.type) {
1060 case BRW_REGISTER_TYPE_B:
1061 case BRW_REGISTER_TYPE_UB:
1062 case BRW_REGISTER_TYPE_W:
1063 case BRW_REGISTER_TYPE_UW:
1064 case BRW_REGISTER_TYPE_D:
1065 case BRW_REGISTER_TYPE_UD:
1066 break;
1067 default:
1068 unreachable("Bad type for brw_AVG");
1069 }
1070
1071 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1072 }
1073
1074 brw_inst *
1075 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1076 struct brw_reg src0, struct brw_reg src1)
1077 {
1078 /* 6.32.38: mul */
1079 if (src0.type == BRW_REGISTER_TYPE_D ||
1080 src0.type == BRW_REGISTER_TYPE_UD ||
1081 src1.type == BRW_REGISTER_TYPE_D ||
1082 src1.type == BRW_REGISTER_TYPE_UD) {
1083 assert(dest.type != BRW_REGISTER_TYPE_F);
1084 }
1085
1086 if (src0.type == BRW_REGISTER_TYPE_F ||
1087 (src0.file == BRW_IMMEDIATE_VALUE &&
1088 src0.type == BRW_REGISTER_TYPE_VF)) {
1089 assert(src1.type != BRW_REGISTER_TYPE_UD);
1090 assert(src1.type != BRW_REGISTER_TYPE_D);
1091 }
1092
1093 if (src1.type == BRW_REGISTER_TYPE_F ||
1094 (src1.file == BRW_IMMEDIATE_VALUE &&
1095 src1.type == BRW_REGISTER_TYPE_VF)) {
1096 assert(src0.type != BRW_REGISTER_TYPE_UD);
1097 assert(src0.type != BRW_REGISTER_TYPE_D);
1098 }
1099
1100 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1101 src0.nr != BRW_ARF_ACCUMULATOR);
1102 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1103 src1.nr != BRW_ARF_ACCUMULATOR);
1104
1105 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1106 }
1107
1108 brw_inst *
1109 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1110 struct brw_reg src0, struct brw_reg src1)
1111 {
1112 src0.vstride = BRW_VERTICAL_STRIDE_0;
1113 src0.width = BRW_WIDTH_1;
1114 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1115 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1116 }
1117
1118 brw_inst *
1119 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1120 struct brw_reg src0, struct brw_reg src1)
1121 {
1122 src0.vstride = BRW_VERTICAL_STRIDE_0;
1123 src0.width = BRW_WIDTH_1;
1124 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1125 src1.vstride = BRW_VERTICAL_STRIDE_8;
1126 src1.width = BRW_WIDTH_8;
1127 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1128 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1129 }
1130
1131 brw_inst *
1132 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1133 {
1134 const struct brw_device_info *devinfo = p->devinfo;
1135 const bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1136 /* The F32TO16 instruction doesn't support 32-bit destination types in
1137 * Align1 mode, and neither does the Gen8 implementation in terms of a
1138 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1139 * an undocumented feature.
1140 */
1141 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1142 (!align16 || devinfo->gen >= 8));
1143 brw_inst *inst;
1144
1145 if (align16) {
1146 assert(dst.type == BRW_REGISTER_TYPE_UD);
1147 } else {
1148 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1149 dst.type == BRW_REGISTER_TYPE_W ||
1150 dst.type == BRW_REGISTER_TYPE_UW ||
1151 dst.type == BRW_REGISTER_TYPE_HF);
1152 }
1153
1154 brw_push_insn_state(p);
1155
1156 if (needs_zero_fill) {
1157 brw_set_default_access_mode(p, BRW_ALIGN_1);
1158 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1159 }
1160
1161 if (devinfo->gen >= 8) {
1162 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1163 } else {
1164 assert(devinfo->gen == 7);
1165 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1166 }
1167
1168 if (needs_zero_fill) {
1169 brw_inst_set_no_dd_clear(devinfo, inst, true);
1170 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_ud(0u));
1171 brw_inst_set_no_dd_check(devinfo, inst, true);
1172 }
1173
1174 brw_pop_insn_state(p);
1175 return inst;
1176 }
1177
1178 brw_inst *
1179 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1180 {
1181 const struct brw_device_info *devinfo = p->devinfo;
1182 bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1183
1184 if (align16) {
1185 assert(src.type == BRW_REGISTER_TYPE_UD);
1186 } else {
1187 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1188 *
1189 * Because this instruction does not have a 16-bit floating-point
1190 * type, the source data type must be Word (W). The destination type
1191 * must be F (Float).
1192 */
1193 if (src.type == BRW_REGISTER_TYPE_UD)
1194 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1195
1196 assert(src.type == BRW_REGISTER_TYPE_W ||
1197 src.type == BRW_REGISTER_TYPE_UW ||
1198 src.type == BRW_REGISTER_TYPE_HF);
1199 }
1200
1201 if (devinfo->gen >= 8) {
1202 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1203 } else {
1204 assert(devinfo->gen == 7);
1205 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1206 }
1207 }
1208
1209
1210 void brw_NOP(struct brw_codegen *p)
1211 {
1212 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1213 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1214 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1215 brw_set_src1(p, insn, brw_imm_ud(0x0));
1216 }
1217
1218
1219
1220
1221
1222 /***********************************************************************
1223 * Comparisons, if/else/endif
1224 */
1225
1226 brw_inst *
1227 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1228 unsigned predicate_control)
1229 {
1230 const struct brw_device_info *devinfo = p->devinfo;
1231 struct brw_reg ip = brw_ip_reg();
1232 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1233
1234 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_2);
1235 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1236 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1237 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1238
1239 return inst;
1240 }
1241
1242 static void
1243 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1244 {
1245 p->if_stack[p->if_stack_depth] = inst - p->store;
1246
1247 p->if_stack_depth++;
1248 if (p->if_stack_array_size <= p->if_stack_depth) {
1249 p->if_stack_array_size *= 2;
1250 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1251 p->if_stack_array_size);
1252 }
1253 }
1254
1255 static brw_inst *
1256 pop_if_stack(struct brw_codegen *p)
1257 {
1258 p->if_stack_depth--;
1259 return &p->store[p->if_stack[p->if_stack_depth]];
1260 }
1261
1262 static void
1263 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1264 {
1265 if (p->loop_stack_array_size < p->loop_stack_depth) {
1266 p->loop_stack_array_size *= 2;
1267 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1268 p->loop_stack_array_size);
1269 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1270 p->loop_stack_array_size);
1271 }
1272
1273 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1274 p->loop_stack_depth++;
1275 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1276 }
1277
1278 static brw_inst *
1279 get_inner_do_insn(struct brw_codegen *p)
1280 {
1281 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1282 }
1283
1284 /* EU takes the value from the flag register and pushes it onto some
1285 * sort of a stack (presumably merging with any flag value already on
1286 * the stack). Within an if block, the flags at the top of the stack
1287 * control execution on each channel of the unit, eg. on each of the
1288 * 16 pixel values in our wm programs.
1289 *
1290 * When the matching 'else' instruction is reached (presumably by
1291 * countdown of the instruction count patched in by our ELSE/ENDIF
1292 * functions), the relevant flags are inverted.
1293 *
1294 * When the matching 'endif' instruction is reached, the flags are
1295 * popped off. If the stack is now empty, normal execution resumes.
1296 */
1297 brw_inst *
1298 brw_IF(struct brw_codegen *p, unsigned execute_size)
1299 {
1300 const struct brw_device_info *devinfo = p->devinfo;
1301 brw_inst *insn;
1302
1303 insn = next_insn(p, BRW_OPCODE_IF);
1304
1305 /* Override the defaults for this instruction:
1306 */
1307 if (devinfo->gen < 6) {
1308 brw_set_dest(p, insn, brw_ip_reg());
1309 brw_set_src0(p, insn, brw_ip_reg());
1310 brw_set_src1(p, insn, brw_imm_d(0x0));
1311 } else if (devinfo->gen == 6) {
1312 brw_set_dest(p, insn, brw_imm_w(0));
1313 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1314 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1315 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1316 } else if (devinfo->gen == 7) {
1317 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1318 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1319 brw_set_src1(p, insn, brw_imm_w(0));
1320 brw_inst_set_jip(devinfo, insn, 0);
1321 brw_inst_set_uip(devinfo, insn, 0);
1322 } else {
1323 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1324 brw_set_src0(p, insn, brw_imm_d(0));
1325 brw_inst_set_jip(devinfo, insn, 0);
1326 brw_inst_set_uip(devinfo, insn, 0);
1327 }
1328
1329 brw_inst_set_exec_size(devinfo, insn, execute_size);
1330 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1331 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1332 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1333 if (!p->single_program_flow && devinfo->gen < 6)
1334 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1335
1336 push_if_stack(p, insn);
1337 p->if_depth_in_loop[p->loop_stack_depth]++;
1338 return insn;
1339 }
1340
1341 /* This function is only used for gen6-style IF instructions with an
1342 * embedded comparison (conditional modifier). It is not used on gen7.
1343 */
1344 brw_inst *
1345 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1346 struct brw_reg src0, struct brw_reg src1)
1347 {
1348 const struct brw_device_info *devinfo = p->devinfo;
1349 brw_inst *insn;
1350
1351 insn = next_insn(p, BRW_OPCODE_IF);
1352
1353 brw_set_dest(p, insn, brw_imm_w(0));
1354 brw_inst_set_exec_size(devinfo, insn, p->compressed ? BRW_EXECUTE_16
1355 : BRW_EXECUTE_8);
1356 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1357 brw_set_src0(p, insn, src0);
1358 brw_set_src1(p, insn, src1);
1359
1360 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1361 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1362 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1363
1364 push_if_stack(p, insn);
1365 return insn;
1366 }
1367
1368 /**
1369 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1370 */
1371 static void
1372 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1373 brw_inst *if_inst, brw_inst *else_inst)
1374 {
1375 const struct brw_device_info *devinfo = p->devinfo;
1376
1377 /* The next instruction (where the ENDIF would be, if it existed) */
1378 brw_inst *next_inst = &p->store[p->nr_insn];
1379
1380 assert(p->single_program_flow);
1381 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1382 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1383 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1384
1385 /* Convert IF to an ADD instruction that moves the instruction pointer
1386 * to the first instruction of the ELSE block. If there is no ELSE
1387 * block, point to where ENDIF would be. Reverse the predicate.
1388 *
1389 * There's no need to execute an ENDIF since we don't need to do any
1390 * stack operations, and if we're currently executing, we just want to
1391 * continue normally.
1392 */
1393 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1394 brw_inst_set_pred_inv(devinfo, if_inst, true);
1395
1396 if (else_inst != NULL) {
1397 /* Convert ELSE to an ADD instruction that points where the ENDIF
1398 * would be.
1399 */
1400 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1401
1402 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1403 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1404 } else {
1405 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1406 }
1407 }
1408
1409 /**
1410 * Patch IF and ELSE instructions with appropriate jump targets.
1411 */
1412 static void
1413 patch_IF_ELSE(struct brw_codegen *p,
1414 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1415 {
1416 const struct brw_device_info *devinfo = p->devinfo;
1417
1418 /* We shouldn't be patching IF and ELSE instructions in single program flow
1419 * mode when gen < 6, because in single program flow mode on those
1420 * platforms, we convert flow control instructions to conditional ADDs that
1421 * operate on IP (see brw_ENDIF).
1422 *
1423 * However, on Gen6, writing to IP doesn't work in single program flow mode
1424 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1425 * not be updated by non-flow control instructions."). And on later
1426 * platforms, there is no significant benefit to converting control flow
1427 * instructions to conditional ADDs. So we do patch IF and ELSE
1428 * instructions in single program flow mode on those platforms.
1429 */
1430 if (devinfo->gen < 6)
1431 assert(!p->single_program_flow);
1432
1433 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1434 assert(endif_inst != NULL);
1435 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1436
1437 unsigned br = brw_jump_scale(devinfo);
1438
1439 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1440 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1441
1442 if (else_inst == NULL) {
1443 /* Patch IF -> ENDIF */
1444 if (devinfo->gen < 6) {
1445 /* Turn it into an IFF, which means no mask stack operations for
1446 * all-false and jumping past the ENDIF.
1447 */
1448 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1449 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1450 br * (endif_inst - if_inst + 1));
1451 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1452 } else if (devinfo->gen == 6) {
1453 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1454 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1455 } else {
1456 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1457 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1458 }
1459 } else {
1460 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1461
1462 /* Patch IF -> ELSE */
1463 if (devinfo->gen < 6) {
1464 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1465 br * (else_inst - if_inst));
1466 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1467 } else if (devinfo->gen == 6) {
1468 brw_inst_set_gen6_jump_count(devinfo, if_inst,
1469 br * (else_inst - if_inst + 1));
1470 }
1471
1472 /* Patch ELSE -> ENDIF */
1473 if (devinfo->gen < 6) {
1474 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1475 * matching ENDIF.
1476 */
1477 brw_inst_set_gen4_jump_count(devinfo, else_inst,
1478 br * (endif_inst - else_inst + 1));
1479 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1480 } else if (devinfo->gen == 6) {
1481 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1482 brw_inst_set_gen6_jump_count(devinfo, else_inst,
1483 br * (endif_inst - else_inst));
1484 } else {
1485 /* The IF instruction's JIP should point just past the ELSE */
1486 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1487 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1488 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1489 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1490 if (devinfo->gen >= 8) {
1491 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1492 * should point to ENDIF.
1493 */
1494 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1495 }
1496 }
1497 }
1498 }
1499
1500 void
1501 brw_ELSE(struct brw_codegen *p)
1502 {
1503 const struct brw_device_info *devinfo = p->devinfo;
1504 brw_inst *insn;
1505
1506 insn = next_insn(p, BRW_OPCODE_ELSE);
1507
1508 if (devinfo->gen < 6) {
1509 brw_set_dest(p, insn, brw_ip_reg());
1510 brw_set_src0(p, insn, brw_ip_reg());
1511 brw_set_src1(p, insn, brw_imm_d(0x0));
1512 } else if (devinfo->gen == 6) {
1513 brw_set_dest(p, insn, brw_imm_w(0));
1514 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1515 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1516 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1517 } else if (devinfo->gen == 7) {
1518 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1519 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1520 brw_set_src1(p, insn, brw_imm_w(0));
1521 brw_inst_set_jip(devinfo, insn, 0);
1522 brw_inst_set_uip(devinfo, insn, 0);
1523 } else {
1524 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1525 brw_set_src0(p, insn, brw_imm_d(0));
1526 brw_inst_set_jip(devinfo, insn, 0);
1527 brw_inst_set_uip(devinfo, insn, 0);
1528 }
1529
1530 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1531 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1532 if (!p->single_program_flow && devinfo->gen < 6)
1533 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1534
1535 push_if_stack(p, insn);
1536 }
1537
1538 void
1539 brw_ENDIF(struct brw_codegen *p)
1540 {
1541 const struct brw_device_info *devinfo = p->devinfo;
1542 brw_inst *insn = NULL;
1543 brw_inst *else_inst = NULL;
1544 brw_inst *if_inst = NULL;
1545 brw_inst *tmp;
1546 bool emit_endif = true;
1547
1548 /* In single program flow mode, we can express IF and ELSE instructions
1549 * equivalently as ADD instructions that operate on IP. On platforms prior
1550 * to Gen6, flow control instructions cause an implied thread switch, so
1551 * this is a significant savings.
1552 *
1553 * However, on Gen6, writing to IP doesn't work in single program flow mode
1554 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1555 * not be updated by non-flow control instructions."). And on later
1556 * platforms, there is no significant benefit to converting control flow
1557 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1558 * Gen5.
1559 */
1560 if (devinfo->gen < 6 && p->single_program_flow)
1561 emit_endif = false;
1562
1563 /*
1564 * A single next_insn() may change the base address of instruction store
1565 * memory(p->store), so call it first before referencing the instruction
1566 * store pointer from an index
1567 */
1568 if (emit_endif)
1569 insn = next_insn(p, BRW_OPCODE_ENDIF);
1570
1571 /* Pop the IF and (optional) ELSE instructions from the stack */
1572 p->if_depth_in_loop[p->loop_stack_depth]--;
1573 tmp = pop_if_stack(p);
1574 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1575 else_inst = tmp;
1576 tmp = pop_if_stack(p);
1577 }
1578 if_inst = tmp;
1579
1580 if (!emit_endif) {
1581 /* ENDIF is useless; don't bother emitting it. */
1582 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1583 return;
1584 }
1585
1586 if (devinfo->gen < 6) {
1587 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1588 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1589 brw_set_src1(p, insn, brw_imm_d(0x0));
1590 } else if (devinfo->gen == 6) {
1591 brw_set_dest(p, insn, brw_imm_w(0));
1592 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1593 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1594 } else if (devinfo->gen == 7) {
1595 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1596 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1597 brw_set_src1(p, insn, brw_imm_w(0));
1598 } else {
1599 brw_set_src0(p, insn, brw_imm_d(0));
1600 }
1601
1602 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1603 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1604 if (devinfo->gen < 6)
1605 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1606
1607 /* Also pop item off the stack in the endif instruction: */
1608 if (devinfo->gen < 6) {
1609 brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1610 brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1611 } else if (devinfo->gen == 6) {
1612 brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1613 } else {
1614 brw_inst_set_jip(devinfo, insn, 2);
1615 }
1616 patch_IF_ELSE(p, if_inst, else_inst, insn);
1617 }
1618
1619 brw_inst *
1620 brw_BREAK(struct brw_codegen *p)
1621 {
1622 const struct brw_device_info *devinfo = p->devinfo;
1623 brw_inst *insn;
1624
1625 insn = next_insn(p, BRW_OPCODE_BREAK);
1626 if (devinfo->gen >= 8) {
1627 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1628 brw_set_src0(p, insn, brw_imm_d(0x0));
1629 } else if (devinfo->gen >= 6) {
1630 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1631 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1632 brw_set_src1(p, insn, brw_imm_d(0x0));
1633 } else {
1634 brw_set_dest(p, insn, brw_ip_reg());
1635 brw_set_src0(p, insn, brw_ip_reg());
1636 brw_set_src1(p, insn, brw_imm_d(0x0));
1637 brw_inst_set_gen4_pop_count(devinfo, insn,
1638 p->if_depth_in_loop[p->loop_stack_depth]);
1639 }
1640 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1641 brw_inst_set_exec_size(devinfo, insn, p->compressed ? BRW_EXECUTE_16
1642 : BRW_EXECUTE_8);
1643
1644 return insn;
1645 }
1646
1647 brw_inst *
1648 brw_CONT(struct brw_codegen *p)
1649 {
1650 const struct brw_device_info *devinfo = p->devinfo;
1651 brw_inst *insn;
1652
1653 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1654 brw_set_dest(p, insn, brw_ip_reg());
1655 if (devinfo->gen >= 8) {
1656 brw_set_src0(p, insn, brw_imm_d(0x0));
1657 } else {
1658 brw_set_src0(p, insn, brw_ip_reg());
1659 brw_set_src1(p, insn, brw_imm_d(0x0));
1660 }
1661
1662 if (devinfo->gen < 6) {
1663 brw_inst_set_gen4_pop_count(devinfo, insn,
1664 p->if_depth_in_loop[p->loop_stack_depth]);
1665 }
1666 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1667 brw_inst_set_exec_size(devinfo, insn, p->compressed ? BRW_EXECUTE_16
1668 : BRW_EXECUTE_8);
1669 return insn;
1670 }
1671
1672 brw_inst *
1673 gen6_HALT(struct brw_codegen *p)
1674 {
1675 const struct brw_device_info *devinfo = p->devinfo;
1676 brw_inst *insn;
1677
1678 insn = next_insn(p, BRW_OPCODE_HALT);
1679 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1680 if (devinfo->gen >= 8) {
1681 brw_set_src0(p, insn, brw_imm_d(0x0));
1682 } else {
1683 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1684 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1685 }
1686
1687 if (p->compressed) {
1688 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_16);
1689 } else {
1690 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1691 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_8);
1692 }
1693 return insn;
1694 }
1695
1696 /* DO/WHILE loop:
1697 *
1698 * The DO/WHILE is just an unterminated loop -- break or continue are
1699 * used for control within the loop. We have a few ways they can be
1700 * done.
1701 *
1702 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1703 * jip and no DO instruction.
1704 *
1705 * For non-uniform control flow pre-gen6, there's a DO instruction to
1706 * push the mask, and a WHILE to jump back, and BREAK to get out and
1707 * pop the mask.
1708 *
1709 * For gen6, there's no more mask stack, so no need for DO. WHILE
1710 * just points back to the first instruction of the loop.
1711 */
1712 brw_inst *
1713 brw_DO(struct brw_codegen *p, unsigned execute_size)
1714 {
1715 const struct brw_device_info *devinfo = p->devinfo;
1716
1717 if (devinfo->gen >= 6 || p->single_program_flow) {
1718 push_loop_stack(p, &p->store[p->nr_insn]);
1719 return &p->store[p->nr_insn];
1720 } else {
1721 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1722
1723 push_loop_stack(p, insn);
1724
1725 /* Override the defaults for this instruction:
1726 */
1727 brw_set_dest(p, insn, brw_null_reg());
1728 brw_set_src0(p, insn, brw_null_reg());
1729 brw_set_src1(p, insn, brw_null_reg());
1730
1731 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1732 brw_inst_set_exec_size(devinfo, insn, execute_size);
1733 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1734
1735 return insn;
1736 }
1737 }
1738
1739 /**
1740 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1741 * instruction here.
1742 *
1743 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1744 * nesting, since it can always just point to the end of the block/current loop.
1745 */
1746 static void
1747 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1748 {
1749 const struct brw_device_info *devinfo = p->devinfo;
1750 brw_inst *do_inst = get_inner_do_insn(p);
1751 brw_inst *inst;
1752 unsigned br = brw_jump_scale(devinfo);
1753
1754 assert(devinfo->gen < 6);
1755
1756 for (inst = while_inst - 1; inst != do_inst; inst--) {
1757 /* If the jump count is != 0, that means that this instruction has already
1758 * been patched because it's part of a loop inside of the one we're
1759 * patching.
1760 */
1761 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1762 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1763 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1764 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1765 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1766 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1767 }
1768 }
1769 }
1770
1771 brw_inst *
1772 brw_WHILE(struct brw_codegen *p)
1773 {
1774 const struct brw_device_info *devinfo = p->devinfo;
1775 brw_inst *insn, *do_insn;
1776 unsigned br = brw_jump_scale(devinfo);
1777
1778 if (devinfo->gen >= 6) {
1779 insn = next_insn(p, BRW_OPCODE_WHILE);
1780 do_insn = get_inner_do_insn(p);
1781
1782 if (devinfo->gen >= 8) {
1783 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1784 brw_set_src0(p, insn, brw_imm_d(0));
1785 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1786 } else if (devinfo->gen == 7) {
1787 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1788 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1789 brw_set_src1(p, insn, brw_imm_w(0));
1790 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1791 } else {
1792 brw_set_dest(p, insn, brw_imm_w(0));
1793 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1794 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1795 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1796 }
1797
1798 brw_inst_set_exec_size(devinfo, insn, p->compressed ? BRW_EXECUTE_16
1799 : BRW_EXECUTE_8);
1800 } else {
1801 if (p->single_program_flow) {
1802 insn = next_insn(p, BRW_OPCODE_ADD);
1803 do_insn = get_inner_do_insn(p);
1804
1805 brw_set_dest(p, insn, brw_ip_reg());
1806 brw_set_src0(p, insn, brw_ip_reg());
1807 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1808 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1809 } else {
1810 insn = next_insn(p, BRW_OPCODE_WHILE);
1811 do_insn = get_inner_do_insn(p);
1812
1813 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1814
1815 brw_set_dest(p, insn, brw_ip_reg());
1816 brw_set_src0(p, insn, brw_ip_reg());
1817 brw_set_src1(p, insn, brw_imm_d(0));
1818
1819 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1820 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1821 brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1822
1823 brw_patch_break_cont(p, insn);
1824 }
1825 }
1826 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1827
1828 p->loop_stack_depth--;
1829
1830 return insn;
1831 }
1832
1833 /* FORWARD JUMPS:
1834 */
1835 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1836 {
1837 const struct brw_device_info *devinfo = p->devinfo;
1838 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1839 unsigned jmpi = 1;
1840
1841 if (devinfo->gen >= 5)
1842 jmpi = 2;
1843
1844 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1845 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1846
1847 brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1848 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1849 }
1850
1851 /* To integrate with the above, it makes sense that the comparison
1852 * instruction should populate the flag register. It might be simpler
1853 * just to use the flag reg for most WM tasks?
1854 */
1855 void brw_CMP(struct brw_codegen *p,
1856 struct brw_reg dest,
1857 unsigned conditional,
1858 struct brw_reg src0,
1859 struct brw_reg src1)
1860 {
1861 const struct brw_device_info *devinfo = p->devinfo;
1862 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1863
1864 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1865 brw_set_dest(p, insn, dest);
1866 brw_set_src0(p, insn, src0);
1867 brw_set_src1(p, insn, src1);
1868
1869 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1870 * page says:
1871 * "Any CMP instruction with a null destination must use a {switch}."
1872 *
1873 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1874 * mentioned on their work-arounds pages.
1875 */
1876 if (devinfo->gen == 7) {
1877 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1878 dest.nr == BRW_ARF_NULL) {
1879 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1880 }
1881 }
1882 }
1883
1884 /***********************************************************************
1885 * Helpers for the various SEND message types:
1886 */
1887
1888 /** Extended math function, float[8].
1889 */
1890 void gen4_math(struct brw_codegen *p,
1891 struct brw_reg dest,
1892 unsigned function,
1893 unsigned msg_reg_nr,
1894 struct brw_reg src,
1895 unsigned precision )
1896 {
1897 const struct brw_device_info *devinfo = p->devinfo;
1898 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1899 unsigned data_type;
1900 if (has_scalar_region(src)) {
1901 data_type = BRW_MATH_DATA_SCALAR;
1902 } else {
1903 data_type = BRW_MATH_DATA_VECTOR;
1904 }
1905
1906 assert(devinfo->gen < 6);
1907
1908 /* Example code doesn't set predicate_control for send
1909 * instructions.
1910 */
1911 brw_inst_set_pred_control(devinfo, insn, 0);
1912 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1913
1914 brw_set_dest(p, insn, dest);
1915 brw_set_src0(p, insn, src);
1916 brw_set_math_message(p,
1917 insn,
1918 function,
1919 src.type == BRW_REGISTER_TYPE_D,
1920 precision,
1921 data_type);
1922 }
1923
1924 void gen6_math(struct brw_codegen *p,
1925 struct brw_reg dest,
1926 unsigned function,
1927 struct brw_reg src0,
1928 struct brw_reg src1)
1929 {
1930 const struct brw_device_info *devinfo = p->devinfo;
1931 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
1932
1933 assert(devinfo->gen >= 6);
1934
1935 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1936 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1937 assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
1938 (devinfo->gen >= 8 && src0.file == BRW_IMMEDIATE_VALUE));
1939
1940 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1941 if (devinfo->gen == 6) {
1942 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1943 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1944 }
1945
1946 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1947 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1948 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1949 assert(src0.type != BRW_REGISTER_TYPE_F);
1950 assert(src1.type != BRW_REGISTER_TYPE_F);
1951 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
1952 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
1953 } else {
1954 assert(src0.type == BRW_REGISTER_TYPE_F);
1955 assert(src1.type == BRW_REGISTER_TYPE_F);
1956 if (function == BRW_MATH_FUNCTION_POW) {
1957 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
1958 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
1959 } else {
1960 assert(src1.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1961 src1.nr == BRW_ARF_NULL);
1962 }
1963 }
1964
1965 /* Source modifiers are ignored for extended math instructions on Gen6. */
1966 if (devinfo->gen == 6) {
1967 assert(!src0.negate);
1968 assert(!src0.abs);
1969 assert(!src1.negate);
1970 assert(!src1.abs);
1971 }
1972
1973 brw_inst_set_math_function(devinfo, insn, function);
1974
1975 brw_set_dest(p, insn, dest);
1976 brw_set_src0(p, insn, src0);
1977 brw_set_src1(p, insn, src1);
1978 }
1979
1980
1981 /**
1982 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1983 * using a constant offset per channel.
1984 *
1985 * The offset must be aligned to oword size (16 bytes). Used for
1986 * register spilling.
1987 */
1988 void brw_oword_block_write_scratch(struct brw_codegen *p,
1989 struct brw_reg mrf,
1990 int num_regs,
1991 unsigned offset)
1992 {
1993 const struct brw_device_info *devinfo = p->devinfo;
1994 uint32_t msg_control, msg_type;
1995 int mlen;
1996
1997 if (devinfo->gen >= 6)
1998 offset /= 16;
1999
2000 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2001
2002 if (num_regs == 1) {
2003 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2004 mlen = 2;
2005 } else {
2006 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2007 mlen = 3;
2008 }
2009
2010 /* Set up the message header. This is g0, with g0.2 filled with
2011 * the offset. We don't want to leave our offset around in g0 or
2012 * it'll screw up texture samples, so set it up inside the message
2013 * reg.
2014 */
2015 {
2016 brw_push_insn_state(p);
2017 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2018 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2019 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2020
2021 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2022
2023 /* set message header global offset field (reg 0, element 2) */
2024 brw_MOV(p,
2025 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2026 mrf.nr,
2027 2), BRW_REGISTER_TYPE_UD),
2028 brw_imm_ud(offset));
2029
2030 brw_pop_insn_state(p);
2031 }
2032
2033 {
2034 struct brw_reg dest;
2035 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2036 int send_commit_msg;
2037 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2038 BRW_REGISTER_TYPE_UW);
2039
2040 if (brw_inst_qtr_control(devinfo, insn) != BRW_COMPRESSION_NONE) {
2041 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
2042 src_header = vec16(src_header);
2043 }
2044 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2045 if (devinfo->gen < 6)
2046 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2047
2048 /* Until gen6, writes followed by reads from the same location
2049 * are not guaranteed to be ordered unless write_commit is set.
2050 * If set, then a no-op write is issued to the destination
2051 * register to set a dependency, and a read from the destination
2052 * can be used to ensure the ordering.
2053 *
2054 * For gen6, only writes between different threads need ordering
2055 * protection. Our use of DP writes is all about register
2056 * spilling within a thread.
2057 */
2058 if (devinfo->gen >= 6) {
2059 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2060 send_commit_msg = 0;
2061 } else {
2062 dest = src_header;
2063 send_commit_msg = 1;
2064 }
2065
2066 brw_set_dest(p, insn, dest);
2067 if (devinfo->gen >= 6) {
2068 brw_set_src0(p, insn, mrf);
2069 } else {
2070 brw_set_src0(p, insn, brw_null_reg());
2071 }
2072
2073 if (devinfo->gen >= 6)
2074 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2075 else
2076 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2077
2078 brw_set_dp_write_message(p,
2079 insn,
2080 255, /* binding table index (255=stateless) */
2081 msg_control,
2082 msg_type,
2083 mlen,
2084 true, /* header_present */
2085 0, /* not a render target */
2086 send_commit_msg, /* response_length */
2087 0, /* eot */
2088 send_commit_msg);
2089 }
2090 }
2091
2092
2093 /**
2094 * Read a block of owords (half a GRF each) from the scratch buffer
2095 * using a constant index per channel.
2096 *
2097 * Offset must be aligned to oword size (16 bytes). Used for register
2098 * spilling.
2099 */
2100 void
2101 brw_oword_block_read_scratch(struct brw_codegen *p,
2102 struct brw_reg dest,
2103 struct brw_reg mrf,
2104 int num_regs,
2105 unsigned offset)
2106 {
2107 const struct brw_device_info *devinfo = p->devinfo;
2108 uint32_t msg_control;
2109 int rlen;
2110
2111 if (devinfo->gen >= 6)
2112 offset /= 16;
2113
2114 if (p->devinfo->gen >= 7) {
2115 /* On gen 7 and above, we no longer have message registers and we can
2116 * send from any register we want. By using the destination register
2117 * for the message, we guarantee that the implied message write won't
2118 * accidentally overwrite anything. This has been a problem because
2119 * the MRF registers and source for the final FB write are both fixed
2120 * and may overlap.
2121 */
2122 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2123 } else {
2124 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2125 }
2126 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2127
2128 if (num_regs == 1) {
2129 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2130 rlen = 1;
2131 } else {
2132 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2133 rlen = 2;
2134 }
2135
2136 {
2137 brw_push_insn_state(p);
2138 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2139 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2140 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2141
2142 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2143
2144 /* set message header global offset field (reg 0, element 2) */
2145 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2146
2147 brw_pop_insn_state(p);
2148 }
2149
2150 {
2151 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2152
2153 assert(brw_inst_pred_control(devinfo, insn) == 0);
2154 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
2155
2156 brw_set_dest(p, insn, dest); /* UW? */
2157 if (devinfo->gen >= 6) {
2158 brw_set_src0(p, insn, mrf);
2159 } else {
2160 brw_set_src0(p, insn, brw_null_reg());
2161 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2162 }
2163
2164 brw_set_dp_read_message(p,
2165 insn,
2166 255, /* binding table index (255=stateless) */
2167 msg_control,
2168 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2169 BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2170 1, /* msg_length */
2171 true, /* header_present */
2172 rlen);
2173 }
2174 }
2175
2176 void
2177 gen7_block_read_scratch(struct brw_codegen *p,
2178 struct brw_reg dest,
2179 int num_regs,
2180 unsigned offset)
2181 {
2182 const struct brw_device_info *devinfo = p->devinfo;
2183 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2184 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2185
2186 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
2187 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2188
2189 /* The HW requires that the header is present; this is to get the g0.5
2190 * scratch offset.
2191 */
2192 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2193
2194 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2195 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2196 * is 32 bytes, which happens to be the size of a register.
2197 */
2198 offset /= REG_SIZE;
2199 assert(offset < (1 << 12));
2200
2201 gen7_set_dp_scratch_message(p, insn,
2202 false, /* scratch read */
2203 false, /* OWords */
2204 false, /* invalidate after read */
2205 num_regs,
2206 offset,
2207 1, /* mlen: just g0 */
2208 num_regs, /* rlen */
2209 true); /* header present */
2210 }
2211
2212 /**
2213 * Read a float[4] vector from the data port Data Cache (const buffer).
2214 * Location (in buffer) should be a multiple of 16.
2215 * Used for fetching shader constants.
2216 */
2217 void brw_oword_block_read(struct brw_codegen *p,
2218 struct brw_reg dest,
2219 struct brw_reg mrf,
2220 uint32_t offset,
2221 uint32_t bind_table_index)
2222 {
2223 const struct brw_device_info *devinfo = p->devinfo;
2224
2225 /* On newer hardware, offset is in units of owords. */
2226 if (devinfo->gen >= 6)
2227 offset /= 16;
2228
2229 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2230
2231 brw_push_insn_state(p);
2232 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2233 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2234 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2235 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2236
2237 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2238
2239 /* set message header global offset field (reg 0, element 2) */
2240 brw_MOV(p,
2241 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2242 mrf.nr,
2243 2), BRW_REGISTER_TYPE_UD),
2244 brw_imm_ud(offset));
2245
2246 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2247
2248 /* cast dest to a uword[8] vector */
2249 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2250
2251 brw_set_dest(p, insn, dest);
2252 if (devinfo->gen >= 6) {
2253 brw_set_src0(p, insn, mrf);
2254 } else {
2255 brw_set_src0(p, insn, brw_null_reg());
2256 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2257 }
2258
2259 brw_set_dp_read_message(p,
2260 insn,
2261 bind_table_index,
2262 BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2263 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2264 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2265 1, /* msg_length */
2266 true, /* header_present */
2267 1); /* response_length (1 reg, 2 owords!) */
2268
2269 brw_pop_insn_state(p);
2270 }
2271
2272
2273 void brw_fb_WRITE(struct brw_codegen *p,
2274 int dispatch_width,
2275 struct brw_reg payload,
2276 struct brw_reg implied_header,
2277 unsigned msg_control,
2278 unsigned binding_table_index,
2279 unsigned msg_length,
2280 unsigned response_length,
2281 bool eot,
2282 bool last_render_target,
2283 bool header_present)
2284 {
2285 const struct brw_device_info *devinfo = p->devinfo;
2286 brw_inst *insn;
2287 unsigned msg_type;
2288 struct brw_reg dest, src0;
2289
2290 if (dispatch_width == 16)
2291 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2292 else
2293 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2294
2295 if (devinfo->gen >= 6) {
2296 insn = next_insn(p, BRW_OPCODE_SENDC);
2297 } else {
2298 insn = next_insn(p, BRW_OPCODE_SEND);
2299 }
2300 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
2301
2302 if (devinfo->gen >= 6) {
2303 /* headerless version, just submit color payload */
2304 src0 = payload;
2305
2306 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2307 } else {
2308 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2309 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2310 src0 = implied_header;
2311
2312 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2313 }
2314
2315 brw_set_dest(p, insn, dest);
2316 brw_set_src0(p, insn, src0);
2317 brw_set_dp_write_message(p,
2318 insn,
2319 binding_table_index,
2320 msg_control,
2321 msg_type,
2322 msg_length,
2323 header_present,
2324 last_render_target,
2325 response_length,
2326 eot,
2327 0 /* send_commit_msg */);
2328 }
2329
2330
2331 /**
2332 * Texture sample instruction.
2333 * Note: the msg_type plus msg_length values determine exactly what kind
2334 * of sampling operation is performed. See volume 4, page 161 of docs.
2335 */
2336 void brw_SAMPLE(struct brw_codegen *p,
2337 struct brw_reg dest,
2338 unsigned msg_reg_nr,
2339 struct brw_reg src0,
2340 unsigned binding_table_index,
2341 unsigned sampler,
2342 unsigned msg_type,
2343 unsigned response_length,
2344 unsigned msg_length,
2345 unsigned header_present,
2346 unsigned simd_mode,
2347 unsigned return_format)
2348 {
2349 const struct brw_device_info *devinfo = p->devinfo;
2350 brw_inst *insn;
2351
2352 if (msg_reg_nr != -1)
2353 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2354
2355 insn = next_insn(p, BRW_OPCODE_SEND);
2356 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2357
2358 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2359 *
2360 * "Instruction compression is not allowed for this instruction (that
2361 * is, send). The hardware behavior is undefined if this instruction is
2362 * set as compressed. However, compress control can be set to "SecHalf"
2363 * to affect the EMask generation."
2364 *
2365 * No similar wording is found in later PRMs, but there are examples
2366 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2367 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2368 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2369 */
2370 if (brw_inst_qtr_control(devinfo, insn) != BRW_COMPRESSION_2NDHALF)
2371 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
2372
2373 if (devinfo->gen < 6)
2374 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2375
2376 brw_set_dest(p, insn, dest);
2377 brw_set_src0(p, insn, src0);
2378 brw_set_sampler_message(p, insn,
2379 binding_table_index,
2380 sampler,
2381 msg_type,
2382 response_length,
2383 msg_length,
2384 header_present,
2385 simd_mode,
2386 return_format);
2387 }
2388
2389 /* Adjust the message header's sampler state pointer to
2390 * select the correct group of 16 samplers.
2391 */
2392 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2393 struct brw_reg header,
2394 struct brw_reg sampler_index)
2395 {
2396 /* The "Sampler Index" field can only store values between 0 and 15.
2397 * However, we can add an offset to the "Sampler State Pointer"
2398 * field, effectively selecting a different set of 16 samplers.
2399 *
2400 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2401 * offset, and each sampler state is only 16-bytes, so we can't
2402 * exclusively use the offset - we have to use both.
2403 */
2404
2405 const struct brw_device_info *devinfo = p->devinfo;
2406
2407 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2408 const int sampler_state_size = 16; /* 16 bytes */
2409 uint32_t sampler = sampler_index.dw1.ud;
2410
2411 if (sampler >= 16) {
2412 assert(devinfo->is_haswell || devinfo->gen >= 8);
2413 brw_ADD(p,
2414 get_element_ud(header, 3),
2415 get_element_ud(brw_vec8_grf(0, 0), 3),
2416 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2417 }
2418 } else {
2419 /* Non-const sampler array indexing case */
2420 if (devinfo->gen < 8 && !devinfo->is_haswell) {
2421 return;
2422 }
2423
2424 struct brw_reg temp = get_element_ud(header, 3);
2425
2426 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2427 brw_SHL(p, temp, temp, brw_imm_ud(4));
2428 brw_ADD(p,
2429 get_element_ud(header, 3),
2430 get_element_ud(brw_vec8_grf(0, 0), 3),
2431 temp);
2432 }
2433 }
2434
2435 /* All these variables are pretty confusing - we might be better off
2436 * using bitmasks and macros for this, in the old style. Or perhaps
2437 * just having the caller instantiate the fields in dword3 itself.
2438 */
2439 void brw_urb_WRITE(struct brw_codegen *p,
2440 struct brw_reg dest,
2441 unsigned msg_reg_nr,
2442 struct brw_reg src0,
2443 enum brw_urb_write_flags flags,
2444 unsigned msg_length,
2445 unsigned response_length,
2446 unsigned offset,
2447 unsigned swizzle)
2448 {
2449 const struct brw_device_info *devinfo = p->devinfo;
2450 brw_inst *insn;
2451
2452 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2453
2454 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2455 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2456 brw_push_insn_state(p);
2457 brw_set_default_access_mode(p, BRW_ALIGN_1);
2458 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2459 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2460 BRW_REGISTER_TYPE_UD),
2461 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2462 brw_imm_ud(0xff00));
2463 brw_pop_insn_state(p);
2464 }
2465
2466 insn = next_insn(p, BRW_OPCODE_SEND);
2467
2468 assert(msg_length < BRW_MAX_MRF);
2469
2470 brw_set_dest(p, insn, dest);
2471 brw_set_src0(p, insn, src0);
2472 brw_set_src1(p, insn, brw_imm_d(0));
2473
2474 if (devinfo->gen < 6)
2475 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2476
2477 brw_set_urb_message(p,
2478 insn,
2479 flags,
2480 msg_length,
2481 response_length,
2482 offset,
2483 swizzle);
2484 }
2485
2486 struct brw_inst *
2487 brw_send_indirect_message(struct brw_codegen *p,
2488 unsigned sfid,
2489 struct brw_reg dst,
2490 struct brw_reg payload,
2491 struct brw_reg desc)
2492 {
2493 const struct brw_device_info *devinfo = p->devinfo;
2494 struct brw_inst *send, *setup;
2495
2496 assert(desc.type == BRW_REGISTER_TYPE_UD);
2497
2498 if (desc.file == BRW_IMMEDIATE_VALUE) {
2499 setup = send = next_insn(p, BRW_OPCODE_SEND);
2500 brw_set_src1(p, send, desc);
2501
2502 } else {
2503 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2504
2505 brw_push_insn_state(p);
2506 brw_set_default_access_mode(p, BRW_ALIGN_1);
2507 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2508 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2509
2510 /* Load the indirect descriptor to an address register using OR so the
2511 * caller can specify additional descriptor bits with the usual
2512 * brw_set_*_message() helper functions.
2513 */
2514 setup = brw_OR(p, addr, desc, brw_imm_ud(0));
2515
2516 brw_pop_insn_state(p);
2517
2518 send = next_insn(p, BRW_OPCODE_SEND);
2519 brw_set_src1(p, send, addr);
2520 }
2521
2522 brw_set_dest(p, send, dst);
2523 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2524 brw_inst_set_sfid(devinfo, send, sfid);
2525
2526 return setup;
2527 }
2528
2529 static struct brw_inst *
2530 brw_send_indirect_surface_message(struct brw_codegen *p,
2531 unsigned sfid,
2532 struct brw_reg dst,
2533 struct brw_reg payload,
2534 struct brw_reg surface,
2535 unsigned message_len,
2536 unsigned response_len,
2537 bool header_present)
2538 {
2539 const struct brw_device_info *devinfo = p->devinfo;
2540 struct brw_inst *insn;
2541
2542 if (surface.file != BRW_IMMEDIATE_VALUE) {
2543 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2544
2545 brw_push_insn_state(p);
2546 brw_set_default_access_mode(p, BRW_ALIGN_1);
2547 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2548 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2549
2550 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2551 * some surface array is accessed out of bounds.
2552 */
2553 insn = brw_AND(p, addr,
2554 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2555 BRW_GET_SWZ(surface.dw1.bits.swizzle, 0)),
2556 brw_imm_ud(0xff));
2557
2558 brw_pop_insn_state(p);
2559
2560 surface = addr;
2561 }
2562
2563 insn = brw_send_indirect_message(p, sfid, dst, payload, surface);
2564 brw_inst_set_mlen(devinfo, insn, message_len);
2565 brw_inst_set_rlen(devinfo, insn, response_len);
2566 brw_inst_set_header_present(devinfo, insn, header_present);
2567
2568 return insn;
2569 }
2570
2571 static int
2572 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2573 {
2574 int offset;
2575 void *store = p->store;
2576 const struct brw_device_info *devinfo = p->devinfo;
2577
2578 for (offset = next_offset(devinfo, store, start_offset);
2579 offset < p->next_insn_offset;
2580 offset = next_offset(devinfo, store, offset)) {
2581 brw_inst *insn = store + offset;
2582
2583 switch (brw_inst_opcode(devinfo, insn)) {
2584 case BRW_OPCODE_ENDIF:
2585 case BRW_OPCODE_ELSE:
2586 case BRW_OPCODE_WHILE:
2587 case BRW_OPCODE_HALT:
2588 return offset;
2589 }
2590 }
2591
2592 return 0;
2593 }
2594
2595 /* There is no DO instruction on gen6, so to find the end of the loop
2596 * we have to see if the loop is jumping back before our start
2597 * instruction.
2598 */
2599 static int
2600 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2601 {
2602 const struct brw_device_info *devinfo = p->devinfo;
2603 int offset;
2604 int scale = 16 / brw_jump_scale(devinfo);
2605 void *store = p->store;
2606
2607 assert(devinfo->gen >= 6);
2608
2609 /* Always start after the instruction (such as a WHILE) we're trying to fix
2610 * up.
2611 */
2612 for (offset = next_offset(devinfo, store, start_offset);
2613 offset < p->next_insn_offset;
2614 offset = next_offset(devinfo, store, offset)) {
2615 brw_inst *insn = store + offset;
2616
2617 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2618 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2619 : brw_inst_jip(devinfo, insn);
2620 if (offset + jip * scale <= start_offset)
2621 return offset;
2622 }
2623 }
2624 assert(!"not reached");
2625 return start_offset;
2626 }
2627
2628 /* After program generation, go back and update the UIP and JIP of
2629 * BREAK, CONT, and HALT instructions to their correct locations.
2630 */
2631 void
2632 brw_set_uip_jip(struct brw_codegen *p)
2633 {
2634 const struct brw_device_info *devinfo = p->devinfo;
2635 int offset;
2636 int br = brw_jump_scale(devinfo);
2637 int scale = 16 / br;
2638 void *store = p->store;
2639
2640 if (devinfo->gen < 6)
2641 return;
2642
2643 for (offset = 0; offset < p->next_insn_offset;
2644 offset = next_offset(devinfo, store, offset)) {
2645 brw_inst *insn = store + offset;
2646
2647 if (brw_inst_cmpt_control(devinfo, insn)) {
2648 /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2649 assert(brw_inst_opcode(devinfo, insn) != BRW_OPCODE_BREAK &&
2650 brw_inst_opcode(devinfo, insn) != BRW_OPCODE_CONTINUE &&
2651 brw_inst_opcode(devinfo, insn) != BRW_OPCODE_HALT);
2652 continue;
2653 }
2654
2655 int block_end_offset = brw_find_next_block_end(p, offset);
2656 switch (brw_inst_opcode(devinfo, insn)) {
2657 case BRW_OPCODE_BREAK:
2658 assert(block_end_offset != 0);
2659 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2660 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2661 brw_inst_set_uip(devinfo, insn,
2662 (brw_find_loop_end(p, offset) - offset +
2663 (devinfo->gen == 6 ? 16 : 0)) / scale);
2664 break;
2665 case BRW_OPCODE_CONTINUE:
2666 assert(block_end_offset != 0);
2667 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2668 brw_inst_set_uip(devinfo, insn,
2669 (brw_find_loop_end(p, offset) - offset) / scale);
2670
2671 assert(brw_inst_uip(devinfo, insn) != 0);
2672 assert(brw_inst_jip(devinfo, insn) != 0);
2673 break;
2674
2675 case BRW_OPCODE_ENDIF: {
2676 int32_t jump = (block_end_offset == 0) ?
2677 1 * br : (block_end_offset - offset) / scale;
2678 if (devinfo->gen >= 7)
2679 brw_inst_set_jip(devinfo, insn, jump);
2680 else
2681 brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2682 break;
2683 }
2684
2685 case BRW_OPCODE_HALT:
2686 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2687 *
2688 * "In case of the halt instruction not inside any conditional
2689 * code block, the value of <JIP> and <UIP> should be the
2690 * same. In case of the halt instruction inside conditional code
2691 * block, the <UIP> should be the end of the program, and the
2692 * <JIP> should be end of the most inner conditional code block."
2693 *
2694 * The uip will have already been set by whoever set up the
2695 * instruction.
2696 */
2697 if (block_end_offset == 0) {
2698 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2699 } else {
2700 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2701 }
2702 assert(brw_inst_uip(devinfo, insn) != 0);
2703 assert(brw_inst_jip(devinfo, insn) != 0);
2704 break;
2705 }
2706 }
2707 }
2708
2709 void brw_ff_sync(struct brw_codegen *p,
2710 struct brw_reg dest,
2711 unsigned msg_reg_nr,
2712 struct brw_reg src0,
2713 bool allocate,
2714 unsigned response_length,
2715 bool eot)
2716 {
2717 const struct brw_device_info *devinfo = p->devinfo;
2718 brw_inst *insn;
2719
2720 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2721
2722 insn = next_insn(p, BRW_OPCODE_SEND);
2723 brw_set_dest(p, insn, dest);
2724 brw_set_src0(p, insn, src0);
2725 brw_set_src1(p, insn, brw_imm_d(0));
2726
2727 if (devinfo->gen < 6)
2728 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2729
2730 brw_set_ff_sync_message(p,
2731 insn,
2732 allocate,
2733 response_length,
2734 eot);
2735 }
2736
2737 /**
2738 * Emit the SEND instruction necessary to generate stream output data on Gen6
2739 * (for transform feedback).
2740 *
2741 * If send_commit_msg is true, this is the last piece of stream output data
2742 * from this thread, so send the data as a committed write. According to the
2743 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2744 *
2745 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2746 * writes are complete by sending the final write as a committed write."
2747 */
2748 void
2749 brw_svb_write(struct brw_codegen *p,
2750 struct brw_reg dest,
2751 unsigned msg_reg_nr,
2752 struct brw_reg src0,
2753 unsigned binding_table_index,
2754 bool send_commit_msg)
2755 {
2756 brw_inst *insn;
2757
2758 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2759
2760 insn = next_insn(p, BRW_OPCODE_SEND);
2761 brw_set_dest(p, insn, dest);
2762 brw_set_src0(p, insn, src0);
2763 brw_set_src1(p, insn, brw_imm_d(0));
2764 brw_set_dp_write_message(p, insn,
2765 binding_table_index,
2766 0, /* msg_control: ignored */
2767 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2768 1, /* msg_length */
2769 true, /* header_present */
2770 0, /* last_render_target: ignored */
2771 send_commit_msg, /* response_length */
2772 0, /* end_of_thread */
2773 send_commit_msg); /* send_commit_msg */
2774 }
2775
2776 static unsigned
2777 brw_surface_payload_size(struct brw_codegen *p,
2778 unsigned num_channels,
2779 bool has_simd4x2,
2780 bool has_simd16)
2781 {
2782 if (has_simd4x2 && brw_inst_access_mode(p->devinfo, p->current) == BRW_ALIGN_16)
2783 return 1;
2784 else if (has_simd16 && p->compressed)
2785 return 2 * num_channels;
2786 else
2787 return num_channels;
2788 }
2789
2790 static void
2791 brw_set_dp_untyped_atomic_message(struct brw_codegen *p,
2792 brw_inst *insn,
2793 unsigned atomic_op,
2794 bool response_expected)
2795 {
2796 const struct brw_device_info *devinfo = p->devinfo;
2797 unsigned msg_control =
2798 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2799 (response_expected ? 1 << 5 : 0); /* Return data expected */
2800
2801 if (devinfo->gen >= 8 || devinfo->is_haswell) {
2802 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2803 if (!p->compressed)
2804 msg_control |= 1 << 4; /* SIMD8 mode */
2805
2806 brw_inst_set_dp_msg_type(devinfo, insn,
2807 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
2808 } else {
2809 brw_inst_set_dp_msg_type(devinfo, insn,
2810 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
2811 }
2812 } else {
2813 brw_inst_set_dp_msg_type(devinfo, insn,
2814 GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
2815
2816 if (!p->compressed)
2817 msg_control |= 1 << 4; /* SIMD8 mode */
2818 }
2819
2820 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2821 }
2822
2823 void
2824 brw_untyped_atomic(struct brw_codegen *p,
2825 struct brw_reg dst,
2826 struct brw_reg payload,
2827 struct brw_reg surface,
2828 unsigned atomic_op,
2829 unsigned msg_length,
2830 bool response_expected)
2831 {
2832 const struct brw_device_info *devinfo = p->devinfo;
2833 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2834 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2835 GEN7_SFID_DATAPORT_DATA_CACHE);
2836 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
2837 /* Mask out unused components -- This is especially important in Align16
2838 * mode on generations that don't have native support for SIMD4x2 atomics,
2839 * because unused but enabled components will cause the dataport to perform
2840 * additional atomic operations on the addresses that happen to be in the
2841 * uninitialized Y, Z and W coordinates of the payload.
2842 */
2843 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2844 struct brw_inst *insn = brw_send_indirect_surface_message(
2845 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
2846 brw_surface_payload_size(p, response_expected,
2847 devinfo->gen >= 8 || devinfo->is_haswell, true),
2848 align1);
2849
2850 brw_set_dp_untyped_atomic_message(
2851 p, insn, atomic_op, response_expected);
2852 }
2853
2854 static void
2855 brw_set_dp_untyped_surface_read_message(struct brw_codegen *p,
2856 struct brw_inst *insn,
2857 unsigned num_channels)
2858 {
2859 const struct brw_device_info *devinfo = p->devinfo;
2860 /* Set mask of 32-bit channels to drop. */
2861 unsigned msg_control = 0xf & (0xf << num_channels);
2862
2863 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2864 if (p->compressed)
2865 msg_control |= 1 << 4; /* SIMD16 mode */
2866 else
2867 msg_control |= 2 << 4; /* SIMD8 mode */
2868 }
2869
2870 brw_inst_set_dp_msg_type(devinfo, insn,
2871 (devinfo->gen >= 8 || devinfo->is_haswell ?
2872 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
2873 GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ));
2874 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2875 }
2876
2877 void
2878 brw_untyped_surface_read(struct brw_codegen *p,
2879 struct brw_reg dst,
2880 struct brw_reg payload,
2881 struct brw_reg surface,
2882 unsigned msg_length,
2883 unsigned num_channels)
2884 {
2885 const struct brw_device_info *devinfo = p->devinfo;
2886 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2887 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2888 GEN7_SFID_DATAPORT_DATA_CACHE);
2889 const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
2890 struct brw_inst *insn = brw_send_indirect_surface_message(
2891 p, sfid, dst, payload, surface, msg_length,
2892 brw_surface_payload_size(p, num_channels, true, true),
2893 align1);
2894
2895 brw_set_dp_untyped_surface_read_message(
2896 p, insn, num_channels);
2897 }
2898
2899 static void
2900 brw_set_dp_untyped_surface_write_message(struct brw_codegen *p,
2901 struct brw_inst *insn,
2902 unsigned num_channels)
2903 {
2904 const struct brw_device_info *devinfo = p->devinfo;
2905 /* Set mask of 32-bit channels to drop. */
2906 unsigned msg_control = 0xf & (0xf << num_channels);
2907
2908 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2909 if (p->compressed)
2910 msg_control |= 1 << 4; /* SIMD16 mode */
2911 else
2912 msg_control |= 2 << 4; /* SIMD8 mode */
2913 } else {
2914 if (devinfo->gen >= 8 || devinfo->is_haswell)
2915 msg_control |= 0 << 4; /* SIMD4x2 mode */
2916 else
2917 msg_control |= 2 << 4; /* SIMD8 mode */
2918 }
2919
2920 brw_inst_set_dp_msg_type(devinfo, insn,
2921 devinfo->gen >= 8 || devinfo->is_haswell ?
2922 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
2923 GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
2924 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2925 }
2926
2927 void
2928 brw_untyped_surface_write(struct brw_codegen *p,
2929 struct brw_reg payload,
2930 struct brw_reg surface,
2931 unsigned msg_length,
2932 unsigned num_channels)
2933 {
2934 const struct brw_device_info *devinfo = p->devinfo;
2935 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2936 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2937 GEN7_SFID_DATAPORT_DATA_CACHE);
2938 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
2939 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
2940 const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
2941 WRITEMASK_X : WRITEMASK_XYZW;
2942 struct brw_inst *insn = brw_send_indirect_surface_message(
2943 p, sfid, brw_writemask(brw_null_reg(), mask),
2944 payload, surface, msg_length, 0, align1);
2945
2946 brw_set_dp_untyped_surface_write_message(
2947 p, insn, num_channels);
2948 }
2949
2950 static void
2951 brw_set_dp_typed_atomic_message(struct brw_codegen *p,
2952 struct brw_inst *insn,
2953 unsigned atomic_op,
2954 bool response_expected)
2955 {
2956 const struct brw_device_info *devinfo = p->devinfo;
2957 unsigned msg_control =
2958 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2959 (response_expected ? 1 << 5 : 0); /* Return data expected */
2960
2961 if (devinfo->gen >= 8 || devinfo->is_haswell) {
2962 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2963 if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
2964 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
2965
2966 brw_inst_set_dp_msg_type(devinfo, insn,
2967 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP);
2968 } else {
2969 brw_inst_set_dp_msg_type(devinfo, insn,
2970 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2);
2971 }
2972
2973 } else {
2974 brw_inst_set_dp_msg_type(devinfo, insn,
2975 GEN7_DATAPORT_RC_TYPED_ATOMIC_OP);
2976
2977 if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
2978 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
2979 }
2980
2981 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2982 }
2983
2984 void
2985 brw_typed_atomic(struct brw_codegen *p,
2986 struct brw_reg dst,
2987 struct brw_reg payload,
2988 struct brw_reg surface,
2989 unsigned atomic_op,
2990 unsigned msg_length,
2991 bool response_expected) {
2992 const struct brw_device_info *devinfo = p->devinfo;
2993 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2994 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2995 GEN6_SFID_DATAPORT_RENDER_CACHE);
2996 const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
2997 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
2998 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2999 struct brw_inst *insn = brw_send_indirect_surface_message(
3000 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
3001 brw_surface_payload_size(p, response_expected,
3002 devinfo->gen >= 8 || devinfo->is_haswell, false),
3003 true);
3004
3005 brw_set_dp_typed_atomic_message(
3006 p, insn, atomic_op, response_expected);
3007 }
3008
3009 static void
3010 brw_set_dp_typed_surface_read_message(struct brw_codegen *p,
3011 struct brw_inst *insn,
3012 unsigned num_channels)
3013 {
3014 const struct brw_device_info *devinfo = p->devinfo;
3015 /* Set mask of unused channels. */
3016 unsigned msg_control = 0xf & (0xf << num_channels);
3017
3018 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3019 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3020 if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
3021 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3022 else
3023 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3024 }
3025
3026 brw_inst_set_dp_msg_type(devinfo, insn,
3027 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ);
3028 } else {
3029 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3030 if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
3031 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3032 }
3033
3034 brw_inst_set_dp_msg_type(devinfo, insn,
3035 GEN7_DATAPORT_RC_TYPED_SURFACE_READ);
3036 }
3037
3038 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3039 }
3040
3041 void
3042 brw_typed_surface_read(struct brw_codegen *p,
3043 struct brw_reg dst,
3044 struct brw_reg payload,
3045 struct brw_reg surface,
3046 unsigned msg_length,
3047 unsigned num_channels)
3048 {
3049 const struct brw_device_info *devinfo = p->devinfo;
3050 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3051 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3052 GEN6_SFID_DATAPORT_RENDER_CACHE);
3053 struct brw_inst *insn = brw_send_indirect_surface_message(
3054 p, sfid, dst, payload, surface, msg_length,
3055 brw_surface_payload_size(p, num_channels,
3056 devinfo->gen >= 8 || devinfo->is_haswell, false),
3057 true);
3058
3059 brw_set_dp_typed_surface_read_message(
3060 p, insn, num_channels);
3061 }
3062
3063 static void
3064 brw_set_dp_typed_surface_write_message(struct brw_codegen *p,
3065 struct brw_inst *insn,
3066 unsigned num_channels)
3067 {
3068 const struct brw_device_info *devinfo = p->devinfo;
3069 /* Set mask of unused channels. */
3070 unsigned msg_control = 0xf & (0xf << num_channels);
3071
3072 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3073 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3074 if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
3075 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3076 else
3077 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3078 }
3079
3080 brw_inst_set_dp_msg_type(devinfo, insn,
3081 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE);
3082
3083 } else {
3084 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3085 if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
3086 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3087 }
3088
3089 brw_inst_set_dp_msg_type(devinfo, insn,
3090 GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE);
3091 }
3092
3093 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3094 }
3095
3096 void
3097 brw_typed_surface_write(struct brw_codegen *p,
3098 struct brw_reg payload,
3099 struct brw_reg surface,
3100 unsigned msg_length,
3101 unsigned num_channels)
3102 {
3103 const struct brw_device_info *devinfo = p->devinfo;
3104 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3105 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3106 GEN6_SFID_DATAPORT_RENDER_CACHE);
3107 const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3108 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3109 const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3110 WRITEMASK_X : WRITEMASK_XYZW);
3111 struct brw_inst *insn = brw_send_indirect_surface_message(
3112 p, sfid, brw_writemask(brw_null_reg(), mask),
3113 payload, surface, msg_length, 0, true);
3114
3115 brw_set_dp_typed_surface_write_message(
3116 p, insn, num_channels);
3117 }
3118
3119 static void
3120 brw_set_memory_fence_message(struct brw_codegen *p,
3121 struct brw_inst *insn,
3122 enum brw_message_target sfid,
3123 bool commit_enable)
3124 {
3125 const struct brw_device_info *devinfo = p->devinfo;
3126
3127 brw_set_message_descriptor(p, insn, sfid,
3128 1 /* message length */,
3129 (commit_enable ? 1 : 0) /* response length */,
3130 true /* header present */,
3131 false);
3132
3133 switch (sfid) {
3134 case GEN6_SFID_DATAPORT_RENDER_CACHE:
3135 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3136 break;
3137 case GEN7_SFID_DATAPORT_DATA_CACHE:
3138 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3139 break;
3140 default:
3141 unreachable("Not reached");
3142 }
3143
3144 if (commit_enable)
3145 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3146 }
3147
3148 void
3149 brw_memory_fence(struct brw_codegen *p,
3150 struct brw_reg dst)
3151 {
3152 const struct brw_device_info *devinfo = p->devinfo;
3153 const bool commit_enable = devinfo->gen == 7 && !devinfo->is_haswell;
3154 struct brw_inst *insn;
3155
3156 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3157 * message doesn't write anything back.
3158 */
3159 insn = next_insn(p, BRW_OPCODE_SEND);
3160 brw_set_dest(p, insn, dst);
3161 brw_set_src0(p, insn, dst);
3162 brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3163 commit_enable);
3164
3165 if (devinfo->gen == 7 && !devinfo->is_haswell) {
3166 /* IVB does typed surface access through the render cache, so we need to
3167 * flush it too. Use a different register so both flushes can be
3168 * pipelined by the hardware.
3169 */
3170 insn = next_insn(p, BRW_OPCODE_SEND);
3171 brw_set_dest(p, insn, offset(dst, 1));
3172 brw_set_src0(p, insn, offset(dst, 1));
3173 brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3174 commit_enable);
3175
3176 /* Now write the response of the second message into the response of the
3177 * first to trigger a pipeline stall -- This way future render and data
3178 * cache messages will be properly ordered with respect to past data and
3179 * render cache messages.
3180 */
3181 brw_push_insn_state(p);
3182 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3183 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3184 brw_MOV(p, dst, offset(dst, 1));
3185 brw_pop_insn_state(p);
3186 }
3187 }
3188
3189 void
3190 brw_pixel_interpolator_query(struct brw_codegen *p,
3191 struct brw_reg dest,
3192 struct brw_reg mrf,
3193 bool noperspective,
3194 unsigned mode,
3195 unsigned data,
3196 unsigned msg_length,
3197 unsigned response_length)
3198 {
3199 const struct brw_device_info *devinfo = p->devinfo;
3200 struct brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
3201
3202 brw_set_dest(p, insn, dest);
3203 brw_set_src0(p, insn, mrf);
3204 brw_set_message_descriptor(p, insn, GEN7_SFID_PIXEL_INTERPOLATOR,
3205 msg_length, response_length,
3206 false /* header is never present for PI */,
3207 false);
3208
3209 brw_inst_set_pi_simd_mode(
3210 devinfo, insn, brw_inst_exec_size(devinfo, insn) == BRW_EXECUTE_16);
3211 brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
3212 brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
3213 brw_inst_set_pi_message_type(devinfo, insn, mode);
3214 brw_inst_set_pi_message_data(devinfo, insn, data);
3215 }
3216
3217 void
3218 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst)
3219 {
3220 const struct brw_device_info *devinfo = p->devinfo;
3221 brw_inst *inst;
3222
3223 assert(devinfo->gen >= 7);
3224
3225 brw_push_insn_state(p);
3226
3227 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3228 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3229
3230 if (devinfo->gen >= 8) {
3231 /* Getting the first active channel index is easy on Gen8: Just find
3232 * the first bit set in the mask register. The same register exists
3233 * on HSW already but it reads back as all ones when the current
3234 * instruction has execution masking disabled, so it's kind of
3235 * useless.
3236 */
3237 inst = brw_FBL(p, vec1(dst),
3238 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
3239
3240 /* Quarter control has the effect of magically shifting the value of
3241 * this register. Make sure it's set to zero.
3242 */
3243 brw_inst_set_qtr_control(devinfo, inst, GEN6_COMPRESSION_1Q);
3244 } else {
3245 const struct brw_reg flag = retype(brw_flag_reg(1, 0),
3246 BRW_REGISTER_TYPE_UD);
3247
3248 brw_MOV(p, flag, brw_imm_ud(0));
3249
3250 /* Run a 16-wide instruction returning zero with execution masking
3251 * and a conditional modifier enabled in order to get the current
3252 * execution mask in f1.0.
3253 */
3254 inst = brw_MOV(p, brw_null_reg(), brw_imm_ud(0));
3255 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_16);
3256 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3257 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3258 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3259
3260 brw_FBL(p, vec1(dst), flag);
3261 }
3262 } else {
3263 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3264
3265 if (devinfo->gen >= 8) {
3266 /* In SIMD4x2 mode the first active channel index is just the
3267 * negation of the first bit of the mask register.
3268 */
3269 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3270 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3271 brw_imm_ud(1));
3272
3273 } else {
3274 /* Overwrite the destination without and with execution masking to
3275 * find out which of the channels is active.
3276 */
3277 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3278 brw_imm_ud(1));
3279
3280 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3281 brw_imm_ud(0));
3282 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3283 }
3284 }
3285
3286 brw_pop_insn_state(p);
3287 }
3288
3289 void
3290 brw_broadcast(struct brw_codegen *p,
3291 struct brw_reg dst,
3292 struct brw_reg src,
3293 struct brw_reg idx)
3294 {
3295 const struct brw_device_info *devinfo = p->devinfo;
3296 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3297 brw_inst *inst;
3298
3299 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3300 src.address_mode == BRW_ADDRESS_DIRECT);
3301
3302 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3303 idx.file == BRW_IMMEDIATE_VALUE) {
3304 /* Trivial, the source is already uniform or the index is a constant.
3305 * We will typically not get here if the optimizer is doing its job, but
3306 * asserting would be mean.
3307 */
3308 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.dw1.ud : 0;
3309 brw_MOV(p, dst,
3310 (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3311 stride(suboffset(src, 4 * i), 0, 4, 1)));
3312 } else {
3313 if (align1) {
3314 const struct brw_reg addr =
3315 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3316 const unsigned offset = src.nr * REG_SIZE + src.subnr;
3317 /* Limit in bytes of the signed indirect addressing immediate. */
3318 const unsigned limit = 512;
3319
3320 brw_push_insn_state(p);
3321 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3322 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3323
3324 /* Take into account the component size and horizontal stride. */
3325 assert(src.vstride == src.hstride + src.width);
3326 brw_SHL(p, addr, vec1(idx),
3327 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3328 src.hstride - 1));
3329
3330 /* We can only address up to limit bytes using the indirect
3331 * addressing immediate, account for the difference if the source
3332 * register is above this limit.
3333 */
3334 if (offset >= limit)
3335 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3336
3337 brw_pop_insn_state(p);
3338
3339 /* Use indirect addressing to fetch the specified component. */
3340 brw_MOV(p, dst,
3341 retype(brw_vec1_indirect(addr.subnr, offset % limit),
3342 src.type));
3343 } else {
3344 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3345 * to all bits of a flag register,
3346 */
3347 inst = brw_MOV(p,
3348 brw_null_reg(),
3349 stride(brw_swizzle1(idx, 0), 0, 4, 1));
3350 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3351 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3352 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3353
3354 /* and use predicated SEL to pick the right channel. */
3355 inst = brw_SEL(p, dst,
3356 stride(suboffset(src, 4), 0, 4, 1),
3357 stride(src, 0, 4, 1));
3358 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3359 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3360 }
3361 }
3362 }
3363
3364 /**
3365 * This instruction is generated as a single-channel align1 instruction by
3366 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3367 *
3368 * We can't use the typed atomic op in the FS because that has the execution
3369 * mask ANDed with the pixel mask, but we just want to write the one dword for
3370 * all the pixels.
3371 *
3372 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3373 * one u32. So we use the same untyped atomic write message as the pixel
3374 * shader.
3375 *
3376 * The untyped atomic operation requires a BUFFER surface type with RAW
3377 * format, and is only accessible through the legacy DATA_CACHE dataport
3378 * messages.
3379 */
3380 void brw_shader_time_add(struct brw_codegen *p,
3381 struct brw_reg payload,
3382 uint32_t surf_index)
3383 {
3384 const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ?
3385 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3386 GEN7_SFID_DATAPORT_DATA_CACHE);
3387 assert(p->devinfo->gen >= 7);
3388
3389 brw_push_insn_state(p);
3390 brw_set_default_access_mode(p, BRW_ALIGN_1);
3391 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3392 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3393 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3394
3395 /* We use brw_vec1_reg and unmasked because we want to increment the given
3396 * offset only once.
3397 */
3398 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3399 BRW_ARF_NULL, 0));
3400 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3401 payload.nr, 0));
3402 brw_set_src1(p, send, brw_imm_ud(0));
3403 brw_set_message_descriptor(p, send, sfid, 2, 0, false, false);
3404 brw_inst_set_binding_table_index(p->devinfo, send, surf_index);
3405 brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
3406
3407 brw_pop_insn_state(p);
3408 }
3409
3410
3411 /**
3412 * Emit the SEND message for a barrier
3413 */
3414 void
3415 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3416 {
3417 const struct brw_device_info *devinfo = p->devinfo;
3418 struct brw_inst *inst;
3419
3420 assert(devinfo->gen >= 7);
3421
3422 inst = next_insn(p, BRW_OPCODE_SEND);
3423 brw_set_dest(p, inst, brw_null_reg());
3424 brw_set_src0(p, inst, src);
3425 brw_set_src1(p, inst, brw_null_reg());
3426
3427 brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY,
3428 1 /* msg_length */,
3429 0 /* response_length */,
3430 false /* header_present */,
3431 false /* end_of_thread */);
3432
3433 brw_inst_set_gateway_notify(devinfo, inst, 1);
3434 brw_inst_set_gateway_subfuncid(devinfo, inst,
3435 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3436
3437 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3438 }
3439
3440
3441 /**
3442 * Emit the wait instruction for a barrier
3443 */
3444 void
3445 brw_WAIT(struct brw_codegen *p)
3446 {
3447 const struct brw_device_info *devinfo = p->devinfo;
3448 struct brw_inst *insn;
3449
3450 struct brw_reg src = brw_notification_reg();
3451
3452 insn = next_insn(p, BRW_OPCODE_WAIT);
3453 brw_set_dest(p, insn, src);
3454 brw_set_src0(p, insn, src);
3455 brw_set_src1(p, insn, brw_null_reg());
3456
3457 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3458 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3459 }