i965: Validate destination restrictions with vector immediates
[mesa.git] / src / intel / compiler / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35
36 #include "util/ralloc.h"
37
38 /**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45 void
46 gen6_resolve_implied_move(struct brw_codegen *p,
47 struct brw_reg *src,
48 unsigned msg_reg_nr)
49 {
50 const struct gen_device_info *devinfo = p->devinfo;
51 if (devinfo->gen < 6)
52 return;
53
54 if (src->file == BRW_MESSAGE_REGISTER_FILE)
55 return;
56
57 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58 brw_push_insn_state(p);
59 brw_set_default_exec_size(p, BRW_EXECUTE_8);
60 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
61 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
62 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
63 retype(*src, BRW_REGISTER_TYPE_UD));
64 brw_pop_insn_state(p);
65 }
66 *src = brw_message_reg(msg_reg_nr);
67 }
68
69 static void
70 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
71 {
72 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
73 * "The send with EOT should use register space R112-R127 for <src>. This is
74 * to enable loading of a new thread into the same slot while the message
75 * with EOT for current thread is pending dispatch."
76 *
77 * Since we're pretending to have 16 MRFs anyway, we may as well use the
78 * registers required for messages with EOT.
79 */
80 const struct gen_device_info *devinfo = p->devinfo;
81 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
82 reg->file = BRW_GENERAL_REGISTER_FILE;
83 reg->nr += GEN7_MRF_HACK_START;
84 }
85 }
86
87 /**
88 * Convert a brw_reg_type enumeration value into the hardware representation.
89 *
90 * The hardware encoding may depend on whether the value is an immediate.
91 */
92 unsigned
93 brw_reg_type_to_hw_type(const struct gen_device_info *devinfo,
94 enum brw_reg_type type, enum brw_reg_file file)
95 {
96 if (file == BRW_IMMEDIATE_VALUE) {
97 static const int imm_hw_types[] = {
98 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
99 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
100 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
101 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
102 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
103 [BRW_REGISTER_TYPE_UB] = -1,
104 [BRW_REGISTER_TYPE_B] = -1,
105 [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
106 [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
107 [BRW_REGISTER_TYPE_V] = BRW_HW_REG_IMM_TYPE_V,
108 [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
109 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
110 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
111 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
112 };
113 assert(type < ARRAY_SIZE(imm_hw_types));
114 assert(imm_hw_types[type] != -1);
115 assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
116 return imm_hw_types[type];
117 } else {
118 /* Non-immediate registers */
119 static const int hw_types[] = {
120 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
121 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
122 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
123 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
124 [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
125 [BRW_REGISTER_TYPE_B] = BRW_HW_REG_NON_IMM_TYPE_B,
126 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
127 [BRW_REGISTER_TYPE_UV] = -1,
128 [BRW_REGISTER_TYPE_VF] = -1,
129 [BRW_REGISTER_TYPE_V] = -1,
130 [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
131 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
132 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
133 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
134 };
135 assert(type < ARRAY_SIZE(hw_types));
136 assert(hw_types[type] != -1);
137 assert(devinfo->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
138 assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_Q);
139 return hw_types[type];
140 }
141 }
142
143 /**
144 * Return the element size given a hardware register type and file.
145 *
146 * The hardware encoding may depend on whether the value is an immediate.
147 */
148 unsigned
149 brw_hw_reg_type_to_size(const struct gen_device_info *devinfo,
150 unsigned type, enum brw_reg_file file)
151 {
152 if (file == BRW_IMMEDIATE_VALUE) {
153 static const unsigned imm_hw_sizes[] = {
154 [BRW_HW_REG_TYPE_UD] = 4,
155 [BRW_HW_REG_TYPE_D] = 4,
156 [BRW_HW_REG_TYPE_UW] = 2,
157 [BRW_HW_REG_TYPE_W] = 2,
158 [BRW_HW_REG_IMM_TYPE_UV] = 2,
159 [BRW_HW_REG_IMM_TYPE_VF] = 4,
160 [BRW_HW_REG_IMM_TYPE_V] = 2,
161 [BRW_HW_REG_TYPE_F] = 4,
162 [GEN8_HW_REG_TYPE_UQ] = 8,
163 [GEN8_HW_REG_TYPE_Q] = 8,
164 [GEN8_HW_REG_IMM_TYPE_DF] = 8,
165 [GEN8_HW_REG_IMM_TYPE_HF] = 2,
166 };
167 assert(type < ARRAY_SIZE(imm_hw_sizes));
168 assert(devinfo->gen >= 6 || type != BRW_HW_REG_IMM_TYPE_UV);
169 assert(devinfo->gen >= 8 || type <= BRW_HW_REG_TYPE_F);
170 return imm_hw_sizes[type];
171 } else {
172 /* Non-immediate registers */
173 static const unsigned hw_sizes[] = {
174 [BRW_HW_REG_TYPE_UD] = 4,
175 [BRW_HW_REG_TYPE_D] = 4,
176 [BRW_HW_REG_TYPE_UW] = 2,
177 [BRW_HW_REG_TYPE_W] = 2,
178 [BRW_HW_REG_NON_IMM_TYPE_UB] = 1,
179 [BRW_HW_REG_NON_IMM_TYPE_B] = 1,
180 [GEN7_HW_REG_NON_IMM_TYPE_DF] = 8,
181 [BRW_HW_REG_TYPE_F] = 4,
182 [GEN8_HW_REG_TYPE_UQ] = 8,
183 [GEN8_HW_REG_TYPE_Q] = 8,
184 [GEN8_HW_REG_NON_IMM_TYPE_HF] = 2,
185 };
186 assert(type < ARRAY_SIZE(hw_sizes));
187 assert(devinfo->gen >= 7 ||
188 (type < GEN7_HW_REG_NON_IMM_TYPE_DF || type == BRW_HW_REG_TYPE_F));
189 assert(devinfo->gen >= 8 || type <= BRW_HW_REG_TYPE_F);
190 return hw_sizes[type];
191 }
192 }
193
194 void
195 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
196 {
197 const struct gen_device_info *devinfo = p->devinfo;
198
199 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
200 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
201 else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE)
202 assert(dest.nr < 128);
203
204 gen7_convert_mrf_to_grf(p, &dest);
205
206 brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
207 brw_inst_set_dst_reg_type(devinfo, inst,
208 brw_reg_type_to_hw_type(devinfo, dest.type,
209 dest.file));
210 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
211
212 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
213 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
214
215 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
216 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
217 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
218 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
219 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
220 } else {
221 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
222 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
223 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
224 dest.file == BRW_MESSAGE_REGISTER_FILE) {
225 assert(dest.writemask != 0);
226 }
227 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
228 * Although Dst.HorzStride is a don't care for Align16, HW needs
229 * this to be programmed as "01".
230 */
231 brw_inst_set_dst_hstride(devinfo, inst, 1);
232 }
233 } else {
234 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
235
236 /* These are different sizes in align1 vs align16:
237 */
238 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
239 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
240 dest.indirect_offset);
241 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
242 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
243 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
244 } else {
245 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
246 dest.indirect_offset);
247 /* even ignored in da16, still need to set as '01' */
248 brw_inst_set_dst_hstride(devinfo, inst, 1);
249 }
250 }
251
252 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
253 * or 16 (SIMD16), as that's normally correct. However, when dealing with
254 * small registers, we automatically reduce it to match the register size.
255 *
256 * In platforms that support fp64 we can emit instructions with a width of
257 * 4 that need two SIMD8 registers and an exec_size of 8 or 16. In these
258 * cases we need to make sure that these instructions have their exec sizes
259 * set properly when they are emitted and we can't rely on this code to fix
260 * it.
261 */
262 bool fix_exec_size;
263 if (devinfo->gen >= 6)
264 fix_exec_size = dest.width < BRW_EXECUTE_4;
265 else
266 fix_exec_size = dest.width < BRW_EXECUTE_8;
267
268 if (fix_exec_size)
269 brw_inst_set_exec_size(devinfo, inst, dest.width);
270 }
271
272 static void
273 validate_reg(const struct gen_device_info *devinfo,
274 brw_inst *inst, struct brw_reg reg)
275 {
276 const int hstride_for_reg[] = {0, 1, 2, 4};
277 const int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32};
278 const int width_for_reg[] = {1, 2, 4, 8, 16};
279 const int execsize_for_reg[] = {1, 2, 4, 8, 16, 32};
280 int width, hstride, vstride, execsize;
281
282 if (reg.file == BRW_IMMEDIATE_VALUE)
283 return;
284
285 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
286 reg.file == BRW_ARF_NULL)
287 return;
288
289 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
290 *
291 * "Swizzling is not allowed when an accumulator is used as an implicit
292 * source or an explicit source in an instruction."
293 */
294 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
295 reg.nr == BRW_ARF_ACCUMULATOR)
296 assert(reg.swizzle == BRW_SWIZZLE_XYZW);
297
298 assert(reg.hstride < ARRAY_SIZE(hstride_for_reg));
299 hstride = hstride_for_reg[reg.hstride];
300
301 if (reg.vstride == 0xf) {
302 vstride = -1;
303 } else {
304 assert(reg.vstride >= 0 && reg.vstride < ARRAY_SIZE(vstride_for_reg));
305 vstride = vstride_for_reg[reg.vstride];
306 }
307
308 assert(reg.width >= 0 && reg.width < ARRAY_SIZE(width_for_reg));
309 width = width_for_reg[reg.width];
310
311 assert(brw_inst_exec_size(devinfo, inst) >= 0 &&
312 brw_inst_exec_size(devinfo, inst) < ARRAY_SIZE(execsize_for_reg));
313 execsize = execsize_for_reg[brw_inst_exec_size(devinfo, inst)];
314
315 /* Restrictions from 3.3.10: Register Region Restrictions. */
316 /* 3. */
317 assert(execsize >= width);
318
319 /* 4. */
320 if (execsize == width && hstride != 0) {
321 assert(vstride == -1 || vstride == width * hstride);
322 }
323
324 /* 5. */
325 if (execsize == width && hstride == 0) {
326 /* no restriction on vstride. */
327 }
328
329 /* 6. */
330 if (width == 1) {
331 assert(hstride == 0);
332 }
333
334 /* 7. */
335 if (execsize == 1 && width == 1) {
336 assert(hstride == 0);
337 assert(vstride == 0);
338 }
339
340 /* 8. */
341 if (vstride == 0 && hstride == 0) {
342 assert(width == 1);
343 }
344
345 /* 10. Check destination issues. */
346 }
347
348 void
349 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
350 {
351 const struct gen_device_info *devinfo = p->devinfo;
352
353 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
354 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
355 else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
356 assert(reg.nr < 128);
357
358 gen7_convert_mrf_to_grf(p, &reg);
359
360 if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
361 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
362 /* Any source modifiers or regions will be ignored, since this just
363 * identifies the MRF/GRF to start reading the message contents from.
364 * Check for some likely failures.
365 */
366 assert(!reg.negate);
367 assert(!reg.abs);
368 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
369 }
370
371 validate_reg(devinfo, inst, reg);
372
373 brw_inst_set_src0_reg_file(devinfo, inst, reg.file);
374 brw_inst_set_src0_reg_type(devinfo, inst,
375 brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
376 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
377 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
378 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
379
380 if (reg.file == BRW_IMMEDIATE_VALUE) {
381 if (reg.type == BRW_REGISTER_TYPE_DF ||
382 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
383 brw_inst_set_imm_df(devinfo, inst, reg.df);
384 else if (reg.type == BRW_REGISTER_TYPE_UQ ||
385 reg.type == BRW_REGISTER_TYPE_Q)
386 brw_inst_set_imm_uq(devinfo, inst, reg.u64);
387 else
388 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
389
390 if (type_sz(reg.type) < 8) {
391 brw_inst_set_src1_reg_file(devinfo, inst,
392 BRW_ARCHITECTURE_REGISTER_FILE);
393 brw_inst_set_src1_reg_type(devinfo, inst,
394 brw_inst_src0_reg_type(devinfo, inst));
395 }
396 } else {
397 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
398 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
399 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
400 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
401 } else {
402 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
403 }
404 } else {
405 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
406
407 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
408 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
409 } else {
410 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
411 }
412 }
413
414 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
415 if (reg.width == BRW_WIDTH_1 &&
416 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
417 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
418 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
419 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
420 } else {
421 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
422 brw_inst_set_src0_width(devinfo, inst, reg.width);
423 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
424 }
425 } else {
426 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
427 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
428 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
429 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
430 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
431 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
432 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
433 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
434
435 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
436 /* This is an oddity of the fact we're using the same
437 * descriptions for registers in align_16 as align_1:
438 */
439 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
440 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
441 reg.type == BRW_REGISTER_TYPE_DF &&
442 reg.vstride == BRW_VERTICAL_STRIDE_2) {
443 /* From SNB PRM:
444 *
445 * "For Align16 access mode, only encodings of 0000 and 0011
446 * are allowed. Other codes are reserved."
447 *
448 * Presumably the DevSNB behavior applies to IVB as well.
449 */
450 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
451 } else {
452 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
453 }
454 }
455 }
456 }
457
458
459 void
460 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
461 {
462 const struct gen_device_info *devinfo = p->devinfo;
463
464 if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
465 assert(reg.nr < 128);
466
467 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
468 *
469 * "Accumulator registers may be accessed explicitly as src0
470 * operands only."
471 */
472 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
473 reg.nr != BRW_ARF_ACCUMULATOR);
474
475 gen7_convert_mrf_to_grf(p, &reg);
476 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
477
478 validate_reg(devinfo, inst, reg);
479
480 brw_inst_set_src1_reg_file(devinfo, inst, reg.file);
481 brw_inst_set_src1_reg_type(devinfo, inst,
482 brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
483 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
484 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
485
486 /* Only src1 can be immediate in two-argument instructions.
487 */
488 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
489
490 if (reg.file == BRW_IMMEDIATE_VALUE) {
491 /* two-argument instructions can only use 32-bit immediates */
492 assert(type_sz(reg.type) < 8);
493 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
494 } else {
495 /* This is a hardware restriction, which may or may not be lifted
496 * in the future:
497 */
498 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
499 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
500
501 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
502 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
503 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
504 } else {
505 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
506 }
507
508 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
509 if (reg.width == BRW_WIDTH_1 &&
510 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
511 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
512 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
513 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
514 } else {
515 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
516 brw_inst_set_src1_width(devinfo, inst, reg.width);
517 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
518 }
519 } else {
520 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
521 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
522 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
523 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
524 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
525 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
526 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
527 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
528
529 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
530 /* This is an oddity of the fact we're using the same
531 * descriptions for registers in align_16 as align_1:
532 */
533 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
534 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
535 reg.type == BRW_REGISTER_TYPE_DF &&
536 reg.vstride == BRW_VERTICAL_STRIDE_2) {
537 /* From SNB PRM:
538 *
539 * "For Align16 access mode, only encodings of 0000 and 0011
540 * are allowed. Other codes are reserved."
541 *
542 * Presumably the DevSNB behavior applies to IVB as well.
543 */
544 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
545 } else {
546 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
547 }
548 }
549 }
550 }
551
552 /**
553 * Set the Message Descriptor and Extended Message Descriptor fields
554 * for SEND messages.
555 *
556 * \note This zeroes out the Function Control bits, so it must be called
557 * \b before filling out any message-specific data. Callers can
558 * choose not to fill in irrelevant bits; they will be zero.
559 */
560 void
561 brw_set_message_descriptor(struct brw_codegen *p,
562 brw_inst *inst,
563 enum brw_message_target sfid,
564 unsigned msg_length,
565 unsigned response_length,
566 bool header_present,
567 bool end_of_thread)
568 {
569 const struct gen_device_info *devinfo = p->devinfo;
570
571 brw_set_src1(p, inst, brw_imm_d(0));
572
573 /* For indirect sends, `inst` will not be the SEND/SENDC instruction
574 * itself; instead, it will be a MOV/OR into the address register.
575 *
576 * In this case, we avoid setting the extended message descriptor bits,
577 * since they go on the later SEND/SENDC instead and if set here would
578 * instead clobber the conditionalmod bits.
579 */
580 unsigned opcode = brw_inst_opcode(devinfo, inst);
581 if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
582 brw_inst_set_sfid(devinfo, inst, sfid);
583 }
584
585 brw_inst_set_mlen(devinfo, inst, msg_length);
586 brw_inst_set_rlen(devinfo, inst, response_length);
587 brw_inst_set_eot(devinfo, inst, end_of_thread);
588
589 if (devinfo->gen >= 5) {
590 brw_inst_set_header_present(devinfo, inst, header_present);
591 }
592 }
593
594 static void brw_set_math_message( struct brw_codegen *p,
595 brw_inst *inst,
596 unsigned function,
597 unsigned integer_type,
598 bool low_precision,
599 unsigned dataType )
600 {
601 const struct gen_device_info *devinfo = p->devinfo;
602 unsigned msg_length;
603 unsigned response_length;
604
605 /* Infer message length from the function */
606 switch (function) {
607 case BRW_MATH_FUNCTION_POW:
608 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
609 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
610 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
611 msg_length = 2;
612 break;
613 default:
614 msg_length = 1;
615 break;
616 }
617
618 /* Infer response length from the function */
619 switch (function) {
620 case BRW_MATH_FUNCTION_SINCOS:
621 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
622 response_length = 2;
623 break;
624 default:
625 response_length = 1;
626 break;
627 }
628
629
630 brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
631 msg_length, response_length, false, false);
632 brw_inst_set_math_msg_function(devinfo, inst, function);
633 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
634 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
635 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
636 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
637 brw_inst_set_saturate(devinfo, inst, 0);
638 }
639
640
641 static void brw_set_ff_sync_message(struct brw_codegen *p,
642 brw_inst *insn,
643 bool allocate,
644 unsigned response_length,
645 bool end_of_thread)
646 {
647 const struct gen_device_info *devinfo = p->devinfo;
648
649 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
650 1, response_length, true, end_of_thread);
651 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
652 brw_inst_set_urb_allocate(devinfo, insn, allocate);
653 /* The following fields are not used by FF_SYNC: */
654 brw_inst_set_urb_global_offset(devinfo, insn, 0);
655 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
656 brw_inst_set_urb_used(devinfo, insn, 0);
657 brw_inst_set_urb_complete(devinfo, insn, 0);
658 }
659
660 static void brw_set_urb_message( struct brw_codegen *p,
661 brw_inst *insn,
662 enum brw_urb_write_flags flags,
663 unsigned msg_length,
664 unsigned response_length,
665 unsigned offset,
666 unsigned swizzle_control )
667 {
668 const struct gen_device_info *devinfo = p->devinfo;
669
670 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
671 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
672 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
673
674 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
675 msg_length, response_length, true,
676 flags & BRW_URB_WRITE_EOT);
677
678 if (flags & BRW_URB_WRITE_OWORD) {
679 assert(msg_length == 2); /* header + one OWORD of data */
680 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
681 } else {
682 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
683 }
684
685 brw_inst_set_urb_global_offset(devinfo, insn, offset);
686 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
687
688 if (devinfo->gen < 8) {
689 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
690 }
691
692 if (devinfo->gen < 7) {
693 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
694 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
695 } else {
696 brw_inst_set_urb_per_slot_offset(devinfo, insn,
697 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
698 }
699 }
700
701 void
702 brw_set_dp_write_message(struct brw_codegen *p,
703 brw_inst *insn,
704 unsigned binding_table_index,
705 unsigned msg_control,
706 unsigned msg_type,
707 unsigned target_cache,
708 unsigned msg_length,
709 bool header_present,
710 unsigned last_render_target,
711 unsigned response_length,
712 unsigned end_of_thread,
713 unsigned send_commit_msg)
714 {
715 const struct gen_device_info *devinfo = p->devinfo;
716 const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
717 BRW_SFID_DATAPORT_WRITE);
718
719 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
720 header_present, end_of_thread);
721
722 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
723 brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type);
724 brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control);
725 brw_inst_set_rt_last(devinfo, insn, last_render_target);
726 if (devinfo->gen < 7) {
727 brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg);
728 }
729 }
730
731 void
732 brw_set_dp_read_message(struct brw_codegen *p,
733 brw_inst *insn,
734 unsigned binding_table_index,
735 unsigned msg_control,
736 unsigned msg_type,
737 unsigned target_cache,
738 unsigned msg_length,
739 bool header_present,
740 unsigned response_length)
741 {
742 const struct gen_device_info *devinfo = p->devinfo;
743 const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
744 BRW_SFID_DATAPORT_READ);
745
746 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
747 header_present, false);
748
749 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
750 brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type);
751 brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control);
752 if (devinfo->gen < 6)
753 brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache);
754 }
755
756 void
757 brw_set_sampler_message(struct brw_codegen *p,
758 brw_inst *inst,
759 unsigned binding_table_index,
760 unsigned sampler,
761 unsigned msg_type,
762 unsigned response_length,
763 unsigned msg_length,
764 unsigned header_present,
765 unsigned simd_mode,
766 unsigned return_format)
767 {
768 const struct gen_device_info *devinfo = p->devinfo;
769
770 brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
771 response_length, header_present, false);
772
773 brw_inst_set_binding_table_index(devinfo, inst, binding_table_index);
774 brw_inst_set_sampler(devinfo, inst, sampler);
775 brw_inst_set_sampler_msg_type(devinfo, inst, msg_type);
776 if (devinfo->gen >= 5) {
777 brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode);
778 } else if (devinfo->gen == 4 && !devinfo->is_g4x) {
779 brw_inst_set_sampler_return_format(devinfo, inst, return_format);
780 }
781 }
782
783 static void
784 gen7_set_dp_scratch_message(struct brw_codegen *p,
785 brw_inst *inst,
786 bool write,
787 bool dword,
788 bool invalidate_after_read,
789 unsigned num_regs,
790 unsigned addr_offset,
791 unsigned mlen,
792 unsigned rlen,
793 bool header_present)
794 {
795 const struct gen_device_info *devinfo = p->devinfo;
796 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
797 (devinfo->gen >= 8 && num_regs == 8));
798 const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
799 num_regs - 1);
800
801 brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
802 mlen, rlen, header_present, false);
803 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
804 brw_inst_set_scratch_read_write(devinfo, inst, write);
805 brw_inst_set_scratch_type(devinfo, inst, dword);
806 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
807 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
808 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
809 }
810
811 #define next_insn brw_next_insn
812 brw_inst *
813 brw_next_insn(struct brw_codegen *p, unsigned opcode)
814 {
815 const struct gen_device_info *devinfo = p->devinfo;
816 brw_inst *insn;
817
818 if (p->nr_insn + 1 > p->store_size) {
819 p->store_size <<= 1;
820 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
821 }
822
823 p->next_insn_offset += 16;
824 insn = &p->store[p->nr_insn++];
825 memcpy(insn, p->current, sizeof(*insn));
826
827 brw_inst_set_opcode(devinfo, insn, opcode);
828 return insn;
829 }
830
831 static brw_inst *
832 brw_alu1(struct brw_codegen *p, unsigned opcode,
833 struct brw_reg dest, struct brw_reg src)
834 {
835 brw_inst *insn = next_insn(p, opcode);
836 brw_set_dest(p, insn, dest);
837 brw_set_src0(p, insn, src);
838 return insn;
839 }
840
841 static brw_inst *
842 brw_alu2(struct brw_codegen *p, unsigned opcode,
843 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
844 {
845 /* 64-bit immediates are only supported on 1-src instructions */
846 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
847 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
848
849 brw_inst *insn = next_insn(p, opcode);
850 brw_set_dest(p, insn, dest);
851 brw_set_src0(p, insn, src0);
852 brw_set_src1(p, insn, src1);
853 return insn;
854 }
855
856 static int
857 get_3src_subreg_nr(struct brw_reg reg)
858 {
859 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
860 * use 32-bit units (components 0..7). Since they only support F/D/UD
861 * types, this doesn't lose any flexibility, but uses fewer bits.
862 */
863 return reg.subnr / 4;
864 }
865
866 static brw_inst *
867 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
868 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
869 {
870 const struct gen_device_info *devinfo = p->devinfo;
871 brw_inst *inst = next_insn(p, opcode);
872
873 gen7_convert_mrf_to_grf(p, &dest);
874
875 assert(brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16);
876
877 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
878 dest.file == BRW_MESSAGE_REGISTER_FILE);
879 assert(dest.nr < 128);
880 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
881 assert(dest.type == BRW_REGISTER_TYPE_F ||
882 dest.type == BRW_REGISTER_TYPE_DF ||
883 dest.type == BRW_REGISTER_TYPE_D ||
884 dest.type == BRW_REGISTER_TYPE_UD);
885 if (devinfo->gen == 6) {
886 brw_inst_set_3src_dst_reg_file(devinfo, inst,
887 dest.file == BRW_MESSAGE_REGISTER_FILE);
888 }
889 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
890 brw_inst_set_3src_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
891 brw_inst_set_3src_dst_writemask(devinfo, inst, dest.writemask);
892
893 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
894 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
895 assert(src0.nr < 128);
896 brw_inst_set_3src_src0_swizzle(devinfo, inst, src0.swizzle);
897 brw_inst_set_3src_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
898 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
899 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
900 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
901 brw_inst_set_3src_src0_rep_ctrl(devinfo, inst,
902 src0.vstride == BRW_VERTICAL_STRIDE_0);
903
904 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
905 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
906 assert(src1.nr < 128);
907 brw_inst_set_3src_src1_swizzle(devinfo, inst, src1.swizzle);
908 brw_inst_set_3src_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
909 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
910 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
911 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
912 brw_inst_set_3src_src1_rep_ctrl(devinfo, inst,
913 src1.vstride == BRW_VERTICAL_STRIDE_0);
914
915 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
916 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
917 assert(src2.nr < 128);
918 brw_inst_set_3src_src2_swizzle(devinfo, inst, src2.swizzle);
919 brw_inst_set_3src_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
920 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
921 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
922 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
923 brw_inst_set_3src_src2_rep_ctrl(devinfo, inst,
924 src2.vstride == BRW_VERTICAL_STRIDE_0);
925
926 if (devinfo->gen >= 7) {
927 /* Set both the source and destination types based on dest.type,
928 * ignoring the source register types. The MAD and LRP emitters ensure
929 * that all four types are float. The BFE and BFI2 emitters, however,
930 * may send us mixed D and UD types and want us to ignore that and use
931 * the destination type.
932 */
933 switch (dest.type) {
934 case BRW_REGISTER_TYPE_F:
935 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_F);
936 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_F);
937 break;
938 case BRW_REGISTER_TYPE_DF:
939 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_DF);
940 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_DF);
941 break;
942 case BRW_REGISTER_TYPE_D:
943 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_D);
944 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_D);
945 break;
946 case BRW_REGISTER_TYPE_UD:
947 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_UD);
948 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_UD);
949 break;
950 default:
951 unreachable("not reached");
952 }
953 }
954
955 return inst;
956 }
957
958
959 /***********************************************************************
960 * Convenience routines.
961 */
962 #define ALU1(OP) \
963 brw_inst *brw_##OP(struct brw_codegen *p, \
964 struct brw_reg dest, \
965 struct brw_reg src0) \
966 { \
967 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
968 }
969
970 #define ALU2(OP) \
971 brw_inst *brw_##OP(struct brw_codegen *p, \
972 struct brw_reg dest, \
973 struct brw_reg src0, \
974 struct brw_reg src1) \
975 { \
976 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
977 }
978
979 #define ALU3(OP) \
980 brw_inst *brw_##OP(struct brw_codegen *p, \
981 struct brw_reg dest, \
982 struct brw_reg src0, \
983 struct brw_reg src1, \
984 struct brw_reg src2) \
985 { \
986 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
987 }
988
989 #define ALU3F(OP) \
990 brw_inst *brw_##OP(struct brw_codegen *p, \
991 struct brw_reg dest, \
992 struct brw_reg src0, \
993 struct brw_reg src1, \
994 struct brw_reg src2) \
995 { \
996 assert(dest.type == BRW_REGISTER_TYPE_F || \
997 dest.type == BRW_REGISTER_TYPE_DF); \
998 if (dest.type == BRW_REGISTER_TYPE_F) { \
999 assert(src0.type == BRW_REGISTER_TYPE_F); \
1000 assert(src1.type == BRW_REGISTER_TYPE_F); \
1001 assert(src2.type == BRW_REGISTER_TYPE_F); \
1002 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
1003 assert(src0.type == BRW_REGISTER_TYPE_DF); \
1004 assert(src1.type == BRW_REGISTER_TYPE_DF); \
1005 assert(src2.type == BRW_REGISTER_TYPE_DF); \
1006 } \
1007 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1008 }
1009
1010 /* Rounding operations (other than RNDD) require two instructions - the first
1011 * stores a rounded value (possibly the wrong way) in the dest register, but
1012 * also sets a per-channel "increment bit" in the flag register. A predicated
1013 * add of 1.0 fixes dest to contain the desired result.
1014 *
1015 * Sandybridge and later appear to round correctly without an ADD.
1016 */
1017 #define ROUND(OP) \
1018 void brw_##OP(struct brw_codegen *p, \
1019 struct brw_reg dest, \
1020 struct brw_reg src) \
1021 { \
1022 const struct gen_device_info *devinfo = p->devinfo; \
1023 brw_inst *rnd, *add; \
1024 rnd = next_insn(p, BRW_OPCODE_##OP); \
1025 brw_set_dest(p, rnd, dest); \
1026 brw_set_src0(p, rnd, src); \
1027 \
1028 if (devinfo->gen < 6) { \
1029 /* turn on round-increments */ \
1030 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
1031 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
1032 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
1033 } \
1034 }
1035
1036
1037 ALU2(SEL)
1038 ALU1(NOT)
1039 ALU2(AND)
1040 ALU2(OR)
1041 ALU2(XOR)
1042 ALU2(SHR)
1043 ALU2(SHL)
1044 ALU1(DIM)
1045 ALU2(ASR)
1046 ALU1(FRC)
1047 ALU1(RNDD)
1048 ALU2(MAC)
1049 ALU2(MACH)
1050 ALU1(LZD)
1051 ALU2(DP4)
1052 ALU2(DPH)
1053 ALU2(DP3)
1054 ALU2(DP2)
1055 ALU3F(MAD)
1056 ALU3F(LRP)
1057 ALU1(BFREV)
1058 ALU3(BFE)
1059 ALU2(BFI1)
1060 ALU3(BFI2)
1061 ALU1(FBH)
1062 ALU1(FBL)
1063 ALU1(CBIT)
1064 ALU2(ADDC)
1065 ALU2(SUBB)
1066
1067 ROUND(RNDZ)
1068 ROUND(RNDE)
1069
1070 brw_inst *
1071 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
1072 {
1073 const struct gen_device_info *devinfo = p->devinfo;
1074
1075 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1076 * To avoid the problems that causes, we use a <1,2,0> source region to read
1077 * each element twice.
1078 */
1079 if (devinfo->gen == 7 && !devinfo->is_haswell &&
1080 brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1 &&
1081 dest.type == BRW_REGISTER_TYPE_DF &&
1082 (src0.type == BRW_REGISTER_TYPE_F ||
1083 src0.type == BRW_REGISTER_TYPE_D ||
1084 src0.type == BRW_REGISTER_TYPE_UD) &&
1085 !has_scalar_region(src0)) {
1086 assert(src0.vstride == BRW_VERTICAL_STRIDE_4 &&
1087 src0.width == BRW_WIDTH_4 &&
1088 src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1089
1090 src0.vstride = BRW_VERTICAL_STRIDE_1;
1091 src0.width = BRW_WIDTH_2;
1092 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1093 }
1094
1095 return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
1096 }
1097
1098 brw_inst *
1099 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1100 struct brw_reg src0, struct brw_reg src1)
1101 {
1102 /* 6.2.2: add */
1103 if (src0.type == BRW_REGISTER_TYPE_F ||
1104 (src0.file == BRW_IMMEDIATE_VALUE &&
1105 src0.type == BRW_REGISTER_TYPE_VF)) {
1106 assert(src1.type != BRW_REGISTER_TYPE_UD);
1107 assert(src1.type != BRW_REGISTER_TYPE_D);
1108 }
1109
1110 if (src1.type == BRW_REGISTER_TYPE_F ||
1111 (src1.file == BRW_IMMEDIATE_VALUE &&
1112 src1.type == BRW_REGISTER_TYPE_VF)) {
1113 assert(src0.type != BRW_REGISTER_TYPE_UD);
1114 assert(src0.type != BRW_REGISTER_TYPE_D);
1115 }
1116
1117 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1118 }
1119
1120 brw_inst *
1121 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1122 struct brw_reg src0, struct brw_reg src1)
1123 {
1124 assert(dest.type == src0.type);
1125 assert(src0.type == src1.type);
1126 switch (src0.type) {
1127 case BRW_REGISTER_TYPE_B:
1128 case BRW_REGISTER_TYPE_UB:
1129 case BRW_REGISTER_TYPE_W:
1130 case BRW_REGISTER_TYPE_UW:
1131 case BRW_REGISTER_TYPE_D:
1132 case BRW_REGISTER_TYPE_UD:
1133 break;
1134 default:
1135 unreachable("Bad type for brw_AVG");
1136 }
1137
1138 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1139 }
1140
1141 brw_inst *
1142 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1143 struct brw_reg src0, struct brw_reg src1)
1144 {
1145 /* 6.32.38: mul */
1146 if (src0.type == BRW_REGISTER_TYPE_D ||
1147 src0.type == BRW_REGISTER_TYPE_UD ||
1148 src1.type == BRW_REGISTER_TYPE_D ||
1149 src1.type == BRW_REGISTER_TYPE_UD) {
1150 assert(dest.type != BRW_REGISTER_TYPE_F);
1151 }
1152
1153 if (src0.type == BRW_REGISTER_TYPE_F ||
1154 (src0.file == BRW_IMMEDIATE_VALUE &&
1155 src0.type == BRW_REGISTER_TYPE_VF)) {
1156 assert(src1.type != BRW_REGISTER_TYPE_UD);
1157 assert(src1.type != BRW_REGISTER_TYPE_D);
1158 }
1159
1160 if (src1.type == BRW_REGISTER_TYPE_F ||
1161 (src1.file == BRW_IMMEDIATE_VALUE &&
1162 src1.type == BRW_REGISTER_TYPE_VF)) {
1163 assert(src0.type != BRW_REGISTER_TYPE_UD);
1164 assert(src0.type != BRW_REGISTER_TYPE_D);
1165 }
1166
1167 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1168 src0.nr != BRW_ARF_ACCUMULATOR);
1169 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1170 src1.nr != BRW_ARF_ACCUMULATOR);
1171
1172 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1173 }
1174
1175 brw_inst *
1176 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1177 struct brw_reg src0, struct brw_reg src1)
1178 {
1179 src0.vstride = BRW_VERTICAL_STRIDE_0;
1180 src0.width = BRW_WIDTH_1;
1181 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1182 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1183 }
1184
1185 brw_inst *
1186 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1187 struct brw_reg src0, struct brw_reg src1)
1188 {
1189 src0.vstride = BRW_VERTICAL_STRIDE_0;
1190 src0.width = BRW_WIDTH_1;
1191 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1192 src1.vstride = BRW_VERTICAL_STRIDE_8;
1193 src1.width = BRW_WIDTH_8;
1194 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1195 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1196 }
1197
1198 brw_inst *
1199 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1200 {
1201 const struct gen_device_info *devinfo = p->devinfo;
1202 const bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1203 /* The F32TO16 instruction doesn't support 32-bit destination types in
1204 * Align1 mode, and neither does the Gen8 implementation in terms of a
1205 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1206 * an undocumented feature.
1207 */
1208 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1209 (!align16 || devinfo->gen >= 8));
1210 brw_inst *inst;
1211
1212 if (align16) {
1213 assert(dst.type == BRW_REGISTER_TYPE_UD);
1214 } else {
1215 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1216 dst.type == BRW_REGISTER_TYPE_W ||
1217 dst.type == BRW_REGISTER_TYPE_UW ||
1218 dst.type == BRW_REGISTER_TYPE_HF);
1219 }
1220
1221 brw_push_insn_state(p);
1222
1223 if (needs_zero_fill) {
1224 brw_set_default_access_mode(p, BRW_ALIGN_1);
1225 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1226 }
1227
1228 if (devinfo->gen >= 8) {
1229 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1230 } else {
1231 assert(devinfo->gen == 7);
1232 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1233 }
1234
1235 if (needs_zero_fill) {
1236 brw_inst_set_no_dd_clear(devinfo, inst, true);
1237 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1238 brw_inst_set_no_dd_check(devinfo, inst, true);
1239 }
1240
1241 brw_pop_insn_state(p);
1242 return inst;
1243 }
1244
1245 brw_inst *
1246 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1247 {
1248 const struct gen_device_info *devinfo = p->devinfo;
1249 bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1250
1251 if (align16) {
1252 assert(src.type == BRW_REGISTER_TYPE_UD);
1253 } else {
1254 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1255 *
1256 * Because this instruction does not have a 16-bit floating-point
1257 * type, the source data type must be Word (W). The destination type
1258 * must be F (Float).
1259 */
1260 if (src.type == BRW_REGISTER_TYPE_UD)
1261 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1262
1263 assert(src.type == BRW_REGISTER_TYPE_W ||
1264 src.type == BRW_REGISTER_TYPE_UW ||
1265 src.type == BRW_REGISTER_TYPE_HF);
1266 }
1267
1268 if (devinfo->gen >= 8) {
1269 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1270 } else {
1271 assert(devinfo->gen == 7);
1272 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1273 }
1274 }
1275
1276
1277 void brw_NOP(struct brw_codegen *p)
1278 {
1279 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1280 memset(insn, 0, sizeof(*insn));
1281 brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1282 }
1283
1284
1285
1286
1287
1288 /***********************************************************************
1289 * Comparisons, if/else/endif
1290 */
1291
1292 brw_inst *
1293 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1294 unsigned predicate_control)
1295 {
1296 const struct gen_device_info *devinfo = p->devinfo;
1297 struct brw_reg ip = brw_ip_reg();
1298 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1299
1300 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_2);
1301 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1302 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1303 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1304
1305 return inst;
1306 }
1307
1308 static void
1309 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1310 {
1311 p->if_stack[p->if_stack_depth] = inst - p->store;
1312
1313 p->if_stack_depth++;
1314 if (p->if_stack_array_size <= p->if_stack_depth) {
1315 p->if_stack_array_size *= 2;
1316 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1317 p->if_stack_array_size);
1318 }
1319 }
1320
1321 static brw_inst *
1322 pop_if_stack(struct brw_codegen *p)
1323 {
1324 p->if_stack_depth--;
1325 return &p->store[p->if_stack[p->if_stack_depth]];
1326 }
1327
1328 static void
1329 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1330 {
1331 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1332 p->loop_stack_array_size *= 2;
1333 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1334 p->loop_stack_array_size);
1335 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1336 p->loop_stack_array_size);
1337 }
1338
1339 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1340 p->loop_stack_depth++;
1341 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1342 }
1343
1344 static brw_inst *
1345 get_inner_do_insn(struct brw_codegen *p)
1346 {
1347 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1348 }
1349
1350 /* EU takes the value from the flag register and pushes it onto some
1351 * sort of a stack (presumably merging with any flag value already on
1352 * the stack). Within an if block, the flags at the top of the stack
1353 * control execution on each channel of the unit, eg. on each of the
1354 * 16 pixel values in our wm programs.
1355 *
1356 * When the matching 'else' instruction is reached (presumably by
1357 * countdown of the instruction count patched in by our ELSE/ENDIF
1358 * functions), the relevant flags are inverted.
1359 *
1360 * When the matching 'endif' instruction is reached, the flags are
1361 * popped off. If the stack is now empty, normal execution resumes.
1362 */
1363 brw_inst *
1364 brw_IF(struct brw_codegen *p, unsigned execute_size)
1365 {
1366 const struct gen_device_info *devinfo = p->devinfo;
1367 brw_inst *insn;
1368
1369 insn = next_insn(p, BRW_OPCODE_IF);
1370
1371 /* Override the defaults for this instruction:
1372 */
1373 if (devinfo->gen < 6) {
1374 brw_set_dest(p, insn, brw_ip_reg());
1375 brw_set_src0(p, insn, brw_ip_reg());
1376 brw_set_src1(p, insn, brw_imm_d(0x0));
1377 } else if (devinfo->gen == 6) {
1378 brw_set_dest(p, insn, brw_imm_w(0));
1379 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1380 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1381 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1382 } else if (devinfo->gen == 7) {
1383 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1384 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1385 brw_set_src1(p, insn, brw_imm_w(0));
1386 brw_inst_set_jip(devinfo, insn, 0);
1387 brw_inst_set_uip(devinfo, insn, 0);
1388 } else {
1389 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1390 brw_set_src0(p, insn, brw_imm_d(0));
1391 brw_inst_set_jip(devinfo, insn, 0);
1392 brw_inst_set_uip(devinfo, insn, 0);
1393 }
1394
1395 brw_inst_set_exec_size(devinfo, insn, execute_size);
1396 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1397 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1398 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1399 if (!p->single_program_flow && devinfo->gen < 6)
1400 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1401
1402 push_if_stack(p, insn);
1403 p->if_depth_in_loop[p->loop_stack_depth]++;
1404 return insn;
1405 }
1406
1407 /* This function is only used for gen6-style IF instructions with an
1408 * embedded comparison (conditional modifier). It is not used on gen7.
1409 */
1410 brw_inst *
1411 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1412 struct brw_reg src0, struct brw_reg src1)
1413 {
1414 const struct gen_device_info *devinfo = p->devinfo;
1415 brw_inst *insn;
1416
1417 insn = next_insn(p, BRW_OPCODE_IF);
1418
1419 brw_set_dest(p, insn, brw_imm_w(0));
1420 brw_inst_set_exec_size(devinfo, insn,
1421 brw_inst_exec_size(devinfo, p->current));
1422 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1423 brw_set_src0(p, insn, src0);
1424 brw_set_src1(p, insn, src1);
1425
1426 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1427 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1428 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1429
1430 push_if_stack(p, insn);
1431 return insn;
1432 }
1433
1434 /**
1435 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1436 */
1437 static void
1438 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1439 brw_inst *if_inst, brw_inst *else_inst)
1440 {
1441 const struct gen_device_info *devinfo = p->devinfo;
1442
1443 /* The next instruction (where the ENDIF would be, if it existed) */
1444 brw_inst *next_inst = &p->store[p->nr_insn];
1445
1446 assert(p->single_program_flow);
1447 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1448 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1449 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1450
1451 /* Convert IF to an ADD instruction that moves the instruction pointer
1452 * to the first instruction of the ELSE block. If there is no ELSE
1453 * block, point to where ENDIF would be. Reverse the predicate.
1454 *
1455 * There's no need to execute an ENDIF since we don't need to do any
1456 * stack operations, and if we're currently executing, we just want to
1457 * continue normally.
1458 */
1459 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1460 brw_inst_set_pred_inv(devinfo, if_inst, true);
1461
1462 if (else_inst != NULL) {
1463 /* Convert ELSE to an ADD instruction that points where the ENDIF
1464 * would be.
1465 */
1466 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1467
1468 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1469 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1470 } else {
1471 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1472 }
1473 }
1474
1475 /**
1476 * Patch IF and ELSE instructions with appropriate jump targets.
1477 */
1478 static void
1479 patch_IF_ELSE(struct brw_codegen *p,
1480 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1481 {
1482 const struct gen_device_info *devinfo = p->devinfo;
1483
1484 /* We shouldn't be patching IF and ELSE instructions in single program flow
1485 * mode when gen < 6, because in single program flow mode on those
1486 * platforms, we convert flow control instructions to conditional ADDs that
1487 * operate on IP (see brw_ENDIF).
1488 *
1489 * However, on Gen6, writing to IP doesn't work in single program flow mode
1490 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1491 * not be updated by non-flow control instructions."). And on later
1492 * platforms, there is no significant benefit to converting control flow
1493 * instructions to conditional ADDs. So we do patch IF and ELSE
1494 * instructions in single program flow mode on those platforms.
1495 */
1496 if (devinfo->gen < 6)
1497 assert(!p->single_program_flow);
1498
1499 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1500 assert(endif_inst != NULL);
1501 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1502
1503 unsigned br = brw_jump_scale(devinfo);
1504
1505 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1506 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1507
1508 if (else_inst == NULL) {
1509 /* Patch IF -> ENDIF */
1510 if (devinfo->gen < 6) {
1511 /* Turn it into an IFF, which means no mask stack operations for
1512 * all-false and jumping past the ENDIF.
1513 */
1514 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1515 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1516 br * (endif_inst - if_inst + 1));
1517 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1518 } else if (devinfo->gen == 6) {
1519 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1520 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1521 } else {
1522 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1523 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1524 }
1525 } else {
1526 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1527
1528 /* Patch IF -> ELSE */
1529 if (devinfo->gen < 6) {
1530 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1531 br * (else_inst - if_inst));
1532 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1533 } else if (devinfo->gen == 6) {
1534 brw_inst_set_gen6_jump_count(devinfo, if_inst,
1535 br * (else_inst - if_inst + 1));
1536 }
1537
1538 /* Patch ELSE -> ENDIF */
1539 if (devinfo->gen < 6) {
1540 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1541 * matching ENDIF.
1542 */
1543 brw_inst_set_gen4_jump_count(devinfo, else_inst,
1544 br * (endif_inst - else_inst + 1));
1545 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1546 } else if (devinfo->gen == 6) {
1547 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1548 brw_inst_set_gen6_jump_count(devinfo, else_inst,
1549 br * (endif_inst - else_inst));
1550 } else {
1551 /* The IF instruction's JIP should point just past the ELSE */
1552 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1553 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1554 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1555 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1556 if (devinfo->gen >= 8) {
1557 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1558 * should point to ENDIF.
1559 */
1560 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1561 }
1562 }
1563 }
1564 }
1565
1566 void
1567 brw_ELSE(struct brw_codegen *p)
1568 {
1569 const struct gen_device_info *devinfo = p->devinfo;
1570 brw_inst *insn;
1571
1572 insn = next_insn(p, BRW_OPCODE_ELSE);
1573
1574 if (devinfo->gen < 6) {
1575 brw_set_dest(p, insn, brw_ip_reg());
1576 brw_set_src0(p, insn, brw_ip_reg());
1577 brw_set_src1(p, insn, brw_imm_d(0x0));
1578 } else if (devinfo->gen == 6) {
1579 brw_set_dest(p, insn, brw_imm_w(0));
1580 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1581 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1582 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1583 } else if (devinfo->gen == 7) {
1584 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1585 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1586 brw_set_src1(p, insn, brw_imm_w(0));
1587 brw_inst_set_jip(devinfo, insn, 0);
1588 brw_inst_set_uip(devinfo, insn, 0);
1589 } else {
1590 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1591 brw_set_src0(p, insn, brw_imm_d(0));
1592 brw_inst_set_jip(devinfo, insn, 0);
1593 brw_inst_set_uip(devinfo, insn, 0);
1594 }
1595
1596 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1597 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1598 if (!p->single_program_flow && devinfo->gen < 6)
1599 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1600
1601 push_if_stack(p, insn);
1602 }
1603
1604 void
1605 brw_ENDIF(struct brw_codegen *p)
1606 {
1607 const struct gen_device_info *devinfo = p->devinfo;
1608 brw_inst *insn = NULL;
1609 brw_inst *else_inst = NULL;
1610 brw_inst *if_inst = NULL;
1611 brw_inst *tmp;
1612 bool emit_endif = true;
1613
1614 /* In single program flow mode, we can express IF and ELSE instructions
1615 * equivalently as ADD instructions that operate on IP. On platforms prior
1616 * to Gen6, flow control instructions cause an implied thread switch, so
1617 * this is a significant savings.
1618 *
1619 * However, on Gen6, writing to IP doesn't work in single program flow mode
1620 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1621 * not be updated by non-flow control instructions."). And on later
1622 * platforms, there is no significant benefit to converting control flow
1623 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1624 * Gen5.
1625 */
1626 if (devinfo->gen < 6 && p->single_program_flow)
1627 emit_endif = false;
1628
1629 /*
1630 * A single next_insn() may change the base address of instruction store
1631 * memory(p->store), so call it first before referencing the instruction
1632 * store pointer from an index
1633 */
1634 if (emit_endif)
1635 insn = next_insn(p, BRW_OPCODE_ENDIF);
1636
1637 /* Pop the IF and (optional) ELSE instructions from the stack */
1638 p->if_depth_in_loop[p->loop_stack_depth]--;
1639 tmp = pop_if_stack(p);
1640 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1641 else_inst = tmp;
1642 tmp = pop_if_stack(p);
1643 }
1644 if_inst = tmp;
1645
1646 if (!emit_endif) {
1647 /* ENDIF is useless; don't bother emitting it. */
1648 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1649 return;
1650 }
1651
1652 if (devinfo->gen < 6) {
1653 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1654 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1655 brw_set_src1(p, insn, brw_imm_d(0x0));
1656 } else if (devinfo->gen == 6) {
1657 brw_set_dest(p, insn, brw_imm_w(0));
1658 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1659 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1660 } else if (devinfo->gen == 7) {
1661 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1662 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1663 brw_set_src1(p, insn, brw_imm_w(0));
1664 } else {
1665 brw_set_src0(p, insn, brw_imm_d(0));
1666 }
1667
1668 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1669 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1670 if (devinfo->gen < 6)
1671 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1672
1673 /* Also pop item off the stack in the endif instruction: */
1674 if (devinfo->gen < 6) {
1675 brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1676 brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1677 } else if (devinfo->gen == 6) {
1678 brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1679 } else {
1680 brw_inst_set_jip(devinfo, insn, 2);
1681 }
1682 patch_IF_ELSE(p, if_inst, else_inst, insn);
1683 }
1684
1685 brw_inst *
1686 brw_BREAK(struct brw_codegen *p)
1687 {
1688 const struct gen_device_info *devinfo = p->devinfo;
1689 brw_inst *insn;
1690
1691 insn = next_insn(p, BRW_OPCODE_BREAK);
1692 if (devinfo->gen >= 8) {
1693 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1694 brw_set_src0(p, insn, brw_imm_d(0x0));
1695 } else if (devinfo->gen >= 6) {
1696 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1697 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1698 brw_set_src1(p, insn, brw_imm_d(0x0));
1699 } else {
1700 brw_set_dest(p, insn, brw_ip_reg());
1701 brw_set_src0(p, insn, brw_ip_reg());
1702 brw_set_src1(p, insn, brw_imm_d(0x0));
1703 brw_inst_set_gen4_pop_count(devinfo, insn,
1704 p->if_depth_in_loop[p->loop_stack_depth]);
1705 }
1706 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1707 brw_inst_set_exec_size(devinfo, insn,
1708 brw_inst_exec_size(devinfo, p->current));
1709
1710 return insn;
1711 }
1712
1713 brw_inst *
1714 brw_CONT(struct brw_codegen *p)
1715 {
1716 const struct gen_device_info *devinfo = p->devinfo;
1717 brw_inst *insn;
1718
1719 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1720 brw_set_dest(p, insn, brw_ip_reg());
1721 if (devinfo->gen >= 8) {
1722 brw_set_src0(p, insn, brw_imm_d(0x0));
1723 } else {
1724 brw_set_src0(p, insn, brw_ip_reg());
1725 brw_set_src1(p, insn, brw_imm_d(0x0));
1726 }
1727
1728 if (devinfo->gen < 6) {
1729 brw_inst_set_gen4_pop_count(devinfo, insn,
1730 p->if_depth_in_loop[p->loop_stack_depth]);
1731 }
1732 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1733 brw_inst_set_exec_size(devinfo, insn,
1734 brw_inst_exec_size(devinfo, p->current));
1735 return insn;
1736 }
1737
1738 brw_inst *
1739 gen6_HALT(struct brw_codegen *p)
1740 {
1741 const struct gen_device_info *devinfo = p->devinfo;
1742 brw_inst *insn;
1743
1744 insn = next_insn(p, BRW_OPCODE_HALT);
1745 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1746 if (devinfo->gen >= 8) {
1747 brw_set_src0(p, insn, brw_imm_d(0x0));
1748 } else {
1749 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1750 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1751 }
1752
1753 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1754 brw_inst_set_exec_size(devinfo, insn,
1755 brw_inst_exec_size(devinfo, p->current));
1756 return insn;
1757 }
1758
1759 /* DO/WHILE loop:
1760 *
1761 * The DO/WHILE is just an unterminated loop -- break or continue are
1762 * used for control within the loop. We have a few ways they can be
1763 * done.
1764 *
1765 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1766 * jip and no DO instruction.
1767 *
1768 * For non-uniform control flow pre-gen6, there's a DO instruction to
1769 * push the mask, and a WHILE to jump back, and BREAK to get out and
1770 * pop the mask.
1771 *
1772 * For gen6, there's no more mask stack, so no need for DO. WHILE
1773 * just points back to the first instruction of the loop.
1774 */
1775 brw_inst *
1776 brw_DO(struct brw_codegen *p, unsigned execute_size)
1777 {
1778 const struct gen_device_info *devinfo = p->devinfo;
1779
1780 if (devinfo->gen >= 6 || p->single_program_flow) {
1781 push_loop_stack(p, &p->store[p->nr_insn]);
1782 return &p->store[p->nr_insn];
1783 } else {
1784 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1785
1786 push_loop_stack(p, insn);
1787
1788 /* Override the defaults for this instruction:
1789 */
1790 brw_set_dest(p, insn, brw_null_reg());
1791 brw_set_src0(p, insn, brw_null_reg());
1792 brw_set_src1(p, insn, brw_null_reg());
1793
1794 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1795 brw_inst_set_exec_size(devinfo, insn, execute_size);
1796 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1797
1798 return insn;
1799 }
1800 }
1801
1802 /**
1803 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1804 * instruction here.
1805 *
1806 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1807 * nesting, since it can always just point to the end of the block/current loop.
1808 */
1809 static void
1810 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1811 {
1812 const struct gen_device_info *devinfo = p->devinfo;
1813 brw_inst *do_inst = get_inner_do_insn(p);
1814 brw_inst *inst;
1815 unsigned br = brw_jump_scale(devinfo);
1816
1817 assert(devinfo->gen < 6);
1818
1819 for (inst = while_inst - 1; inst != do_inst; inst--) {
1820 /* If the jump count is != 0, that means that this instruction has already
1821 * been patched because it's part of a loop inside of the one we're
1822 * patching.
1823 */
1824 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1825 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1826 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1827 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1828 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1829 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1830 }
1831 }
1832 }
1833
1834 brw_inst *
1835 brw_WHILE(struct brw_codegen *p)
1836 {
1837 const struct gen_device_info *devinfo = p->devinfo;
1838 brw_inst *insn, *do_insn;
1839 unsigned br = brw_jump_scale(devinfo);
1840
1841 if (devinfo->gen >= 6) {
1842 insn = next_insn(p, BRW_OPCODE_WHILE);
1843 do_insn = get_inner_do_insn(p);
1844
1845 if (devinfo->gen >= 8) {
1846 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1847 brw_set_src0(p, insn, brw_imm_d(0));
1848 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1849 } else if (devinfo->gen == 7) {
1850 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1851 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1852 brw_set_src1(p, insn, brw_imm_w(0));
1853 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1854 } else {
1855 brw_set_dest(p, insn, brw_imm_w(0));
1856 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1857 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1858 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1859 }
1860
1861 brw_inst_set_exec_size(devinfo, insn,
1862 brw_inst_exec_size(devinfo, p->current));
1863
1864 } else {
1865 if (p->single_program_flow) {
1866 insn = next_insn(p, BRW_OPCODE_ADD);
1867 do_insn = get_inner_do_insn(p);
1868
1869 brw_set_dest(p, insn, brw_ip_reg());
1870 brw_set_src0(p, insn, brw_ip_reg());
1871 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1872 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1873 } else {
1874 insn = next_insn(p, BRW_OPCODE_WHILE);
1875 do_insn = get_inner_do_insn(p);
1876
1877 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1878
1879 brw_set_dest(p, insn, brw_ip_reg());
1880 brw_set_src0(p, insn, brw_ip_reg());
1881 brw_set_src1(p, insn, brw_imm_d(0));
1882
1883 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1884 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1885 brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1886
1887 brw_patch_break_cont(p, insn);
1888 }
1889 }
1890 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1891
1892 p->loop_stack_depth--;
1893
1894 return insn;
1895 }
1896
1897 /* FORWARD JUMPS:
1898 */
1899 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1900 {
1901 const struct gen_device_info *devinfo = p->devinfo;
1902 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1903 unsigned jmpi = 1;
1904
1905 if (devinfo->gen >= 5)
1906 jmpi = 2;
1907
1908 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1909 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1910
1911 brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1912 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1913 }
1914
1915 /* To integrate with the above, it makes sense that the comparison
1916 * instruction should populate the flag register. It might be simpler
1917 * just to use the flag reg for most WM tasks?
1918 */
1919 void brw_CMP(struct brw_codegen *p,
1920 struct brw_reg dest,
1921 unsigned conditional,
1922 struct brw_reg src0,
1923 struct brw_reg src1)
1924 {
1925 const struct gen_device_info *devinfo = p->devinfo;
1926 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1927
1928 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1929 brw_set_dest(p, insn, dest);
1930 brw_set_src0(p, insn, src0);
1931 brw_set_src1(p, insn, src1);
1932
1933 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1934 * page says:
1935 * "Any CMP instruction with a null destination must use a {switch}."
1936 *
1937 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1938 * mentioned on their work-arounds pages.
1939 */
1940 if (devinfo->gen == 7) {
1941 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1942 dest.nr == BRW_ARF_NULL) {
1943 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1944 }
1945 }
1946 }
1947
1948 /***********************************************************************
1949 * Helpers for the various SEND message types:
1950 */
1951
1952 /** Extended math function, float[8].
1953 */
1954 void gen4_math(struct brw_codegen *p,
1955 struct brw_reg dest,
1956 unsigned function,
1957 unsigned msg_reg_nr,
1958 struct brw_reg src,
1959 unsigned precision )
1960 {
1961 const struct gen_device_info *devinfo = p->devinfo;
1962 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1963 unsigned data_type;
1964 if (has_scalar_region(src)) {
1965 data_type = BRW_MATH_DATA_SCALAR;
1966 } else {
1967 data_type = BRW_MATH_DATA_VECTOR;
1968 }
1969
1970 assert(devinfo->gen < 6);
1971
1972 /* Example code doesn't set predicate_control for send
1973 * instructions.
1974 */
1975 brw_inst_set_pred_control(devinfo, insn, 0);
1976 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1977
1978 brw_set_dest(p, insn, dest);
1979 brw_set_src0(p, insn, src);
1980 brw_set_math_message(p,
1981 insn,
1982 function,
1983 src.type == BRW_REGISTER_TYPE_D,
1984 precision,
1985 data_type);
1986 }
1987
1988 void gen6_math(struct brw_codegen *p,
1989 struct brw_reg dest,
1990 unsigned function,
1991 struct brw_reg src0,
1992 struct brw_reg src1)
1993 {
1994 const struct gen_device_info *devinfo = p->devinfo;
1995 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
1996
1997 assert(devinfo->gen >= 6);
1998
1999 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
2000 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
2001
2002 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
2003 if (devinfo->gen == 6) {
2004 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
2005 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
2006 }
2007
2008 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
2009 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
2010 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
2011 assert(src0.type != BRW_REGISTER_TYPE_F);
2012 assert(src1.type != BRW_REGISTER_TYPE_F);
2013 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2014 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2015 } else {
2016 assert(src0.type == BRW_REGISTER_TYPE_F);
2017 assert(src1.type == BRW_REGISTER_TYPE_F);
2018 }
2019
2020 /* Source modifiers are ignored for extended math instructions on Gen6. */
2021 if (devinfo->gen == 6) {
2022 assert(!src0.negate);
2023 assert(!src0.abs);
2024 assert(!src1.negate);
2025 assert(!src1.abs);
2026 }
2027
2028 brw_inst_set_math_function(devinfo, insn, function);
2029
2030 brw_set_dest(p, insn, dest);
2031 brw_set_src0(p, insn, src0);
2032 brw_set_src1(p, insn, src1);
2033 }
2034
2035 /**
2036 * Return the right surface index to access the thread scratch space using
2037 * stateless dataport messages.
2038 */
2039 unsigned
2040 brw_scratch_surface_idx(const struct brw_codegen *p)
2041 {
2042 /* The scratch space is thread-local so IA coherency is unnecessary. */
2043 if (p->devinfo->gen >= 8)
2044 return GEN8_BTI_STATELESS_NON_COHERENT;
2045 else
2046 return BRW_BTI_STATELESS;
2047 }
2048
2049 /**
2050 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2051 * using a constant offset per channel.
2052 *
2053 * The offset must be aligned to oword size (16 bytes). Used for
2054 * register spilling.
2055 */
2056 void brw_oword_block_write_scratch(struct brw_codegen *p,
2057 struct brw_reg mrf,
2058 int num_regs,
2059 unsigned offset)
2060 {
2061 const struct gen_device_info *devinfo = p->devinfo;
2062 const unsigned target_cache =
2063 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2064 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2065 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2066 uint32_t msg_type;
2067
2068 if (devinfo->gen >= 6)
2069 offset /= 16;
2070
2071 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2072
2073 const unsigned mlen = 1 + num_regs;
2074
2075 /* Set up the message header. This is g0, with g0.2 filled with
2076 * the offset. We don't want to leave our offset around in g0 or
2077 * it'll screw up texture samples, so set it up inside the message
2078 * reg.
2079 */
2080 {
2081 brw_push_insn_state(p);
2082 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2083 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2084 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2085
2086 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2087
2088 /* set message header global offset field (reg 0, element 2) */
2089 brw_MOV(p,
2090 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2091 mrf.nr,
2092 2), BRW_REGISTER_TYPE_UD),
2093 brw_imm_ud(offset));
2094
2095 brw_pop_insn_state(p);
2096 }
2097
2098 {
2099 struct brw_reg dest;
2100 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2101 int send_commit_msg;
2102 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2103 BRW_REGISTER_TYPE_UW);
2104
2105 brw_inst_set_compression(devinfo, insn, false);
2106
2107 if (brw_inst_exec_size(devinfo, insn) >= 16)
2108 src_header = vec16(src_header);
2109
2110 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2111 if (devinfo->gen < 6)
2112 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2113
2114 /* Until gen6, writes followed by reads from the same location
2115 * are not guaranteed to be ordered unless write_commit is set.
2116 * If set, then a no-op write is issued to the destination
2117 * register to set a dependency, and a read from the destination
2118 * can be used to ensure the ordering.
2119 *
2120 * For gen6, only writes between different threads need ordering
2121 * protection. Our use of DP writes is all about register
2122 * spilling within a thread.
2123 */
2124 if (devinfo->gen >= 6) {
2125 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2126 send_commit_msg = 0;
2127 } else {
2128 dest = src_header;
2129 send_commit_msg = 1;
2130 }
2131
2132 brw_set_dest(p, insn, dest);
2133 if (devinfo->gen >= 6) {
2134 brw_set_src0(p, insn, mrf);
2135 } else {
2136 brw_set_src0(p, insn, brw_null_reg());
2137 }
2138
2139 if (devinfo->gen >= 6)
2140 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2141 else
2142 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2143
2144 brw_set_dp_write_message(p,
2145 insn,
2146 brw_scratch_surface_idx(p),
2147 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2148 msg_type,
2149 target_cache,
2150 mlen,
2151 true, /* header_present */
2152 0, /* not a render target */
2153 send_commit_msg, /* response_length */
2154 0, /* eot */
2155 send_commit_msg);
2156 }
2157 }
2158
2159
2160 /**
2161 * Read a block of owords (half a GRF each) from the scratch buffer
2162 * using a constant index per channel.
2163 *
2164 * Offset must be aligned to oword size (16 bytes). Used for register
2165 * spilling.
2166 */
2167 void
2168 brw_oword_block_read_scratch(struct brw_codegen *p,
2169 struct brw_reg dest,
2170 struct brw_reg mrf,
2171 int num_regs,
2172 unsigned offset)
2173 {
2174 const struct gen_device_info *devinfo = p->devinfo;
2175
2176 if (devinfo->gen >= 6)
2177 offset /= 16;
2178
2179 if (p->devinfo->gen >= 7) {
2180 /* On gen 7 and above, we no longer have message registers and we can
2181 * send from any register we want. By using the destination register
2182 * for the message, we guarantee that the implied message write won't
2183 * accidentally overwrite anything. This has been a problem because
2184 * the MRF registers and source for the final FB write are both fixed
2185 * and may overlap.
2186 */
2187 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2188 } else {
2189 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2190 }
2191 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2192
2193 const unsigned rlen = num_regs;
2194 const unsigned target_cache =
2195 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2196 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2197 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2198
2199 {
2200 brw_push_insn_state(p);
2201 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2202 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2203 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2204
2205 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2206
2207 /* set message header global offset field (reg 0, element 2) */
2208 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2209
2210 brw_pop_insn_state(p);
2211 }
2212
2213 {
2214 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2215
2216 assert(brw_inst_pred_control(devinfo, insn) == 0);
2217 brw_inst_set_compression(devinfo, insn, false);
2218
2219 brw_set_dest(p, insn, dest); /* UW? */
2220 if (devinfo->gen >= 6) {
2221 brw_set_src0(p, insn, mrf);
2222 } else {
2223 brw_set_src0(p, insn, brw_null_reg());
2224 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2225 }
2226
2227 brw_set_dp_read_message(p,
2228 insn,
2229 brw_scratch_surface_idx(p),
2230 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2231 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2232 target_cache,
2233 1, /* msg_length */
2234 true, /* header_present */
2235 rlen);
2236 }
2237 }
2238
2239 void
2240 gen7_block_read_scratch(struct brw_codegen *p,
2241 struct brw_reg dest,
2242 int num_regs,
2243 unsigned offset)
2244 {
2245 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2246 assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2247
2248 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2249
2250 /* The HW requires that the header is present; this is to get the g0.5
2251 * scratch offset.
2252 */
2253 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2254
2255 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2256 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2257 * is 32 bytes, which happens to be the size of a register.
2258 */
2259 offset /= REG_SIZE;
2260 assert(offset < (1 << 12));
2261
2262 gen7_set_dp_scratch_message(p, insn,
2263 false, /* scratch read */
2264 false, /* OWords */
2265 false, /* invalidate after read */
2266 num_regs,
2267 offset,
2268 1, /* mlen: just g0 */
2269 num_regs, /* rlen */
2270 true); /* header present */
2271 }
2272
2273 /**
2274 * Read float[4] vectors from the data port constant cache.
2275 * Location (in buffer) should be a multiple of 16.
2276 * Used for fetching shader constants.
2277 */
2278 void brw_oword_block_read(struct brw_codegen *p,
2279 struct brw_reg dest,
2280 struct brw_reg mrf,
2281 uint32_t offset,
2282 uint32_t bind_table_index)
2283 {
2284 const struct gen_device_info *devinfo = p->devinfo;
2285 const unsigned target_cache =
2286 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
2287 BRW_DATAPORT_READ_TARGET_DATA_CACHE);
2288 const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current);
2289
2290 /* On newer hardware, offset is in units of owords. */
2291 if (devinfo->gen >= 6)
2292 offset /= 16;
2293
2294 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2295
2296 brw_push_insn_state(p);
2297 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2298 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2299 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2300
2301 brw_push_insn_state(p);
2302 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2303 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2304
2305 /* set message header global offset field (reg 0, element 2) */
2306 brw_MOV(p,
2307 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2308 mrf.nr,
2309 2), BRW_REGISTER_TYPE_UD),
2310 brw_imm_ud(offset));
2311 brw_pop_insn_state(p);
2312
2313 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2314
2315 /* cast dest to a uword[8] vector */
2316 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2317
2318 brw_set_dest(p, insn, dest);
2319 if (devinfo->gen >= 6) {
2320 brw_set_src0(p, insn, mrf);
2321 } else {
2322 brw_set_src0(p, insn, brw_null_reg());
2323 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2324 }
2325
2326 brw_set_dp_read_message(p, insn, bind_table_index,
2327 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2328 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2329 target_cache,
2330 1, /* msg_length */
2331 true, /* header_present */
2332 DIV_ROUND_UP(exec_size, 8)); /* response_length */
2333
2334 brw_pop_insn_state(p);
2335 }
2336
2337
2338 void brw_fb_WRITE(struct brw_codegen *p,
2339 struct brw_reg payload,
2340 struct brw_reg implied_header,
2341 unsigned msg_control,
2342 unsigned binding_table_index,
2343 unsigned msg_length,
2344 unsigned response_length,
2345 bool eot,
2346 bool last_render_target,
2347 bool header_present)
2348 {
2349 const struct gen_device_info *devinfo = p->devinfo;
2350 const unsigned target_cache =
2351 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2352 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2353 brw_inst *insn;
2354 unsigned msg_type;
2355 struct brw_reg dest, src0;
2356
2357 if (brw_inst_exec_size(devinfo, p->current) >= BRW_EXECUTE_16)
2358 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2359 else
2360 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2361
2362 if (devinfo->gen >= 6) {
2363 insn = next_insn(p, BRW_OPCODE_SENDC);
2364 } else {
2365 insn = next_insn(p, BRW_OPCODE_SEND);
2366 }
2367 brw_inst_set_compression(devinfo, insn, false);
2368
2369 if (devinfo->gen >= 6) {
2370 /* headerless version, just submit color payload */
2371 src0 = payload;
2372
2373 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2374 } else {
2375 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2376 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2377 src0 = implied_header;
2378
2379 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2380 }
2381
2382 brw_set_dest(p, insn, dest);
2383 brw_set_src0(p, insn, src0);
2384 brw_set_dp_write_message(p,
2385 insn,
2386 binding_table_index,
2387 msg_control,
2388 msg_type,
2389 target_cache,
2390 msg_length,
2391 header_present,
2392 last_render_target,
2393 response_length,
2394 eot,
2395 0 /* send_commit_msg */);
2396 }
2397
2398 brw_inst *
2399 gen9_fb_READ(struct brw_codegen *p,
2400 struct brw_reg dst,
2401 struct brw_reg payload,
2402 unsigned binding_table_index,
2403 unsigned msg_length,
2404 unsigned response_length,
2405 bool per_sample)
2406 {
2407 const struct gen_device_info *devinfo = p->devinfo;
2408 assert(devinfo->gen >= 9);
2409 const unsigned msg_subtype =
2410 brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16 ? 0 : 1;
2411 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2412
2413 brw_set_dest(p, insn, dst);
2414 brw_set_src0(p, insn, payload);
2415 brw_set_dp_read_message(p, insn, binding_table_index,
2416 per_sample << 5 | msg_subtype,
2417 GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2418 GEN6_SFID_DATAPORT_RENDER_CACHE,
2419 msg_length, true /* header_present */,
2420 response_length);
2421 brw_inst_set_rt_slot_group(devinfo, insn,
2422 brw_inst_qtr_control(devinfo, p->current) / 2);
2423
2424 return insn;
2425 }
2426
2427 /**
2428 * Texture sample instruction.
2429 * Note: the msg_type plus msg_length values determine exactly what kind
2430 * of sampling operation is performed. See volume 4, page 161 of docs.
2431 */
2432 void brw_SAMPLE(struct brw_codegen *p,
2433 struct brw_reg dest,
2434 unsigned msg_reg_nr,
2435 struct brw_reg src0,
2436 unsigned binding_table_index,
2437 unsigned sampler,
2438 unsigned msg_type,
2439 unsigned response_length,
2440 unsigned msg_length,
2441 unsigned header_present,
2442 unsigned simd_mode,
2443 unsigned return_format)
2444 {
2445 const struct gen_device_info *devinfo = p->devinfo;
2446 brw_inst *insn;
2447
2448 if (msg_reg_nr != -1)
2449 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2450
2451 insn = next_insn(p, BRW_OPCODE_SEND);
2452 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2453
2454 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2455 *
2456 * "Instruction compression is not allowed for this instruction (that
2457 * is, send). The hardware behavior is undefined if this instruction is
2458 * set as compressed. However, compress control can be set to "SecHalf"
2459 * to affect the EMask generation."
2460 *
2461 * No similar wording is found in later PRMs, but there are examples
2462 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2463 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2464 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2465 */
2466 brw_inst_set_compression(devinfo, insn, false);
2467
2468 if (devinfo->gen < 6)
2469 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2470
2471 brw_set_dest(p, insn, dest);
2472 brw_set_src0(p, insn, src0);
2473 brw_set_sampler_message(p, insn,
2474 binding_table_index,
2475 sampler,
2476 msg_type,
2477 response_length,
2478 msg_length,
2479 header_present,
2480 simd_mode,
2481 return_format);
2482 }
2483
2484 /* Adjust the message header's sampler state pointer to
2485 * select the correct group of 16 samplers.
2486 */
2487 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2488 struct brw_reg header,
2489 struct brw_reg sampler_index)
2490 {
2491 /* The "Sampler Index" field can only store values between 0 and 15.
2492 * However, we can add an offset to the "Sampler State Pointer"
2493 * field, effectively selecting a different set of 16 samplers.
2494 *
2495 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2496 * offset, and each sampler state is only 16-bytes, so we can't
2497 * exclusively use the offset - we have to use both.
2498 */
2499
2500 const struct gen_device_info *devinfo = p->devinfo;
2501
2502 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2503 const int sampler_state_size = 16; /* 16 bytes */
2504 uint32_t sampler = sampler_index.ud;
2505
2506 if (sampler >= 16) {
2507 assert(devinfo->is_haswell || devinfo->gen >= 8);
2508 brw_ADD(p,
2509 get_element_ud(header, 3),
2510 get_element_ud(brw_vec8_grf(0, 0), 3),
2511 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2512 }
2513 } else {
2514 /* Non-const sampler array indexing case */
2515 if (devinfo->gen < 8 && !devinfo->is_haswell) {
2516 return;
2517 }
2518
2519 struct brw_reg temp = get_element_ud(header, 3);
2520
2521 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2522 brw_SHL(p, temp, temp, brw_imm_ud(4));
2523 brw_ADD(p,
2524 get_element_ud(header, 3),
2525 get_element_ud(brw_vec8_grf(0, 0), 3),
2526 temp);
2527 }
2528 }
2529
2530 /* All these variables are pretty confusing - we might be better off
2531 * using bitmasks and macros for this, in the old style. Or perhaps
2532 * just having the caller instantiate the fields in dword3 itself.
2533 */
2534 void brw_urb_WRITE(struct brw_codegen *p,
2535 struct brw_reg dest,
2536 unsigned msg_reg_nr,
2537 struct brw_reg src0,
2538 enum brw_urb_write_flags flags,
2539 unsigned msg_length,
2540 unsigned response_length,
2541 unsigned offset,
2542 unsigned swizzle)
2543 {
2544 const struct gen_device_info *devinfo = p->devinfo;
2545 brw_inst *insn;
2546
2547 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2548
2549 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2550 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2551 brw_push_insn_state(p);
2552 brw_set_default_access_mode(p, BRW_ALIGN_1);
2553 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2554 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2555 BRW_REGISTER_TYPE_UD),
2556 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2557 brw_imm_ud(0xff00));
2558 brw_pop_insn_state(p);
2559 }
2560
2561 insn = next_insn(p, BRW_OPCODE_SEND);
2562
2563 assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2564
2565 brw_set_dest(p, insn, dest);
2566 brw_set_src0(p, insn, src0);
2567 brw_set_src1(p, insn, brw_imm_d(0));
2568
2569 if (devinfo->gen < 6)
2570 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2571
2572 brw_set_urb_message(p,
2573 insn,
2574 flags,
2575 msg_length,
2576 response_length,
2577 offset,
2578 swizzle);
2579 }
2580
2581 struct brw_inst *
2582 brw_send_indirect_message(struct brw_codegen *p,
2583 unsigned sfid,
2584 struct brw_reg dst,
2585 struct brw_reg payload,
2586 struct brw_reg desc)
2587 {
2588 const struct gen_device_info *devinfo = p->devinfo;
2589 struct brw_inst *send;
2590 int setup;
2591
2592 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2593
2594 assert(desc.type == BRW_REGISTER_TYPE_UD);
2595
2596 /* We hold on to the setup instruction (the SEND in the direct case, the OR
2597 * in the indirect case) by its index in the instruction store. The
2598 * pointer returned by next_insn() may become invalid if emitting the SEND
2599 * in the indirect case reallocs the store.
2600 */
2601
2602 if (desc.file == BRW_IMMEDIATE_VALUE) {
2603 setup = p->nr_insn;
2604 send = next_insn(p, BRW_OPCODE_SEND);
2605 brw_set_src1(p, send, desc);
2606
2607 } else {
2608 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2609
2610 brw_push_insn_state(p);
2611 brw_set_default_access_mode(p, BRW_ALIGN_1);
2612 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2613 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2614
2615 /* Load the indirect descriptor to an address register using OR so the
2616 * caller can specify additional descriptor bits with the usual
2617 * brw_set_*_message() helper functions.
2618 */
2619 setup = p->nr_insn;
2620 brw_OR(p, addr, desc, brw_imm_ud(0));
2621
2622 brw_pop_insn_state(p);
2623
2624 send = next_insn(p, BRW_OPCODE_SEND);
2625 brw_set_src1(p, send, addr);
2626 }
2627
2628 if (dst.width < BRW_EXECUTE_8)
2629 brw_inst_set_exec_size(devinfo, send, dst.width);
2630
2631 brw_set_dest(p, send, dst);
2632 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2633 brw_inst_set_sfid(devinfo, send, sfid);
2634
2635 return &p->store[setup];
2636 }
2637
2638 static struct brw_inst *
2639 brw_send_indirect_surface_message(struct brw_codegen *p,
2640 unsigned sfid,
2641 struct brw_reg dst,
2642 struct brw_reg payload,
2643 struct brw_reg surface,
2644 unsigned message_len,
2645 unsigned response_len,
2646 bool header_present)
2647 {
2648 const struct gen_device_info *devinfo = p->devinfo;
2649 struct brw_inst *insn;
2650
2651 if (surface.file != BRW_IMMEDIATE_VALUE) {
2652 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2653
2654 brw_push_insn_state(p);
2655 brw_set_default_access_mode(p, BRW_ALIGN_1);
2656 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2657 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2658
2659 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2660 * some surface array is accessed out of bounds.
2661 */
2662 insn = brw_AND(p, addr,
2663 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2664 BRW_GET_SWZ(surface.swizzle, 0)),
2665 brw_imm_ud(0xff));
2666
2667 brw_pop_insn_state(p);
2668
2669 surface = addr;
2670 }
2671
2672 insn = brw_send_indirect_message(p, sfid, dst, payload, surface);
2673 brw_inst_set_mlen(devinfo, insn, message_len);
2674 brw_inst_set_rlen(devinfo, insn, response_len);
2675 brw_inst_set_header_present(devinfo, insn, header_present);
2676
2677 return insn;
2678 }
2679
2680 static bool
2681 while_jumps_before_offset(const struct gen_device_info *devinfo,
2682 brw_inst *insn, int while_offset, int start_offset)
2683 {
2684 int scale = 16 / brw_jump_scale(devinfo);
2685 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2686 : brw_inst_jip(devinfo, insn);
2687 assert(jip < 0);
2688 return while_offset + jip * scale <= start_offset;
2689 }
2690
2691
2692 static int
2693 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2694 {
2695 int offset;
2696 void *store = p->store;
2697 const struct gen_device_info *devinfo = p->devinfo;
2698
2699 int depth = 0;
2700
2701 for (offset = next_offset(devinfo, store, start_offset);
2702 offset < p->next_insn_offset;
2703 offset = next_offset(devinfo, store, offset)) {
2704 brw_inst *insn = store + offset;
2705
2706 switch (brw_inst_opcode(devinfo, insn)) {
2707 case BRW_OPCODE_IF:
2708 depth++;
2709 break;
2710 case BRW_OPCODE_ENDIF:
2711 if (depth == 0)
2712 return offset;
2713 depth--;
2714 break;
2715 case BRW_OPCODE_WHILE:
2716 /* If the while doesn't jump before our instruction, it's the end
2717 * of a sibling do...while loop. Ignore it.
2718 */
2719 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2720 continue;
2721 /* fallthrough */
2722 case BRW_OPCODE_ELSE:
2723 case BRW_OPCODE_HALT:
2724 if (depth == 0)
2725 return offset;
2726 }
2727 }
2728
2729 return 0;
2730 }
2731
2732 /* There is no DO instruction on gen6, so to find the end of the loop
2733 * we have to see if the loop is jumping back before our start
2734 * instruction.
2735 */
2736 static int
2737 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2738 {
2739 const struct gen_device_info *devinfo = p->devinfo;
2740 int offset;
2741 void *store = p->store;
2742
2743 assert(devinfo->gen >= 6);
2744
2745 /* Always start after the instruction (such as a WHILE) we're trying to fix
2746 * up.
2747 */
2748 for (offset = next_offset(devinfo, store, start_offset);
2749 offset < p->next_insn_offset;
2750 offset = next_offset(devinfo, store, offset)) {
2751 brw_inst *insn = store + offset;
2752
2753 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2754 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2755 return offset;
2756 }
2757 }
2758 assert(!"not reached");
2759 return start_offset;
2760 }
2761
2762 /* After program generation, go back and update the UIP and JIP of
2763 * BREAK, CONT, and HALT instructions to their correct locations.
2764 */
2765 void
2766 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2767 {
2768 const struct gen_device_info *devinfo = p->devinfo;
2769 int offset;
2770 int br = brw_jump_scale(devinfo);
2771 int scale = 16 / br;
2772 void *store = p->store;
2773
2774 if (devinfo->gen < 6)
2775 return;
2776
2777 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2778 brw_inst *insn = store + offset;
2779 assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2780
2781 int block_end_offset = brw_find_next_block_end(p, offset);
2782 switch (brw_inst_opcode(devinfo, insn)) {
2783 case BRW_OPCODE_BREAK:
2784 assert(block_end_offset != 0);
2785 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2786 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2787 brw_inst_set_uip(devinfo, insn,
2788 (brw_find_loop_end(p, offset) - offset +
2789 (devinfo->gen == 6 ? 16 : 0)) / scale);
2790 break;
2791 case BRW_OPCODE_CONTINUE:
2792 assert(block_end_offset != 0);
2793 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2794 brw_inst_set_uip(devinfo, insn,
2795 (brw_find_loop_end(p, offset) - offset) / scale);
2796
2797 assert(brw_inst_uip(devinfo, insn) != 0);
2798 assert(brw_inst_jip(devinfo, insn) != 0);
2799 break;
2800
2801 case BRW_OPCODE_ENDIF: {
2802 int32_t jump = (block_end_offset == 0) ?
2803 1 * br : (block_end_offset - offset) / scale;
2804 if (devinfo->gen >= 7)
2805 brw_inst_set_jip(devinfo, insn, jump);
2806 else
2807 brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2808 break;
2809 }
2810
2811 case BRW_OPCODE_HALT:
2812 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2813 *
2814 * "In case of the halt instruction not inside any conditional
2815 * code block, the value of <JIP> and <UIP> should be the
2816 * same. In case of the halt instruction inside conditional code
2817 * block, the <UIP> should be the end of the program, and the
2818 * <JIP> should be end of the most inner conditional code block."
2819 *
2820 * The uip will have already been set by whoever set up the
2821 * instruction.
2822 */
2823 if (block_end_offset == 0) {
2824 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2825 } else {
2826 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2827 }
2828 assert(brw_inst_uip(devinfo, insn) != 0);
2829 assert(brw_inst_jip(devinfo, insn) != 0);
2830 break;
2831 }
2832 }
2833 }
2834
2835 void brw_ff_sync(struct brw_codegen *p,
2836 struct brw_reg dest,
2837 unsigned msg_reg_nr,
2838 struct brw_reg src0,
2839 bool allocate,
2840 unsigned response_length,
2841 bool eot)
2842 {
2843 const struct gen_device_info *devinfo = p->devinfo;
2844 brw_inst *insn;
2845
2846 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2847
2848 insn = next_insn(p, BRW_OPCODE_SEND);
2849 brw_set_dest(p, insn, dest);
2850 brw_set_src0(p, insn, src0);
2851 brw_set_src1(p, insn, brw_imm_d(0));
2852
2853 if (devinfo->gen < 6)
2854 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2855
2856 brw_set_ff_sync_message(p,
2857 insn,
2858 allocate,
2859 response_length,
2860 eot);
2861 }
2862
2863 /**
2864 * Emit the SEND instruction necessary to generate stream output data on Gen6
2865 * (for transform feedback).
2866 *
2867 * If send_commit_msg is true, this is the last piece of stream output data
2868 * from this thread, so send the data as a committed write. According to the
2869 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2870 *
2871 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2872 * writes are complete by sending the final write as a committed write."
2873 */
2874 void
2875 brw_svb_write(struct brw_codegen *p,
2876 struct brw_reg dest,
2877 unsigned msg_reg_nr,
2878 struct brw_reg src0,
2879 unsigned binding_table_index,
2880 bool send_commit_msg)
2881 {
2882 const struct gen_device_info *devinfo = p->devinfo;
2883 const unsigned target_cache =
2884 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2885 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2886 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2887 brw_inst *insn;
2888
2889 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2890
2891 insn = next_insn(p, BRW_OPCODE_SEND);
2892 brw_set_dest(p, insn, dest);
2893 brw_set_src0(p, insn, src0);
2894 brw_set_src1(p, insn, brw_imm_d(0));
2895 brw_set_dp_write_message(p, insn,
2896 binding_table_index,
2897 0, /* msg_control: ignored */
2898 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2899 target_cache,
2900 1, /* msg_length */
2901 true, /* header_present */
2902 0, /* last_render_target: ignored */
2903 send_commit_msg, /* response_length */
2904 0, /* end_of_thread */
2905 send_commit_msg); /* send_commit_msg */
2906 }
2907
2908 static unsigned
2909 brw_surface_payload_size(struct brw_codegen *p,
2910 unsigned num_channels,
2911 bool has_simd4x2,
2912 bool has_simd16)
2913 {
2914 if (has_simd4x2 &&
2915 brw_inst_access_mode(p->devinfo, p->current) == BRW_ALIGN_16)
2916 return 1;
2917 else if (has_simd16 &&
2918 brw_inst_exec_size(p->devinfo, p->current) == BRW_EXECUTE_16)
2919 return 2 * num_channels;
2920 else
2921 return num_channels;
2922 }
2923
2924 static void
2925 brw_set_dp_untyped_atomic_message(struct brw_codegen *p,
2926 brw_inst *insn,
2927 unsigned atomic_op,
2928 bool response_expected)
2929 {
2930 const struct gen_device_info *devinfo = p->devinfo;
2931 unsigned msg_control =
2932 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2933 (response_expected ? 1 << 5 : 0); /* Return data expected */
2934
2935 if (devinfo->gen >= 8 || devinfo->is_haswell) {
2936 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2937 if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
2938 msg_control |= 1 << 4; /* SIMD8 mode */
2939
2940 brw_inst_set_dp_msg_type(devinfo, insn,
2941 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
2942 } else {
2943 brw_inst_set_dp_msg_type(devinfo, insn,
2944 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
2945 }
2946 } else {
2947 brw_inst_set_dp_msg_type(devinfo, insn,
2948 GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
2949
2950 if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
2951 msg_control |= 1 << 4; /* SIMD8 mode */
2952 }
2953
2954 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2955 }
2956
2957 void
2958 brw_untyped_atomic(struct brw_codegen *p,
2959 struct brw_reg dst,
2960 struct brw_reg payload,
2961 struct brw_reg surface,
2962 unsigned atomic_op,
2963 unsigned msg_length,
2964 bool response_expected)
2965 {
2966 const struct gen_device_info *devinfo = p->devinfo;
2967 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2968 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2969 GEN7_SFID_DATAPORT_DATA_CACHE);
2970 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
2971 /* Mask out unused components -- This is especially important in Align16
2972 * mode on generations that don't have native support for SIMD4x2 atomics,
2973 * because unused but enabled components will cause the dataport to perform
2974 * additional atomic operations on the addresses that happen to be in the
2975 * uninitialized Y, Z and W coordinates of the payload.
2976 */
2977 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2978 struct brw_inst *insn = brw_send_indirect_surface_message(
2979 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
2980 brw_surface_payload_size(p, response_expected,
2981 devinfo->gen >= 8 || devinfo->is_haswell, true),
2982 align1);
2983
2984 brw_set_dp_untyped_atomic_message(
2985 p, insn, atomic_op, response_expected);
2986 }
2987
2988 static void
2989 brw_set_dp_untyped_surface_read_message(struct brw_codegen *p,
2990 struct brw_inst *insn,
2991 unsigned num_channels)
2992 {
2993 const struct gen_device_info *devinfo = p->devinfo;
2994 /* Set mask of 32-bit channels to drop. */
2995 unsigned msg_control = 0xf & (0xf << num_channels);
2996
2997 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2998 if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
2999 msg_control |= 1 << 4; /* SIMD16 mode */
3000 else
3001 msg_control |= 2 << 4; /* SIMD8 mode */
3002 }
3003
3004 brw_inst_set_dp_msg_type(devinfo, insn,
3005 (devinfo->gen >= 8 || devinfo->is_haswell ?
3006 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
3007 GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ));
3008 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3009 }
3010
3011 void
3012 brw_untyped_surface_read(struct brw_codegen *p,
3013 struct brw_reg dst,
3014 struct brw_reg payload,
3015 struct brw_reg surface,
3016 unsigned msg_length,
3017 unsigned num_channels)
3018 {
3019 const struct gen_device_info *devinfo = p->devinfo;
3020 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3021 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3022 GEN7_SFID_DATAPORT_DATA_CACHE);
3023 struct brw_inst *insn = brw_send_indirect_surface_message(
3024 p, sfid, dst, payload, surface, msg_length,
3025 brw_surface_payload_size(p, num_channels, true, true),
3026 false);
3027
3028 brw_set_dp_untyped_surface_read_message(
3029 p, insn, num_channels);
3030 }
3031
3032 static void
3033 brw_set_dp_untyped_surface_write_message(struct brw_codegen *p,
3034 struct brw_inst *insn,
3035 unsigned num_channels)
3036 {
3037 const struct gen_device_info *devinfo = p->devinfo;
3038 /* Set mask of 32-bit channels to drop. */
3039 unsigned msg_control = 0xf & (0xf << num_channels);
3040
3041 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3042 if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
3043 msg_control |= 1 << 4; /* SIMD16 mode */
3044 else
3045 msg_control |= 2 << 4; /* SIMD8 mode */
3046 } else {
3047 if (devinfo->gen >= 8 || devinfo->is_haswell)
3048 msg_control |= 0 << 4; /* SIMD4x2 mode */
3049 else
3050 msg_control |= 2 << 4; /* SIMD8 mode */
3051 }
3052
3053 brw_inst_set_dp_msg_type(devinfo, insn,
3054 devinfo->gen >= 8 || devinfo->is_haswell ?
3055 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
3056 GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
3057 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3058 }
3059
3060 void
3061 brw_untyped_surface_write(struct brw_codegen *p,
3062 struct brw_reg payload,
3063 struct brw_reg surface,
3064 unsigned msg_length,
3065 unsigned num_channels)
3066 {
3067 const struct gen_device_info *devinfo = p->devinfo;
3068 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3069 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3070 GEN7_SFID_DATAPORT_DATA_CACHE);
3071 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3072 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3073 const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3074 WRITEMASK_X : WRITEMASK_XYZW;
3075 struct brw_inst *insn = brw_send_indirect_surface_message(
3076 p, sfid, brw_writemask(brw_null_reg(), mask),
3077 payload, surface, msg_length, 0, align1);
3078
3079 brw_set_dp_untyped_surface_write_message(
3080 p, insn, num_channels);
3081 }
3082
3083 static void
3084 brw_set_dp_typed_atomic_message(struct brw_codegen *p,
3085 struct brw_inst *insn,
3086 unsigned atomic_op,
3087 bool response_expected)
3088 {
3089 const struct gen_device_info *devinfo = p->devinfo;
3090 unsigned msg_control =
3091 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
3092 (response_expected ? 1 << 5 : 0); /* Return data expected */
3093
3094 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3095 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3096 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3097 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3098
3099 brw_inst_set_dp_msg_type(devinfo, insn,
3100 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP);
3101 } else {
3102 brw_inst_set_dp_msg_type(devinfo, insn,
3103 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2);
3104 }
3105
3106 } else {
3107 brw_inst_set_dp_msg_type(devinfo, insn,
3108 GEN7_DATAPORT_RC_TYPED_ATOMIC_OP);
3109
3110 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3111 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3112 }
3113
3114 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3115 }
3116
3117 void
3118 brw_typed_atomic(struct brw_codegen *p,
3119 struct brw_reg dst,
3120 struct brw_reg payload,
3121 struct brw_reg surface,
3122 unsigned atomic_op,
3123 unsigned msg_length,
3124 bool response_expected) {
3125 const struct gen_device_info *devinfo = p->devinfo;
3126 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3127 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3128 GEN6_SFID_DATAPORT_RENDER_CACHE);
3129 const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3130 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3131 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3132 struct brw_inst *insn = brw_send_indirect_surface_message(
3133 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
3134 brw_surface_payload_size(p, response_expected,
3135 devinfo->gen >= 8 || devinfo->is_haswell, false),
3136 true);
3137
3138 brw_set_dp_typed_atomic_message(
3139 p, insn, atomic_op, response_expected);
3140 }
3141
3142 static void
3143 brw_set_dp_typed_surface_read_message(struct brw_codegen *p,
3144 struct brw_inst *insn,
3145 unsigned num_channels)
3146 {
3147 const struct gen_device_info *devinfo = p->devinfo;
3148 /* Set mask of unused channels. */
3149 unsigned msg_control = 0xf & (0xf << num_channels);
3150
3151 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3152 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3153 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3154 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3155 else
3156 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3157 }
3158
3159 brw_inst_set_dp_msg_type(devinfo, insn,
3160 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ);
3161 } else {
3162 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3163 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3164 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3165 }
3166
3167 brw_inst_set_dp_msg_type(devinfo, insn,
3168 GEN7_DATAPORT_RC_TYPED_SURFACE_READ);
3169 }
3170
3171 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3172 }
3173
3174 void
3175 brw_typed_surface_read(struct brw_codegen *p,
3176 struct brw_reg dst,
3177 struct brw_reg payload,
3178 struct brw_reg surface,
3179 unsigned msg_length,
3180 unsigned num_channels)
3181 {
3182 const struct gen_device_info *devinfo = p->devinfo;
3183 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3184 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3185 GEN6_SFID_DATAPORT_RENDER_CACHE);
3186 struct brw_inst *insn = brw_send_indirect_surface_message(
3187 p, sfid, dst, payload, surface, msg_length,
3188 brw_surface_payload_size(p, num_channels,
3189 devinfo->gen >= 8 || devinfo->is_haswell, false),
3190 true);
3191
3192 brw_set_dp_typed_surface_read_message(
3193 p, insn, num_channels);
3194 }
3195
3196 static void
3197 brw_set_dp_typed_surface_write_message(struct brw_codegen *p,
3198 struct brw_inst *insn,
3199 unsigned num_channels)
3200 {
3201 const struct gen_device_info *devinfo = p->devinfo;
3202 /* Set mask of unused channels. */
3203 unsigned msg_control = 0xf & (0xf << num_channels);
3204
3205 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3206 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3207 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3208 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3209 else
3210 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3211 }
3212
3213 brw_inst_set_dp_msg_type(devinfo, insn,
3214 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE);
3215
3216 } else {
3217 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3218 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3219 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3220 }
3221
3222 brw_inst_set_dp_msg_type(devinfo, insn,
3223 GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE);
3224 }
3225
3226 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3227 }
3228
3229 void
3230 brw_typed_surface_write(struct brw_codegen *p,
3231 struct brw_reg payload,
3232 struct brw_reg surface,
3233 unsigned msg_length,
3234 unsigned num_channels)
3235 {
3236 const struct gen_device_info *devinfo = p->devinfo;
3237 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3238 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3239 GEN6_SFID_DATAPORT_RENDER_CACHE);
3240 const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3241 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3242 const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3243 WRITEMASK_X : WRITEMASK_XYZW);
3244 struct brw_inst *insn = brw_send_indirect_surface_message(
3245 p, sfid, brw_writemask(brw_null_reg(), mask),
3246 payload, surface, msg_length, 0, true);
3247
3248 brw_set_dp_typed_surface_write_message(
3249 p, insn, num_channels);
3250 }
3251
3252 static void
3253 brw_set_memory_fence_message(struct brw_codegen *p,
3254 struct brw_inst *insn,
3255 enum brw_message_target sfid,
3256 bool commit_enable)
3257 {
3258 const struct gen_device_info *devinfo = p->devinfo;
3259
3260 brw_set_message_descriptor(p, insn, sfid,
3261 1 /* message length */,
3262 (commit_enable ? 1 : 0) /* response length */,
3263 true /* header present */,
3264 false);
3265
3266 switch (sfid) {
3267 case GEN6_SFID_DATAPORT_RENDER_CACHE:
3268 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3269 break;
3270 case GEN7_SFID_DATAPORT_DATA_CACHE:
3271 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3272 break;
3273 default:
3274 unreachable("Not reached");
3275 }
3276
3277 if (commit_enable)
3278 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3279 }
3280
3281 void
3282 brw_memory_fence(struct brw_codegen *p,
3283 struct brw_reg dst)
3284 {
3285 const struct gen_device_info *devinfo = p->devinfo;
3286 const bool commit_enable = devinfo->gen == 7 && !devinfo->is_haswell;
3287 struct brw_inst *insn;
3288
3289 brw_push_insn_state(p);
3290 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3291 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3292 dst = vec1(dst);
3293
3294 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3295 * message doesn't write anything back.
3296 */
3297 insn = next_insn(p, BRW_OPCODE_SEND);
3298 dst = retype(dst, BRW_REGISTER_TYPE_UW);
3299 brw_set_dest(p, insn, dst);
3300 brw_set_src0(p, insn, dst);
3301 brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3302 commit_enable);
3303
3304 if (devinfo->gen == 7 && !devinfo->is_haswell) {
3305 /* IVB does typed surface access through the render cache, so we need to
3306 * flush it too. Use a different register so both flushes can be
3307 * pipelined by the hardware.
3308 */
3309 insn = next_insn(p, BRW_OPCODE_SEND);
3310 brw_set_dest(p, insn, offset(dst, 1));
3311 brw_set_src0(p, insn, offset(dst, 1));
3312 brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3313 commit_enable);
3314
3315 /* Now write the response of the second message into the response of the
3316 * first to trigger a pipeline stall -- This way future render and data
3317 * cache messages will be properly ordered with respect to past data and
3318 * render cache messages.
3319 */
3320 brw_MOV(p, dst, offset(dst, 1));
3321 }
3322
3323 brw_pop_insn_state(p);
3324 }
3325
3326 void
3327 brw_pixel_interpolator_query(struct brw_codegen *p,
3328 struct brw_reg dest,
3329 struct brw_reg mrf,
3330 bool noperspective,
3331 unsigned mode,
3332 struct brw_reg data,
3333 unsigned msg_length,
3334 unsigned response_length)
3335 {
3336 const struct gen_device_info *devinfo = p->devinfo;
3337 struct brw_inst *insn;
3338 const uint16_t exec_size = brw_inst_exec_size(devinfo, p->current);
3339
3340 /* brw_send_indirect_message will automatically use a direct send message
3341 * if data is actually immediate.
3342 */
3343 insn = brw_send_indirect_message(p,
3344 GEN7_SFID_PIXEL_INTERPOLATOR,
3345 dest,
3346 mrf,
3347 vec1(data));
3348 brw_inst_set_mlen(devinfo, insn, msg_length);
3349 brw_inst_set_rlen(devinfo, insn, response_length);
3350
3351 brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16);
3352 brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
3353 brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
3354 brw_inst_set_pi_message_type(devinfo, insn, mode);
3355 }
3356
3357 void
3358 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3359 struct brw_reg mask)
3360 {
3361 const struct gen_device_info *devinfo = p->devinfo;
3362 const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current);
3363 const unsigned qtr_control = brw_inst_qtr_control(devinfo, p->current);
3364 brw_inst *inst;
3365
3366 assert(devinfo->gen >= 7);
3367 assert(mask.type == BRW_REGISTER_TYPE_UD);
3368
3369 brw_push_insn_state(p);
3370
3371 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3372 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3373
3374 if (devinfo->gen >= 8) {
3375 /* Getting the first active channel index is easy on Gen8: Just find
3376 * the first bit set in the execution mask. The register exists on
3377 * HSW already but it reads back as all ones when the current
3378 * instruction has execution masking disabled, so it's kind of
3379 * useless.
3380 */
3381 struct brw_reg exec_mask =
3382 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3383
3384 if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3385 /* Unfortunately, ce0 does not take into account the thread
3386 * dispatch mask, which may be a problem in cases where it's not
3387 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3388 * some n). Combine ce0 with the given dispatch (or vector) mask
3389 * to mask off those channels which were never dispatched by the
3390 * hardware.
3391 */
3392 brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3393 brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3394 exec_mask = vec1(dst);
3395 }
3396
3397 /* Quarter control has the effect of magically shifting the value of
3398 * ce0 so you'll get the first active channel relative to the
3399 * specified quarter control as result.
3400 */
3401 inst = brw_FBL(p, vec1(dst), exec_mask);
3402 } else {
3403 const struct brw_reg flag = brw_flag_reg(1, 0);
3404
3405 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3406
3407 /* Run enough instructions returning zero with execution masking and
3408 * a conditional modifier enabled in order to get the full execution
3409 * mask in f1.0. We could use a single 32-wide move here if it
3410 * weren't because of the hardware bug that causes channel enables to
3411 * be applied incorrectly to the second half of 32-wide instructions
3412 * on Gen7.
3413 */
3414 const unsigned lower_size = MIN2(16, exec_size);
3415 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3416 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3417 brw_imm_uw(0));
3418 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3419 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3420 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3421 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3422 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3423 }
3424
3425 /* Find the first bit set in the exec_size-wide portion of the flag
3426 * register that was updated by the last sequence of MOV
3427 * instructions.
3428 */
3429 const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3430 brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3431 }
3432 } else {
3433 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3434
3435 if (devinfo->gen >= 8 &&
3436 mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3437 /* In SIMD4x2 mode the first active channel index is just the
3438 * negation of the first bit of the mask register. Note that ce0
3439 * doesn't take into account the dispatch mask, so the Gen7 path
3440 * should be used instead unless you have the guarantee that the
3441 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3442 * for some n).
3443 */
3444 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3445 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3446 brw_imm_ud(1));
3447
3448 } else {
3449 /* Overwrite the destination without and with execution masking to
3450 * find out which of the channels is active.
3451 */
3452 brw_push_insn_state(p);
3453 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3454 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3455 brw_imm_ud(1));
3456
3457 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3458 brw_imm_ud(0));
3459 brw_pop_insn_state(p);
3460 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3461 }
3462 }
3463
3464 brw_pop_insn_state(p);
3465 }
3466
3467 void
3468 brw_broadcast(struct brw_codegen *p,
3469 struct brw_reg dst,
3470 struct brw_reg src,
3471 struct brw_reg idx)
3472 {
3473 const struct gen_device_info *devinfo = p->devinfo;
3474 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3475 brw_inst *inst;
3476
3477 brw_push_insn_state(p);
3478 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3479 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3480
3481 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3482 src.address_mode == BRW_ADDRESS_DIRECT);
3483
3484 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3485 idx.file == BRW_IMMEDIATE_VALUE) {
3486 /* Trivial, the source is already uniform or the index is a constant.
3487 * We will typically not get here if the optimizer is doing its job, but
3488 * asserting would be mean.
3489 */
3490 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3491 brw_MOV(p, dst,
3492 (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3493 stride(suboffset(src, 4 * i), 0, 4, 1)));
3494 } else {
3495 if (align1) {
3496 const struct brw_reg addr =
3497 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3498 const unsigned offset = src.nr * REG_SIZE + src.subnr;
3499 /* Limit in bytes of the signed indirect addressing immediate. */
3500 const unsigned limit = 512;
3501
3502 brw_push_insn_state(p);
3503 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3504 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3505
3506 /* Take into account the component size and horizontal stride. */
3507 assert(src.vstride == src.hstride + src.width);
3508 brw_SHL(p, addr, vec1(idx),
3509 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3510 src.hstride - 1));
3511
3512 /* We can only address up to limit bytes using the indirect
3513 * addressing immediate, account for the difference if the source
3514 * register is above this limit.
3515 */
3516 if (offset >= limit)
3517 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3518
3519 brw_pop_insn_state(p);
3520
3521 /* Use indirect addressing to fetch the specified component. */
3522 brw_MOV(p, dst,
3523 retype(brw_vec1_indirect(addr.subnr, offset % limit),
3524 src.type));
3525 } else {
3526 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3527 * to all bits of a flag register,
3528 */
3529 inst = brw_MOV(p,
3530 brw_null_reg(),
3531 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3532 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3533 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3534 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3535
3536 /* and use predicated SEL to pick the right channel. */
3537 inst = brw_SEL(p, dst,
3538 stride(suboffset(src, 4), 4, 4, 1),
3539 stride(src, 4, 4, 1));
3540 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3541 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3542 }
3543 }
3544
3545 brw_pop_insn_state(p);
3546 }
3547
3548 /**
3549 * This instruction is generated as a single-channel align1 instruction by
3550 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3551 *
3552 * We can't use the typed atomic op in the FS because that has the execution
3553 * mask ANDed with the pixel mask, but we just want to write the one dword for
3554 * all the pixels.
3555 *
3556 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3557 * one u32. So we use the same untyped atomic write message as the pixel
3558 * shader.
3559 *
3560 * The untyped atomic operation requires a BUFFER surface type with RAW
3561 * format, and is only accessible through the legacy DATA_CACHE dataport
3562 * messages.
3563 */
3564 void brw_shader_time_add(struct brw_codegen *p,
3565 struct brw_reg payload,
3566 uint32_t surf_index)
3567 {
3568 const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ?
3569 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3570 GEN7_SFID_DATAPORT_DATA_CACHE);
3571 assert(p->devinfo->gen >= 7);
3572
3573 brw_push_insn_state(p);
3574 brw_set_default_access_mode(p, BRW_ALIGN_1);
3575 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3576 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3577 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3578
3579 /* We use brw_vec1_reg and unmasked because we want to increment the given
3580 * offset only once.
3581 */
3582 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3583 BRW_ARF_NULL, 0));
3584 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3585 payload.nr, 0));
3586 brw_set_src1(p, send, brw_imm_ud(0));
3587 brw_set_message_descriptor(p, send, sfid, 2, 0, false, false);
3588 brw_inst_set_binding_table_index(p->devinfo, send, surf_index);
3589 brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
3590
3591 brw_pop_insn_state(p);
3592 }
3593
3594
3595 /**
3596 * Emit the SEND message for a barrier
3597 */
3598 void
3599 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3600 {
3601 const struct gen_device_info *devinfo = p->devinfo;
3602 struct brw_inst *inst;
3603
3604 assert(devinfo->gen >= 7);
3605
3606 brw_push_insn_state(p);
3607 brw_set_default_access_mode(p, BRW_ALIGN_1);
3608 inst = next_insn(p, BRW_OPCODE_SEND);
3609 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3610 brw_set_src0(p, inst, src);
3611 brw_set_src1(p, inst, brw_null_reg());
3612
3613 brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY,
3614 1 /* msg_length */,
3615 0 /* response_length */,
3616 false /* header_present */,
3617 false /* end_of_thread */);
3618
3619 brw_inst_set_gateway_notify(devinfo, inst, 1);
3620 brw_inst_set_gateway_subfuncid(devinfo, inst,
3621 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3622
3623 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3624 brw_pop_insn_state(p);
3625 }
3626
3627
3628 /**
3629 * Emit the wait instruction for a barrier
3630 */
3631 void
3632 brw_WAIT(struct brw_codegen *p)
3633 {
3634 const struct gen_device_info *devinfo = p->devinfo;
3635 struct brw_inst *insn;
3636
3637 struct brw_reg src = brw_notification_reg();
3638
3639 insn = next_insn(p, BRW_OPCODE_WAIT);
3640 brw_set_dest(p, insn, src);
3641 brw_set_src0(p, insn, src);
3642 brw_set_src1(p, insn, brw_null_reg());
3643
3644 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3645 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3646 }