i965: Push down inclusion of brw_program.h.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "brw_program.h"
27 #include "glsl/ir_uniform.h"
28 #include "program/sampler.h"
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
33 const src_reg &src0, const src_reg &src1,
34 const src_reg &src2)
35 {
36 this->opcode = opcode;
37 this->dst = dst;
38 this->src[0] = src0;
39 this->src[1] = src1;
40 this->src[2] = src2;
41 this->saturate = false;
42 this->force_writemask_all = false;
43 this->no_dd_clear = false;
44 this->no_dd_check = false;
45 this->writes_accumulator = false;
46 this->conditional_mod = BRW_CONDITIONAL_NONE;
47 this->predicate = BRW_PREDICATE_NONE;
48 this->predicate_inverse = false;
49 this->target = 0;
50 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
51 this->shadow_compare = false;
52 this->ir = NULL;
53 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
54 this->header_size = 0;
55 this->flag_subreg = 0;
56 this->mlen = 0;
57 this->base_mrf = 0;
58 this->offset = 0;
59 this->annotation = NULL;
60 }
61
62 vec4_instruction *
63 vec4_visitor::emit(vec4_instruction *inst)
64 {
65 inst->ir = this->base_ir;
66 inst->annotation = this->current_annotation;
67
68 this->instructions.push_tail(inst);
69
70 return inst;
71 }
72
73 vec4_instruction *
74 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
75 vec4_instruction *new_inst)
76 {
77 new_inst->ir = inst->ir;
78 new_inst->annotation = inst->annotation;
79
80 inst->insert_before(block, new_inst);
81
82 return inst;
83 }
84
85 vec4_instruction *
86 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
87 const src_reg &src1, const src_reg &src2)
88 {
89 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
90 }
91
92
93 vec4_instruction *
94 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
95 const src_reg &src1)
96 {
97 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
98 }
99
100 vec4_instruction *
101 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
102 {
103 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
104 }
105
106 vec4_instruction *
107 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
108 {
109 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
110 }
111
112 vec4_instruction *
113 vec4_visitor::emit(enum opcode opcode)
114 {
115 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
116 }
117
118 #define ALU1(op) \
119 vec4_instruction * \
120 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
121 { \
122 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
123 }
124
125 #define ALU2(op) \
126 vec4_instruction * \
127 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
128 const src_reg &src1) \
129 { \
130 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
131 src0, src1); \
132 }
133
134 #define ALU2_ACC(op) \
135 vec4_instruction * \
136 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
137 const src_reg &src1) \
138 { \
139 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
140 BRW_OPCODE_##op, dst, src0, src1); \
141 inst->writes_accumulator = true; \
142 return inst; \
143 }
144
145 #define ALU3(op) \
146 vec4_instruction * \
147 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
148 const src_reg &src1, const src_reg &src2) \
149 { \
150 assert(devinfo->gen >= 6); \
151 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
152 src0, src1, src2); \
153 }
154
155 ALU1(NOT)
156 ALU1(MOV)
157 ALU1(FRC)
158 ALU1(RNDD)
159 ALU1(RNDE)
160 ALU1(RNDZ)
161 ALU1(F32TO16)
162 ALU1(F16TO32)
163 ALU2(ADD)
164 ALU2(MUL)
165 ALU2_ACC(MACH)
166 ALU2(AND)
167 ALU2(OR)
168 ALU2(XOR)
169 ALU2(DP3)
170 ALU2(DP4)
171 ALU2(DPH)
172 ALU2(SHL)
173 ALU2(SHR)
174 ALU2(ASR)
175 ALU3(LRP)
176 ALU1(BFREV)
177 ALU3(BFE)
178 ALU2(BFI1)
179 ALU3(BFI2)
180 ALU1(FBH)
181 ALU1(FBL)
182 ALU1(CBIT)
183 ALU3(MAD)
184 ALU2_ACC(ADDC)
185 ALU2_ACC(SUBB)
186 ALU2(MAC)
187
188 /** Gen4 predicated IF. */
189 vec4_instruction *
190 vec4_visitor::IF(enum brw_predicate predicate)
191 {
192 vec4_instruction *inst;
193
194 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
195 inst->predicate = predicate;
196
197 return inst;
198 }
199
200 /** Gen6 IF with embedded comparison. */
201 vec4_instruction *
202 vec4_visitor::IF(src_reg src0, src_reg src1,
203 enum brw_conditional_mod condition)
204 {
205 assert(devinfo->gen == 6);
206
207 vec4_instruction *inst;
208
209 resolve_ud_negate(&src0);
210 resolve_ud_negate(&src1);
211
212 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
213 src0, src1);
214 inst->conditional_mod = condition;
215
216 return inst;
217 }
218
219 /**
220 * CMP: Sets the low bit of the destination channels with the result
221 * of the comparison, while the upper bits are undefined, and updates
222 * the flag register with the packed 16 bits of the result.
223 */
224 vec4_instruction *
225 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
226 enum brw_conditional_mod condition)
227 {
228 vec4_instruction *inst;
229
230 /* Take the instruction:
231 *
232 * CMP null<d> src0<f> src1<f>
233 *
234 * Original gen4 does type conversion to the destination type before
235 * comparison, producing garbage results for floating point comparisons.
236 *
237 * The destination type doesn't matter on newer generations, so we set the
238 * type to match src0 so we can compact the instruction.
239 */
240 dst.type = src0.type;
241
242 resolve_ud_negate(&src0);
243 resolve_ud_negate(&src1);
244
245 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
246 inst->conditional_mod = condition;
247
248 return inst;
249 }
250
251 vec4_instruction *
252 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
253 {
254 vec4_instruction *inst;
255
256 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
257 dst, index);
258 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
259 inst->mlen = 2;
260
261 return inst;
262 }
263
264 vec4_instruction *
265 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
266 const src_reg &index)
267 {
268 vec4_instruction *inst;
269
270 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
271 dst, src, index);
272 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
273 inst->mlen = 3;
274
275 return inst;
276 }
277
278 src_reg
279 vec4_visitor::fix_3src_operand(const src_reg &src)
280 {
281 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
282 * able to use vertical stride of zero to replicate the vec4 uniform, like
283 *
284 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
285 *
286 * But you can't, since vertical stride is always four in three-source
287 * instructions. Instead, insert a MOV instruction to do the replication so
288 * that the three-source instruction can consume it.
289 */
290
291 /* The MOV is only needed if the source is a uniform or immediate. */
292 if (src.file != UNIFORM && src.file != IMM)
293 return src;
294
295 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
296 return src;
297
298 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
299 expanded.type = src.type;
300 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
301 return src_reg(expanded);
302 }
303
304 src_reg
305 vec4_visitor::resolve_source_modifiers(const src_reg &src)
306 {
307 if (!src.abs && !src.negate)
308 return src;
309
310 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
311 resolved.type = src.type;
312 emit(MOV(resolved, src));
313
314 return src_reg(resolved);
315 }
316
317 src_reg
318 vec4_visitor::fix_math_operand(const src_reg &src)
319 {
320 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
321 return src;
322
323 /* The gen6 math instruction ignores the source modifiers --
324 * swizzle, abs, negate, and at least some parts of the register
325 * region description.
326 *
327 * Rather than trying to enumerate all these cases, *always* expand the
328 * operand to a temp GRF for gen6.
329 *
330 * For gen7, keep the operand as-is, except if immediate, which gen7 still
331 * can't use.
332 */
333
334 if (devinfo->gen == 7 && src.file != IMM)
335 return src;
336
337 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
338 expanded.type = src.type;
339 emit(MOV(expanded, src));
340 return src_reg(expanded);
341 }
342
343 vec4_instruction *
344 vec4_visitor::emit_math(enum opcode opcode,
345 const dst_reg &dst,
346 const src_reg &src0, const src_reg &src1)
347 {
348 vec4_instruction *math =
349 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
350
351 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
352 /* MATH on Gen6 must be align1, so we can't do writemasks. */
353 math->dst = dst_reg(this, glsl_type::vec4_type);
354 math->dst.type = dst.type;
355 math = emit(MOV(dst, src_reg(math->dst)));
356 } else if (devinfo->gen < 6) {
357 math->base_mrf = 1;
358 math->mlen = src1.file == BAD_FILE ? 1 : 2;
359 }
360
361 return math;
362 }
363
364 void
365 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
366 {
367 if (devinfo->gen < 7) {
368 unreachable("ir_unop_pack_half_2x16 should be lowered");
369 }
370
371 assert(dst.type == BRW_REGISTER_TYPE_UD);
372 assert(src0.type == BRW_REGISTER_TYPE_F);
373
374 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
375 *
376 * Because this instruction does not have a 16-bit floating-point type,
377 * the destination data type must be Word (W).
378 *
379 * The destination must be DWord-aligned and specify a horizontal stride
380 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
381 * each destination channel and the upper word is not modified.
382 *
383 * The above restriction implies that the f32to16 instruction must use
384 * align1 mode, because only in align1 mode is it possible to specify
385 * horizontal stride. We choose here to defy the hardware docs and emit
386 * align16 instructions.
387 *
388 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
389 * instructions. I was partially successful in that the code passed all
390 * tests. However, the code was dubiously correct and fragile, and the
391 * tests were not harsh enough to probe that frailty. Not trusting the
392 * code, I chose instead to remain in align16 mode in defiance of the hw
393 * docs).
394 *
395 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
396 * simulator, emitting a f32to16 in align16 mode with UD as destination
397 * data type is safe. The behavior differs from that specified in the PRM
398 * in that the upper word of each destination channel is cleared to 0.
399 */
400
401 dst_reg tmp_dst(this, glsl_type::uvec2_type);
402 src_reg tmp_src(tmp_dst);
403
404 #if 0
405 /* Verify the undocumented behavior on which the following instructions
406 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
407 * then the result of the bit-or instruction below will be incorrect.
408 *
409 * You should inspect the disasm output in order to verify that the MOV is
410 * not optimized away.
411 */
412 emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
413 #endif
414
415 /* Give tmp the form below, where "." means untouched.
416 *
417 * w z y x w z y x
418 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
419 *
420 * That the upper word of each write-channel be 0 is required for the
421 * following bit-shift and bit-or instructions to work. Note that this
422 * relies on the undocumented hardware behavior mentioned above.
423 */
424 tmp_dst.writemask = WRITEMASK_XY;
425 emit(F32TO16(tmp_dst, src0));
426
427 /* Give the write-channels of dst the form:
428 * 0xhhhh0000
429 */
430 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
431 emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
432
433 /* Finally, give the write-channels of dst the form of packHalf2x16's
434 * output:
435 * 0xhhhhllll
436 */
437 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
438 emit(OR(dst, src_reg(dst), tmp_src));
439 }
440
441 void
442 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
443 {
444 if (devinfo->gen < 7) {
445 unreachable("ir_unop_unpack_half_2x16 should be lowered");
446 }
447
448 assert(dst.type == BRW_REGISTER_TYPE_F);
449 assert(src0.type == BRW_REGISTER_TYPE_UD);
450
451 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
452 *
453 * Because this instruction does not have a 16-bit floating-point type,
454 * the source data type must be Word (W). The destination type must be
455 * F (Float).
456 *
457 * To use W as the source data type, we must adjust horizontal strides,
458 * which is only possible in align1 mode. All my [chadv] attempts at
459 * emitting align1 instructions for unpackHalf2x16 failed to pass the
460 * Piglit tests, so I gave up.
461 *
462 * I've verified that, on gen7 hardware and the simulator, it is safe to
463 * emit f16to32 in align16 mode with UD as source data type.
464 */
465
466 dst_reg tmp_dst(this, glsl_type::uvec2_type);
467 src_reg tmp_src(tmp_dst);
468
469 tmp_dst.writemask = WRITEMASK_X;
470 emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
471
472 tmp_dst.writemask = WRITEMASK_Y;
473 emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
474
475 dst.writemask = WRITEMASK_XY;
476 emit(F16TO32(dst, tmp_src));
477 }
478
479 void
480 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
481 {
482 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
483 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
484 * is not suitable to generate the shift values, but we can use the packed
485 * vector float and a type-converting MOV.
486 */
487 dst_reg shift(this, glsl_type::uvec4_type);
488 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
489
490 dst_reg shifted(this, glsl_type::uvec4_type);
491 src0.swizzle = BRW_SWIZZLE_XXXX;
492 emit(SHR(shifted, src0, src_reg(shift)));
493
494 shifted.type = BRW_REGISTER_TYPE_UB;
495 dst_reg f(this, glsl_type::vec4_type);
496 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
497
498 emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
499 }
500
501 void
502 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
503 {
504 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
505 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
506 * is not suitable to generate the shift values, but we can use the packed
507 * vector float and a type-converting MOV.
508 */
509 dst_reg shift(this, glsl_type::uvec4_type);
510 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
511
512 dst_reg shifted(this, glsl_type::uvec4_type);
513 src0.swizzle = BRW_SWIZZLE_XXXX;
514 emit(SHR(shifted, src0, src_reg(shift)));
515
516 shifted.type = BRW_REGISTER_TYPE_B;
517 dst_reg f(this, glsl_type::vec4_type);
518 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
519
520 dst_reg scaled(this, glsl_type::vec4_type);
521 emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
522
523 dst_reg max(this, glsl_type::vec4_type);
524 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
525 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
526 }
527
528 void
529 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
530 {
531 dst_reg saturated(this, glsl_type::vec4_type);
532 vec4_instruction *inst = emit(MOV(saturated, src0));
533 inst->saturate = true;
534
535 dst_reg scaled(this, glsl_type::vec4_type);
536 emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
537
538 dst_reg rounded(this, glsl_type::vec4_type);
539 emit(RNDE(rounded, src_reg(scaled)));
540
541 dst_reg u(this, glsl_type::uvec4_type);
542 emit(MOV(u, src_reg(rounded)));
543
544 src_reg bytes(u);
545 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
546 }
547
548 void
549 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
550 {
551 dst_reg max(this, glsl_type::vec4_type);
552 emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
553
554 dst_reg min(this, glsl_type::vec4_type);
555 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
556
557 dst_reg scaled(this, glsl_type::vec4_type);
558 emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
559
560 dst_reg rounded(this, glsl_type::vec4_type);
561 emit(RNDE(rounded, src_reg(scaled)));
562
563 dst_reg i(this, glsl_type::ivec4_type);
564 emit(MOV(i, src_reg(rounded)));
565
566 src_reg bytes(i);
567 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
568 }
569
570 /**
571 * Returns the minimum number of vec4 elements needed to pack a type.
572 *
573 * For simple types, it will return 1 (a single vec4); for matrices, the
574 * number of columns; for array and struct, the sum of the vec4_size of
575 * each of its elements; and for sampler and atomic, zero.
576 *
577 * This method is useful to calculate how much register space is needed to
578 * store a particular type.
579 */
580 extern "C" int
581 type_size_vec4(const struct glsl_type *type)
582 {
583 unsigned int i;
584 int size;
585
586 switch (type->base_type) {
587 case GLSL_TYPE_UINT:
588 case GLSL_TYPE_INT:
589 case GLSL_TYPE_FLOAT:
590 case GLSL_TYPE_BOOL:
591 if (type->is_matrix()) {
592 return type->matrix_columns;
593 } else {
594 /* Regardless of size of vector, it gets a vec4. This is bad
595 * packing for things like floats, but otherwise arrays become a
596 * mess. Hopefully a later pass over the code can pack scalars
597 * down if appropriate.
598 */
599 return 1;
600 }
601 case GLSL_TYPE_ARRAY:
602 assert(type->length > 0);
603 return type_size_vec4(type->fields.array) * type->length;
604 case GLSL_TYPE_STRUCT:
605 size = 0;
606 for (i = 0; i < type->length; i++) {
607 size += type_size_vec4(type->fields.structure[i].type);
608 }
609 return size;
610 case GLSL_TYPE_SUBROUTINE:
611 return 1;
612
613 case GLSL_TYPE_SAMPLER:
614 /* Samplers take up no register space, since they're baked in at
615 * link time.
616 */
617 return 0;
618 case GLSL_TYPE_ATOMIC_UINT:
619 return 0;
620 case GLSL_TYPE_IMAGE:
621 return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
622 case GLSL_TYPE_VOID:
623 case GLSL_TYPE_DOUBLE:
624 case GLSL_TYPE_ERROR:
625 case GLSL_TYPE_INTERFACE:
626 unreachable("not reached");
627 }
628
629 return 0;
630 }
631
632 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
633 {
634 init();
635
636 this->file = VGRF;
637 this->nr = v->alloc.allocate(type_size_vec4(type));
638
639 if (type->is_array() || type->is_record()) {
640 this->swizzle = BRW_SWIZZLE_NOOP;
641 } else {
642 this->swizzle = brw_swizzle_for_size(type->vector_elements);
643 }
644
645 this->type = brw_type_for_base_type(type);
646 }
647
648 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
649 {
650 assert(size > 0);
651
652 init();
653
654 this->file = VGRF;
655 this->nr = v->alloc.allocate(type_size_vec4(type) * size);
656
657 this->swizzle = BRW_SWIZZLE_NOOP;
658
659 this->type = brw_type_for_base_type(type);
660 }
661
662 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
663 {
664 init();
665
666 this->file = VGRF;
667 this->nr = v->alloc.allocate(type_size_vec4(type));
668
669 if (type->is_array() || type->is_record()) {
670 this->writemask = WRITEMASK_XYZW;
671 } else {
672 this->writemask = (1 << type->vector_elements) - 1;
673 }
674
675 this->type = brw_type_for_base_type(type);
676 }
677
678 vec4_instruction *
679 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
680 src_reg src0, src_reg src1)
681 {
682 vec4_instruction *inst;
683
684 if (devinfo->gen >= 6) {
685 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
686 inst->conditional_mod = conditionalmod;
687 } else {
688 emit(CMP(dst, src0, src1, conditionalmod));
689
690 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
691 inst->predicate = BRW_PREDICATE_NORMAL;
692 }
693
694 return inst;
695 }
696
697 vec4_instruction *
698 vec4_visitor::emit_lrp(const dst_reg &dst,
699 const src_reg &x, const src_reg &y, const src_reg &a)
700 {
701 if (devinfo->gen >= 6) {
702 /* Note that the instruction's argument order is reversed from GLSL
703 * and the IR.
704 */
705 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
706 fix_3src_operand(x)));
707 } else {
708 /* Earlier generations don't support three source operations, so we
709 * need to emit x*(1-a) + y*a.
710 */
711 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
712 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
713 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
714 y_times_a.writemask = dst.writemask;
715 one_minus_a.writemask = dst.writemask;
716 x_times_one_minus_a.writemask = dst.writemask;
717
718 emit(MUL(y_times_a, y, a));
719 emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
720 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
721 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
722 }
723 }
724
725 /**
726 * Emits the instructions needed to perform a pull constant load. before_block
727 * and before_inst can be NULL in which case the instruction will be appended
728 * to the end of the instruction list.
729 */
730 void
731 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
732 src_reg surf_index,
733 src_reg offset_reg,
734 bblock_t *before_block,
735 vec4_instruction *before_inst)
736 {
737 assert((before_inst == NULL && before_block == NULL) ||
738 (before_inst && before_block));
739
740 vec4_instruction *pull;
741
742 if (devinfo->gen >= 9) {
743 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
744 src_reg header(this, glsl_type::uvec4_type, 2);
745
746 pull = new(mem_ctx)
747 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
748 dst_reg(header));
749
750 if (before_inst)
751 emit_before(before_block, before_inst, pull);
752 else
753 emit(pull);
754
755 dst_reg index_reg = retype(offset(dst_reg(header), 1),
756 offset_reg.type);
757 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
758
759 if (before_inst)
760 emit_before(before_block, before_inst, pull);
761 else
762 emit(pull);
763
764 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
765 dst,
766 surf_index,
767 header);
768 pull->mlen = 2;
769 pull->header_size = 1;
770 } else if (devinfo->gen >= 7) {
771 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
772
773 grf_offset.type = offset_reg.type;
774
775 pull = MOV(grf_offset, offset_reg);
776
777 if (before_inst)
778 emit_before(before_block, before_inst, pull);
779 else
780 emit(pull);
781
782 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
783 dst,
784 surf_index,
785 src_reg(grf_offset));
786 pull->mlen = 1;
787 } else {
788 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
789 dst,
790 surf_index,
791 offset_reg);
792 pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
793 pull->mlen = 1;
794 }
795
796 if (before_inst)
797 emit_before(before_block, before_inst, pull);
798 else
799 emit(pull);
800 }
801
802 src_reg
803 vec4_visitor::emit_uniformize(const src_reg &src)
804 {
805 const src_reg chan_index(this, glsl_type::uint_type);
806 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
807 src.type);
808
809 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
810 ->force_writemask_all = true;
811 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
812 ->force_writemask_all = true;
813
814 return src_reg(dst);
815 }
816
817 src_reg
818 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
819 src_reg coordinate, src_reg sampler)
820 {
821 vec4_instruction *inst =
822 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
823 dst_reg(this, glsl_type::uvec4_type));
824 inst->base_mrf = 2;
825 inst->src[1] = sampler;
826
827 int param_base;
828
829 if (devinfo->gen >= 9) {
830 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
831 vec4_instruction *header_inst = new(mem_ctx)
832 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
833 dst_reg(MRF, inst->base_mrf));
834
835 emit(header_inst);
836
837 inst->mlen = 2;
838 inst->header_size = 1;
839 param_base = inst->base_mrf + 1;
840 } else {
841 inst->mlen = 1;
842 param_base = inst->base_mrf;
843 }
844
845 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
846 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
847 int zero_mask = 0xf & ~coord_mask;
848
849 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
850 coordinate));
851
852 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
853 brw_imm_d(0)));
854
855 emit(inst);
856 return src_reg(inst->dst);
857 }
858
859 bool
860 vec4_visitor::is_high_sampler(src_reg sampler)
861 {
862 if (devinfo->gen < 8 && !devinfo->is_haswell)
863 return false;
864
865 return sampler.file != IMM || sampler.ud >= 16;
866 }
867
868 void
869 vec4_visitor::emit_texture(ir_texture_opcode op,
870 dst_reg dest,
871 const glsl_type *dest_type,
872 src_reg coordinate,
873 int coord_components,
874 src_reg shadow_comparitor,
875 src_reg lod, src_reg lod2,
876 src_reg sample_index,
877 uint32_t constant_offset,
878 src_reg offset_value,
879 src_reg mcs,
880 bool is_cube_array,
881 uint32_t sampler,
882 src_reg sampler_reg)
883 {
884 /* The sampler can only meaningfully compute LOD for fragment shader
885 * messages. For all other stages, we change the opcode to TXL and hardcode
886 * the LOD to 0.
887 *
888 * textureQueryLevels() is implemented in terms of TXS so we need to pass a
889 * valid LOD argument.
890 */
891 if (op == ir_tex || op == ir_query_levels) {
892 assert(lod.file == BAD_FILE);
893 lod = brw_imm_f(0.0f);
894 }
895
896 enum opcode opcode;
897 switch (op) {
898 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
899 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
900 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
901 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
902 case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
903 SHADER_OPCODE_TXF_CMS); break;
904 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
905 case ir_tg4: opcode = offset_value.file != BAD_FILE
906 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
907 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
908 case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
909 case ir_txb:
910 unreachable("TXB is not valid for vertex shaders.");
911 case ir_lod:
912 unreachable("LOD is not valid for vertex shaders.");
913 case ir_samples_identical: {
914 /* There are some challenges implementing this for vec4, and it seems
915 * unlikely to be used anyway. For now, just return false ways.
916 */
917 emit(MOV(dest, brw_imm_ud(0u)));
918 return;
919 }
920 default:
921 unreachable("Unrecognized tex op");
922 }
923
924 vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
925
926 inst->offset = constant_offset;
927
928 /* The message header is necessary for:
929 * - Gen4 (always)
930 * - Gen9+ for selecting SIMD4x2
931 * - Texel offsets
932 * - Gather channel selection
933 * - Sampler indices too large to fit in a 4-bit value.
934 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
935 */
936 inst->header_size =
937 (devinfo->gen < 5 || devinfo->gen >= 9 ||
938 inst->offset != 0 || op == ir_tg4 ||
939 op == ir_texture_samples ||
940 is_high_sampler(sampler_reg)) ? 1 : 0;
941 inst->base_mrf = 2;
942 inst->mlen = inst->header_size;
943 inst->dst.writemask = WRITEMASK_XYZW;
944 inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
945
946 inst->src[1] = sampler_reg;
947
948 /* MRF for the first parameter */
949 int param_base = inst->base_mrf + inst->header_size;
950
951 if (op == ir_txs || op == ir_query_levels) {
952 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
953 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
954 inst->mlen++;
955 } else if (op == ir_texture_samples) {
956 inst->dst.writemask = WRITEMASK_X;
957 } else {
958 /* Load the coordinate */
959 /* FINISHME: gl_clamp_mask and saturate */
960 int coord_mask = (1 << coord_components) - 1;
961 int zero_mask = 0xf & ~coord_mask;
962
963 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
964 coordinate));
965 inst->mlen++;
966
967 if (zero_mask != 0) {
968 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
969 brw_imm_d(0)));
970 }
971 /* Load the shadow comparitor */
972 if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
973 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
974 WRITEMASK_X),
975 shadow_comparitor));
976 inst->mlen++;
977 }
978
979 /* Load the LOD info */
980 if (op == ir_tex || op == ir_txl) {
981 int mrf, writemask;
982 if (devinfo->gen >= 5) {
983 mrf = param_base + 1;
984 if (shadow_comparitor.file != BAD_FILE) {
985 writemask = WRITEMASK_Y;
986 /* mlen already incremented */
987 } else {
988 writemask = WRITEMASK_X;
989 inst->mlen++;
990 }
991 } else /* devinfo->gen == 4 */ {
992 mrf = param_base;
993 writemask = WRITEMASK_W;
994 }
995 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
996 } else if (op == ir_txf) {
997 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
998 } else if (op == ir_txf_ms) {
999 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1000 sample_index));
1001 if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1002 /* MCS data is stored in the first two channels of ‘mcs’, but we
1003 * need to get it into the .y and .z channels of the second vec4
1004 * of params.
1005 */
1006 mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1007 emit(MOV(dst_reg(MRF, param_base + 1,
1008 glsl_type::uint_type, WRITEMASK_YZ),
1009 mcs));
1010 } else if (devinfo->gen >= 7) {
1011 /* MCS data is in the first channel of `mcs`, but we need to get it into
1012 * the .y channel of the second vec4 of params, so replicate .x across
1013 * the whole vec4 and then mask off everything except .y
1014 */
1015 mcs.swizzle = BRW_SWIZZLE_XXXX;
1016 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1017 mcs));
1018 }
1019 inst->mlen++;
1020 } else if (op == ir_txd) {
1021 const brw_reg_type type = lod.type;
1022
1023 if (devinfo->gen >= 5) {
1024 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1025 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1026 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1027 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1028 inst->mlen++;
1029
1030 if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
1031 lod.swizzle = BRW_SWIZZLE_ZZZZ;
1032 lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1033 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1034 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1035 inst->mlen++;
1036
1037 if (shadow_comparitor.file != BAD_FILE) {
1038 emit(MOV(dst_reg(MRF, param_base + 2,
1039 shadow_comparitor.type, WRITEMASK_Z),
1040 shadow_comparitor));
1041 }
1042 }
1043 } else /* devinfo->gen == 4 */ {
1044 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1045 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1046 inst->mlen += 2;
1047 }
1048 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1049 if (shadow_comparitor.file != BAD_FILE) {
1050 emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
1051 shadow_comparitor));
1052 }
1053
1054 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1055 offset_value));
1056 inst->mlen++;
1057 }
1058 }
1059
1060 emit(inst);
1061
1062 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1063 * spec requires layers.
1064 */
1065 if (op == ir_txs && is_cube_array) {
1066 emit_math(SHADER_OPCODE_INT_QUOTIENT,
1067 writemask(inst->dst, WRITEMASK_Z),
1068 src_reg(inst->dst), brw_imm_d(6));
1069 }
1070
1071 if (devinfo->gen == 6 && op == ir_tg4) {
1072 emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst);
1073 }
1074
1075 if (op == ir_query_levels) {
1076 /* # levels is in .w */
1077 src_reg swizzled(dest);
1078 swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1079 SWIZZLE_W, SWIZZLE_W);
1080 emit(MOV(dest, swizzled));
1081 }
1082 }
1083
1084 /**
1085 * Apply workarounds for Gen6 gather with UINT/SINT
1086 */
1087 void
1088 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1089 {
1090 if (!wa)
1091 return;
1092
1093 int width = (wa & WA_8BIT) ? 8 : 16;
1094 dst_reg dst_f = dst;
1095 dst_f.type = BRW_REGISTER_TYPE_F;
1096
1097 /* Convert from UNORM to UINT */
1098 emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1099 emit(MOV(dst, src_reg(dst_f)));
1100
1101 if (wa & WA_SIGN) {
1102 /* Reinterpret the UINT value as a signed INT value by
1103 * shifting the sign bit into place, then shifting back
1104 * preserving sign.
1105 */
1106 emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1107 emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1108 }
1109 }
1110
1111 void
1112 vec4_visitor::gs_emit_vertex(int stream_id)
1113 {
1114 unreachable("not reached");
1115 }
1116
1117 void
1118 vec4_visitor::gs_end_primitive()
1119 {
1120 unreachable("not reached");
1121 }
1122
1123 void
1124 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
1125 dst_reg dst, src_reg surf_offset,
1126 src_reg src0, src_reg src1)
1127 {
1128 unsigned mlen = 1 + (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
1129 src_reg src_payload(this, glsl_type::uint_type, mlen);
1130 dst_reg payload(src_payload);
1131 payload.writemask = WRITEMASK_X;
1132
1133 /* Set the atomic operation offset. */
1134 emit(MOV(offset(payload, 0), surf_offset));
1135 unsigned i = 1;
1136
1137 /* Set the atomic operation arguments. */
1138 if (src0.file != BAD_FILE) {
1139 emit(MOV(offset(payload, i), src0));
1140 i++;
1141 }
1142
1143 if (src1.file != BAD_FILE) {
1144 emit(MOV(offset(payload, i), src1));
1145 i++;
1146 }
1147
1148 /* Emit the instruction. Note that this maps to the normal SIMD8
1149 * untyped atomic message on Ivy Bridge, but that's OK because
1150 * unused channels will be masked out.
1151 */
1152 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
1153 src_payload,
1154 brw_imm_ud(surf_index), brw_imm_ud(atomic_op));
1155 inst->mlen = mlen;
1156 }
1157
1158 void
1159 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
1160 src_reg surf_offset)
1161 {
1162 dst_reg offset(this, glsl_type::uint_type);
1163 offset.writemask = WRITEMASK_X;
1164
1165 /* Set the surface read offset. */
1166 emit(MOV(offset, surf_offset));
1167
1168 /* Emit the instruction. Note that this maps to the normal SIMD8
1169 * untyped surface read message, but that's OK because unused
1170 * channels will be masked out.
1171 */
1172 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
1173 src_reg(offset),
1174 brw_imm_ud(surf_index), brw_imm_d(1));
1175 inst->mlen = 1;
1176 }
1177
1178 void
1179 vec4_visitor::emit_ndc_computation()
1180 {
1181 if (output_reg[VARYING_SLOT_POS].file == BAD_FILE)
1182 return;
1183
1184 /* Get the position */
1185 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
1186
1187 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1188 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1189 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
1190
1191 current_annotation = "NDC";
1192 dst_reg ndc_w = ndc;
1193 ndc_w.writemask = WRITEMASK_W;
1194 src_reg pos_w = pos;
1195 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1196 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1197
1198 dst_reg ndc_xyz = ndc;
1199 ndc_xyz.writemask = WRITEMASK_XYZ;
1200
1201 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1202 }
1203
1204 void
1205 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1206 {
1207 if (devinfo->gen < 6 &&
1208 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1209 output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
1210 devinfo->has_negative_rhw_bug)) {
1211 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1212 dst_reg header1_w = header1;
1213 header1_w.writemask = WRITEMASK_W;
1214
1215 emit(MOV(header1, brw_imm_ud(0u)));
1216
1217 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1218 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1219
1220 current_annotation = "Point size";
1221 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1222 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1223 }
1224
1225 if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
1226 current_annotation = "Clipping flags";
1227 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1228 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1229
1230 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1231 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1232 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1233
1234 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1235 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1236 emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1237 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1238 }
1239
1240 /* i965 clipping workaround:
1241 * 1) Test for -ve rhw
1242 * 2) If set,
1243 * set ndc = (0,0,0,0)
1244 * set ucp[6] = 1
1245 *
1246 * Later, clipping will detect ucp[6] and ensure the primitive is
1247 * clipped against all fixed planes.
1248 */
1249 if (devinfo->has_negative_rhw_bug &&
1250 output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE) {
1251 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
1252 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1253 emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1254 vec4_instruction *inst;
1255 inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1256 inst->predicate = BRW_PREDICATE_NORMAL;
1257 output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
1258 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], brw_imm_f(0.0f)));
1259 inst->predicate = BRW_PREDICATE_NORMAL;
1260 }
1261
1262 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1263 } else if (devinfo->gen < 6) {
1264 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1265 } else {
1266 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1267 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1268 dst_reg reg_w = reg;
1269 reg_w.writemask = WRITEMASK_W;
1270 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1271 reg_as_src.type = reg_w.type;
1272 reg_as_src.swizzle = brw_swizzle_for_size(1);
1273 emit(MOV(reg_w, reg_as_src));
1274 }
1275 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1276 dst_reg reg_y = reg;
1277 reg_y.writemask = WRITEMASK_Y;
1278 reg_y.type = BRW_REGISTER_TYPE_D;
1279 output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
1280 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
1281 }
1282 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1283 dst_reg reg_z = reg;
1284 reg_z.writemask = WRITEMASK_Z;
1285 reg_z.type = BRW_REGISTER_TYPE_D;
1286 output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
1287 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
1288 }
1289 }
1290 }
1291
1292 vec4_instruction *
1293 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
1294 {
1295 assert(varying < VARYING_SLOT_MAX);
1296 assert(output_reg[varying].type == reg.type);
1297 current_annotation = output_reg_annotation[varying];
1298 if (output_reg[varying].file != BAD_FILE)
1299 return emit(MOV(reg, src_reg(output_reg[varying])));
1300 else
1301 return NULL;
1302 }
1303
1304 void
1305 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1306 {
1307 reg.type = BRW_REGISTER_TYPE_F;
1308 output_reg[varying].type = reg.type;
1309
1310 switch (varying) {
1311 case VARYING_SLOT_PSIZ:
1312 {
1313 /* PSIZ is always in slot 0, and is coupled with other flags. */
1314 current_annotation = "indices, point width, clip flags";
1315 emit_psiz_and_flags(reg);
1316 break;
1317 }
1318 case BRW_VARYING_SLOT_NDC:
1319 current_annotation = "NDC";
1320 if (output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE)
1321 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
1322 break;
1323 case VARYING_SLOT_POS:
1324 current_annotation = "gl_Position";
1325 if (output_reg[VARYING_SLOT_POS].file != BAD_FILE)
1326 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
1327 break;
1328 case VARYING_SLOT_EDGE:
1329 /* This is present when doing unfilled polygons. We're supposed to copy
1330 * the edge flag from the user-provided vertex array
1331 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1332 * of that attribute (starts as 1.0f). This is then used in clipping to
1333 * determine which edges should be drawn as wireframe.
1334 */
1335 current_annotation = "edge flag";
1336 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1337 glsl_type::float_type, WRITEMASK_XYZW))));
1338 break;
1339 case BRW_VARYING_SLOT_PAD:
1340 /* No need to write to this slot */
1341 break;
1342 default:
1343 emit_generic_urb_slot(reg, varying);
1344 break;
1345 }
1346 }
1347
1348 static int
1349 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
1350 {
1351 if (devinfo->gen >= 6) {
1352 /* URB data written (does not include the message header reg) must
1353 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1354 * section 5.4.3.2.2: URB_INTERLEAVED.
1355 *
1356 * URB entries are allocated on a multiple of 1024 bits, so an
1357 * extra 128 bits written here to make the end align to 256 is
1358 * no problem.
1359 */
1360 if ((mlen % 2) != 1)
1361 mlen++;
1362 }
1363
1364 return mlen;
1365 }
1366
1367
1368 /**
1369 * Generates the VUE payload plus the necessary URB write instructions to
1370 * output it.
1371 *
1372 * The VUE layout is documented in Volume 2a.
1373 */
1374 void
1375 vec4_visitor::emit_vertex()
1376 {
1377 /* MRF 0 is reserved for the debugger, so start with message header
1378 * in MRF 1.
1379 */
1380 int base_mrf = 1;
1381 int mrf = base_mrf;
1382 /* In the process of generating our URB write message contents, we
1383 * may need to unspill a register or load from an array. Those
1384 * reads would use MRFs 14-15.
1385 */
1386 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1387
1388 /* The following assertion verifies that max_usable_mrf causes an
1389 * even-numbered amount of URB write data, which will meet gen6's
1390 * requirements for length alignment.
1391 */
1392 assert ((max_usable_mrf - base_mrf) % 2 == 0);
1393
1394 /* First mrf is the g0-based message header containing URB handles and
1395 * such.
1396 */
1397 emit_urb_write_header(mrf++);
1398
1399 if (devinfo->gen < 6) {
1400 emit_ndc_computation();
1401 }
1402
1403 /* We may need to split this up into several URB writes, so do them in a
1404 * loop.
1405 */
1406 int slot = 0;
1407 bool complete = false;
1408 do {
1409 /* URB offset is in URB row increments, and each of our MRFs is half of
1410 * one of those, since we're doing interleaved writes.
1411 */
1412 int offset = slot / 2;
1413
1414 mrf = base_mrf + 1;
1415 for (; slot < prog_data->vue_map.num_slots; ++slot) {
1416 emit_urb_slot(dst_reg(MRF, mrf++),
1417 prog_data->vue_map.slot_to_varying[slot]);
1418
1419 /* If this was max_usable_mrf, we can't fit anything more into this
1420 * URB WRITE. Same thing if we reached the maximum length available.
1421 */
1422 if (mrf > max_usable_mrf ||
1423 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1424 slot++;
1425 break;
1426 }
1427 }
1428
1429 complete = slot >= prog_data->vue_map.num_slots;
1430 current_annotation = "URB write";
1431 vec4_instruction *inst = emit_urb_write_opcode(complete);
1432 inst->base_mrf = base_mrf;
1433 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1434 inst->offset += offset;
1435 } while(!complete);
1436 }
1437
1438
1439 src_reg
1440 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1441 src_reg *reladdr, int reg_offset)
1442 {
1443 /* Because we store the values to scratch interleaved like our
1444 * vertex data, we need to scale the vec4 index by 2.
1445 */
1446 int message_header_scale = 2;
1447
1448 /* Pre-gen6, the message header uses byte offsets instead of vec4
1449 * (16-byte) offset units.
1450 */
1451 if (devinfo->gen < 6)
1452 message_header_scale *= 16;
1453
1454 if (reladdr) {
1455 src_reg index = src_reg(this, glsl_type::int_type);
1456
1457 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1458 brw_imm_d(reg_offset)));
1459 emit_before(block, inst, MUL(dst_reg(index), index,
1460 brw_imm_d(message_header_scale)));
1461
1462 return index;
1463 } else {
1464 return brw_imm_d(reg_offset * message_header_scale);
1465 }
1466 }
1467
1468 src_reg
1469 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
1470 src_reg *reladdr, int reg_offset)
1471 {
1472 if (reladdr) {
1473 src_reg index = src_reg(this, glsl_type::int_type);
1474
1475 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1476 brw_imm_d(reg_offset)));
1477
1478 /* Pre-gen6, the message header uses byte offsets instead of vec4
1479 * (16-byte) offset units.
1480 */
1481 if (devinfo->gen < 6) {
1482 emit_before(block, inst, MUL(dst_reg(index), index, brw_imm_d(16)));
1483 }
1484
1485 return index;
1486 } else if (devinfo->gen >= 8) {
1487 /* Store the offset in a GRF so we can send-from-GRF. */
1488 src_reg offset = src_reg(this, glsl_type::int_type);
1489 emit_before(block, inst, MOV(dst_reg(offset), brw_imm_d(reg_offset)));
1490 return offset;
1491 } else {
1492 int message_header_scale = devinfo->gen < 6 ? 16 : 1;
1493 return brw_imm_d(reg_offset * message_header_scale);
1494 }
1495 }
1496
1497 /**
1498 * Emits an instruction before @inst to load the value named by @orig_src
1499 * from scratch space at @base_offset to @temp.
1500 *
1501 * @base_offset is measured in 32-byte units (the size of a register).
1502 */
1503 void
1504 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1505 dst_reg temp, src_reg orig_src,
1506 int base_offset)
1507 {
1508 int reg_offset = base_offset + orig_src.reg_offset;
1509 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1510 reg_offset);
1511
1512 emit_before(block, inst, SCRATCH_READ(temp, index));
1513 }
1514
1515 /**
1516 * Emits an instruction after @inst to store the value to be written
1517 * to @orig_dst to scratch space at @base_offset, from @temp.
1518 *
1519 * @base_offset is measured in 32-byte units (the size of a register).
1520 */
1521 void
1522 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1523 int base_offset)
1524 {
1525 int reg_offset = base_offset + inst->dst.reg_offset;
1526 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1527 reg_offset);
1528
1529 /* Create a temporary register to store *inst's result in.
1530 *
1531 * We have to be careful in MOVing from our temporary result register in
1532 * the scratch write. If we swizzle from channels of the temporary that
1533 * weren't initialized, it will confuse live interval analysis, which will
1534 * make spilling fail to make progress.
1535 */
1536 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
1537 inst->dst.type),
1538 brw_swizzle_for_mask(inst->dst.writemask));
1539 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1540 inst->dst.writemask));
1541 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1542 if (inst->opcode != BRW_OPCODE_SEL)
1543 write->predicate = inst->predicate;
1544 write->ir = inst->ir;
1545 write->annotation = inst->annotation;
1546 inst->insert_after(block, write);
1547
1548 inst->dst.file = temp.file;
1549 inst->dst.nr = temp.nr;
1550 inst->dst.reg_offset = temp.reg_offset;
1551 inst->dst.reladdr = NULL;
1552 }
1553
1554 /**
1555 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1556 * adds the scratch read(s) before \p inst. The function also checks for
1557 * recursive reladdr scratch accesses, issuing the corresponding scratch
1558 * loads and rewriting reladdr references accordingly.
1559 *
1560 * \return \p src if it did not require a scratch load, otherwise, the
1561 * register holding the result of the scratch load that the caller should
1562 * use to rewrite src.
1563 */
1564 src_reg
1565 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1566 vec4_instruction *inst, src_reg src)
1567 {
1568 /* Resolve recursive reladdr scratch access by calling ourselves
1569 * with src.reladdr
1570 */
1571 if (src.reladdr)
1572 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1573 *src.reladdr);
1574
1575 /* Now handle scratch access on src */
1576 if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1577 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1578 emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1579 src.nr = temp.nr;
1580 src.reg_offset = temp.reg_offset;
1581 src.reladdr = NULL;
1582 }
1583
1584 return src;
1585 }
1586
1587 /**
1588 * We can't generally support array access in GRF space, because a
1589 * single instruction's destination can only span 2 contiguous
1590 * registers. So, we send all GRF arrays that get variable index
1591 * access to scratch space.
1592 */
1593 void
1594 vec4_visitor::move_grf_array_access_to_scratch()
1595 {
1596 int scratch_loc[this->alloc.count];
1597 memset(scratch_loc, -1, sizeof(scratch_loc));
1598
1599 /* First, calculate the set of virtual GRFs that need to be punted
1600 * to scratch due to having any array access on them, and where in
1601 * scratch.
1602 */
1603 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1604 if (inst->dst.file == VGRF && inst->dst.reladdr) {
1605 if (scratch_loc[inst->dst.nr] == -1) {
1606 scratch_loc[inst->dst.nr] = last_scratch;
1607 last_scratch += this->alloc.sizes[inst->dst.nr];
1608 }
1609
1610 for (src_reg *iter = inst->dst.reladdr;
1611 iter->reladdr;
1612 iter = iter->reladdr) {
1613 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1614 scratch_loc[iter->nr] = last_scratch;
1615 last_scratch += this->alloc.sizes[iter->nr];
1616 }
1617 }
1618 }
1619
1620 for (int i = 0 ; i < 3; i++) {
1621 for (src_reg *iter = &inst->src[i];
1622 iter->reladdr;
1623 iter = iter->reladdr) {
1624 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1625 scratch_loc[iter->nr] = last_scratch;
1626 last_scratch += this->alloc.sizes[iter->nr];
1627 }
1628 }
1629 }
1630 }
1631
1632 /* Now, for anything that will be accessed through scratch, rewrite
1633 * it to load/store. Note that this is a _safe list walk, because
1634 * we may generate a new scratch_write instruction after the one
1635 * we're processing.
1636 */
1637 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1638 /* Set up the annotation tracking for new generated instructions. */
1639 base_ir = inst->ir;
1640 current_annotation = inst->annotation;
1641
1642 /* First handle scratch access on the dst. Notice we have to handle
1643 * the case where the dst's reladdr also points to scratch space.
1644 */
1645 if (inst->dst.reladdr)
1646 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1647 *inst->dst.reladdr);
1648
1649 /* Now that we have handled any (possibly recursive) reladdr scratch
1650 * accesses for dst we can safely do the scratch write for dst itself
1651 */
1652 if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1653 emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1654
1655 /* Now handle scratch access on any src. In this case, since inst->src[i]
1656 * already is a src_reg, we can just call emit_resolve_reladdr with
1657 * inst->src[i] and it will take care of handling scratch loads for
1658 * both src and src.reladdr (recursively).
1659 */
1660 for (int i = 0 ; i < 3; i++) {
1661 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1662 inst->src[i]);
1663 }
1664 }
1665 }
1666
1667 /**
1668 * Emits an instruction before @inst to load the value named by @orig_src
1669 * from the pull constant buffer (surface) at @base_offset to @temp.
1670 */
1671 void
1672 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1673 dst_reg temp, src_reg orig_src,
1674 int base_offset)
1675 {
1676 int reg_offset = base_offset + orig_src.reg_offset;
1677 const unsigned index = prog_data->base.binding_table.pull_constants_start;
1678 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
1679 reg_offset);
1680
1681 emit_pull_constant_load_reg(temp,
1682 brw_imm_ud(index),
1683 offset,
1684 block, inst);
1685
1686 brw_mark_surface_used(&prog_data->base, index);
1687 }
1688
1689 /**
1690 * Implements array access of uniforms by inserting a
1691 * PULL_CONSTANT_LOAD instruction.
1692 *
1693 * Unlike temporary GRF array access (where we don't support it due to
1694 * the difficulty of doing relative addressing on instruction
1695 * destinations), we could potentially do array access of uniforms
1696 * that were loaded in GRF space as push constants. In real-world
1697 * usage we've seen, though, the arrays being used are always larger
1698 * than we could load as push constants, so just always move all
1699 * uniform array access out to a pull constant buffer.
1700 */
1701 void
1702 vec4_visitor::move_uniform_array_access_to_pull_constants()
1703 {
1704 int pull_constant_loc[this->uniforms];
1705 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1706 bool nested_reladdr;
1707
1708 /* Walk through and find array access of uniforms. Put a copy of that
1709 * uniform in the pull constant buffer.
1710 *
1711 * Note that we don't move constant-indexed accesses to arrays. No
1712 * testing has been done of the performance impact of this choice.
1713 */
1714 do {
1715 nested_reladdr = false;
1716
1717 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1718 for (int i = 0 ; i < 3; i++) {
1719 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1720 continue;
1721
1722 int uniform = inst->src[i].nr;
1723
1724 if (inst->src[i].reladdr->reladdr)
1725 nested_reladdr = true; /* will need another pass */
1726
1727 /* If this array isn't already present in the pull constant buffer,
1728 * add it.
1729 */
1730 if (pull_constant_loc[uniform] == -1) {
1731 const gl_constant_value **values =
1732 &stage_prog_data->param[uniform * 4];
1733
1734 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
1735
1736 assert(uniform < uniform_array_size);
1737 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
1738 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1739 = values[j];
1740 }
1741 }
1742
1743 /* Set up the annotation tracking for new generated instructions. */
1744 base_ir = inst->ir;
1745 current_annotation = inst->annotation;
1746
1747 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1748
1749 emit_pull_constant_load(block, inst, temp, inst->src[i],
1750 pull_constant_loc[uniform]);
1751
1752 inst->src[i].file = temp.file;
1753 inst->src[i].nr = temp.nr;
1754 inst->src[i].reg_offset = temp.reg_offset;
1755 inst->src[i].reladdr = NULL;
1756 }
1757 }
1758 } while (nested_reladdr);
1759
1760 /* Now there are no accesses of the UNIFORM file with a reladdr, so
1761 * no need to track them as larger-than-vec4 objects. This will be
1762 * relied on in cutting out unused uniform vectors from push
1763 * constants.
1764 */
1765 split_uniform_registers();
1766 }
1767
1768 void
1769 vec4_visitor::resolve_ud_negate(src_reg *reg)
1770 {
1771 if (reg->type != BRW_REGISTER_TYPE_UD ||
1772 !reg->negate)
1773 return;
1774
1775 src_reg temp = src_reg(this, glsl_type::uvec4_type);
1776 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1777 *reg = temp;
1778 }
1779
1780 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1781 void *log_data,
1782 const struct brw_sampler_prog_key_data *key_tex,
1783 struct brw_vue_prog_data *prog_data,
1784 const nir_shader *shader,
1785 void *mem_ctx,
1786 bool no_spills,
1787 int shader_time_index)
1788 : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1789 key_tex(key_tex),
1790 prog_data(prog_data),
1791 fail_msg(NULL),
1792 first_non_payload_grf(0),
1793 need_all_constants_in_pull_buffer(false),
1794 no_spills(no_spills),
1795 shader_time_index(shader_time_index),
1796 last_scratch(0)
1797 {
1798 this->failed = false;
1799
1800 this->base_ir = NULL;
1801 this->current_annotation = NULL;
1802 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1803
1804 this->virtual_grf_start = NULL;
1805 this->virtual_grf_end = NULL;
1806 this->live_intervals = NULL;
1807
1808 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1809
1810 this->uniforms = 0;
1811
1812 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
1813 * at least one. See setup_uniforms() in brw_vec4.cpp.
1814 */
1815 this->uniform_array_size = 1;
1816 if (prog_data) {
1817 this->uniform_array_size =
1818 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
1819 }
1820
1821 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
1822 }
1823
1824 vec4_visitor::~vec4_visitor()
1825 {
1826 }
1827
1828
1829 void
1830 vec4_visitor::fail(const char *format, ...)
1831 {
1832 va_list va;
1833 char *msg;
1834
1835 if (failed)
1836 return;
1837
1838 failed = true;
1839
1840 va_start(va, format);
1841 msg = ralloc_vasprintf(mem_ctx, format, va);
1842 va_end(va);
1843 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1844
1845 this->fail_msg = msg;
1846
1847 if (debug_enabled) {
1848 fprintf(stderr, "%s", msg);
1849 }
1850 }
1851
1852 } /* namespace brw */