i965: Fix execution size of scalar TCS barrier setup code.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "brw_eu.h"
27 #include "brw_program.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(devinfo->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186 ALU1(DIM)
187
188 /** Gen4 predicated IF. */
189 vec4_instruction *
190 vec4_visitor::IF(enum brw_predicate predicate)
191 {
192 vec4_instruction *inst;
193
194 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
195 inst->predicate = predicate;
196
197 return inst;
198 }
199
200 /** Gen6 IF with embedded comparison. */
201 vec4_instruction *
202 vec4_visitor::IF(src_reg src0, src_reg src1,
203 enum brw_conditional_mod condition)
204 {
205 assert(devinfo->gen == 6);
206
207 vec4_instruction *inst;
208
209 resolve_ud_negate(&src0);
210 resolve_ud_negate(&src1);
211
212 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
213 src0, src1);
214 inst->conditional_mod = condition;
215
216 return inst;
217 }
218
219 /**
220 * CMP: Sets the low bit of the destination channels with the result
221 * of the comparison, while the upper bits are undefined, and updates
222 * the flag register with the packed 16 bits of the result.
223 */
224 vec4_instruction *
225 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
226 enum brw_conditional_mod condition)
227 {
228 vec4_instruction *inst;
229
230 /* Take the instruction:
231 *
232 * CMP null<d> src0<f> src1<f>
233 *
234 * Original gen4 does type conversion to the destination type before
235 * comparison, producing garbage results for floating point comparisons.
236 *
237 * The destination type doesn't matter on newer generations, so we set the
238 * type to match src0 so we can compact the instruction.
239 */
240 dst.type = src0.type;
241
242 resolve_ud_negate(&src0);
243 resolve_ud_negate(&src1);
244
245 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
246 inst->conditional_mod = condition;
247
248 return inst;
249 }
250
251 vec4_instruction *
252 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
253 {
254 vec4_instruction *inst;
255
256 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
257 dst, index);
258 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
259 inst->mlen = 2;
260
261 return inst;
262 }
263
264 vec4_instruction *
265 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
266 const src_reg &index)
267 {
268 vec4_instruction *inst;
269
270 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
271 dst, src, index);
272 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
273 inst->mlen = 3;
274
275 return inst;
276 }
277
278 src_reg
279 vec4_visitor::fix_3src_operand(const src_reg &src)
280 {
281 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
282 * able to use vertical stride of zero to replicate the vec4 uniform, like
283 *
284 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
285 *
286 * But you can't, since vertical stride is always four in three-source
287 * instructions. Instead, insert a MOV instruction to do the replication so
288 * that the three-source instruction can consume it.
289 */
290
291 /* The MOV is only needed if the source is a uniform or immediate. */
292 if (src.file != UNIFORM && src.file != IMM)
293 return src;
294
295 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
296 return src;
297
298 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
299 expanded.type = src.type;
300 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
301 return src_reg(expanded);
302 }
303
304 src_reg
305 vec4_visitor::resolve_source_modifiers(const src_reg &src)
306 {
307 if (!src.abs && !src.negate)
308 return src;
309
310 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
311 resolved.type = src.type;
312 emit(MOV(resolved, src));
313
314 return src_reg(resolved);
315 }
316
317 src_reg
318 vec4_visitor::fix_math_operand(const src_reg &src)
319 {
320 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
321 return src;
322
323 /* The gen6 math instruction ignores the source modifiers --
324 * swizzle, abs, negate, and at least some parts of the register
325 * region description.
326 *
327 * Rather than trying to enumerate all these cases, *always* expand the
328 * operand to a temp GRF for gen6.
329 *
330 * For gen7, keep the operand as-is, except if immediate, which gen7 still
331 * can't use.
332 */
333
334 if (devinfo->gen == 7 && src.file != IMM)
335 return src;
336
337 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
338 expanded.type = src.type;
339 emit(MOV(expanded, src));
340 return src_reg(expanded);
341 }
342
343 vec4_instruction *
344 vec4_visitor::emit_math(enum opcode opcode,
345 const dst_reg &dst,
346 const src_reg &src0, const src_reg &src1)
347 {
348 vec4_instruction *math =
349 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
350
351 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
352 /* MATH on Gen6 must be align1, so we can't do writemasks. */
353 math->dst = dst_reg(this, glsl_type::vec4_type);
354 math->dst.type = dst.type;
355 math = emit(MOV(dst, src_reg(math->dst)));
356 } else if (devinfo->gen < 6) {
357 math->base_mrf = 1;
358 math->mlen = src1.file == BAD_FILE ? 1 : 2;
359 }
360
361 return math;
362 }
363
364 void
365 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
366 {
367 if (devinfo->gen < 7) {
368 unreachable("ir_unop_pack_half_2x16 should be lowered");
369 }
370
371 assert(dst.type == BRW_REGISTER_TYPE_UD);
372 assert(src0.type == BRW_REGISTER_TYPE_F);
373
374 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
375 *
376 * Because this instruction does not have a 16-bit floating-point type,
377 * the destination data type must be Word (W).
378 *
379 * The destination must be DWord-aligned and specify a horizontal stride
380 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
381 * each destination channel and the upper word is not modified.
382 *
383 * The above restriction implies that the f32to16 instruction must use
384 * align1 mode, because only in align1 mode is it possible to specify
385 * horizontal stride. We choose here to defy the hardware docs and emit
386 * align16 instructions.
387 *
388 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
389 * instructions. I was partially successful in that the code passed all
390 * tests. However, the code was dubiously correct and fragile, and the
391 * tests were not harsh enough to probe that frailty. Not trusting the
392 * code, I chose instead to remain in align16 mode in defiance of the hw
393 * docs).
394 *
395 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
396 * simulator, emitting a f32to16 in align16 mode with UD as destination
397 * data type is safe. The behavior differs from that specified in the PRM
398 * in that the upper word of each destination channel is cleared to 0.
399 */
400
401 dst_reg tmp_dst(this, glsl_type::uvec2_type);
402 src_reg tmp_src(tmp_dst);
403
404 #if 0
405 /* Verify the undocumented behavior on which the following instructions
406 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
407 * then the result of the bit-or instruction below will be incorrect.
408 *
409 * You should inspect the disasm output in order to verify that the MOV is
410 * not optimized away.
411 */
412 emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
413 #endif
414
415 /* Give tmp the form below, where "." means untouched.
416 *
417 * w z y x w z y x
418 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
419 *
420 * That the upper word of each write-channel be 0 is required for the
421 * following bit-shift and bit-or instructions to work. Note that this
422 * relies on the undocumented hardware behavior mentioned above.
423 */
424 tmp_dst.writemask = WRITEMASK_XY;
425 emit(F32TO16(tmp_dst, src0));
426
427 /* Give the write-channels of dst the form:
428 * 0xhhhh0000
429 */
430 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
431 emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
432
433 /* Finally, give the write-channels of dst the form of packHalf2x16's
434 * output:
435 * 0xhhhhllll
436 */
437 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
438 emit(OR(dst, src_reg(dst), tmp_src));
439 }
440
441 void
442 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
443 {
444 if (devinfo->gen < 7) {
445 unreachable("ir_unop_unpack_half_2x16 should be lowered");
446 }
447
448 assert(dst.type == BRW_REGISTER_TYPE_F);
449 assert(src0.type == BRW_REGISTER_TYPE_UD);
450
451 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
452 *
453 * Because this instruction does not have a 16-bit floating-point type,
454 * the source data type must be Word (W). The destination type must be
455 * F (Float).
456 *
457 * To use W as the source data type, we must adjust horizontal strides,
458 * which is only possible in align1 mode. All my [chadv] attempts at
459 * emitting align1 instructions for unpackHalf2x16 failed to pass the
460 * Piglit tests, so I gave up.
461 *
462 * I've verified that, on gen7 hardware and the simulator, it is safe to
463 * emit f16to32 in align16 mode with UD as source data type.
464 */
465
466 dst_reg tmp_dst(this, glsl_type::uvec2_type);
467 src_reg tmp_src(tmp_dst);
468
469 tmp_dst.writemask = WRITEMASK_X;
470 emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
471
472 tmp_dst.writemask = WRITEMASK_Y;
473 emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
474
475 dst.writemask = WRITEMASK_XY;
476 emit(F16TO32(dst, tmp_src));
477 }
478
479 void
480 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
481 {
482 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
483 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
484 * is not suitable to generate the shift values, but we can use the packed
485 * vector float and a type-converting MOV.
486 */
487 dst_reg shift(this, glsl_type::uvec4_type);
488 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
489
490 dst_reg shifted(this, glsl_type::uvec4_type);
491 src0.swizzle = BRW_SWIZZLE_XXXX;
492 emit(SHR(shifted, src0, src_reg(shift)));
493
494 shifted.type = BRW_REGISTER_TYPE_UB;
495 dst_reg f(this, glsl_type::vec4_type);
496 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
497
498 emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
499 }
500
501 void
502 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
503 {
504 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
505 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
506 * is not suitable to generate the shift values, but we can use the packed
507 * vector float and a type-converting MOV.
508 */
509 dst_reg shift(this, glsl_type::uvec4_type);
510 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
511
512 dst_reg shifted(this, glsl_type::uvec4_type);
513 src0.swizzle = BRW_SWIZZLE_XXXX;
514 emit(SHR(shifted, src0, src_reg(shift)));
515
516 shifted.type = BRW_REGISTER_TYPE_B;
517 dst_reg f(this, glsl_type::vec4_type);
518 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
519
520 dst_reg scaled(this, glsl_type::vec4_type);
521 emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
522
523 dst_reg max(this, glsl_type::vec4_type);
524 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
525 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
526 }
527
528 void
529 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
530 {
531 dst_reg saturated(this, glsl_type::vec4_type);
532 vec4_instruction *inst = emit(MOV(saturated, src0));
533 inst->saturate = true;
534
535 dst_reg scaled(this, glsl_type::vec4_type);
536 emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
537
538 dst_reg rounded(this, glsl_type::vec4_type);
539 emit(RNDE(rounded, src_reg(scaled)));
540
541 dst_reg u(this, glsl_type::uvec4_type);
542 emit(MOV(u, src_reg(rounded)));
543
544 src_reg bytes(u);
545 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
546 }
547
548 void
549 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
550 {
551 dst_reg max(this, glsl_type::vec4_type);
552 emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
553
554 dst_reg min(this, glsl_type::vec4_type);
555 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
556
557 dst_reg scaled(this, glsl_type::vec4_type);
558 emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
559
560 dst_reg rounded(this, glsl_type::vec4_type);
561 emit(RNDE(rounded, src_reg(scaled)));
562
563 dst_reg i(this, glsl_type::ivec4_type);
564 emit(MOV(i, src_reg(rounded)));
565
566 src_reg bytes(i);
567 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
568 }
569
570 /*
571 * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
572 * false) elements needed to pack a type.
573 */
574 static int
575 type_size_xvec4(const struct glsl_type *type, bool as_vec4)
576 {
577 unsigned int i;
578 int size;
579
580 switch (type->base_type) {
581 case GLSL_TYPE_UINT:
582 case GLSL_TYPE_INT:
583 case GLSL_TYPE_FLOAT:
584 case GLSL_TYPE_BOOL:
585 case GLSL_TYPE_DOUBLE:
586 if (type->is_matrix()) {
587 const glsl_type *col_type = type->column_type();
588 unsigned col_slots =
589 (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
590 return type->matrix_columns * col_slots;
591 } else {
592 /* Regardless of size of vector, it gets a vec4. This is bad
593 * packing for things like floats, but otherwise arrays become a
594 * mess. Hopefully a later pass over the code can pack scalars
595 * down if appropriate.
596 */
597 return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
598 }
599 case GLSL_TYPE_ARRAY:
600 assert(type->length > 0);
601 return type_size_xvec4(type->fields.array, as_vec4) * type->length;
602 case GLSL_TYPE_STRUCT:
603 size = 0;
604 for (i = 0; i < type->length; i++) {
605 size += type_size_xvec4(type->fields.structure[i].type, as_vec4);
606 }
607 return size;
608 case GLSL_TYPE_SUBROUTINE:
609 return 1;
610
611 case GLSL_TYPE_SAMPLER:
612 /* Samplers take up no register space, since they're baked in at
613 * link time.
614 */
615 return 0;
616 case GLSL_TYPE_ATOMIC_UINT:
617 return 0;
618 case GLSL_TYPE_IMAGE:
619 return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
620 case GLSL_TYPE_VOID:
621 case GLSL_TYPE_ERROR:
622 case GLSL_TYPE_INTERFACE:
623 case GLSL_TYPE_FUNCTION:
624 unreachable("not reached");
625 }
626
627 return 0;
628 }
629
630 /**
631 * Returns the minimum number of vec4 elements needed to pack a type.
632 *
633 * For simple types, it will return 1 (a single vec4); for matrices, the
634 * number of columns; for array and struct, the sum of the vec4_size of
635 * each of its elements; and for sampler and atomic, zero.
636 *
637 * This method is useful to calculate how much register space is needed to
638 * store a particular type.
639 */
640 extern "C" int
641 type_size_vec4(const struct glsl_type *type)
642 {
643 return type_size_xvec4(type, true);
644 }
645
646 /**
647 * Returns the minimum number of dvec4 elements needed to pack a type.
648 *
649 * For simple types, it will return 1 (a single dvec4); for matrices, the
650 * number of columns; for array and struct, the sum of the dvec4_size of
651 * each of its elements; and for sampler and atomic, zero.
652 *
653 * This method is useful to calculate how much register space is needed to
654 * store a particular type.
655 *
656 * Measuring double-precision vertex inputs as dvec4 is required because
657 * ARB_vertex_attrib_64bit states that these uses the same number of locations
658 * than the single-precision version. That is, two consecutives dvec4 would be
659 * located in location "x" and location "x+1", not "x+2".
660 *
661 * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
662 * remap_vs_attrs() will take in account both the location and also if the
663 * type fits in one or two vec4 slots.
664 */
665 extern "C" int
666 type_size_dvec4(const struct glsl_type *type)
667 {
668 return type_size_xvec4(type, false);
669 }
670
671 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
672 {
673 init();
674
675 this->file = VGRF;
676 this->nr = v->alloc.allocate(type_size_vec4(type));
677
678 if (type->is_array() || type->is_record()) {
679 this->swizzle = BRW_SWIZZLE_NOOP;
680 } else {
681 this->swizzle = brw_swizzle_for_size(type->vector_elements);
682 }
683
684 this->type = brw_type_for_base_type(type);
685 }
686
687 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
688 {
689 assert(size > 0);
690
691 init();
692
693 this->file = VGRF;
694 this->nr = v->alloc.allocate(type_size_vec4(type) * size);
695
696 this->swizzle = BRW_SWIZZLE_NOOP;
697
698 this->type = brw_type_for_base_type(type);
699 }
700
701 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
702 {
703 init();
704
705 this->file = VGRF;
706 this->nr = v->alloc.allocate(type_size_vec4(type));
707
708 if (type->is_array() || type->is_record()) {
709 this->writemask = WRITEMASK_XYZW;
710 } else {
711 this->writemask = (1 << type->vector_elements) - 1;
712 }
713
714 this->type = brw_type_for_base_type(type);
715 }
716
717 vec4_instruction *
718 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
719 src_reg src0, src_reg src1)
720 {
721 vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
722 inst->conditional_mod = conditionalmod;
723 return inst;
724 }
725
726 vec4_instruction *
727 vec4_visitor::emit_lrp(const dst_reg &dst,
728 const src_reg &x, const src_reg &y, const src_reg &a)
729 {
730 if (devinfo->gen >= 6) {
731 /* Note that the instruction's argument order is reversed from GLSL
732 * and the IR.
733 */
734 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
735 fix_3src_operand(x)));
736 } else {
737 /* Earlier generations don't support three source operations, so we
738 * need to emit x*(1-a) + y*a.
739 */
740 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
741 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
742 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
743 y_times_a.writemask = dst.writemask;
744 one_minus_a.writemask = dst.writemask;
745 x_times_one_minus_a.writemask = dst.writemask;
746
747 emit(MUL(y_times_a, y, a));
748 emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
749 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
750 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
751 }
752 }
753
754 /**
755 * Emits the instructions needed to perform a pull constant load. before_block
756 * and before_inst can be NULL in which case the instruction will be appended
757 * to the end of the instruction list.
758 */
759 void
760 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
761 src_reg surf_index,
762 src_reg offset_reg,
763 bblock_t *before_block,
764 vec4_instruction *before_inst)
765 {
766 assert((before_inst == NULL && before_block == NULL) ||
767 (before_inst && before_block));
768
769 vec4_instruction *pull;
770
771 if (devinfo->gen >= 9) {
772 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
773 src_reg header(this, glsl_type::uvec4_type, 2);
774
775 pull = new(mem_ctx)
776 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
777 dst_reg(header));
778
779 if (before_inst)
780 emit_before(before_block, before_inst, pull);
781 else
782 emit(pull);
783
784 dst_reg index_reg = retype(offset(dst_reg(header), 1),
785 offset_reg.type);
786 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
787
788 if (before_inst)
789 emit_before(before_block, before_inst, pull);
790 else
791 emit(pull);
792
793 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
794 dst,
795 surf_index,
796 header);
797 pull->mlen = 2;
798 pull->header_size = 1;
799 } else if (devinfo->gen >= 7) {
800 dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
801
802 grf_offset.type = offset_reg.type;
803
804 pull = MOV(grf_offset, offset_reg);
805
806 if (before_inst)
807 emit_before(before_block, before_inst, pull);
808 else
809 emit(pull);
810
811 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
812 dst,
813 surf_index,
814 src_reg(grf_offset));
815 pull->mlen = 1;
816 } else {
817 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
818 dst,
819 surf_index,
820 offset_reg);
821 pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
822 pull->mlen = 1;
823 }
824
825 if (before_inst)
826 emit_before(before_block, before_inst, pull);
827 else
828 emit(pull);
829 }
830
831 src_reg
832 vec4_visitor::emit_uniformize(const src_reg &src)
833 {
834 const src_reg chan_index(this, glsl_type::uint_type);
835 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
836 src.type);
837
838 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
839 ->force_writemask_all = true;
840 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
841 ->force_writemask_all = true;
842
843 return src_reg(dst);
844 }
845
846 src_reg
847 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
848 src_reg coordinate, src_reg surface)
849 {
850 vec4_instruction *inst =
851 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
852 dst_reg(this, glsl_type::uvec4_type));
853 inst->base_mrf = 2;
854 inst->src[1] = surface;
855 inst->src[2] = surface;
856
857 int param_base;
858
859 if (devinfo->gen >= 9) {
860 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
861 vec4_instruction *header_inst = new(mem_ctx)
862 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
863 dst_reg(MRF, inst->base_mrf));
864
865 emit(header_inst);
866
867 inst->mlen = 2;
868 inst->header_size = 1;
869 param_base = inst->base_mrf + 1;
870 } else {
871 inst->mlen = 1;
872 param_base = inst->base_mrf;
873 }
874
875 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
876 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
877 int zero_mask = 0xf & ~coord_mask;
878
879 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
880 coordinate));
881
882 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
883 brw_imm_d(0)));
884
885 emit(inst);
886 return src_reg(inst->dst);
887 }
888
889 bool
890 vec4_visitor::is_high_sampler(src_reg sampler)
891 {
892 if (devinfo->gen < 8 && !devinfo->is_haswell)
893 return false;
894
895 return sampler.file != IMM || sampler.ud >= 16;
896 }
897
898 void
899 vec4_visitor::emit_texture(ir_texture_opcode op,
900 dst_reg dest,
901 const glsl_type *dest_type,
902 src_reg coordinate,
903 int coord_components,
904 src_reg shadow_comparitor,
905 src_reg lod, src_reg lod2,
906 src_reg sample_index,
907 uint32_t constant_offset,
908 src_reg offset_value,
909 src_reg mcs,
910 uint32_t surface,
911 src_reg surface_reg,
912 uint32_t sampler,
913 src_reg sampler_reg)
914 {
915 /* The sampler can only meaningfully compute LOD for fragment shader
916 * messages. For all other stages, we change the opcode to TXL and hardcode
917 * the LOD to 0.
918 *
919 * textureQueryLevels() is implemented in terms of TXS so we need to pass a
920 * valid LOD argument.
921 */
922 if (op == ir_tex || op == ir_query_levels) {
923 assert(lod.file == BAD_FILE);
924 lod = brw_imm_f(0.0f);
925 }
926
927 enum opcode opcode;
928 switch (op) {
929 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
930 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
931 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
932 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
933 case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
934 SHADER_OPCODE_TXF_CMS); break;
935 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
936 case ir_tg4: opcode = offset_value.file != BAD_FILE
937 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
938 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
939 case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
940 case ir_txb:
941 unreachable("TXB is not valid for vertex shaders.");
942 case ir_lod:
943 unreachable("LOD is not valid for vertex shaders.");
944 case ir_samples_identical: {
945 /* There are some challenges implementing this for vec4, and it seems
946 * unlikely to be used anyway. For now, just return false ways.
947 */
948 emit(MOV(dest, brw_imm_ud(0u)));
949 return;
950 }
951 default:
952 unreachable("Unrecognized tex op");
953 }
954
955 vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
956
957 inst->offset = constant_offset;
958
959 /* The message header is necessary for:
960 * - Gen4 (always)
961 * - Gen9+ for selecting SIMD4x2
962 * - Texel offsets
963 * - Gather channel selection
964 * - Sampler indices too large to fit in a 4-bit value.
965 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
966 */
967 inst->header_size =
968 (devinfo->gen < 5 || devinfo->gen >= 9 ||
969 inst->offset != 0 || op == ir_tg4 ||
970 op == ir_texture_samples ||
971 is_high_sampler(sampler_reg)) ? 1 : 0;
972 inst->base_mrf = 2;
973 inst->mlen = inst->header_size;
974 inst->dst.writemask = WRITEMASK_XYZW;
975 inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
976
977 inst->src[1] = surface_reg;
978 inst->src[2] = sampler_reg;
979
980 /* MRF for the first parameter */
981 int param_base = inst->base_mrf + inst->header_size;
982
983 if (op == ir_txs || op == ir_query_levels) {
984 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
985 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
986 inst->mlen++;
987 } else if (op == ir_texture_samples) {
988 inst->dst.writemask = WRITEMASK_X;
989 } else {
990 /* Load the coordinate */
991 /* FINISHME: gl_clamp_mask and saturate */
992 int coord_mask = (1 << coord_components) - 1;
993 int zero_mask = 0xf & ~coord_mask;
994
995 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
996 coordinate));
997 inst->mlen++;
998
999 if (zero_mask != 0) {
1000 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
1001 brw_imm_d(0)));
1002 }
1003 /* Load the shadow comparitor */
1004 if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
1005 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
1006 WRITEMASK_X),
1007 shadow_comparitor));
1008 inst->mlen++;
1009 }
1010
1011 /* Load the LOD info */
1012 if (op == ir_tex || op == ir_txl) {
1013 int mrf, writemask;
1014 if (devinfo->gen >= 5) {
1015 mrf = param_base + 1;
1016 if (shadow_comparitor.file != BAD_FILE) {
1017 writemask = WRITEMASK_Y;
1018 /* mlen already incremented */
1019 } else {
1020 writemask = WRITEMASK_X;
1021 inst->mlen++;
1022 }
1023 } else /* devinfo->gen == 4 */ {
1024 mrf = param_base;
1025 writemask = WRITEMASK_W;
1026 }
1027 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1028 } else if (op == ir_txf) {
1029 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1030 } else if (op == ir_txf_ms) {
1031 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1032 sample_index));
1033 if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1034 /* MCS data is stored in the first two channels of ‘mcs’, but we
1035 * need to get it into the .y and .z channels of the second vec4
1036 * of params.
1037 */
1038 mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1039 emit(MOV(dst_reg(MRF, param_base + 1,
1040 glsl_type::uint_type, WRITEMASK_YZ),
1041 mcs));
1042 } else if (devinfo->gen >= 7) {
1043 /* MCS data is in the first channel of `mcs`, but we need to get it into
1044 * the .y channel of the second vec4 of params, so replicate .x across
1045 * the whole vec4 and then mask off everything except .y
1046 */
1047 mcs.swizzle = BRW_SWIZZLE_XXXX;
1048 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1049 mcs));
1050 }
1051 inst->mlen++;
1052 } else if (op == ir_txd) {
1053 const brw_reg_type type = lod.type;
1054
1055 if (devinfo->gen >= 5) {
1056 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1057 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1058 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1059 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1060 inst->mlen++;
1061
1062 if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
1063 lod.swizzle = BRW_SWIZZLE_ZZZZ;
1064 lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1065 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1066 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1067 inst->mlen++;
1068
1069 if (shadow_comparitor.file != BAD_FILE) {
1070 emit(MOV(dst_reg(MRF, param_base + 2,
1071 shadow_comparitor.type, WRITEMASK_Z),
1072 shadow_comparitor));
1073 }
1074 }
1075 } else /* devinfo->gen == 4 */ {
1076 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1077 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1078 inst->mlen += 2;
1079 }
1080 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1081 if (shadow_comparitor.file != BAD_FILE) {
1082 emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
1083 shadow_comparitor));
1084 }
1085
1086 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1087 offset_value));
1088 inst->mlen++;
1089 }
1090 }
1091
1092 emit(inst);
1093
1094 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1095 * spec requires layers.
1096 */
1097 if (op == ir_txs && devinfo->gen < 7) {
1098 /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1099 emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1100 src_reg(inst->dst), brw_imm_d(1));
1101 }
1102
1103 if (devinfo->gen == 6 && op == ir_tg4) {
1104 emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1105 }
1106
1107 if (op == ir_query_levels) {
1108 /* # levels is in .w */
1109 src_reg swizzled(dest);
1110 swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1111 SWIZZLE_W, SWIZZLE_W);
1112 emit(MOV(dest, swizzled));
1113 }
1114 }
1115
1116 /**
1117 * Apply workarounds for Gen6 gather with UINT/SINT
1118 */
1119 void
1120 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1121 {
1122 if (!wa)
1123 return;
1124
1125 int width = (wa & WA_8BIT) ? 8 : 16;
1126 dst_reg dst_f = dst;
1127 dst_f.type = BRW_REGISTER_TYPE_F;
1128
1129 /* Convert from UNORM to UINT */
1130 emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1131 emit(MOV(dst, src_reg(dst_f)));
1132
1133 if (wa & WA_SIGN) {
1134 /* Reinterpret the UINT value as a signed INT value by
1135 * shifting the sign bit into place, then shifting back
1136 * preserving sign.
1137 */
1138 emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1139 emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1140 }
1141 }
1142
1143 void
1144 vec4_visitor::gs_emit_vertex(int stream_id)
1145 {
1146 unreachable("not reached");
1147 }
1148
1149 void
1150 vec4_visitor::gs_end_primitive()
1151 {
1152 unreachable("not reached");
1153 }
1154
1155 void
1156 vec4_visitor::emit_ndc_computation()
1157 {
1158 if (output_reg[VARYING_SLOT_POS].file == BAD_FILE)
1159 return;
1160
1161 /* Get the position */
1162 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
1163
1164 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1165 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1166 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
1167
1168 current_annotation = "NDC";
1169 dst_reg ndc_w = ndc;
1170 ndc_w.writemask = WRITEMASK_W;
1171 src_reg pos_w = pos;
1172 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1173 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1174
1175 dst_reg ndc_xyz = ndc;
1176 ndc_xyz.writemask = WRITEMASK_XYZ;
1177
1178 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1179 }
1180
1181 void
1182 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1183 {
1184 if (devinfo->gen < 6 &&
1185 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1186 output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
1187 devinfo->has_negative_rhw_bug)) {
1188 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1189 dst_reg header1_w = header1;
1190 header1_w.writemask = WRITEMASK_W;
1191
1192 emit(MOV(header1, brw_imm_ud(0u)));
1193
1194 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1195 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1196
1197 current_annotation = "Point size";
1198 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1199 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1200 }
1201
1202 if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
1203 current_annotation = "Clipping flags";
1204 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1205 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1206
1207 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1208 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1209 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1210
1211 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1212 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1213 emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1214 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1215 }
1216
1217 /* i965 clipping workaround:
1218 * 1) Test for -ve rhw
1219 * 2) If set,
1220 * set ndc = (0,0,0,0)
1221 * set ucp[6] = 1
1222 *
1223 * Later, clipping will detect ucp[6] and ensure the primitive is
1224 * clipped against all fixed planes.
1225 */
1226 if (devinfo->has_negative_rhw_bug &&
1227 output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE) {
1228 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
1229 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1230 emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1231 vec4_instruction *inst;
1232 inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1233 inst->predicate = BRW_PREDICATE_NORMAL;
1234 output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
1235 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], brw_imm_f(0.0f)));
1236 inst->predicate = BRW_PREDICATE_NORMAL;
1237 }
1238
1239 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1240 } else if (devinfo->gen < 6) {
1241 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1242 } else {
1243 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1244 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1245 dst_reg reg_w = reg;
1246 reg_w.writemask = WRITEMASK_W;
1247 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1248 reg_as_src.type = reg_w.type;
1249 reg_as_src.swizzle = brw_swizzle_for_size(1);
1250 emit(MOV(reg_w, reg_as_src));
1251 }
1252 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1253 dst_reg reg_y = reg;
1254 reg_y.writemask = WRITEMASK_Y;
1255 reg_y.type = BRW_REGISTER_TYPE_D;
1256 output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
1257 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
1258 }
1259 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1260 dst_reg reg_z = reg;
1261 reg_z.writemask = WRITEMASK_Z;
1262 reg_z.type = BRW_REGISTER_TYPE_D;
1263 output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
1264 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
1265 }
1266 }
1267 }
1268
1269 vec4_instruction *
1270 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
1271 {
1272 assert(varying < VARYING_SLOT_MAX);
1273 assert(output_reg[varying].type == reg.type);
1274 current_annotation = output_reg_annotation[varying];
1275 if (output_reg[varying].file != BAD_FILE) {
1276 return emit(MOV(reg, src_reg(output_reg[varying])));
1277 } else
1278 return NULL;
1279 }
1280
1281 void
1282 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1283 {
1284 assert(varying < VARYING_SLOT_MAX);
1285 assert(varying >= VARYING_SLOT_VAR0);
1286 varying = varying - VARYING_SLOT_VAR0;
1287
1288 unsigned num_comps = output_generic_num_components[varying][component];
1289 if (num_comps == 0)
1290 return;
1291
1292 assert(output_generic_reg[varying][component].type == reg.type);
1293 current_annotation = output_reg_annotation[varying];
1294 if (output_generic_reg[varying][component].file != BAD_FILE) {
1295 src_reg src = src_reg(output_generic_reg[varying][component]);
1296 src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1297 reg.writemask =
1298 brw_writemask_for_component_packing(num_comps, component);
1299 emit(MOV(reg, src));
1300 }
1301 }
1302
1303 void
1304 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1305 {
1306 reg.type = BRW_REGISTER_TYPE_F;
1307 output_reg[varying].type = reg.type;
1308
1309 switch (varying) {
1310 case VARYING_SLOT_PSIZ:
1311 {
1312 /* PSIZ is always in slot 0, and is coupled with other flags. */
1313 current_annotation = "indices, point width, clip flags";
1314 emit_psiz_and_flags(reg);
1315 break;
1316 }
1317 case BRW_VARYING_SLOT_NDC:
1318 current_annotation = "NDC";
1319 if (output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE)
1320 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
1321 break;
1322 case VARYING_SLOT_POS:
1323 current_annotation = "gl_Position";
1324 if (output_reg[VARYING_SLOT_POS].file != BAD_FILE)
1325 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
1326 break;
1327 case VARYING_SLOT_EDGE:
1328 /* This is present when doing unfilled polygons. We're supposed to copy
1329 * the edge flag from the user-provided vertex array
1330 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1331 * of that attribute (starts as 1.0f). This is then used in clipping to
1332 * determine which edges should be drawn as wireframe.
1333 */
1334 current_annotation = "edge flag";
1335 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1336 glsl_type::float_type, WRITEMASK_XYZW))));
1337 break;
1338 case BRW_VARYING_SLOT_PAD:
1339 /* No need to write to this slot */
1340 break;
1341 default:
1342 if (varying >= VARYING_SLOT_VAR0) {
1343 for (int i = 0; i < 4; i++) {
1344 emit_generic_urb_slot(reg, varying, i);
1345 }
1346 } else {
1347 emit_generic_urb_slot(reg, varying);
1348 }
1349 break;
1350 }
1351 }
1352
1353 static int
1354 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
1355 {
1356 if (devinfo->gen >= 6) {
1357 /* URB data written (does not include the message header reg) must
1358 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1359 * section 5.4.3.2.2: URB_INTERLEAVED.
1360 *
1361 * URB entries are allocated on a multiple of 1024 bits, so an
1362 * extra 128 bits written here to make the end align to 256 is
1363 * no problem.
1364 */
1365 if ((mlen % 2) != 1)
1366 mlen++;
1367 }
1368
1369 return mlen;
1370 }
1371
1372
1373 /**
1374 * Generates the VUE payload plus the necessary URB write instructions to
1375 * output it.
1376 *
1377 * The VUE layout is documented in Volume 2a.
1378 */
1379 void
1380 vec4_visitor::emit_vertex()
1381 {
1382 /* MRF 0 is reserved for the debugger, so start with message header
1383 * in MRF 1.
1384 */
1385 int base_mrf = 1;
1386 int mrf = base_mrf;
1387 /* In the process of generating our URB write message contents, we
1388 * may need to unspill a register or load from an array. Those
1389 * reads would use MRFs 14-15.
1390 */
1391 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1392
1393 /* The following assertion verifies that max_usable_mrf causes an
1394 * even-numbered amount of URB write data, which will meet gen6's
1395 * requirements for length alignment.
1396 */
1397 assert ((max_usable_mrf - base_mrf) % 2 == 0);
1398
1399 /* First mrf is the g0-based message header containing URB handles and
1400 * such.
1401 */
1402 emit_urb_write_header(mrf++);
1403
1404 if (devinfo->gen < 6) {
1405 emit_ndc_computation();
1406 }
1407
1408 /* We may need to split this up into several URB writes, so do them in a
1409 * loop.
1410 */
1411 int slot = 0;
1412 bool complete = false;
1413 do {
1414 /* URB offset is in URB row increments, and each of our MRFs is half of
1415 * one of those, since we're doing interleaved writes.
1416 */
1417 int offset = slot / 2;
1418
1419 mrf = base_mrf + 1;
1420 for (; slot < prog_data->vue_map.num_slots; ++slot) {
1421 emit_urb_slot(dst_reg(MRF, mrf++),
1422 prog_data->vue_map.slot_to_varying[slot]);
1423
1424 /* If this was max_usable_mrf, we can't fit anything more into this
1425 * URB WRITE. Same thing if we reached the maximum length available.
1426 */
1427 if (mrf > max_usable_mrf ||
1428 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1429 slot++;
1430 break;
1431 }
1432 }
1433
1434 complete = slot >= prog_data->vue_map.num_slots;
1435 current_annotation = "URB write";
1436 vec4_instruction *inst = emit_urb_write_opcode(complete);
1437 inst->base_mrf = base_mrf;
1438 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1439 inst->offset += offset;
1440 } while(!complete);
1441 }
1442
1443
1444 src_reg
1445 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1446 src_reg *reladdr, int reg_offset)
1447 {
1448 /* Because we store the values to scratch interleaved like our
1449 * vertex data, we need to scale the vec4 index by 2.
1450 */
1451 int message_header_scale = 2;
1452
1453 /* Pre-gen6, the message header uses byte offsets instead of vec4
1454 * (16-byte) offset units.
1455 */
1456 if (devinfo->gen < 6)
1457 message_header_scale *= 16;
1458
1459 if (reladdr) {
1460 src_reg index = src_reg(this, glsl_type::int_type);
1461
1462 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1463 brw_imm_d(reg_offset)));
1464 emit_before(block, inst, MUL(dst_reg(index), index,
1465 brw_imm_d(message_header_scale)));
1466
1467 return index;
1468 } else {
1469 return brw_imm_d(reg_offset * message_header_scale);
1470 }
1471 }
1472
1473 /**
1474 * Emits an instruction before @inst to load the value named by @orig_src
1475 * from scratch space at @base_offset to @temp.
1476 *
1477 * @base_offset is measured in 32-byte units (the size of a register).
1478 */
1479 void
1480 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1481 dst_reg temp, src_reg orig_src,
1482 int base_offset)
1483 {
1484 int reg_offset = base_offset + orig_src.reg_offset;
1485 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1486 reg_offset);
1487
1488 emit_before(block, inst, SCRATCH_READ(temp, index));
1489 }
1490
1491 /**
1492 * Emits an instruction after @inst to store the value to be written
1493 * to @orig_dst to scratch space at @base_offset, from @temp.
1494 *
1495 * @base_offset is measured in 32-byte units (the size of a register).
1496 */
1497 void
1498 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1499 int base_offset)
1500 {
1501 int reg_offset = base_offset + inst->dst.reg_offset;
1502 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1503 reg_offset);
1504
1505 /* Create a temporary register to store *inst's result in.
1506 *
1507 * We have to be careful in MOVing from our temporary result register in
1508 * the scratch write. If we swizzle from channels of the temporary that
1509 * weren't initialized, it will confuse live interval analysis, which will
1510 * make spilling fail to make progress.
1511 */
1512 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
1513 inst->dst.type),
1514 brw_swizzle_for_mask(inst->dst.writemask));
1515 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1516 inst->dst.writemask));
1517 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1518 if (inst->opcode != BRW_OPCODE_SEL)
1519 write->predicate = inst->predicate;
1520 write->ir = inst->ir;
1521 write->annotation = inst->annotation;
1522 inst->insert_after(block, write);
1523
1524 inst->dst.file = temp.file;
1525 inst->dst.nr = temp.nr;
1526 inst->dst.reg_offset = temp.reg_offset;
1527 inst->dst.reladdr = NULL;
1528 }
1529
1530 /**
1531 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1532 * adds the scratch read(s) before \p inst. The function also checks for
1533 * recursive reladdr scratch accesses, issuing the corresponding scratch
1534 * loads and rewriting reladdr references accordingly.
1535 *
1536 * \return \p src if it did not require a scratch load, otherwise, the
1537 * register holding the result of the scratch load that the caller should
1538 * use to rewrite src.
1539 */
1540 src_reg
1541 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1542 vec4_instruction *inst, src_reg src)
1543 {
1544 /* Resolve recursive reladdr scratch access by calling ourselves
1545 * with src.reladdr
1546 */
1547 if (src.reladdr)
1548 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1549 *src.reladdr);
1550
1551 /* Now handle scratch access on src */
1552 if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1553 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1554 emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1555 src.nr = temp.nr;
1556 src.reg_offset = temp.reg_offset;
1557 src.reladdr = NULL;
1558 }
1559
1560 return src;
1561 }
1562
1563 /**
1564 * We can't generally support array access in GRF space, because a
1565 * single instruction's destination can only span 2 contiguous
1566 * registers. So, we send all GRF arrays that get variable index
1567 * access to scratch space.
1568 */
1569 void
1570 vec4_visitor::move_grf_array_access_to_scratch()
1571 {
1572 int scratch_loc[this->alloc.count];
1573 memset(scratch_loc, -1, sizeof(scratch_loc));
1574
1575 /* First, calculate the set of virtual GRFs that need to be punted
1576 * to scratch due to having any array access on them, and where in
1577 * scratch.
1578 */
1579 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1580 if (inst->dst.file == VGRF && inst->dst.reladdr) {
1581 if (scratch_loc[inst->dst.nr] == -1) {
1582 scratch_loc[inst->dst.nr] = last_scratch;
1583 last_scratch += this->alloc.sizes[inst->dst.nr];
1584 }
1585
1586 for (src_reg *iter = inst->dst.reladdr;
1587 iter->reladdr;
1588 iter = iter->reladdr) {
1589 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1590 scratch_loc[iter->nr] = last_scratch;
1591 last_scratch += this->alloc.sizes[iter->nr];
1592 }
1593 }
1594 }
1595
1596 for (int i = 0 ; i < 3; i++) {
1597 for (src_reg *iter = &inst->src[i];
1598 iter->reladdr;
1599 iter = iter->reladdr) {
1600 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1601 scratch_loc[iter->nr] = last_scratch;
1602 last_scratch += this->alloc.sizes[iter->nr];
1603 }
1604 }
1605 }
1606 }
1607
1608 /* Now, for anything that will be accessed through scratch, rewrite
1609 * it to load/store. Note that this is a _safe list walk, because
1610 * we may generate a new scratch_write instruction after the one
1611 * we're processing.
1612 */
1613 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1614 /* Set up the annotation tracking for new generated instructions. */
1615 base_ir = inst->ir;
1616 current_annotation = inst->annotation;
1617
1618 /* First handle scratch access on the dst. Notice we have to handle
1619 * the case where the dst's reladdr also points to scratch space.
1620 */
1621 if (inst->dst.reladdr)
1622 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1623 *inst->dst.reladdr);
1624
1625 /* Now that we have handled any (possibly recursive) reladdr scratch
1626 * accesses for dst we can safely do the scratch write for dst itself
1627 */
1628 if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1629 emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1630
1631 /* Now handle scratch access on any src. In this case, since inst->src[i]
1632 * already is a src_reg, we can just call emit_resolve_reladdr with
1633 * inst->src[i] and it will take care of handling scratch loads for
1634 * both src and src.reladdr (recursively).
1635 */
1636 for (int i = 0 ; i < 3; i++) {
1637 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1638 inst->src[i]);
1639 }
1640 }
1641 }
1642
1643 /**
1644 * Emits an instruction before @inst to load the value named by @orig_src
1645 * from the pull constant buffer (surface) at @base_offset to @temp.
1646 */
1647 void
1648 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1649 dst_reg temp, src_reg orig_src,
1650 int base_offset, src_reg indirect)
1651 {
1652 int reg_offset = base_offset + orig_src.reg_offset;
1653 const unsigned index = prog_data->base.binding_table.pull_constants_start;
1654
1655 src_reg offset;
1656 if (indirect.file != BAD_FILE) {
1657 offset = src_reg(this, glsl_type::uint_type);
1658
1659 emit_before(block, inst, ADD(dst_reg(offset), indirect,
1660 brw_imm_ud(reg_offset * 16)));
1661 } else if (devinfo->gen >= 8) {
1662 /* Store the offset in a GRF so we can send-from-GRF. */
1663 offset = src_reg(this, glsl_type::uint_type);
1664 emit_before(block, inst, MOV(dst_reg(offset), brw_imm_ud(reg_offset * 16)));
1665 } else {
1666 offset = brw_imm_d(reg_offset * 16);
1667 }
1668
1669 emit_pull_constant_load_reg(temp,
1670 brw_imm_ud(index),
1671 offset,
1672 block, inst);
1673
1674 brw_mark_surface_used(&prog_data->base, index);
1675 }
1676
1677 /**
1678 * Implements array access of uniforms by inserting a
1679 * PULL_CONSTANT_LOAD instruction.
1680 *
1681 * Unlike temporary GRF array access (where we don't support it due to
1682 * the difficulty of doing relative addressing on instruction
1683 * destinations), we could potentially do array access of uniforms
1684 * that were loaded in GRF space as push constants. In real-world
1685 * usage we've seen, though, the arrays being used are always larger
1686 * than we could load as push constants, so just always move all
1687 * uniform array access out to a pull constant buffer.
1688 */
1689 void
1690 vec4_visitor::move_uniform_array_access_to_pull_constants()
1691 {
1692 /* The vulkan dirver doesn't support pull constants other than UBOs so
1693 * everything has to be pushed regardless.
1694 */
1695 if (stage_prog_data->pull_param == NULL) {
1696 split_uniform_registers();
1697 return;
1698 }
1699
1700 int pull_constant_loc[this->uniforms];
1701 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1702
1703 /* First, walk through the instructions and determine which things need to
1704 * be pulled. We mark something as needing to be pulled by setting
1705 * pull_constant_loc to 0.
1706 */
1707 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1708 /* We only care about MOV_INDIRECT of a uniform */
1709 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1710 inst->src[0].file != UNIFORM)
1711 continue;
1712
1713 int uniform_nr = inst->src[0].nr + inst->src[0].reg_offset;
1714
1715 for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1716 pull_constant_loc[uniform_nr + j] = 0;
1717 }
1718
1719 /* Next, we walk the list of uniforms and assign real pull constant
1720 * locations and set their corresponding entries in pull_param.
1721 */
1722 for (int j = 0; j < this->uniforms; j++) {
1723 if (pull_constant_loc[j] < 0)
1724 continue;
1725
1726 pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1727
1728 for (int i = 0; i < 4; i++) {
1729 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1730 = stage_prog_data->param[j * 4 + i];
1731 }
1732 }
1733
1734 /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1735 * instructions to actual uniform pulls.
1736 */
1737 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1738 /* We only care about MOV_INDIRECT of a uniform */
1739 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1740 inst->src[0].file != UNIFORM)
1741 continue;
1742
1743 int uniform_nr = inst->src[0].nr + inst->src[0].reg_offset;
1744
1745 assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1746
1747 emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1748 pull_constant_loc[uniform_nr], inst->src[1]);
1749 inst->remove(block);
1750 }
1751
1752 /* Now there are no accesses of the UNIFORM file with a reladdr, so
1753 * no need to track them as larger-than-vec4 objects. This will be
1754 * relied on in cutting out unused uniform vectors from push
1755 * constants.
1756 */
1757 split_uniform_registers();
1758 }
1759
1760 void
1761 vec4_visitor::resolve_ud_negate(src_reg *reg)
1762 {
1763 if (reg->type != BRW_REGISTER_TYPE_UD ||
1764 !reg->negate)
1765 return;
1766
1767 src_reg temp = src_reg(this, glsl_type::uvec4_type);
1768 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1769 *reg = temp;
1770 }
1771
1772 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1773 void *log_data,
1774 const struct brw_sampler_prog_key_data *key_tex,
1775 struct brw_vue_prog_data *prog_data,
1776 const nir_shader *shader,
1777 void *mem_ctx,
1778 bool no_spills,
1779 int shader_time_index)
1780 : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1781 key_tex(key_tex),
1782 prog_data(prog_data),
1783 fail_msg(NULL),
1784 first_non_payload_grf(0),
1785 need_all_constants_in_pull_buffer(false),
1786 no_spills(no_spills),
1787 shader_time_index(shader_time_index),
1788 last_scratch(0)
1789 {
1790 this->failed = false;
1791
1792 this->base_ir = NULL;
1793 this->current_annotation = NULL;
1794 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1795
1796 memset(this->output_generic_num_components, 0,
1797 sizeof(this->output_generic_num_components));
1798
1799 this->virtual_grf_start = NULL;
1800 this->virtual_grf_end = NULL;
1801 this->live_intervals = NULL;
1802
1803 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1804
1805 this->uniforms = 0;
1806 }
1807
1808 vec4_visitor::~vec4_visitor()
1809 {
1810 }
1811
1812
1813 void
1814 vec4_visitor::fail(const char *format, ...)
1815 {
1816 va_list va;
1817 char *msg;
1818
1819 if (failed)
1820 return;
1821
1822 failed = true;
1823
1824 va_start(va, format);
1825 msg = ralloc_vasprintf(mem_ctx, format, va);
1826 va_end(va);
1827 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1828
1829 this->fail_msg = msg;
1830
1831 if (debug_enabled) {
1832 fprintf(stderr, "%s", msg);
1833 }
1834 }
1835
1836 } /* namespace brw */