i965: Make emit_minmax return an instruction*.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_builder.h
1 /* -*- c++ -*- */
2 /*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #ifndef BRW_VEC4_BUILDER_H
26 #define BRW_VEC4_BUILDER_H
27
28 #include "brw_ir_vec4.h"
29 #include "brw_ir_allocator.h"
30 #include "brw_context.h"
31
32 namespace brw {
33 /**
34 * Toolbox to assemble a VEC4 IR program out of individual instructions.
35 *
36 * This object is meant to have an interface consistent with
37 * brw::fs_builder. They cannot be fully interchangeable because
38 * brw::fs_builder generates scalar code while brw::vec4_builder generates
39 * vector code.
40 */
41 class vec4_builder {
42 public:
43 /** Type used in this IR to represent a source of an instruction. */
44 typedef brw::src_reg src_reg;
45
46 /** Type used in this IR to represent the destination of an instruction. */
47 typedef brw::dst_reg dst_reg;
48
49 /** Type used in this IR to represent an instruction. */
50 typedef vec4_instruction instruction;
51
52 /**
53 * Construct a vec4_builder that inserts instructions into \p shader.
54 */
55 vec4_builder(backend_shader *shader) :
56 shader(shader), block(NULL), cursor(NULL),
57 force_writemask_all(false),
58 annotation()
59 {
60 }
61
62 /**
63 * Construct a vec4_builder that inserts instructions into \p shader
64 * before instruction \p inst in basic block \p block. The default
65 * execution controls and debug annotation are initialized from the
66 * instruction passed as argument.
67 */
68 vec4_builder(backend_shader *shader, bblock_t *block, instruction *inst) :
69 shader(shader), block(block), cursor(inst),
70 force_writemask_all(inst->force_writemask_all)
71 {
72 annotation.str = inst->annotation;
73 annotation.ir = inst->ir;
74 }
75
76 /**
77 * Construct a vec4_builder that inserts instructions before \p cursor
78 * in basic block \p block, inheriting other code generation parameters
79 * from this.
80 */
81 vec4_builder
82 at(bblock_t *block, exec_node *cursor) const
83 {
84 vec4_builder bld = *this;
85 bld.block = block;
86 bld.cursor = cursor;
87 return bld;
88 }
89
90 /**
91 * Construct a vec4_builder appending instructions at the end of the
92 * instruction list of the shader, inheriting other code generation
93 * parameters from this.
94 */
95 vec4_builder
96 at_end() const
97 {
98 return at(NULL, (exec_node *)&shader->instructions.tail);
99 }
100
101 /**
102 * Construct a builder with per-channel control flow execution masking
103 * disabled if \p b is true. If control flow execution masking is
104 * already disabled this has no effect.
105 */
106 vec4_builder
107 exec_all(bool b = true) const
108 {
109 vec4_builder bld = *this;
110 if (b)
111 bld.force_writemask_all = true;
112 return bld;
113 }
114
115 /**
116 * Construct a builder with the given debug annotation info.
117 */
118 vec4_builder
119 annotate(const char *str, const void *ir = NULL) const
120 {
121 vec4_builder bld = *this;
122 bld.annotation.str = str;
123 bld.annotation.ir = ir;
124 return bld;
125 }
126
127 /**
128 * Get the SIMD width in use.
129 */
130 unsigned
131 dispatch_width() const
132 {
133 return 8;
134 }
135
136 /**
137 * Allocate a virtual register of natural vector size (four for this IR)
138 * and SIMD width. \p n gives the amount of space to allocate in
139 * dispatch_width units (which is just enough space for four logical
140 * components in this IR).
141 */
142 dst_reg
143 vgrf(enum brw_reg_type type, unsigned n = 1) const
144 {
145 assert(dispatch_width() <= 32);
146
147 if (n > 0)
148 return retype(dst_reg(VGRF, shader->alloc.allocate(
149 n * DIV_ROUND_UP(type_sz(type), 4))),
150 type);
151 else
152 return retype(null_reg_ud(), type);
153 }
154
155 /**
156 * Create a null register of floating type.
157 */
158 dst_reg
159 null_reg_f() const
160 {
161 return dst_reg(retype(brw_null_vec(dispatch_width()),
162 BRW_REGISTER_TYPE_F));
163 }
164
165 /**
166 * Create a null register of signed integer type.
167 */
168 dst_reg
169 null_reg_d() const
170 {
171 return dst_reg(retype(brw_null_vec(dispatch_width()),
172 BRW_REGISTER_TYPE_D));
173 }
174
175 /**
176 * Create a null register of unsigned integer type.
177 */
178 dst_reg
179 null_reg_ud() const
180 {
181 return dst_reg(retype(brw_null_vec(dispatch_width()),
182 BRW_REGISTER_TYPE_UD));
183 }
184
185 /**
186 * Insert an instruction into the program.
187 */
188 instruction *
189 emit(const instruction &inst) const
190 {
191 return emit(new(shader->mem_ctx) instruction(inst));
192 }
193
194 /**
195 * Create and insert a nullary control instruction into the program.
196 */
197 instruction *
198 emit(enum opcode opcode) const
199 {
200 return emit(instruction(opcode));
201 }
202
203 /**
204 * Create and insert a nullary instruction into the program.
205 */
206 instruction *
207 emit(enum opcode opcode, const dst_reg &dst) const
208 {
209 return emit(instruction(opcode, dst));
210 }
211
212 /**
213 * Create and insert a unary instruction into the program.
214 */
215 instruction *
216 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
217 {
218 switch (opcode) {
219 case SHADER_OPCODE_RCP:
220 case SHADER_OPCODE_RSQ:
221 case SHADER_OPCODE_SQRT:
222 case SHADER_OPCODE_EXP2:
223 case SHADER_OPCODE_LOG2:
224 case SHADER_OPCODE_SIN:
225 case SHADER_OPCODE_COS:
226 return fix_math_instruction(
227 emit(instruction(opcode, dst,
228 fix_math_operand(src0))));
229
230 default:
231 return emit(instruction(opcode, dst, src0));
232 }
233 }
234
235 /**
236 * Create and insert a binary instruction into the program.
237 */
238 instruction *
239 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
240 const src_reg &src1) const
241 {
242 switch (opcode) {
243 case SHADER_OPCODE_POW:
244 case SHADER_OPCODE_INT_QUOTIENT:
245 case SHADER_OPCODE_INT_REMAINDER:
246 return fix_math_instruction(
247 emit(instruction(opcode, dst,
248 fix_math_operand(src0),
249 fix_math_operand(src1))));
250
251 default:
252 return emit(instruction(opcode, dst, src0, src1));
253 }
254 }
255
256 /**
257 * Create and insert a ternary instruction into the program.
258 */
259 instruction *
260 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
261 const src_reg &src1, const src_reg &src2) const
262 {
263 switch (opcode) {
264 case BRW_OPCODE_BFE:
265 case BRW_OPCODE_BFI2:
266 case BRW_OPCODE_MAD:
267 case BRW_OPCODE_LRP:
268 return emit(instruction(opcode, dst,
269 fix_3src_operand(src0),
270 fix_3src_operand(src1),
271 fix_3src_operand(src2)));
272
273 default:
274 return emit(instruction(opcode, dst, src0, src1, src2));
275 }
276 }
277
278 /**
279 * Insert a preallocated instruction into the program.
280 */
281 instruction *
282 emit(instruction *inst) const
283 {
284 inst->force_writemask_all = force_writemask_all;
285 inst->annotation = annotation.str;
286 inst->ir = annotation.ir;
287
288 if (block)
289 static_cast<instruction *>(cursor)->insert_before(block, inst);
290 else
291 cursor->insert_before(inst);
292
293 return inst;
294 }
295
296 /**
297 * Select \p src0 if the comparison of both sources with the given
298 * conditional mod evaluates to true, otherwise select \p src1.
299 *
300 * Generally useful to get the minimum or maximum of two values.
301 */
302 instruction *
303 emit_minmax(const dst_reg &dst, const src_reg &src0,
304 const src_reg &src1, brw_conditional_mod mod) const
305 {
306 assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
307
308 return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
309 fix_unsigned_negate(src1)));
310 }
311
312 /**
313 * Copy any live channel from \p src to the first channel of the result.
314 */
315 src_reg
316 emit_uniformize(const src_reg &src) const
317 {
318 const vec4_builder ubld = exec_all();
319 const dst_reg chan_index =
320 writemask(vgrf(BRW_REGISTER_TYPE_UD), WRITEMASK_X);
321 const dst_reg dst = vgrf(src.type);
322
323 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
324 ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, src_reg(chan_index));
325
326 return src_reg(dst);
327 }
328
329 /**
330 * Assorted arithmetic ops.
331 * @{
332 */
333 #define ALU1(op) \
334 instruction * \
335 op(const dst_reg &dst, const src_reg &src0) const \
336 { \
337 return emit(BRW_OPCODE_##op, dst, src0); \
338 }
339
340 #define ALU2(op) \
341 instruction * \
342 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
343 { \
344 return emit(BRW_OPCODE_##op, dst, src0, src1); \
345 }
346
347 #define ALU2_ACC(op) \
348 instruction * \
349 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
350 { \
351 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
352 inst->writes_accumulator = true; \
353 return inst; \
354 }
355
356 #define ALU3(op) \
357 instruction * \
358 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
359 const src_reg &src2) const \
360 { \
361 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
362 }
363
364 ALU2(ADD)
365 ALU2_ACC(ADDC)
366 ALU2(AND)
367 ALU2(ASR)
368 ALU2(AVG)
369 ALU3(BFE)
370 ALU2(BFI1)
371 ALU3(BFI2)
372 ALU1(BFREV)
373 ALU1(CBIT)
374 ALU2(CMPN)
375 ALU3(CSEL)
376 ALU2(DP2)
377 ALU2(DP3)
378 ALU2(DP4)
379 ALU2(DPH)
380 ALU1(F16TO32)
381 ALU1(F32TO16)
382 ALU1(FBH)
383 ALU1(FBL)
384 ALU1(FRC)
385 ALU2(LINE)
386 ALU1(LZD)
387 ALU2(MAC)
388 ALU2_ACC(MACH)
389 ALU3(MAD)
390 ALU1(MOV)
391 ALU2(MUL)
392 ALU1(NOT)
393 ALU2(OR)
394 ALU2(PLN)
395 ALU1(RNDD)
396 ALU1(RNDE)
397 ALU1(RNDU)
398 ALU1(RNDZ)
399 ALU2(SAD2)
400 ALU2_ACC(SADA2)
401 ALU2(SEL)
402 ALU2(SHL)
403 ALU2(SHR)
404 ALU2_ACC(SUBB)
405 ALU2(XOR)
406
407 #undef ALU3
408 #undef ALU2_ACC
409 #undef ALU2
410 #undef ALU1
411 /** @} */
412
413 /**
414 * CMP: Sets the low bit of the destination channels with the result
415 * of the comparison, while the upper bits are undefined, and updates
416 * the flag register with the packed 16 bits of the result.
417 */
418 instruction *
419 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
420 brw_conditional_mod condition) const
421 {
422 /* Take the instruction:
423 *
424 * CMP null<d> src0<f> src1<f>
425 *
426 * Original gen4 does type conversion to the destination type
427 * before comparison, producing garbage results for floating
428 * point comparisons.
429 *
430 * The destination type doesn't matter on newer generations,
431 * so we set the type to match src0 so we can compact the
432 * instruction.
433 */
434 return set_condmod(condition,
435 emit(BRW_OPCODE_CMP, retype(dst, src0.type),
436 fix_unsigned_negate(src0),
437 fix_unsigned_negate(src1)));
438 }
439
440 /**
441 * Gen4 predicated IF.
442 */
443 instruction *
444 IF(brw_predicate predicate) const
445 {
446 return set_predicate(predicate, emit(BRW_OPCODE_IF));
447 }
448
449 /**
450 * Gen6 IF with embedded comparison.
451 */
452 instruction *
453 IF(const src_reg &src0, const src_reg &src1,
454 brw_conditional_mod condition) const
455 {
456 assert(shader->devinfo->gen == 6);
457 return set_condmod(condition,
458 emit(BRW_OPCODE_IF,
459 null_reg_d(),
460 fix_unsigned_negate(src0),
461 fix_unsigned_negate(src1)));
462 }
463
464 /**
465 * Emit a linear interpolation instruction.
466 */
467 instruction *
468 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
469 const src_reg &a) const
470 {
471 if (shader->devinfo->gen >= 6) {
472 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
473 * we need to reorder the operands.
474 */
475 return emit(BRW_OPCODE_LRP, dst, a, y, x);
476
477 } else {
478 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
479 const dst_reg y_times_a = vgrf(dst.type);
480 const dst_reg one_minus_a = vgrf(dst.type);
481 const dst_reg x_times_one_minus_a = vgrf(dst.type);
482
483 MUL(y_times_a, y, a);
484 ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
485 MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
486 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
487 }
488 }
489
490 backend_shader *shader;
491
492 protected:
493 /**
494 * Workaround for negation of UD registers. See comment in
495 * fs_generator::generate_code() for the details.
496 */
497 src_reg
498 fix_unsigned_negate(const src_reg &src) const
499 {
500 if (src.type == BRW_REGISTER_TYPE_UD && src.negate) {
501 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
502 MOV(temp, src);
503 return src_reg(temp);
504 } else {
505 return src;
506 }
507 }
508
509 /**
510 * Workaround for register access modes not supported by the ternary
511 * instruction encoding.
512 */
513 src_reg
514 fix_3src_operand(const src_reg &src) const
515 {
516 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
517 * able to use vertical stride of zero to replicate the vec4 uniform, like
518 *
519 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
520 *
521 * But you can't, since vertical stride is always four in three-source
522 * instructions. Instead, insert a MOV instruction to do the replication so
523 * that the three-source instruction can consume it.
524 */
525
526 /* The MOV is only needed if the source is a uniform or immediate. */
527 if (src.file != UNIFORM && src.file != IMM)
528 return src;
529
530 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
531 return src;
532
533 const dst_reg expanded = vgrf(src.type);
534 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
535 return src_reg(expanded);
536 }
537
538 /**
539 * Workaround for register access modes not supported by the math
540 * instruction.
541 */
542 src_reg
543 fix_math_operand(const src_reg &src) const
544 {
545 /* The gen6 math instruction ignores the source modifiers --
546 * swizzle, abs, negate, and at least some parts of the register
547 * region description.
548 *
549 * Rather than trying to enumerate all these cases, *always* expand the
550 * operand to a temp GRF for gen6.
551 *
552 * For gen7, keep the operand as-is, except if immediate, which gen7 still
553 * can't use.
554 */
555 if (shader->devinfo->gen == 6 ||
556 (shader->devinfo->gen == 7 && src.file == IMM)) {
557 const dst_reg tmp = vgrf(src.type);
558 MOV(tmp, src);
559 return src_reg(tmp);
560 } else {
561 return src;
562 }
563 }
564
565 /**
566 * Workaround other weirdness of the math instruction.
567 */
568 instruction *
569 fix_math_instruction(instruction *inst) const
570 {
571 if (shader->devinfo->gen == 6 &&
572 inst->dst.writemask != WRITEMASK_XYZW) {
573 const dst_reg tmp = vgrf(inst->dst.type);
574 MOV(inst->dst, src_reg(tmp));
575 inst->dst = tmp;
576
577 } else if (shader->devinfo->gen < 6) {
578 const unsigned sources = (inst->src[1].file == BAD_FILE ? 1 : 2);
579 inst->base_mrf = 1;
580 inst->mlen = sources;
581 }
582
583 return inst;
584 }
585
586 bblock_t *block;
587 exec_node *cursor;
588
589 bool force_writemask_all;
590
591 /** Debug annotation info. */
592 struct {
593 const char *str;
594 const void *ir;
595 } annotation;
596 };
597 }
598
599 #endif