i965: Enable EGL_KHR_gl_texture_3D_image
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_builder.h
1 /* -*- c++ -*- */
2 /*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #ifndef BRW_FS_BUILDER_H
26 #define BRW_FS_BUILDER_H
27
28 #include "brw_ir_fs.h"
29 #include "brw_shader.h"
30 #include "brw_context.h"
31
32 namespace brw {
33 /**
34 * Toolbox to assemble an FS IR program out of individual instructions.
35 *
36 * This object is meant to have an interface consistent with
37 * brw::vec4_builder. They cannot be fully interchangeable because
38 * brw::fs_builder generates scalar code while brw::vec4_builder generates
39 * vector code.
40 */
41 class fs_builder {
42 public:
43 /** Type used in this IR to represent a source of an instruction. */
44 typedef fs_reg src_reg;
45
46 /** Type used in this IR to represent the destination of an instruction. */
47 typedef fs_reg dst_reg;
48
49 /** Type used in this IR to represent an instruction. */
50 typedef fs_inst instruction;
51
52 /**
53 * Construct an fs_builder that inserts instructions into \p shader.
54 * \p dispatch_width gives the native execution width of the program.
55 */
56 fs_builder(backend_shader *shader,
57 unsigned dispatch_width) :
58 shader(shader), block(NULL), cursor(NULL),
59 _dispatch_width(dispatch_width),
60 _group(0),
61 force_writemask_all(false),
62 annotation()
63 {
64 }
65
66 /**
67 * Construct an fs_builder that inserts instructions into \p shader
68 * before instruction \p inst in basic block \p block. The default
69 * execution controls and debug annotation are initialized from the
70 * instruction passed as argument.
71 */
72 fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
73 shader(shader), block(block), cursor(inst),
74 _dispatch_width(inst->exec_size),
75 _group(inst->group),
76 force_writemask_all(inst->force_writemask_all)
77 {
78 annotation.str = inst->annotation;
79 annotation.ir = inst->ir;
80 }
81
82 /**
83 * Construct an fs_builder that inserts instructions before \p cursor in
84 * basic block \p block, inheriting other code generation parameters
85 * from this.
86 */
87 fs_builder
88 at(bblock_t *block, exec_node *cursor) const
89 {
90 fs_builder bld = *this;
91 bld.block = block;
92 bld.cursor = cursor;
93 return bld;
94 }
95
96 /**
97 * Construct an fs_builder appending instructions at the end of the
98 * instruction list of the shader, inheriting other code generation
99 * parameters from this.
100 */
101 fs_builder
102 at_end() const
103 {
104 return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
105 }
106
107 /**
108 * Construct a builder specifying the default SIMD width and group of
109 * channel enable signals, inheriting other code generation parameters
110 * from this.
111 *
112 * \p n gives the default SIMD width, \p i gives the slot group used for
113 * predication and control flow masking in multiples of \p n channels.
114 */
115 fs_builder
116 group(unsigned n, unsigned i) const
117 {
118 assert(force_writemask_all ||
119 (n <= dispatch_width() && i < dispatch_width() / n));
120 fs_builder bld = *this;
121 bld._dispatch_width = n;
122 bld._group += i * n;
123 return bld;
124 }
125
126 /**
127 * Alias for group() with width equal to eight.
128 */
129 fs_builder
130 half(unsigned i) const
131 {
132 return group(8, i);
133 }
134
135 /**
136 * Construct a builder with per-channel control flow execution masking
137 * disabled if \p b is true. If control flow execution masking is
138 * already disabled this has no effect.
139 */
140 fs_builder
141 exec_all(bool b = true) const
142 {
143 fs_builder bld = *this;
144 if (b)
145 bld.force_writemask_all = true;
146 return bld;
147 }
148
149 /**
150 * Construct a builder with the given debug annotation info.
151 */
152 fs_builder
153 annotate(const char *str, const void *ir = NULL) const
154 {
155 fs_builder bld = *this;
156 bld.annotation.str = str;
157 bld.annotation.ir = ir;
158 return bld;
159 }
160
161 /**
162 * Get the SIMD width in use.
163 */
164 unsigned
165 dispatch_width() const
166 {
167 return _dispatch_width;
168 }
169
170 /**
171 * Get the channel group in use.
172 */
173 unsigned
174 group() const
175 {
176 return _group;
177 }
178
179 /**
180 * Allocate a virtual register of natural vector size (one for this IR)
181 * and SIMD width. \p n gives the amount of space to allocate in
182 * dispatch_width units (which is just enough space for one logical
183 * component in this IR).
184 */
185 dst_reg
186 vgrf(enum brw_reg_type type, unsigned n = 1) const
187 {
188 assert(dispatch_width() <= 32);
189
190 if (n > 0)
191 return dst_reg(VGRF, shader->alloc.allocate(
192 DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
193 REG_SIZE)),
194 type);
195 else
196 return retype(null_reg_ud(), type);
197 }
198
199 /**
200 * Create a null register of floating type.
201 */
202 dst_reg
203 null_reg_f() const
204 {
205 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
206 }
207
208 dst_reg
209 null_reg_df() const
210 {
211 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
212 }
213
214 /**
215 * Create a null register of signed integer type.
216 */
217 dst_reg
218 null_reg_d() const
219 {
220 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
221 }
222
223 /**
224 * Create a null register of unsigned integer type.
225 */
226 dst_reg
227 null_reg_ud() const
228 {
229 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
230 }
231
232 /**
233 * Get the mask of SIMD channels enabled by dispatch and not yet
234 * disabled by discard.
235 */
236 src_reg
237 sample_mask_reg() const
238 {
239 assert(shader->stage != MESA_SHADER_FRAGMENT ||
240 group() + dispatch_width() <= 16);
241 if (shader->stage != MESA_SHADER_FRAGMENT) {
242 return brw_imm_d(0xffffffff);
243 } else if (((brw_wm_prog_data *)shader->stage_prog_data)->uses_kill) {
244 return brw_flag_reg(0, 1);
245 } else {
246 return retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD);
247 }
248 }
249
250 /**
251 * Insert an instruction into the program.
252 */
253 instruction *
254 emit(const instruction &inst) const
255 {
256 return emit(new(shader->mem_ctx) instruction(inst));
257 }
258
259 /**
260 * Create and insert a nullary control instruction into the program.
261 */
262 instruction *
263 emit(enum opcode opcode) const
264 {
265 return emit(instruction(opcode, dispatch_width()));
266 }
267
268 /**
269 * Create and insert a nullary instruction into the program.
270 */
271 instruction *
272 emit(enum opcode opcode, const dst_reg &dst) const
273 {
274 return emit(instruction(opcode, dispatch_width(), dst));
275 }
276
277 /**
278 * Create and insert a unary instruction into the program.
279 */
280 instruction *
281 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
282 {
283 switch (opcode) {
284 case SHADER_OPCODE_RCP:
285 case SHADER_OPCODE_RSQ:
286 case SHADER_OPCODE_SQRT:
287 case SHADER_OPCODE_EXP2:
288 case SHADER_OPCODE_LOG2:
289 case SHADER_OPCODE_SIN:
290 case SHADER_OPCODE_COS:
291 return emit(instruction(opcode, dispatch_width(), dst,
292 fix_math_operand(src0)));
293
294 default:
295 return emit(instruction(opcode, dispatch_width(), dst, src0));
296 }
297 }
298
299 /**
300 * Create and insert a binary instruction into the program.
301 */
302 instruction *
303 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
304 const src_reg &src1) const
305 {
306 switch (opcode) {
307 case SHADER_OPCODE_POW:
308 case SHADER_OPCODE_INT_QUOTIENT:
309 case SHADER_OPCODE_INT_REMAINDER:
310 return emit(instruction(opcode, dispatch_width(), dst,
311 fix_math_operand(src0),
312 fix_math_operand(src1)));
313
314 default:
315 return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
316
317 }
318 }
319
320 /**
321 * Create and insert a ternary instruction into the program.
322 */
323 instruction *
324 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
325 const src_reg &src1, const src_reg &src2) const
326 {
327 switch (opcode) {
328 case BRW_OPCODE_BFE:
329 case BRW_OPCODE_BFI2:
330 case BRW_OPCODE_MAD:
331 case BRW_OPCODE_LRP:
332 return emit(instruction(opcode, dispatch_width(), dst,
333 fix_3src_operand(src0),
334 fix_3src_operand(src1),
335 fix_3src_operand(src2)));
336
337 default:
338 return emit(instruction(opcode, dispatch_width(), dst,
339 src0, src1, src2));
340 }
341 }
342
343 /**
344 * Create and insert an instruction with a variable number of sources
345 * into the program.
346 */
347 instruction *
348 emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
349 unsigned n) const
350 {
351 return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
352 }
353
354 /**
355 * Insert a preallocated instruction into the program.
356 */
357 instruction *
358 emit(instruction *inst) const
359 {
360 assert(inst->exec_size <= 32);
361 assert(inst->exec_size == dispatch_width() ||
362 force_writemask_all);
363
364 inst->group = _group;
365 inst->force_writemask_all = force_writemask_all;
366 inst->annotation = annotation.str;
367 inst->ir = annotation.ir;
368
369 if (block)
370 static_cast<instruction *>(cursor)->insert_before(block, inst);
371 else
372 cursor->insert_before(inst);
373
374 return inst;
375 }
376
377 /**
378 * Select \p src0 if the comparison of both sources with the given
379 * conditional mod evaluates to true, otherwise select \p src1.
380 *
381 * Generally useful to get the minimum or maximum of two values.
382 */
383 instruction *
384 emit_minmax(const dst_reg &dst, const src_reg &src0,
385 const src_reg &src1, brw_conditional_mod mod) const
386 {
387 assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
388
389 return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
390 fix_unsigned_negate(src1)));
391 }
392
393 /**
394 * Copy any live channel from \p src to the first channel of the result.
395 */
396 src_reg
397 emit_uniformize(const src_reg &src) const
398 {
399 /* FIXME: We use a vector chan_index and dst to allow constant and
400 * copy propagration to move result all the way into the consuming
401 * instruction (typically a surface index or sampler index for a
402 * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
403 * dispatch. Once we teach const/copy propagation about scalars we
404 * should go back to scalar destinations here.
405 */
406 const fs_builder ubld = exec_all();
407 const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
408 const dst_reg dst = vgrf(src.type);
409
410 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
411 ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
412
413 return src_reg(component(dst, 0));
414 }
415
416 /**
417 * Assorted arithmetic ops.
418 * @{
419 */
420 #define ALU1(op) \
421 instruction * \
422 op(const dst_reg &dst, const src_reg &src0) const \
423 { \
424 return emit(BRW_OPCODE_##op, dst, src0); \
425 }
426
427 #define ALU2(op) \
428 instruction * \
429 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
430 { \
431 return emit(BRW_OPCODE_##op, dst, src0, src1); \
432 }
433
434 #define ALU2_ACC(op) \
435 instruction * \
436 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
437 { \
438 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
439 inst->writes_accumulator = true; \
440 return inst; \
441 }
442
443 #define ALU3(op) \
444 instruction * \
445 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
446 const src_reg &src2) const \
447 { \
448 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
449 }
450
451 ALU2(ADD)
452 ALU2_ACC(ADDC)
453 ALU2(AND)
454 ALU2(ASR)
455 ALU2(AVG)
456 ALU3(BFE)
457 ALU2(BFI1)
458 ALU3(BFI2)
459 ALU1(BFREV)
460 ALU1(CBIT)
461 ALU2(CMPN)
462 ALU3(CSEL)
463 ALU1(DIM)
464 ALU2(DP2)
465 ALU2(DP3)
466 ALU2(DP4)
467 ALU2(DPH)
468 ALU1(F16TO32)
469 ALU1(F32TO16)
470 ALU1(FBH)
471 ALU1(FBL)
472 ALU1(FRC)
473 ALU2(LINE)
474 ALU1(LZD)
475 ALU2(MAC)
476 ALU2_ACC(MACH)
477 ALU3(MAD)
478 ALU1(MOV)
479 ALU2(MUL)
480 ALU1(NOT)
481 ALU2(OR)
482 ALU2(PLN)
483 ALU1(RNDD)
484 ALU1(RNDE)
485 ALU1(RNDU)
486 ALU1(RNDZ)
487 ALU2(SAD2)
488 ALU2_ACC(SADA2)
489 ALU2(SEL)
490 ALU2(SHL)
491 ALU2(SHR)
492 ALU2_ACC(SUBB)
493 ALU2(XOR)
494
495 #undef ALU3
496 #undef ALU2_ACC
497 #undef ALU2
498 #undef ALU1
499 /** @} */
500
501 /**
502 * CMP: Sets the low bit of the destination channels with the result
503 * of the comparison, while the upper bits are undefined, and updates
504 * the flag register with the packed 16 bits of the result.
505 */
506 instruction *
507 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
508 brw_conditional_mod condition) const
509 {
510 /* Take the instruction:
511 *
512 * CMP null<d> src0<f> src1<f>
513 *
514 * Original gen4 does type conversion to the destination type
515 * before comparison, producing garbage results for floating
516 * point comparisons.
517 *
518 * The destination type doesn't matter on newer generations,
519 * so we set the type to match src0 so we can compact the
520 * instruction.
521 */
522 return set_condmod(condition,
523 emit(BRW_OPCODE_CMP, retype(dst, src0.type),
524 fix_unsigned_negate(src0),
525 fix_unsigned_negate(src1)));
526 }
527
528 /**
529 * Gen4 predicated IF.
530 */
531 instruction *
532 IF(brw_predicate predicate) const
533 {
534 return set_predicate(predicate, emit(BRW_OPCODE_IF));
535 }
536
537 /**
538 * Emit a linear interpolation instruction.
539 */
540 instruction *
541 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
542 const src_reg &a) const
543 {
544 if (shader->devinfo->gen >= 6) {
545 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
546 * we need to reorder the operands.
547 */
548 return emit(BRW_OPCODE_LRP, dst, a, y, x);
549
550 } else {
551 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
552 const dst_reg y_times_a = vgrf(dst.type);
553 const dst_reg one_minus_a = vgrf(dst.type);
554 const dst_reg x_times_one_minus_a = vgrf(dst.type);
555
556 MUL(y_times_a, y, a);
557 ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
558 MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
559 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
560 }
561 }
562
563 /**
564 * Collect a number of registers in a contiguous range of registers.
565 */
566 instruction *
567 LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
568 unsigned sources, unsigned header_size) const
569 {
570 instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
571 inst->header_size = header_size;
572 inst->size_written = header_size * REG_SIZE;
573 for (unsigned i = header_size; i < sources; i++) {
574 inst->size_written +=
575 ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
576 REG_SIZE);
577 }
578
579 return inst;
580 }
581
582 backend_shader *shader;
583
584 private:
585 /**
586 * Workaround for negation of UD registers. See comment in
587 * fs_generator::generate_code() for more details.
588 */
589 src_reg
590 fix_unsigned_negate(const src_reg &src) const
591 {
592 if (src.type == BRW_REGISTER_TYPE_UD &&
593 src.negate) {
594 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
595 MOV(temp, src);
596 return src_reg(temp);
597 } else {
598 return src;
599 }
600 }
601
602 /**
603 * Workaround for source register modes not supported by the ternary
604 * instruction encoding.
605 */
606 src_reg
607 fix_3src_operand(const src_reg &src) const
608 {
609 if (src.file == VGRF || src.file == UNIFORM || src.stride > 1) {
610 return src;
611 } else {
612 dst_reg expanded = vgrf(src.type);
613 MOV(expanded, src);
614 return expanded;
615 }
616 }
617
618 /**
619 * Workaround for source register modes not supported by the math
620 * instruction.
621 */
622 src_reg
623 fix_math_operand(const src_reg &src) const
624 {
625 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
626 * might be able to do better by doing execsize = 1 math and then
627 * expanding that result out, but we would need to be careful with
628 * masking.
629 *
630 * Gen6 hardware ignores source modifiers (negate and abs) on math
631 * instructions, so we also move to a temp to set those up.
632 *
633 * Gen7 relaxes most of the above restrictions, but still can't use IMM
634 * operands to math
635 */
636 if ((shader->devinfo->gen == 6 &&
637 (src.file == IMM || src.file == UNIFORM ||
638 src.abs || src.negate)) ||
639 (shader->devinfo->gen == 7 && src.file == IMM)) {
640 const dst_reg tmp = vgrf(src.type);
641 MOV(tmp, src);
642 return tmp;
643 } else {
644 return src;
645 }
646 }
647
648 bblock_t *block;
649 exec_node *cursor;
650
651 unsigned _dispatch_width;
652 unsigned _group;
653 bool force_writemask_all;
654
655 /** Debug annotation info. */
656 struct {
657 const char *str;
658 const void *ir;
659 } annotation;
660 };
661 }
662
663 #endif