intel/compiler: Allow MESA_SHADER_KERNEL
[mesa.git] / src / intel / compiler / brw_ir_fs.h
1 /* -*- c++ -*- */
2 /*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #ifndef BRW_IR_FS_H
26 #define BRW_IR_FS_H
27
28 #include "brw_shader.h"
29
30 class fs_inst;
31
32 class fs_reg : public backend_reg {
33 public:
34 DECLARE_RALLOC_CXX_OPERATORS(fs_reg)
35
36 void init();
37
38 fs_reg();
39 fs_reg(struct ::brw_reg reg);
40 fs_reg(enum brw_reg_file file, int nr);
41 fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type);
42
43 bool equals(const fs_reg &r) const;
44 bool negative_equals(const fs_reg &r) const;
45 bool is_contiguous() const;
46
47 /**
48 * Return the size in bytes of a single logical component of the
49 * register assuming the given execution width.
50 */
51 unsigned component_size(unsigned width) const;
52
53 /** Register region horizontal stride */
54 uint8_t stride;
55 };
56
57 static inline fs_reg
58 negate(fs_reg reg)
59 {
60 assert(reg.file != IMM);
61 reg.negate = !reg.negate;
62 return reg;
63 }
64
65 static inline fs_reg
66 retype(fs_reg reg, enum brw_reg_type type)
67 {
68 reg.type = type;
69 return reg;
70 }
71
72 static inline fs_reg
73 byte_offset(fs_reg reg, unsigned delta)
74 {
75 switch (reg.file) {
76 case BAD_FILE:
77 break;
78 case VGRF:
79 case ATTR:
80 case UNIFORM:
81 reg.offset += delta;
82 break;
83 case MRF: {
84 const unsigned suboffset = reg.offset + delta;
85 reg.nr += suboffset / REG_SIZE;
86 reg.offset = suboffset % REG_SIZE;
87 break;
88 }
89 case ARF:
90 case FIXED_GRF: {
91 const unsigned suboffset = reg.subnr + delta;
92 reg.nr += suboffset / REG_SIZE;
93 reg.subnr = suboffset % REG_SIZE;
94 break;
95 }
96 case IMM:
97 default:
98 assert(delta == 0);
99 }
100 return reg;
101 }
102
103 static inline fs_reg
104 horiz_offset(const fs_reg &reg, unsigned delta)
105 {
106 switch (reg.file) {
107 case BAD_FILE:
108 case UNIFORM:
109 case IMM:
110 /* These only have a single component that is implicitly splatted. A
111 * horizontal offset should be a harmless no-op.
112 * XXX - Handle vector immediates correctly.
113 */
114 return reg;
115 case VGRF:
116 case MRF:
117 case ATTR:
118 return byte_offset(reg, delta * reg.stride * type_sz(reg.type));
119 case ARF:
120 case FIXED_GRF:
121 if (reg.is_null()) {
122 return reg;
123 } else {
124 const unsigned stride = reg.hstride ? 1 << (reg.hstride - 1) : 0;
125 return byte_offset(reg, delta * stride * type_sz(reg.type));
126 }
127 }
128 unreachable("Invalid register file");
129 }
130
131 static inline fs_reg
132 offset(fs_reg reg, unsigned width, unsigned delta)
133 {
134 switch (reg.file) {
135 case BAD_FILE:
136 break;
137 case ARF:
138 case FIXED_GRF:
139 case MRF:
140 case VGRF:
141 case ATTR:
142 case UNIFORM:
143 return byte_offset(reg, delta * reg.component_size(width));
144 case IMM:
145 assert(delta == 0);
146 }
147 return reg;
148 }
149
150 /**
151 * Get the scalar channel of \p reg given by \p idx and replicate it to all
152 * channels of the result.
153 */
154 static inline fs_reg
155 component(fs_reg reg, unsigned idx)
156 {
157 reg = horiz_offset(reg, idx);
158 reg.stride = 0;
159 return reg;
160 }
161
162 /**
163 * Return an integer identifying the discrete address space a register is
164 * contained in. A register is by definition fully contained in the single
165 * reg_space it belongs to, so two registers with different reg_space ids are
166 * guaranteed not to overlap. Most register files are a single reg_space of
167 * its own, only the VGRF file is composed of multiple discrete address
168 * spaces, one for each VGRF allocation.
169 */
170 static inline uint32_t
171 reg_space(const fs_reg &r)
172 {
173 return r.file << 16 | (r.file == VGRF ? r.nr : 0);
174 }
175
176 /**
177 * Return the base offset in bytes of a register relative to the start of its
178 * reg_space().
179 */
180 static inline unsigned
181 reg_offset(const fs_reg &r)
182 {
183 return (r.file == VGRF || r.file == IMM ? 0 : r.nr) *
184 (r.file == UNIFORM ? 4 : REG_SIZE) + r.offset +
185 (r.file == ARF || r.file == FIXED_GRF ? r.subnr : 0);
186 }
187
188 /**
189 * Return the amount of padding in bytes left unused between individual
190 * components of register \p r due to a (horizontal) stride value greater than
191 * one, or zero if components are tightly packed in the register file.
192 */
193 static inline unsigned
194 reg_padding(const fs_reg &r)
195 {
196 const unsigned stride = ((r.file != ARF && r.file != FIXED_GRF) ? r.stride :
197 r.hstride == 0 ? 0 :
198 1 << (r.hstride - 1));
199 return (MAX2(1, stride) - 1) * type_sz(r.type);
200 }
201
202 /**
203 * Return whether the register region starting at \p r and spanning \p dr
204 * bytes could potentially overlap the register region starting at \p s and
205 * spanning \p ds bytes.
206 */
207 static inline bool
208 regions_overlap(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
209 {
210 if (r.file == MRF && (r.nr & BRW_MRF_COMPR4)) {
211 fs_reg t = r;
212 t.nr &= ~BRW_MRF_COMPR4;
213 /* COMPR4 regions are translated by the hardware during decompression
214 * into two separate half-regions 4 MRFs apart from each other.
215 */
216 return regions_overlap(t, dr / 2, s, ds) ||
217 regions_overlap(byte_offset(t, 4 * REG_SIZE), dr / 2, s, ds);
218
219 } else if (s.file == MRF && (s.nr & BRW_MRF_COMPR4)) {
220 return regions_overlap(s, ds, r, dr);
221
222 } else {
223 return reg_space(r) == reg_space(s) &&
224 !(reg_offset(r) + dr <= reg_offset(s) ||
225 reg_offset(s) + ds <= reg_offset(r));
226 }
227 }
228
229 /**
230 * Check that the register region given by r [r.offset, r.offset + dr[
231 * is fully contained inside the register region given by s
232 * [s.offset, s.offset + ds[.
233 */
234 static inline bool
235 region_contained_in(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
236 {
237 return reg_space(r) == reg_space(s) &&
238 reg_offset(r) >= reg_offset(s) &&
239 reg_offset(r) + dr <= reg_offset(s) + ds;
240 }
241
242 /**
243 * Return whether the given register region is n-periodic, i.e. whether the
244 * original region remains invariant after shifting it by \p n scalar
245 * channels.
246 */
247 static inline bool
248 is_periodic(const fs_reg &reg, unsigned n)
249 {
250 if (reg.file == BAD_FILE || reg.is_null()) {
251 return true;
252
253 } else if (reg.file == IMM) {
254 const unsigned period = (reg.type == BRW_REGISTER_TYPE_UV ||
255 reg.type == BRW_REGISTER_TYPE_V ? 8 :
256 reg.type == BRW_REGISTER_TYPE_VF ? 4 :
257 1);
258 return n % period == 0;
259
260 } else if (reg.file == ARF || reg.file == FIXED_GRF) {
261 const unsigned period = (reg.hstride == 0 && reg.vstride == 0 ? 1 :
262 reg.vstride == 0 ? 1 << reg.width :
263 ~0);
264 return n % period == 0;
265
266 } else {
267 return reg.stride == 0;
268 }
269 }
270
271 static inline bool
272 is_uniform(const fs_reg &reg)
273 {
274 return is_periodic(reg, 1);
275 }
276
277 /**
278 * Get the specified 8-component quarter of a register.
279 */
280 static inline fs_reg
281 quarter(const fs_reg &reg, unsigned idx)
282 {
283 assert(idx < 4);
284 return horiz_offset(reg, 8 * idx);
285 }
286
287 /**
288 * Reinterpret each channel of register \p reg as a vector of values of the
289 * given smaller type and take the i-th subcomponent from each.
290 */
291 static inline fs_reg
292 subscript(fs_reg reg, brw_reg_type type, unsigned i)
293 {
294 assert((i + 1) * type_sz(type) <= type_sz(reg.type));
295
296 if (reg.file == ARF || reg.file == FIXED_GRF) {
297 /* The stride is encoded inconsistently for fixed GRF and ARF registers
298 * as the log2 of the actual vertical and horizontal strides.
299 */
300 const int delta = util_logbase2(type_sz(reg.type)) -
301 util_logbase2(type_sz(type));
302 reg.hstride += (reg.hstride ? delta : 0);
303 reg.vstride += (reg.vstride ? delta : 0);
304
305 } else if (reg.file == IMM) {
306 assert(reg.type == type);
307
308 } else {
309 reg.stride *= type_sz(reg.type) / type_sz(type);
310 }
311
312 return byte_offset(retype(reg, type), i * type_sz(type));
313 }
314
315 static inline fs_reg
316 horiz_stride(fs_reg reg, unsigned s)
317 {
318 reg.stride *= s;
319 return reg;
320 }
321
322 static const fs_reg reg_undef;
323
324 class fs_inst : public backend_instruction {
325 fs_inst &operator=(const fs_inst &);
326
327 void init(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
328 const fs_reg *src, unsigned sources);
329
330 public:
331 DECLARE_RALLOC_CXX_OPERATORS(fs_inst)
332
333 fs_inst();
334 fs_inst(enum opcode opcode, uint8_t exec_size);
335 fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst);
336 fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
337 const fs_reg &src0);
338 fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
339 const fs_reg &src0, const fs_reg &src1);
340 fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
341 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2);
342 fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
343 const fs_reg src[], unsigned sources);
344 fs_inst(const fs_inst &that);
345 ~fs_inst();
346
347 void resize_sources(uint8_t num_sources);
348
349 bool is_send_from_grf() const;
350 bool is_payload(unsigned arg) const;
351 bool is_partial_write() const;
352 unsigned components_read(unsigned i) const;
353 unsigned size_read(int arg) const;
354 bool can_do_source_mods(const struct gen_device_info *devinfo) const;
355 bool can_do_cmod();
356 bool can_change_types() const;
357 bool has_source_and_destination_hazard() const;
358 unsigned implied_mrf_writes() const;
359
360 /**
361 * Return whether \p arg is a control source of a virtual instruction which
362 * shouldn't contribute to the execution type and usual regioning
363 * restriction calculations of arithmetic instructions.
364 */
365 bool is_control_source(unsigned arg) const;
366
367 /**
368 * Return the subset of flag registers read by the instruction as a bitset
369 * with byte granularity.
370 */
371 unsigned flags_read(const gen_device_info *devinfo) const;
372
373 /**
374 * Return the subset of flag registers updated by the instruction (either
375 * partially or fully) as a bitset with byte granularity.
376 */
377 unsigned flags_written() const;
378
379 fs_reg dst;
380 fs_reg *src;
381
382 uint8_t sources; /**< Number of fs_reg sources. */
383
384 bool last_rt:1;
385 bool pi_noperspective:1; /**< Pixel interpolator noperspective flag */
386
387 tgl_swsb sched; /**< Scheduling info. */
388 };
389
390 /**
391 * Make the execution of \p inst dependent on the evaluation of a possibly
392 * inverted predicate.
393 */
394 static inline fs_inst *
395 set_predicate_inv(enum brw_predicate pred, bool inverse,
396 fs_inst *inst)
397 {
398 inst->predicate = pred;
399 inst->predicate_inverse = inverse;
400 return inst;
401 }
402
403 /**
404 * Make the execution of \p inst dependent on the evaluation of a predicate.
405 */
406 static inline fs_inst *
407 set_predicate(enum brw_predicate pred, fs_inst *inst)
408 {
409 return set_predicate_inv(pred, false, inst);
410 }
411
412 /**
413 * Write the result of evaluating the condition given by \p mod to a flag
414 * register.
415 */
416 static inline fs_inst *
417 set_condmod(enum brw_conditional_mod mod, fs_inst *inst)
418 {
419 inst->conditional_mod = mod;
420 return inst;
421 }
422
423 /**
424 * Clamp the result of \p inst to the saturation range of its destination
425 * datatype.
426 */
427 static inline fs_inst *
428 set_saturate(bool saturate, fs_inst *inst)
429 {
430 inst->saturate = saturate;
431 return inst;
432 }
433
434 /**
435 * Return the number of dataflow registers written by the instruction (either
436 * fully or partially) counted from 'floor(reg_offset(inst->dst) /
437 * register_size)'. The somewhat arbitrary register size unit is 4B for the
438 * UNIFORM and IMM files and 32B for all other files.
439 */
440 inline unsigned
441 regs_written(const fs_inst *inst)
442 {
443 assert(inst->dst.file != UNIFORM && inst->dst.file != IMM);
444 return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE +
445 inst->size_written -
446 MIN2(inst->size_written, reg_padding(inst->dst)),
447 REG_SIZE);
448 }
449
450 /**
451 * Return the number of dataflow registers read by the instruction (either
452 * fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
453 * register_size)'. The somewhat arbitrary register size unit is 4B for the
454 * UNIFORM and IMM files and 32B for all other files.
455 */
456 inline unsigned
457 regs_read(const fs_inst *inst, unsigned i)
458 {
459 const unsigned reg_size =
460 inst->src[i].file == UNIFORM || inst->src[i].file == IMM ? 4 : REG_SIZE;
461 return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size +
462 inst->size_read(i) -
463 MIN2(inst->size_read(i), reg_padding(inst->src[i])),
464 reg_size);
465 }
466
467 static inline enum brw_reg_type
468 get_exec_type(const fs_inst *inst)
469 {
470 brw_reg_type exec_type = BRW_REGISTER_TYPE_B;
471
472 for (int i = 0; i < inst->sources; i++) {
473 if (inst->src[i].file != BAD_FILE &&
474 !inst->is_control_source(i)) {
475 const brw_reg_type t = get_exec_type(inst->src[i].type);
476 if (type_sz(t) > type_sz(exec_type))
477 exec_type = t;
478 else if (type_sz(t) == type_sz(exec_type) &&
479 brw_reg_type_is_floating_point(t))
480 exec_type = t;
481 }
482 }
483
484 if (exec_type == BRW_REGISTER_TYPE_B)
485 exec_type = inst->dst.type;
486
487 assert(exec_type != BRW_REGISTER_TYPE_B);
488
489 /* Promotion of the execution type to 32-bit for conversions from or to
490 * half-float seems to be consistent with the following text from the
491 * Cherryview PRM Vol. 7, "Execution Data Type":
492 *
493 * "When single precision and half precision floats are mixed between
494 * source operands or between source and destination operand [..] single
495 * precision float is the execution datatype."
496 *
497 * and from "Register Region Restrictions":
498 *
499 * "Conversion between Integer and HF (Half Float) must be DWord aligned
500 * and strided by a DWord on the destination."
501 */
502 if (type_sz(exec_type) == 2 &&
503 inst->dst.type != exec_type) {
504 if (exec_type == BRW_REGISTER_TYPE_HF)
505 exec_type = BRW_REGISTER_TYPE_F;
506 else if (inst->dst.type == BRW_REGISTER_TYPE_HF)
507 exec_type = BRW_REGISTER_TYPE_D;
508 }
509
510 return exec_type;
511 }
512
513 static inline unsigned
514 get_exec_type_size(const fs_inst *inst)
515 {
516 return type_sz(get_exec_type(inst));
517 }
518
519 static inline bool
520 is_send(const fs_inst *inst)
521 {
522 return inst->mlen || inst->is_send_from_grf();
523 }
524
525 /**
526 * Return whether the instruction isn't an ALU instruction and cannot be
527 * assumed to complete in-order.
528 */
529 static inline bool
530 is_unordered(const fs_inst *inst)
531 {
532 return is_send(inst) || inst->is_math();
533 }
534
535 /**
536 * Return whether the following regioning restriction applies to the specified
537 * instruction. From the Cherryview PRM Vol 7. "Register Region
538 * Restrictions":
539 *
540 * "When source or destination datatype is 64b or operation is integer DWord
541 * multiply, regioning in Align1 must follow these rules:
542 *
543 * 1. Source and Destination horizontal stride must be aligned to the same qword.
544 * 2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride.
545 * 3. Source and Destination offset must be the same, except the case of
546 * scalar source."
547 */
548 static inline bool
549 has_dst_aligned_region_restriction(const gen_device_info *devinfo,
550 const fs_inst *inst)
551 {
552 const brw_reg_type exec_type = get_exec_type(inst);
553 /* Even though the hardware spec claims that "integer DWord multiply"
554 * operations are restricted, empirical evidence and the behavior of the
555 * simulator suggest that only 32x32-bit integer multiplication is
556 * restricted.
557 */
558 const bool is_dword_multiply = !brw_reg_type_is_floating_point(exec_type) &&
559 ((inst->opcode == BRW_OPCODE_MUL &&
560 MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4) ||
561 (inst->opcode == BRW_OPCODE_MAD &&
562 MIN2(type_sz(inst->src[1].type), type_sz(inst->src[2].type)) >= 4));
563
564 if (type_sz(inst->dst.type) > 4 || type_sz(exec_type) > 4 ||
565 (type_sz(exec_type) == 4 && is_dword_multiply))
566 return devinfo->is_cherryview || gen_device_info_is_9lp(devinfo);
567 else
568 return false;
569 }
570
571 /**
572 * Return whether the LOAD_PAYLOAD instruction is a plain copy of bits from
573 * the specified register file into a VGRF.
574 *
575 * This implies identity register regions without any source-destination
576 * overlap, but otherwise has no implications on the location of sources and
577 * destination in the register file: Gathering any number of portions from
578 * multiple virtual registers in any order is allowed.
579 */
580 inline bool
581 is_copy_payload(brw_reg_file file, const fs_inst *inst)
582 {
583 if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD ||
584 inst->is_partial_write() || inst->saturate ||
585 inst->dst.file != VGRF)
586 return false;
587
588 for (unsigned i = 0; i < inst->sources; i++) {
589 if (inst->src[i].file != file ||
590 inst->src[i].abs || inst->src[i].negate)
591 return false;
592
593 if (!inst->src[i].is_contiguous())
594 return false;
595
596 if (regions_overlap(inst->dst, inst->size_written,
597 inst->src[i], inst->size_read(i)))
598 return false;
599 }
600
601 return true;
602 }
603
604 /**
605 * Like is_copy_payload(), but the instruction is required to copy a single
606 * contiguous block of registers from the given register file into the
607 * destination without any reordering.
608 */
609 inline bool
610 is_identity_payload(brw_reg_file file, const fs_inst *inst) {
611 if (is_copy_payload(file, inst)) {
612 fs_reg reg = inst->src[0];
613
614 for (unsigned i = 0; i < inst->sources; i++) {
615 reg.type = inst->src[i].type;
616 if (!inst->src[i].equals(reg))
617 return false;
618
619 reg = byte_offset(reg, inst->size_read(i));
620 }
621
622 return true;
623 } else {
624 return false;
625 }
626 }
627
628 /**
629 * Like is_copy_payload(), but the instruction is required to source data from
630 * at least two disjoint VGRFs.
631 *
632 * This doesn't necessarily rule out the elimination of this instruction
633 * through register coalescing, but due to limitations of the register
634 * coalesce pass it might be impossible to do so directly until a later stage,
635 * when the LOAD_PAYLOAD instruction is unrolled into a sequence of MOV
636 * instructions.
637 */
638 inline bool
639 is_multi_copy_payload(const fs_inst *inst) {
640 if (is_copy_payload(VGRF, inst)) {
641 for (unsigned i = 0; i < inst->sources; i++) {
642 if (inst->src[i].nr != inst->src[0].nr)
643 return true;
644 }
645 }
646
647 return false;
648 }
649
650 /**
651 * Like is_identity_payload(), but the instruction is required to copy the
652 * whole contents of a single VGRF into the destination.
653 *
654 * This means that there is a good chance that the instruction will be
655 * eliminated through register coalescing, but it's neither a necessary nor a
656 * sufficient condition for that to happen -- E.g. consider the case where
657 * source and destination registers diverge due to other instructions in the
658 * program overwriting part of their contents, which isn't something we can
659 * predict up front based on a cheap strictly local test of the copy
660 * instruction.
661 */
662 inline bool
663 is_coalescing_payload(const brw::simple_allocator &alloc, const fs_inst *inst)
664 {
665 return is_identity_payload(VGRF, inst) &&
666 inst->src[0].offset == 0 &&
667 alloc.sizes[inst->src[0].nr] * REG_SIZE == inst->size_written;
668 }
669
670 bool
671 has_bank_conflict(const gen_device_info *devinfo, const fs_inst *inst);
672
673 #endif