i965/compaction: Don't set UIP on ELSE on Gen < 8.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_eu_compact.c
1 /*
2 * Copyright © 2012 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_eu_compact.c
25 *
26 * Instruction compaction is a feature of gm45 and newer hardware that allows
27 * for a smaller instruction encoding.
28 *
29 * The instruction cache is on the order of 32KB, and many programs generate
30 * far more instructions than that. The instruction cache is built to barely
31 * keep up with instruction dispatch abaility in cache hit cases -- L1
32 * instruction cache misses that still hit in the next level could limit
33 * throughput by around 50%.
34 *
35 * The idea of instruction compaction is that most instructions use a tiny
36 * subset of the GPU functionality, so we can encode what would be a 16 byte
37 * instruction in 8 bytes using some lookup tables for various fields.
38 */
39
40 #include "brw_context.h"
41 #include "brw_eu.h"
42 #include "intel_asm_annotation.h"
43
44 static const uint32_t gen6_control_index_table[32] = {
45 0b00000000000000000,
46 0b01000000000000000,
47 0b00110000000000000,
48 0b00000000100000000,
49 0b00010000000000000,
50 0b00001000100000000,
51 0b00000000100000010,
52 0b00000000000000010,
53 0b01000000100000000,
54 0b01010000000000000,
55 0b10110000000000000,
56 0b00100000000000000,
57 0b11010000000000000,
58 0b11000000000000000,
59 0b01001000100000000,
60 0b01000000000001000,
61 0b01000000000000100,
62 0b00000000000001000,
63 0b00000000000000100,
64 0b00111000100000000,
65 0b00001000100000010,
66 0b00110000100000000,
67 0b00110000000000001,
68 0b00100000000000001,
69 0b00110000000000010,
70 0b00110000000000101,
71 0b00110000000001001,
72 0b00110000000010000,
73 0b00110000000000011,
74 0b00110000000000100,
75 0b00110000100001000,
76 0b00100000000001001
77 };
78
79 static const uint32_t gen6_datatype_table[32] = {
80 0b001001110000000000,
81 0b001000110000100000,
82 0b001001110000000001,
83 0b001000000001100000,
84 0b001010110100101001,
85 0b001000000110101101,
86 0b001100011000101100,
87 0b001011110110101101,
88 0b001000000111101100,
89 0b001000000001100001,
90 0b001000110010100101,
91 0b001000000001000001,
92 0b001000001000110001,
93 0b001000001000101001,
94 0b001000000000100000,
95 0b001000001000110010,
96 0b001010010100101001,
97 0b001011010010100101,
98 0b001000000110100101,
99 0b001100011000101001,
100 0b001011011000101100,
101 0b001011010110100101,
102 0b001011110110100101,
103 0b001111011110111101,
104 0b001111011110111100,
105 0b001111011110111101,
106 0b001111011110011101,
107 0b001111011110111110,
108 0b001000000000100001,
109 0b001000000000100010,
110 0b001001111111011101,
111 0b001000001110111110,
112 };
113
114 static const uint16_t gen6_subreg_table[32] = {
115 0b000000000000000,
116 0b000000000000100,
117 0b000000110000000,
118 0b111000000000000,
119 0b011110000001000,
120 0b000010000000000,
121 0b000000000010000,
122 0b000110000001100,
123 0b001000000000000,
124 0b000001000000000,
125 0b000001010010100,
126 0b000000001010110,
127 0b010000000000000,
128 0b110000000000000,
129 0b000100000000000,
130 0b000000010000000,
131 0b000000000001000,
132 0b100000000000000,
133 0b000001010000000,
134 0b001010000000000,
135 0b001100000000000,
136 0b000000001010100,
137 0b101101010010100,
138 0b010100000000000,
139 0b000000010001111,
140 0b011000000000000,
141 0b111110000000000,
142 0b101000000000000,
143 0b000000000001111,
144 0b000100010001111,
145 0b001000010001111,
146 0b000110000000000,
147 };
148
149 static const uint16_t gen6_src_index_table[32] = {
150 0b000000000000,
151 0b010110001000,
152 0b010001101000,
153 0b001000101000,
154 0b011010010000,
155 0b000100100000,
156 0b010001101100,
157 0b010101110000,
158 0b011001111000,
159 0b001100101000,
160 0b010110001100,
161 0b001000100000,
162 0b010110001010,
163 0b000000000010,
164 0b010101010000,
165 0b010101101000,
166 0b111101001100,
167 0b111100101100,
168 0b011001110000,
169 0b010110001001,
170 0b010101011000,
171 0b001101001000,
172 0b010000101100,
173 0b010000000000,
174 0b001101110000,
175 0b001100010000,
176 0b001100000000,
177 0b010001101010,
178 0b001101111000,
179 0b000001110000,
180 0b001100100000,
181 0b001101010000,
182 };
183
184 static const uint32_t gen7_control_index_table[32] = {
185 0b0000000000000000010,
186 0b0000100000000000000,
187 0b0000100000000000001,
188 0b0000100000000000010,
189 0b0000100000000000011,
190 0b0000100000000000100,
191 0b0000100000000000101,
192 0b0000100000000000111,
193 0b0000100000000001000,
194 0b0000100000000001001,
195 0b0000100000000001101,
196 0b0000110000000000000,
197 0b0000110000000000001,
198 0b0000110000000000010,
199 0b0000110000000000011,
200 0b0000110000000000100,
201 0b0000110000000000101,
202 0b0000110000000000111,
203 0b0000110000000001001,
204 0b0000110000000001101,
205 0b0000110000000010000,
206 0b0000110000100000000,
207 0b0001000000000000000,
208 0b0001000000000000010,
209 0b0001000000000000100,
210 0b0001000000100000000,
211 0b0010110000000000000,
212 0b0010110000000010000,
213 0b0011000000000000000,
214 0b0011000000100000000,
215 0b0101000000000000000,
216 0b0101000000100000000
217 };
218
219 static const uint32_t gen7_datatype_table[32] = {
220 0b001000000000000001,
221 0b001000000000100000,
222 0b001000000000100001,
223 0b001000000001100001,
224 0b001000000010111101,
225 0b001000001011111101,
226 0b001000001110100001,
227 0b001000001110100101,
228 0b001000001110111101,
229 0b001000010000100001,
230 0b001000110000100000,
231 0b001000110000100001,
232 0b001001010010100101,
233 0b001001110010100100,
234 0b001001110010100101,
235 0b001111001110111101,
236 0b001111011110011101,
237 0b001111011110111100,
238 0b001111011110111101,
239 0b001111111110111100,
240 0b000000001000001100,
241 0b001000000000111101,
242 0b001000000010100101,
243 0b001000010000100000,
244 0b001001010010100100,
245 0b001001110010000100,
246 0b001010010100001001,
247 0b001101111110111101,
248 0b001111111110111101,
249 0b001011110110101100,
250 0b001010010100101000,
251 0b001010110100101000
252 };
253
254 static const uint16_t gen7_subreg_table[32] = {
255 0b000000000000000,
256 0b000000000000001,
257 0b000000000001000,
258 0b000000000001111,
259 0b000000000010000,
260 0b000000010000000,
261 0b000000100000000,
262 0b000000110000000,
263 0b000001000000000,
264 0b000001000010000,
265 0b000010100000000,
266 0b001000000000000,
267 0b001000000000001,
268 0b001000010000001,
269 0b001000010000010,
270 0b001000010000011,
271 0b001000010000100,
272 0b001000010000111,
273 0b001000010001000,
274 0b001000010001110,
275 0b001000010001111,
276 0b001000110000000,
277 0b001000111101000,
278 0b010000000000000,
279 0b010000110000000,
280 0b011000000000000,
281 0b011110010000111,
282 0b100000000000000,
283 0b101000000000000,
284 0b110000000000000,
285 0b111000000000000,
286 0b111000000011100
287 };
288
289 static const uint16_t gen7_src_index_table[32] = {
290 0b000000000000,
291 0b000000000010,
292 0b000000010000,
293 0b000000010010,
294 0b000000011000,
295 0b000000100000,
296 0b000000101000,
297 0b000001001000,
298 0b000001010000,
299 0b000001110000,
300 0b000001111000,
301 0b001100000000,
302 0b001100000010,
303 0b001100001000,
304 0b001100010000,
305 0b001100010010,
306 0b001100100000,
307 0b001100101000,
308 0b001100111000,
309 0b001101000000,
310 0b001101000010,
311 0b001101001000,
312 0b001101010000,
313 0b001101100000,
314 0b001101101000,
315 0b001101110000,
316 0b001101110001,
317 0b001101111000,
318 0b010001101000,
319 0b010001101001,
320 0b010001101010,
321 0b010110001000
322 };
323
324 static const uint32_t gen8_control_index_table[32] = {
325 0b0000000000000000010,
326 0b0000100000000000000,
327 0b0000100000000000001,
328 0b0000100000000000010,
329 0b0000100000000000011,
330 0b0000100000000000100,
331 0b0000100000000000101,
332 0b0000100000000000111,
333 0b0000100000000001000,
334 0b0000100000000001001,
335 0b0000100000000001101,
336 0b0000110000000000000,
337 0b0000110000000000001,
338 0b0000110000000000010,
339 0b0000110000000000011,
340 0b0000110000000000100,
341 0b0000110000000000101,
342 0b0000110000000000111,
343 0b0000110000000001001,
344 0b0000110000000001101,
345 0b0000110000000010000,
346 0b0000110000100000000,
347 0b0001000000000000000,
348 0b0001000000000000010,
349 0b0001000000000000100,
350 0b0001000000100000000,
351 0b0010110000000000000,
352 0b0010110000000010000,
353 0b0011000000000000000,
354 0b0011000000100000000,
355 0b0101000000000000000,
356 0b0101000000100000000
357 };
358
359 static const uint32_t gen8_datatype_table[32] = {
360 0b001000000000000000001,
361 0b001000000000001000000,
362 0b001000000000001000001,
363 0b001000000000011000001,
364 0b001000000000101011101,
365 0b001000000010111011101,
366 0b001000000011101000001,
367 0b001000000011101000101,
368 0b001000000011101011101,
369 0b001000001000001000001,
370 0b001000011000001000000,
371 0b001000011000001000001,
372 0b001000101000101000101,
373 0b001000111000101000100,
374 0b001000111000101000101,
375 0b001011100011101011101,
376 0b001011101011100011101,
377 0b001011101011101011100,
378 0b001011101011101011101,
379 0b001011111011101011100,
380 0b000000000010000001100,
381 0b001000000000001011101,
382 0b001000000000101000101,
383 0b001000001000001000000,
384 0b001000101000101000100,
385 0b001000111000100000100,
386 0b001001001001000001001,
387 0b001010111011101011101,
388 0b001011111011101011101,
389 0b001001111001101001100,
390 0b001001001001001001000,
391 0b001001011001001001000
392 };
393
394 static const uint16_t gen8_subreg_table[32] = {
395 0b000000000000000,
396 0b000000000000001,
397 0b000000000001000,
398 0b000000000001111,
399 0b000000000010000,
400 0b000000010000000,
401 0b000000100000000,
402 0b000000110000000,
403 0b000001000000000,
404 0b000001000010000,
405 0b000001010000000,
406 0b001000000000000,
407 0b001000000000001,
408 0b001000010000001,
409 0b001000010000010,
410 0b001000010000011,
411 0b001000010000100,
412 0b001000010000111,
413 0b001000010001000,
414 0b001000010001110,
415 0b001000010001111,
416 0b001000110000000,
417 0b001000111101000,
418 0b010000000000000,
419 0b010000110000000,
420 0b011000000000000,
421 0b011110010000111,
422 0b100000000000000,
423 0b101000000000000,
424 0b110000000000000,
425 0b111000000000000,
426 0b111000000011100
427 };
428
429 static const uint16_t gen8_src_index_table[32] = {
430 0b000000000000,
431 0b000000000010,
432 0b000000010000,
433 0b000000010010,
434 0b000000011000,
435 0b000000100000,
436 0b000000101000,
437 0b000001001000,
438 0b000001010000,
439 0b000001110000,
440 0b000001111000,
441 0b001100000000,
442 0b001100000010,
443 0b001100001000,
444 0b001100010000,
445 0b001100010010,
446 0b001100100000,
447 0b001100101000,
448 0b001100111000,
449 0b001101000000,
450 0b001101000010,
451 0b001101001000,
452 0b001101010000,
453 0b001101100000,
454 0b001101101000,
455 0b001101110000,
456 0b001101110001,
457 0b001101111000,
458 0b010001101000,
459 0b010001101001,
460 0b010001101010,
461 0b010110001000
462 };
463
464 /* This is actually the control index table for Cherryview (26 bits), but the
465 * only difference from Broadwell (24 bits) is that it has two extra 0-bits at
466 * the start.
467 *
468 * The low 24 bits have the same mappings on both hardware.
469 */
470 static const uint32_t gen8_3src_control_index_table[4] = {
471 0b00100000000110000000000001,
472 0b00000000000110000000000001,
473 0b00000000001000000000000001,
474 0b00000000001000000000100001
475 };
476
477 /* This is actually the control index table for Cherryview (49 bits), but the
478 * only difference from Broadwell (46 bits) is that it has three extra 0-bits
479 * at the start.
480 *
481 * The low 44 bits have the same mappings on both hardware, and since the high
482 * three bits on Broadwell are zero, we can reuse Cherryview's table.
483 */
484 static const uint64_t gen8_3src_source_index_table[4] = {
485 0b0000001110010011100100111001000001111000000000000,
486 0b0000001110010011100100111001000001111000000000010,
487 0b0000001110010011100100111001000001111000000001000,
488 0b0000001110010011100100111001000001111000000100000
489 };
490
491 static const uint32_t *control_index_table;
492 static const uint32_t *datatype_table;
493 static const uint16_t *subreg_table;
494 static const uint16_t *src_index_table;
495
496 static bool
497 set_control_index(struct brw_context *brw, brw_compact_inst *dst, brw_inst *src)
498 {
499 uint32_t uncompacted = brw->gen >= 8 /* 17b/SNB; 19b/IVB+ */
500 ? (brw_inst_bits(src, 33, 31) << 16) | /* 3b */
501 (brw_inst_bits(src, 23, 12) << 4) | /* 12b */
502 (brw_inst_bits(src, 10, 9) << 2) | /* 2b */
503 (brw_inst_bits(src, 34, 34) << 1) | /* 1b */
504 (brw_inst_bits(src, 8, 8)) /* 1b */
505 : (brw_inst_bits(src, 31, 31) << 16) | /* 1b */
506 (brw_inst_bits(src, 23, 8)); /* 16b */
507
508 /* On gen7, the flag register and subregister numbers are integrated into
509 * the control index.
510 */
511 if (brw->gen == 7)
512 uncompacted |= brw_inst_bits(src, 90, 89) << 17; /* 2b */
513
514 for (int i = 0; i < 32; i++) {
515 if (control_index_table[i] == uncompacted) {
516 brw_compact_inst_set_control_index(dst, i);
517 return true;
518 }
519 }
520
521 return false;
522 }
523
524 static bool
525 set_datatype_index(struct brw_context *brw, brw_compact_inst *dst,
526 brw_inst *src)
527 {
528 uint32_t uncompacted = brw->gen >= 8 /* 18b/SNB+; 21b/BDW+ */
529 ? (brw_inst_bits(src, 63, 61) << 18) | /* 3b */
530 (brw_inst_bits(src, 94, 89) << 12) | /* 6b */
531 (brw_inst_bits(src, 46, 35)) /* 12b */
532 : (brw_inst_bits(src, 63, 61) << 15) | /* 3b */
533 (brw_inst_bits(src, 46, 32)); /* 15b */
534
535 for (int i = 0; i < 32; i++) {
536 if (datatype_table[i] == uncompacted) {
537 brw_compact_inst_set_datatype_index(dst, i);
538 return true;
539 }
540 }
541
542 return false;
543 }
544
545 static bool
546 set_subreg_index(struct brw_context *brw, brw_compact_inst *dst, brw_inst *src,
547 bool is_immediate)
548 {
549 uint16_t uncompacted = /* 15b */
550 (brw_inst_bits(src, 52, 48) << 0) | /* 5b */
551 (brw_inst_bits(src, 68, 64) << 5); /* 5b */
552
553 if (!is_immediate)
554 uncompacted |= brw_inst_bits(src, 100, 96) << 10; /* 5b */
555
556 for (int i = 0; i < 32; i++) {
557 if (subreg_table[i] == uncompacted) {
558 brw_compact_inst_set_subreg_index(dst, i);
559 return true;
560 }
561 }
562
563 return false;
564 }
565
566 static bool
567 get_src_index(uint16_t uncompacted,
568 uint16_t *compacted)
569 {
570 for (int i = 0; i < 32; i++) {
571 if (src_index_table[i] == uncompacted) {
572 *compacted = i;
573 return true;
574 }
575 }
576
577 return false;
578 }
579
580 static bool
581 set_src0_index(struct brw_context *brw, brw_compact_inst *dst, brw_inst *src)
582 {
583 uint16_t compacted;
584 uint16_t uncompacted = brw_inst_bits(src, 88, 77); /* 12b */
585
586 if (!get_src_index(uncompacted, &compacted))
587 return false;
588
589 brw_compact_inst_set_src0_index(dst, compacted);
590
591 return true;
592 }
593
594 static bool
595 set_src1_index(struct brw_context *brw, brw_compact_inst *dst, brw_inst *src,
596 bool is_immediate)
597 {
598 uint16_t compacted;
599
600 if (is_immediate) {
601 compacted = (brw_inst_imm_ud(brw, src) >> 8) & 0x1f;
602 } else {
603 uint16_t uncompacted = brw_inst_bits(src, 120, 109); /* 12b */
604
605 if (!get_src_index(uncompacted, &compacted))
606 return false;
607 }
608
609 brw_compact_inst_set_src1_index(dst, compacted);
610
611 return true;
612 }
613
614 static bool
615 set_3src_control_index(struct brw_context *brw, brw_compact_inst *dst, brw_inst *src)
616 {
617 assert(brw->gen >= 8);
618
619 uint32_t uncompacted = /* 24b/BDW; 26b/CHV */
620 (brw_inst_bits(src, 34, 32) << 21) | /* 3b */
621 (brw_inst_bits(src, 28, 8)); /* 21b */
622
623 if (brw->is_cherryview)
624 uncompacted |= brw_inst_bits(src, 36, 35) << 24; /* 2b */
625
626 for (int i = 0; i < ARRAY_SIZE(gen8_3src_control_index_table); i++) {
627 if (gen8_3src_control_index_table[i] == uncompacted) {
628 brw_compact_inst_set_3src_control_index(dst, i);
629 return true;
630 }
631 }
632
633 return false;
634 }
635
636 static bool
637 set_3src_source_index(struct brw_context *brw, brw_compact_inst *dst, brw_inst *src)
638 {
639 assert(brw->gen >= 8);
640
641 uint64_t uncompacted = /* 46b/BDW; 49b/CHV */
642 (brw_inst_bits(src, 83, 83) << 43) | /* 1b */
643 (brw_inst_bits(src, 114, 107) << 35) | /* 8b */
644 (brw_inst_bits(src, 93, 86) << 27) | /* 8b */
645 (brw_inst_bits(src, 72, 65) << 19) | /* 8b */
646 (brw_inst_bits(src, 55, 37)); /* 19b */
647
648 if (brw->is_cherryview) {
649 uncompacted |=
650 (brw_inst_bits(src, 126, 125) << 47) | /* 2b */
651 (brw_inst_bits(src, 105, 104) << 45) | /* 2b */
652 (brw_inst_bits(src, 84, 84) << 44); /* 1b */
653 } else {
654 uncompacted |=
655 (brw_inst_bits(src, 125, 125) << 45) | /* 1b */
656 (brw_inst_bits(src, 104, 104) << 44); /* 1b */
657 }
658
659 for (int i = 0; i < ARRAY_SIZE(gen8_3src_source_index_table); i++) {
660 if (gen8_3src_source_index_table[i] == uncompacted) {
661 brw_compact_inst_set_3src_source_index(dst, i);
662 return true;
663 }
664 }
665
666 return false;
667 }
668
669 static bool
670 brw_try_compact_3src_instruction(struct brw_context *brw, brw_compact_inst *dst,
671 brw_inst *src)
672 {
673 assert(brw->gen >= 8);
674
675 #define compact(field) \
676 brw_compact_inst_set_3src_##field(dst, brw_inst_3src_##field(brw, src))
677
678 compact(opcode);
679
680 if (!set_3src_control_index(brw, dst, src))
681 return false;
682
683 if (!set_3src_source_index(brw, dst, src))
684 return false;
685
686 compact(dst_reg_nr);
687 compact(src0_rep_ctrl);
688 brw_compact_inst_set_3src_cmpt_control(dst, true);
689 compact(debug_control);
690 compact(saturate);
691 compact(src1_rep_ctrl);
692 compact(src2_rep_ctrl);
693 compact(src0_reg_nr);
694 compact(src1_reg_nr);
695 compact(src2_reg_nr);
696 compact(src0_subreg_nr);
697 compact(src1_subreg_nr);
698 compact(src2_subreg_nr);
699
700 #undef compact
701
702 return true;
703 }
704
705 /* Compacted instructions have 12-bits for immediate sources, and a 13th bit
706 * that's replicated through the high 20 bits.
707 *
708 * Effectively this means we get 12-bit integers, 0.0f, and some limited uses
709 * of packed vectors as compactable immediates.
710 */
711 static bool
712 is_compactable_immediate(unsigned imm)
713 {
714 /* We get the low 12 bits as-is. */
715 imm &= ~0xfff;
716
717 /* We get one bit replicated through the top 20 bits. */
718 return imm == 0 || imm == 0xfffff000;
719 }
720
721 /* Returns whether an opcode takes three sources. */
722 static bool
723 is_3src(uint32_t op)
724 {
725 return opcode_descs[op].nsrc == 3;
726 }
727
728 /**
729 * Tries to compact instruction src into dst.
730 *
731 * It doesn't modify dst unless src is compactable, which is relied on by
732 * brw_compact_instructions().
733 */
734 bool
735 brw_try_compact_instruction(struct brw_context *brw, brw_compact_inst *dst,
736 brw_inst *src)
737 {
738 brw_compact_inst temp;
739
740 if (brw_inst_opcode(brw, src) == BRW_OPCODE_IF ||
741 brw_inst_opcode(brw, src) == BRW_OPCODE_ELSE ||
742 brw_inst_opcode(brw, src) == BRW_OPCODE_ENDIF ||
743 brw_inst_opcode(brw, src) == BRW_OPCODE_HALT ||
744 brw_inst_opcode(brw, src) == BRW_OPCODE_DO ||
745 brw_inst_opcode(brw, src) == BRW_OPCODE_WHILE) {
746 /* FINISHME: The fixup code below, and brw_set_uip_jip and friends, needs
747 * to be able to handle compacted flow control instructions..
748 */
749 return false;
750 }
751
752 if (is_3src(brw_inst_opcode(brw, src))) {
753 if (brw->gen >= 8) {
754 memset(&temp, 0, sizeof(temp));
755 if (brw_try_compact_3src_instruction(brw, &temp, src)) {
756 *dst = temp;
757 return true;
758 } else {
759 return false;
760 }
761 } else {
762 return false;
763 }
764 }
765
766 bool is_immediate =
767 brw_inst_src0_reg_file(brw, src) == BRW_IMMEDIATE_VALUE ||
768 brw_inst_src1_reg_file(brw, src) == BRW_IMMEDIATE_VALUE;
769 if (is_immediate && !is_compactable_immediate(brw_inst_imm_ud(brw, src))) {
770 return false;
771 }
772
773 memset(&temp, 0, sizeof(temp));
774
775 brw_compact_inst_set_opcode(&temp, brw_inst_opcode(brw, src));
776 brw_compact_inst_set_debug_control(&temp, brw_inst_debug_control(brw, src));
777 if (!set_control_index(brw, &temp, src))
778 return false;
779 if (!set_datatype_index(brw, &temp, src))
780 return false;
781 if (!set_subreg_index(brw, &temp, src, is_immediate))
782 return false;
783 brw_compact_inst_set_acc_wr_control(&temp,
784 brw_inst_acc_wr_control(brw, src));
785 brw_compact_inst_set_cond_modifier(&temp, brw_inst_cond_modifier(brw, src));
786 if (brw->gen <= 6)
787 brw_compact_inst_set_flag_subreg_nr(&temp,
788 brw_inst_flag_subreg_nr(brw, src));
789 brw_compact_inst_set_cmpt_control(&temp, true);
790 if (!set_src0_index(brw, &temp, src))
791 return false;
792 if (!set_src1_index(brw, &temp, src, is_immediate))
793 return false;
794 brw_compact_inst_set_dst_reg_nr(&temp, brw_inst_dst_da_reg_nr(brw, src));
795 brw_compact_inst_set_src0_reg_nr(&temp, brw_inst_src0_da_reg_nr(brw, src));
796 if (is_immediate) {
797 brw_compact_inst_set_src1_reg_nr(&temp, brw_inst_imm_ud(brw, src) & 0xff);
798 } else {
799 brw_compact_inst_set_src1_reg_nr(&temp,
800 brw_inst_src1_da_reg_nr(brw, src));
801 }
802
803 *dst = temp;
804
805 return true;
806 }
807
808 static void
809 set_uncompacted_control(struct brw_context *brw, brw_inst *dst,
810 brw_compact_inst *src)
811 {
812 uint32_t uncompacted =
813 control_index_table[brw_compact_inst_control_index(src)];
814
815 if (brw->gen >= 8) {
816 brw_inst_set_bits(dst, 33, 31, (uncompacted >> 16));
817 brw_inst_set_bits(dst, 23, 12, (uncompacted >> 4) & 0xfff);
818 brw_inst_set_bits(dst, 10, 9, (uncompacted >> 2) & 0x3);
819 brw_inst_set_bits(dst, 34, 34, (uncompacted >> 1) & 0x1);
820 brw_inst_set_bits(dst, 8, 8, (uncompacted >> 0) & 0x1);
821 } else {
822 brw_inst_set_bits(dst, 31, 31, (uncompacted >> 16) & 0x1);
823 brw_inst_set_bits(dst, 23, 8, (uncompacted & 0xffff));
824
825 if (brw->gen == 7)
826 brw_inst_set_bits(dst, 90, 89, uncompacted >> 17);
827 }
828 }
829
830 static void
831 set_uncompacted_datatype(struct brw_context *brw, brw_inst *dst,
832 brw_compact_inst *src)
833 {
834 uint32_t uncompacted = datatype_table[brw_compact_inst_datatype_index(src)];
835
836 if (brw->gen >= 8) {
837 brw_inst_set_bits(dst, 63, 61, (uncompacted >> 18));
838 brw_inst_set_bits(dst, 94, 89, (uncompacted >> 12) & 0x3f);
839 brw_inst_set_bits(dst, 46, 35, (uncompacted >> 0) & 0xfff);
840 } else {
841 brw_inst_set_bits(dst, 63, 61, (uncompacted >> 15));
842 brw_inst_set_bits(dst, 46, 32, (uncompacted & 0x7fff));
843 }
844 }
845
846 static void
847 set_uncompacted_subreg(struct brw_context *brw, brw_inst *dst,
848 brw_compact_inst *src)
849 {
850 uint16_t uncompacted = subreg_table[brw_compact_inst_subreg_index(src)];
851
852 brw_inst_set_bits(dst, 100, 96, (uncompacted >> 10));
853 brw_inst_set_bits(dst, 68, 64, (uncompacted >> 5) & 0x1f);
854 brw_inst_set_bits(dst, 52, 48, (uncompacted >> 0) & 0x1f);
855 }
856
857 static void
858 set_uncompacted_src0(struct brw_context *brw, brw_inst *dst,
859 brw_compact_inst *src)
860 {
861 uint32_t compacted = brw_compact_inst_src0_index(src);
862 uint16_t uncompacted = src_index_table[compacted];
863
864 brw_inst_set_bits(dst, 88, 77, uncompacted);
865 }
866
867 static void
868 set_uncompacted_src1(struct brw_context *brw, brw_inst *dst,
869 brw_compact_inst *src, bool is_immediate)
870 {
871 if (is_immediate) {
872 signed high5 = brw_compact_inst_src1_index(src);
873 /* Replicate top bit of src1_index into high 20 bits of the immediate. */
874 brw_inst_set_imm_ud(brw, dst, (high5 << 27) >> 19);
875 } else {
876 uint16_t uncompacted = src_index_table[brw_compact_inst_src1_index(src)];
877
878 brw_inst_set_bits(dst, 120, 109, uncompacted);
879 }
880 }
881
882 static void
883 set_uncompacted_3src_control_index(struct brw_context *brw, brw_inst *dst,
884 brw_compact_inst *src)
885 {
886 assert(brw->gen >= 8);
887
888 uint32_t compacted = brw_compact_inst_3src_control_index(src);
889 uint32_t uncompacted = gen8_3src_control_index_table[compacted];
890
891 brw_inst_set_bits(dst, 34, 32, (uncompacted >> 21) & 0x7);
892 brw_inst_set_bits(dst, 28, 8, (uncompacted >> 0) & 0x1fffff);
893
894 if (brw->is_cherryview)
895 brw_inst_set_bits(dst, 36, 35, (uncompacted >> 24) & 0x3);
896 }
897
898 static void
899 set_uncompacted_3src_source_index(struct brw_context *brw, brw_inst *dst,
900 brw_compact_inst *src)
901 {
902 assert(brw->gen >= 8);
903
904 uint32_t compacted = brw_compact_inst_3src_source_index(src);
905 uint64_t uncompacted = gen8_3src_source_index_table[compacted];
906
907 brw_inst_set_bits(dst, 83, 83, (uncompacted >> 43) & 0x1);
908 brw_inst_set_bits(dst, 114, 107, (uncompacted >> 35) & 0xff);
909 brw_inst_set_bits(dst, 93, 86, (uncompacted >> 27) & 0xff);
910 brw_inst_set_bits(dst, 72, 65, (uncompacted >> 19) & 0xff);
911 brw_inst_set_bits(dst, 55, 37, (uncompacted >> 0) & 0x7ffff);
912
913 if (brw->is_cherryview) {
914 brw_inst_set_bits(dst, 126, 125, (uncompacted >> 47) & 0x3);
915 brw_inst_set_bits(dst, 105, 104, (uncompacted >> 45) & 0x3);
916 brw_inst_set_bits(dst, 84, 84, (uncompacted >> 44) & 0x1);
917 } else {
918 brw_inst_set_bits(dst, 125, 125, (uncompacted >> 45) & 0x1);
919 brw_inst_set_bits(dst, 104, 104, (uncompacted >> 44) & 0x1);
920 }
921 }
922
923 static void
924 brw_uncompact_3src_instruction(struct brw_context *brw, brw_inst *dst,
925 brw_compact_inst *src)
926 {
927 assert(brw->gen >= 8);
928
929 #define uncompact(field) \
930 brw_inst_set_3src_##field(brw, dst, brw_compact_inst_3src_##field(src))
931
932 uncompact(opcode);
933
934 set_uncompacted_3src_control_index(brw, dst, src);
935 set_uncompacted_3src_source_index(brw, dst, src);
936
937 uncompact(dst_reg_nr);
938 uncompact(src0_rep_ctrl);
939 brw_inst_set_3src_cmpt_control(brw, dst, false);
940 uncompact(debug_control);
941 uncompact(saturate);
942 uncompact(src1_rep_ctrl);
943 uncompact(src2_rep_ctrl);
944 uncompact(src0_reg_nr);
945 uncompact(src1_reg_nr);
946 uncompact(src2_reg_nr);
947 uncompact(src0_subreg_nr);
948 uncompact(src1_subreg_nr);
949 uncompact(src2_subreg_nr);
950
951 #undef uncompact
952 }
953
954 void
955 brw_uncompact_instruction(struct brw_context *brw, brw_inst *dst,
956 brw_compact_inst *src)
957 {
958 memset(dst, 0, sizeof(*dst));
959
960 if (brw->gen >= 8 && is_3src(brw_compact_inst_3src_opcode(src))) {
961 brw_uncompact_3src_instruction(brw, dst, src);
962 return;
963 }
964
965 brw_inst_set_opcode(brw, dst, brw_compact_inst_opcode(src));
966 brw_inst_set_debug_control(brw, dst, brw_compact_inst_debug_control(src));
967
968 set_uncompacted_control(brw, dst, src);
969 set_uncompacted_datatype(brw, dst, src);
970
971 /* src0/1 register file fields are in the datatype table. */
972 bool is_immediate = brw_inst_src0_reg_file(brw, dst) == BRW_IMMEDIATE_VALUE ||
973 brw_inst_src1_reg_file(brw, dst) == BRW_IMMEDIATE_VALUE;
974
975 set_uncompacted_subreg(brw, dst, src);
976 brw_inst_set_acc_wr_control(brw, dst, brw_compact_inst_acc_wr_control(src));
977 brw_inst_set_cond_modifier(brw, dst, brw_compact_inst_cond_modifier(src));
978 if (brw->gen <= 6)
979 brw_inst_set_flag_subreg_nr(brw, dst,
980 brw_compact_inst_flag_subreg_nr(src));
981 set_uncompacted_src0(brw, dst, src);
982 set_uncompacted_src1(brw, dst, src, is_immediate);
983 brw_inst_set_dst_da_reg_nr(brw, dst, brw_compact_inst_dst_reg_nr(src));
984 brw_inst_set_src0_da_reg_nr(brw, dst, brw_compact_inst_src0_reg_nr(src));
985 if (is_immediate) {
986 brw_inst_set_imm_ud(brw, dst,
987 brw_inst_imm_ud(brw, dst) |
988 brw_compact_inst_src1_reg_nr(src));
989 } else {
990 brw_inst_set_src1_da_reg_nr(brw, dst, brw_compact_inst_src1_reg_nr(src));
991 }
992 }
993
994 void brw_debug_compact_uncompact(struct brw_context *brw,
995 brw_inst *orig,
996 brw_inst *uncompacted)
997 {
998 fprintf(stderr, "Instruction compact/uncompact changed (gen%d):\n",
999 brw->gen);
1000
1001 fprintf(stderr, " before: ");
1002 brw_disassemble_inst(stderr, brw, orig, true);
1003
1004 fprintf(stderr, " after: ");
1005 brw_disassemble_inst(stderr, brw, uncompacted, false);
1006
1007 uint32_t *before_bits = (uint32_t *)orig;
1008 uint32_t *after_bits = (uint32_t *)uncompacted;
1009 fprintf(stderr, " changed bits:\n");
1010 for (int i = 0; i < 128; i++) {
1011 uint32_t before = before_bits[i / 32] & (1 << (i & 31));
1012 uint32_t after = after_bits[i / 32] & (1 << (i & 31));
1013
1014 if (before != after) {
1015 fprintf(stderr, " bit %d, %s to %s\n", i,
1016 before ? "set" : "unset",
1017 after ? "set" : "unset");
1018 }
1019 }
1020 }
1021
1022 static int
1023 compacted_between(int old_ip, int old_target_ip, int *compacted_counts)
1024 {
1025 int this_compacted_count = compacted_counts[old_ip];
1026 int target_compacted_count = compacted_counts[old_target_ip];
1027 return target_compacted_count - this_compacted_count;
1028 }
1029
1030 static void
1031 update_uip_jip(struct brw_context *brw, brw_inst *insn,
1032 int this_old_ip, int *compacted_counts)
1033 {
1034 int scale = brw->gen >= 8 ? sizeof(brw_compact_inst) : 1;
1035
1036 int32_t jip = brw_inst_jip(brw, insn) / scale;
1037 jip -= compacted_between(this_old_ip, this_old_ip + jip, compacted_counts);
1038 brw_inst_set_jip(brw, insn, jip * scale);
1039
1040 if (brw_inst_opcode(brw, insn) == BRW_OPCODE_ENDIF ||
1041 brw_inst_opcode(brw, insn) == BRW_OPCODE_WHILE ||
1042 (brw_inst_opcode(brw, insn) == BRW_OPCODE_ELSE && brw->gen <= 7))
1043 return;
1044
1045 int32_t uip = brw_inst_uip(brw, insn) / scale;
1046 uip -= compacted_between(this_old_ip, this_old_ip + uip, compacted_counts);
1047 brw_inst_set_uip(brw, insn, uip * scale);
1048 }
1049
1050 void
1051 brw_init_compaction_tables(struct brw_context *brw)
1052 {
1053 assert(gen6_control_index_table[ARRAY_SIZE(gen6_control_index_table) - 1] != 0);
1054 assert(gen6_datatype_table[ARRAY_SIZE(gen6_datatype_table) - 1] != 0);
1055 assert(gen6_subreg_table[ARRAY_SIZE(gen6_subreg_table) - 1] != 0);
1056 assert(gen6_src_index_table[ARRAY_SIZE(gen6_src_index_table) - 1] != 0);
1057 assert(gen7_control_index_table[ARRAY_SIZE(gen7_control_index_table) - 1] != 0);
1058 assert(gen7_datatype_table[ARRAY_SIZE(gen7_datatype_table) - 1] != 0);
1059 assert(gen7_subreg_table[ARRAY_SIZE(gen7_subreg_table) - 1] != 0);
1060 assert(gen7_src_index_table[ARRAY_SIZE(gen7_src_index_table) - 1] != 0);
1061 assert(gen8_control_index_table[ARRAY_SIZE(gen8_control_index_table) - 1] != 0);
1062 assert(gen8_datatype_table[ARRAY_SIZE(gen8_datatype_table) - 1] != 0);
1063 assert(gen8_subreg_table[ARRAY_SIZE(gen8_subreg_table) - 1] != 0);
1064 assert(gen8_src_index_table[ARRAY_SIZE(gen8_src_index_table) - 1] != 0);
1065
1066 switch (brw->gen) {
1067 case 8:
1068 control_index_table = gen8_control_index_table;
1069 datatype_table = gen8_datatype_table;
1070 subreg_table = gen8_subreg_table;
1071 src_index_table = gen8_src_index_table;
1072 break;
1073 case 7:
1074 control_index_table = gen7_control_index_table;
1075 datatype_table = gen7_datatype_table;
1076 subreg_table = gen7_subreg_table;
1077 src_index_table = gen7_src_index_table;
1078 break;
1079 case 6:
1080 control_index_table = gen6_control_index_table;
1081 datatype_table = gen6_datatype_table;
1082 subreg_table = gen6_subreg_table;
1083 src_index_table = gen6_src_index_table;
1084 break;
1085 default:
1086 return;
1087 }
1088 }
1089
1090 void
1091 brw_compact_instructions(struct brw_compile *p, int start_offset,
1092 int num_annotations, struct annotation *annotation)
1093 {
1094 struct brw_context *brw = p->brw;
1095 void *store = p->store + start_offset / 16;
1096 /* For an instruction at byte offset 8*i before compaction, this is the number
1097 * of compacted instructions that preceded it.
1098 */
1099 int compacted_counts[(p->next_insn_offset - start_offset) / 8];
1100 /* For an instruction at byte offset 8*i after compaction, this is the
1101 * 8-byte offset it was at before compaction.
1102 */
1103 int old_ip[(p->next_insn_offset - start_offset) / 8];
1104
1105 if (brw->gen < 6)
1106 return;
1107
1108 int src_offset;
1109 int offset = 0;
1110 int compacted_count = 0;
1111 for (src_offset = 0; src_offset < p->next_insn_offset - start_offset;) {
1112 brw_inst *src = store + src_offset;
1113 void *dst = store + offset;
1114
1115 old_ip[offset / 8] = src_offset / 8;
1116 compacted_counts[src_offset / 8] = compacted_count;
1117
1118 brw_inst saved = *src;
1119
1120 if (!brw_inst_cmpt_control(brw, src) &&
1121 brw_try_compact_instruction(brw, dst, src)) {
1122 compacted_count++;
1123
1124 if (INTEL_DEBUG) {
1125 brw_inst uncompacted;
1126 brw_uncompact_instruction(brw, &uncompacted, dst);
1127 if (memcmp(&saved, &uncompacted, sizeof(uncompacted))) {
1128 brw_debug_compact_uncompact(brw, &saved, &uncompacted);
1129 }
1130 }
1131
1132 offset += 8;
1133 src_offset += 16;
1134 } else {
1135 int size = brw_inst_cmpt_control(brw, src) ? 8 : 16;
1136
1137 /* It appears that the end of thread SEND instruction needs to be
1138 * aligned, or the GPU hangs.
1139 */
1140 if ((brw_inst_opcode(brw, src) == BRW_OPCODE_SEND ||
1141 brw_inst_opcode(brw, src) == BRW_OPCODE_SENDC) &&
1142 brw_inst_eot(brw, src) &&
1143 (offset & 8) != 0) {
1144 brw_compact_inst *align = store + offset;
1145 memset(align, 0, sizeof(*align));
1146 brw_compact_inst_set_opcode(align, BRW_OPCODE_NOP);
1147 brw_compact_inst_set_cmpt_control(align, true);
1148 offset += 8;
1149 old_ip[offset / 8] = src_offset / 8;
1150 dst = store + offset;
1151 }
1152
1153 /* If we didn't compact this intruction, we need to move it down into
1154 * place.
1155 */
1156 if (offset != src_offset) {
1157 memmove(dst, src, size);
1158 }
1159 offset += size;
1160 src_offset += size;
1161 }
1162 }
1163
1164 /* Fix up control flow offsets. */
1165 p->next_insn_offset = start_offset + offset;
1166 for (offset = 0; offset < p->next_insn_offset - start_offset;) {
1167 brw_inst *insn = store + offset;
1168 int this_old_ip = old_ip[offset / 8];
1169 int this_compacted_count = compacted_counts[this_old_ip];
1170 int target_old_ip, target_compacted_count;
1171
1172 switch (brw_inst_opcode(brw, insn)) {
1173 case BRW_OPCODE_BREAK:
1174 case BRW_OPCODE_CONTINUE:
1175 case BRW_OPCODE_HALT:
1176 update_uip_jip(brw, insn, this_old_ip, compacted_counts);
1177 break;
1178
1179 case BRW_OPCODE_IF:
1180 case BRW_OPCODE_ELSE:
1181 case BRW_OPCODE_ENDIF:
1182 case BRW_OPCODE_WHILE:
1183 if (brw->gen >= 7) {
1184 update_uip_jip(brw, insn, this_old_ip, compacted_counts);
1185 } else if (brw->gen == 6) {
1186 int gen6_jump_count = brw_inst_gen6_jump_count(brw, insn);
1187 target_old_ip = this_old_ip + gen6_jump_count;
1188 target_compacted_count = compacted_counts[target_old_ip];
1189 gen6_jump_count -= (target_compacted_count - this_compacted_count);
1190 brw_inst_set_gen6_jump_count(brw, insn, gen6_jump_count);
1191 }
1192 break;
1193 }
1194
1195 offset = next_offset(brw, store, offset);
1196 }
1197
1198 /* p->nr_insn is counting the number of uncompacted instructions still, so
1199 * divide. We do want to be sure there's a valid instruction in any
1200 * alignment padding, so that the next compression pass (for the FS 8/16
1201 * compile passes) parses correctly.
1202 */
1203 if (p->next_insn_offset & 8) {
1204 brw_compact_inst *align = store + offset;
1205 memset(align, 0, sizeof(*align));
1206 brw_compact_inst_set_opcode(align, BRW_OPCODE_NOP);
1207 brw_compact_inst_set_cmpt_control(align, true);
1208 p->next_insn_offset += 8;
1209 }
1210 p->nr_insn = p->next_insn_offset / 16;
1211
1212 /* Update the instruction offsets for each annotation. */
1213 if (annotation) {
1214 for (int offset = 0, i = 0; i < num_annotations; i++) {
1215 while (start_offset + old_ip[offset / 8] * 8 != annotation[i].offset) {
1216 assert(start_offset + old_ip[offset / 8] * 8 <
1217 annotation[i].offset);
1218 offset = next_offset(brw, store, offset);
1219 }
1220
1221 annotation[i].offset = start_offset + offset;
1222
1223 offset = next_offset(brw, store, offset);
1224 }
1225
1226 annotation[num_annotations].offset = p->next_insn_offset;
1227 }
1228 }