2 * Copyright © 2012 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file brw_eu_compact.c
26 * Instruction compaction is a feature of gm45 and newer hardware that allows
27 * for a smaller instruction encoding.
29 * The instruction cache is on the order of 32KB, and many programs generate
30 * far more instructions than that. The instruction cache is built to barely
31 * keep up with instruction dispatch abaility in cache hit cases -- L1
32 * instruction cache misses that still hit in the next level could limit
33 * throughput by around 50%.
35 * The idea of instruction compaction is that most instructions use a tiny
36 * subset of the GPU functionality, so we can encode what would be a 16 byte
37 * instruction in 8 bytes using some lookup tables for various fields.
40 #include "brw_context.h"
42 #include "intel_asm_annotation.h"
44 static const uint32_t g45_control_index_table
[32] = {
79 static const uint32_t g45_datatype_table
[32] = {
100 0b101000001000110001,
101 0b001011011000101101,
102 0b001000000100001001,
103 0b001011011000101100,
104 0b110100011000110001,
105 0b001000001110111101,
106 0b110000001000110001,
107 0b011000000100101010,
108 0b101000001000101001,
109 0b001011010110001100,
110 0b001000000110100001,
114 static const uint16_t g45_subreg_table
[32] = {
149 static const uint16_t g45_src_index_table
[32] = {
184 static const uint32_t gen6_control_index_table
[32] = {
219 static const uint32_t gen6_datatype_table
[32] = {
220 0b001001110000000000,
221 0b001000110000100000,
222 0b001001110000000001,
223 0b001000000001100000,
224 0b001010110100101001,
225 0b001000000110101101,
226 0b001100011000101100,
227 0b001011110110101101,
228 0b001000000111101100,
229 0b001000000001100001,
230 0b001000110010100101,
231 0b001000000001000001,
232 0b001000001000110001,
233 0b001000001000101001,
234 0b001000000000100000,
235 0b001000001000110010,
236 0b001010010100101001,
237 0b001011010010100101,
238 0b001000000110100101,
239 0b001100011000101001,
240 0b001011011000101100,
241 0b001011010110100101,
242 0b001011110110100101,
243 0b001111011110111101,
244 0b001111011110111100,
245 0b001111011110111101,
246 0b001111011110011101,
247 0b001111011110111110,
248 0b001000000000100001,
249 0b001000000000100010,
250 0b001001111111011101,
251 0b001000001110111110,
254 static const uint16_t gen6_subreg_table
[32] = {
289 static const uint16_t gen6_src_index_table
[32] = {
324 static const uint32_t gen7_control_index_table
[32] = {
325 0b0000000000000000010,
326 0b0000100000000000000,
327 0b0000100000000000001,
328 0b0000100000000000010,
329 0b0000100000000000011,
330 0b0000100000000000100,
331 0b0000100000000000101,
332 0b0000100000000000111,
333 0b0000100000000001000,
334 0b0000100000000001001,
335 0b0000100000000001101,
336 0b0000110000000000000,
337 0b0000110000000000001,
338 0b0000110000000000010,
339 0b0000110000000000011,
340 0b0000110000000000100,
341 0b0000110000000000101,
342 0b0000110000000000111,
343 0b0000110000000001001,
344 0b0000110000000001101,
345 0b0000110000000010000,
346 0b0000110000100000000,
347 0b0001000000000000000,
348 0b0001000000000000010,
349 0b0001000000000000100,
350 0b0001000000100000000,
351 0b0010110000000000000,
352 0b0010110000000010000,
353 0b0011000000000000000,
354 0b0011000000100000000,
355 0b0101000000000000000,
356 0b0101000000100000000
359 static const uint32_t gen7_datatype_table
[32] = {
360 0b001000000000000001,
361 0b001000000000100000,
362 0b001000000000100001,
363 0b001000000001100001,
364 0b001000000010111101,
365 0b001000001011111101,
366 0b001000001110100001,
367 0b001000001110100101,
368 0b001000001110111101,
369 0b001000010000100001,
370 0b001000110000100000,
371 0b001000110000100001,
372 0b001001010010100101,
373 0b001001110010100100,
374 0b001001110010100101,
375 0b001111001110111101,
376 0b001111011110011101,
377 0b001111011110111100,
378 0b001111011110111101,
379 0b001111111110111100,
380 0b000000001000001100,
381 0b001000000000111101,
382 0b001000000010100101,
383 0b001000010000100000,
384 0b001001010010100100,
385 0b001001110010000100,
386 0b001010010100001001,
387 0b001101111110111101,
388 0b001111111110111101,
389 0b001011110110101100,
390 0b001010010100101000,
394 static const uint16_t gen7_subreg_table
[32] = {
429 static const uint16_t gen7_src_index_table
[32] = {
464 static const uint32_t gen8_control_index_table
[32] = {
465 0b0000000000000000010,
466 0b0000100000000000000,
467 0b0000100000000000001,
468 0b0000100000000000010,
469 0b0000100000000000011,
470 0b0000100000000000100,
471 0b0000100000000000101,
472 0b0000100000000000111,
473 0b0000100000000001000,
474 0b0000100000000001001,
475 0b0000100000000001101,
476 0b0000110000000000000,
477 0b0000110000000000001,
478 0b0000110000000000010,
479 0b0000110000000000011,
480 0b0000110000000000100,
481 0b0000110000000000101,
482 0b0000110000000000111,
483 0b0000110000000001001,
484 0b0000110000000001101,
485 0b0000110000000010000,
486 0b0000110000100000000,
487 0b0001000000000000000,
488 0b0001000000000000010,
489 0b0001000000000000100,
490 0b0001000000100000000,
491 0b0010110000000000000,
492 0b0010110000000010000,
493 0b0011000000000000000,
494 0b0011000000100000000,
495 0b0101000000000000000,
496 0b0101000000100000000
499 static const uint32_t gen8_datatype_table
[32] = {
500 0b001000000000000000001,
501 0b001000000000001000000,
502 0b001000000000001000001,
503 0b001000000000011000001,
504 0b001000000000101011101,
505 0b001000000010111011101,
506 0b001000000011101000001,
507 0b001000000011101000101,
508 0b001000000011101011101,
509 0b001000001000001000001,
510 0b001000011000001000000,
511 0b001000011000001000001,
512 0b001000101000101000101,
513 0b001000111000101000100,
514 0b001000111000101000101,
515 0b001011100011101011101,
516 0b001011101011100011101,
517 0b001011101011101011100,
518 0b001011101011101011101,
519 0b001011111011101011100,
520 0b000000000010000001100,
521 0b001000000000001011101,
522 0b001000000000101000101,
523 0b001000001000001000000,
524 0b001000101000101000100,
525 0b001000111000100000100,
526 0b001001001001000001001,
527 0b001010111011101011101,
528 0b001011111011101011101,
529 0b001001111001101001100,
530 0b001001001001001001000,
531 0b001001011001001001000
534 static const uint16_t gen8_subreg_table
[32] = {
569 static const uint16_t gen8_src_index_table
[32] = {
604 /* This is actually the control index table for Cherryview (26 bits), but the
605 * only difference from Broadwell (24 bits) is that it has two extra 0-bits at
608 * The low 24 bits have the same mappings on both hardware.
610 static const uint32_t gen8_3src_control_index_table
[4] = {
611 0b00100000000110000000000001,
612 0b00000000000110000000000001,
613 0b00000000001000000000000001,
614 0b00000000001000000000100001
617 /* This is actually the control index table for Cherryview (49 bits), but the
618 * only difference from Broadwell (46 bits) is that it has three extra 0-bits
621 * The low 44 bits have the same mappings on both hardware, and since the high
622 * three bits on Broadwell are zero, we can reuse Cherryview's table.
624 static const uint64_t gen8_3src_source_index_table
[4] = {
625 0b0000001110010011100100111001000001111000000000000,
626 0b0000001110010011100100111001000001111000000000010,
627 0b0000001110010011100100111001000001111000000001000,
628 0b0000001110010011100100111001000001111000000100000
631 static const uint32_t *control_index_table
;
632 static const uint32_t *datatype_table
;
633 static const uint16_t *subreg_table
;
634 static const uint16_t *src_index_table
;
637 set_control_index(struct brw_context
*brw
, brw_compact_inst
*dst
, brw_inst
*src
)
639 uint32_t uncompacted
= brw
->gen
>= 8 /* 17b/G45; 19b/IVB+ */
640 ? (brw_inst_bits(src
, 33, 31) << 16) | /* 3b */
641 (brw_inst_bits(src
, 23, 12) << 4) | /* 12b */
642 (brw_inst_bits(src
, 10, 9) << 2) | /* 2b */
643 (brw_inst_bits(src
, 34, 34) << 1) | /* 1b */
644 (brw_inst_bits(src
, 8, 8)) /* 1b */
645 : (brw_inst_bits(src
, 31, 31) << 16) | /* 1b */
646 (brw_inst_bits(src
, 23, 8)); /* 16b */
648 /* On gen7, the flag register and subregister numbers are integrated into
652 uncompacted
|= brw_inst_bits(src
, 90, 89) << 17; /* 2b */
654 for (int i
= 0; i
< 32; i
++) {
655 if (control_index_table
[i
] == uncompacted
) {
656 brw_compact_inst_set_control_index(dst
, i
);
665 set_datatype_index(struct brw_context
*brw
, brw_compact_inst
*dst
,
668 uint32_t uncompacted
= brw
->gen
>= 8 /* 18b/G45+; 21b/BDW+ */
669 ? (brw_inst_bits(src
, 63, 61) << 18) | /* 3b */
670 (brw_inst_bits(src
, 94, 89) << 12) | /* 6b */
671 (brw_inst_bits(src
, 46, 35)) /* 12b */
672 : (brw_inst_bits(src
, 63, 61) << 15) | /* 3b */
673 (brw_inst_bits(src
, 46, 32)); /* 15b */
675 for (int i
= 0; i
< 32; i
++) {
676 if (datatype_table
[i
] == uncompacted
) {
677 brw_compact_inst_set_datatype_index(dst
, i
);
686 set_subreg_index(struct brw_context
*brw
, brw_compact_inst
*dst
, brw_inst
*src
,
689 uint16_t uncompacted
= /* 15b */
690 (brw_inst_bits(src
, 52, 48) << 0) | /* 5b */
691 (brw_inst_bits(src
, 68, 64) << 5); /* 5b */
694 uncompacted
|= brw_inst_bits(src
, 100, 96) << 10; /* 5b */
696 for (int i
= 0; i
< 32; i
++) {
697 if (subreg_table
[i
] == uncompacted
) {
698 brw_compact_inst_set_subreg_index(dst
, i
);
707 get_src_index(uint16_t uncompacted
,
710 for (int i
= 0; i
< 32; i
++) {
711 if (src_index_table
[i
] == uncompacted
) {
721 set_src0_index(struct brw_context
*brw
, brw_compact_inst
*dst
, brw_inst
*src
)
724 uint16_t uncompacted
= brw_inst_bits(src
, 88, 77); /* 12b */
726 if (!get_src_index(uncompacted
, &compacted
))
729 brw_compact_inst_set_src0_index(dst
, compacted
);
735 set_src1_index(struct brw_context
*brw
, brw_compact_inst
*dst
, brw_inst
*src
,
741 compacted
= (brw_inst_imm_ud(brw
, src
) >> 8) & 0x1f;
743 uint16_t uncompacted
= brw_inst_bits(src
, 120, 109); /* 12b */
745 if (!get_src_index(uncompacted
, &compacted
))
749 brw_compact_inst_set_src1_index(dst
, compacted
);
755 set_3src_control_index(struct brw_context
*brw
, brw_compact_inst
*dst
, brw_inst
*src
)
757 assert(brw
->gen
>= 8);
759 uint32_t uncompacted
= /* 24b/BDW; 26b/CHV */
760 (brw_inst_bits(src
, 34, 32) << 21) | /* 3b */
761 (brw_inst_bits(src
, 28, 8)); /* 21b */
763 if (brw
->is_cherryview
)
764 uncompacted
|= brw_inst_bits(src
, 36, 35) << 24; /* 2b */
766 for (int i
= 0; i
< ARRAY_SIZE(gen8_3src_control_index_table
); i
++) {
767 if (gen8_3src_control_index_table
[i
] == uncompacted
) {
768 brw_compact_inst_set_3src_control_index(dst
, i
);
777 set_3src_source_index(struct brw_context
*brw
, brw_compact_inst
*dst
, brw_inst
*src
)
779 assert(brw
->gen
>= 8);
781 uint64_t uncompacted
= /* 46b/BDW; 49b/CHV */
782 (brw_inst_bits(src
, 83, 83) << 43) | /* 1b */
783 (brw_inst_bits(src
, 114, 107) << 35) | /* 8b */
784 (brw_inst_bits(src
, 93, 86) << 27) | /* 8b */
785 (brw_inst_bits(src
, 72, 65) << 19) | /* 8b */
786 (brw_inst_bits(src
, 55, 37)); /* 19b */
788 if (brw
->is_cherryview
) {
790 (brw_inst_bits(src
, 126, 125) << 47) | /* 2b */
791 (brw_inst_bits(src
, 105, 104) << 45) | /* 2b */
792 (brw_inst_bits(src
, 84, 84) << 44); /* 1b */
795 (brw_inst_bits(src
, 125, 125) << 45) | /* 1b */
796 (brw_inst_bits(src
, 104, 104) << 44); /* 1b */
799 for (int i
= 0; i
< ARRAY_SIZE(gen8_3src_source_index_table
); i
++) {
800 if (gen8_3src_source_index_table
[i
] == uncompacted
) {
801 brw_compact_inst_set_3src_source_index(dst
, i
);
810 brw_try_compact_3src_instruction(struct brw_context
*brw
, brw_compact_inst
*dst
,
813 assert(brw
->gen
>= 8);
815 #define compact(field) \
816 brw_compact_inst_set_3src_##field(dst, brw_inst_3src_##field(brw, src))
820 if (!set_3src_control_index(brw
, dst
, src
))
823 if (!set_3src_source_index(brw
, dst
, src
))
827 compact(src0_rep_ctrl
);
828 brw_compact_inst_set_3src_cmpt_control(dst
, true);
829 compact(debug_control
);
831 compact(src1_rep_ctrl
);
832 compact(src2_rep_ctrl
);
833 compact(src0_reg_nr
);
834 compact(src1_reg_nr
);
835 compact(src2_reg_nr
);
836 compact(src0_subreg_nr
);
837 compact(src1_subreg_nr
);
838 compact(src2_subreg_nr
);
845 /* Compacted instructions have 12-bits for immediate sources, and a 13th bit
846 * that's replicated through the high 20 bits.
848 * Effectively this means we get 12-bit integers, 0.0f, and some limited uses
849 * of packed vectors as compactable immediates.
852 is_compactable_immediate(unsigned imm
)
854 /* We get the low 12 bits as-is. */
857 /* We get one bit replicated through the top 20 bits. */
858 return imm
== 0 || imm
== 0xfffff000;
861 /* Returns whether an opcode takes three sources. */
865 return opcode_descs
[op
].nsrc
== 3;
869 * Tries to compact instruction src into dst.
871 * It doesn't modify dst unless src is compactable, which is relied on by
872 * brw_compact_instructions().
875 brw_try_compact_instruction(struct brw_context
*brw
, brw_compact_inst
*dst
,
878 brw_compact_inst temp
;
880 assert(brw_inst_cmpt_control(brw
, src
) == 0);
882 if (brw_inst_opcode(brw
, src
) == BRW_OPCODE_IF
||
883 brw_inst_opcode(brw
, src
) == BRW_OPCODE_IFF
||
884 brw_inst_opcode(brw
, src
) == BRW_OPCODE_ELSE
||
885 brw_inst_opcode(brw
, src
) == BRW_OPCODE_ENDIF
||
886 brw_inst_opcode(brw
, src
) == BRW_OPCODE_HALT
||
887 brw_inst_opcode(brw
, src
) == BRW_OPCODE_DO
||
888 brw_inst_opcode(brw
, src
) == BRW_OPCODE_WHILE
) {
889 /* FINISHME: The fixup code below, and brw_set_uip_jip and friends, needs
890 * to be able to handle compacted flow control instructions..
895 if (is_3src(brw_inst_opcode(brw
, src
))) {
897 memset(&temp
, 0, sizeof(temp
));
898 if (brw_try_compact_3src_instruction(brw
, &temp
, src
)) {
910 brw_inst_src0_reg_file(brw
, src
) == BRW_IMMEDIATE_VALUE
||
911 brw_inst_src1_reg_file(brw
, src
) == BRW_IMMEDIATE_VALUE
;
913 (brw
->gen
< 6 || !is_compactable_immediate(brw_inst_imm_ud(brw
, src
)))) {
917 memset(&temp
, 0, sizeof(temp
));
919 brw_compact_inst_set_opcode(&temp
, brw_inst_opcode(brw
, src
));
920 brw_compact_inst_set_debug_control(&temp
, brw_inst_debug_control(brw
, src
));
921 if (!set_control_index(brw
, &temp
, src
))
923 if (!set_datatype_index(brw
, &temp
, src
))
925 if (!set_subreg_index(brw
, &temp
, src
, is_immediate
))
927 brw_compact_inst_set_acc_wr_control(&temp
,
928 brw_inst_acc_wr_control(brw
, src
));
929 brw_compact_inst_set_cond_modifier(&temp
, brw_inst_cond_modifier(brw
, src
));
931 brw_compact_inst_set_flag_subreg_nr(&temp
,
932 brw_inst_flag_subreg_nr(brw
, src
));
933 brw_compact_inst_set_cmpt_control(&temp
, true);
934 if (!set_src0_index(brw
, &temp
, src
))
936 if (!set_src1_index(brw
, &temp
, src
, is_immediate
))
938 brw_compact_inst_set_dst_reg_nr(&temp
, brw_inst_dst_da_reg_nr(brw
, src
));
939 brw_compact_inst_set_src0_reg_nr(&temp
, brw_inst_src0_da_reg_nr(brw
, src
));
941 brw_compact_inst_set_src1_reg_nr(&temp
, brw_inst_imm_ud(brw
, src
) & 0xff);
943 brw_compact_inst_set_src1_reg_nr(&temp
,
944 brw_inst_src1_da_reg_nr(brw
, src
));
953 set_uncompacted_control(struct brw_context
*brw
, brw_inst
*dst
,
954 brw_compact_inst
*src
)
956 uint32_t uncompacted
=
957 control_index_table
[brw_compact_inst_control_index(src
)];
960 brw_inst_set_bits(dst
, 33, 31, (uncompacted
>> 16));
961 brw_inst_set_bits(dst
, 23, 12, (uncompacted
>> 4) & 0xfff);
962 brw_inst_set_bits(dst
, 10, 9, (uncompacted
>> 2) & 0x3);
963 brw_inst_set_bits(dst
, 34, 34, (uncompacted
>> 1) & 0x1);
964 brw_inst_set_bits(dst
, 8, 8, (uncompacted
>> 0) & 0x1);
966 brw_inst_set_bits(dst
, 31, 31, (uncompacted
>> 16) & 0x1);
967 brw_inst_set_bits(dst
, 23, 8, (uncompacted
& 0xffff));
970 brw_inst_set_bits(dst
, 90, 89, uncompacted
>> 17);
975 set_uncompacted_datatype(struct brw_context
*brw
, brw_inst
*dst
,
976 brw_compact_inst
*src
)
978 uint32_t uncompacted
= datatype_table
[brw_compact_inst_datatype_index(src
)];
981 brw_inst_set_bits(dst
, 63, 61, (uncompacted
>> 18));
982 brw_inst_set_bits(dst
, 94, 89, (uncompacted
>> 12) & 0x3f);
983 brw_inst_set_bits(dst
, 46, 35, (uncompacted
>> 0) & 0xfff);
985 brw_inst_set_bits(dst
, 63, 61, (uncompacted
>> 15));
986 brw_inst_set_bits(dst
, 46, 32, (uncompacted
& 0x7fff));
991 set_uncompacted_subreg(struct brw_context
*brw
, brw_inst
*dst
,
992 brw_compact_inst
*src
)
994 uint16_t uncompacted
= subreg_table
[brw_compact_inst_subreg_index(src
)];
996 brw_inst_set_bits(dst
, 100, 96, (uncompacted
>> 10));
997 brw_inst_set_bits(dst
, 68, 64, (uncompacted
>> 5) & 0x1f);
998 brw_inst_set_bits(dst
, 52, 48, (uncompacted
>> 0) & 0x1f);
1002 set_uncompacted_src0(struct brw_context
*brw
, brw_inst
*dst
,
1003 brw_compact_inst
*src
)
1005 uint32_t compacted
= brw_compact_inst_src0_index(src
);
1006 uint16_t uncompacted
= src_index_table
[compacted
];
1008 brw_inst_set_bits(dst
, 88, 77, uncompacted
);
1012 set_uncompacted_src1(struct brw_context
*brw
, brw_inst
*dst
,
1013 brw_compact_inst
*src
, bool is_immediate
)
1016 signed high5
= brw_compact_inst_src1_index(src
);
1017 /* Replicate top bit of src1_index into high 20 bits of the immediate. */
1018 brw_inst_set_imm_ud(brw
, dst
, (high5
<< 27) >> 19);
1020 uint16_t uncompacted
= src_index_table
[brw_compact_inst_src1_index(src
)];
1022 brw_inst_set_bits(dst
, 120, 109, uncompacted
);
1027 set_uncompacted_3src_control_index(struct brw_context
*brw
, brw_inst
*dst
,
1028 brw_compact_inst
*src
)
1030 assert(brw
->gen
>= 8);
1032 uint32_t compacted
= brw_compact_inst_3src_control_index(src
);
1033 uint32_t uncompacted
= gen8_3src_control_index_table
[compacted
];
1035 brw_inst_set_bits(dst
, 34, 32, (uncompacted
>> 21) & 0x7);
1036 brw_inst_set_bits(dst
, 28, 8, (uncompacted
>> 0) & 0x1fffff);
1038 if (brw
->is_cherryview
)
1039 brw_inst_set_bits(dst
, 36, 35, (uncompacted
>> 24) & 0x3);
1043 set_uncompacted_3src_source_index(struct brw_context
*brw
, brw_inst
*dst
,
1044 brw_compact_inst
*src
)
1046 assert(brw
->gen
>= 8);
1048 uint32_t compacted
= brw_compact_inst_3src_source_index(src
);
1049 uint64_t uncompacted
= gen8_3src_source_index_table
[compacted
];
1051 brw_inst_set_bits(dst
, 83, 83, (uncompacted
>> 43) & 0x1);
1052 brw_inst_set_bits(dst
, 114, 107, (uncompacted
>> 35) & 0xff);
1053 brw_inst_set_bits(dst
, 93, 86, (uncompacted
>> 27) & 0xff);
1054 brw_inst_set_bits(dst
, 72, 65, (uncompacted
>> 19) & 0xff);
1055 brw_inst_set_bits(dst
, 55, 37, (uncompacted
>> 0) & 0x7ffff);
1057 if (brw
->is_cherryview
) {
1058 brw_inst_set_bits(dst
, 126, 125, (uncompacted
>> 47) & 0x3);
1059 brw_inst_set_bits(dst
, 105, 104, (uncompacted
>> 45) & 0x3);
1060 brw_inst_set_bits(dst
, 84, 84, (uncompacted
>> 44) & 0x1);
1062 brw_inst_set_bits(dst
, 125, 125, (uncompacted
>> 45) & 0x1);
1063 brw_inst_set_bits(dst
, 104, 104, (uncompacted
>> 44) & 0x1);
1068 brw_uncompact_3src_instruction(struct brw_context
*brw
, brw_inst
*dst
,
1069 brw_compact_inst
*src
)
1071 assert(brw
->gen
>= 8);
1073 #define uncompact(field) \
1074 brw_inst_set_3src_##field(brw, dst, brw_compact_inst_3src_##field(src))
1078 set_uncompacted_3src_control_index(brw
, dst
, src
);
1079 set_uncompacted_3src_source_index(brw
, dst
, src
);
1081 uncompact(dst_reg_nr
);
1082 uncompact(src0_rep_ctrl
);
1083 brw_inst_set_3src_cmpt_control(brw
, dst
, false);
1084 uncompact(debug_control
);
1085 uncompact(saturate
);
1086 uncompact(src1_rep_ctrl
);
1087 uncompact(src2_rep_ctrl
);
1088 uncompact(src0_reg_nr
);
1089 uncompact(src1_reg_nr
);
1090 uncompact(src2_reg_nr
);
1091 uncompact(src0_subreg_nr
);
1092 uncompact(src1_subreg_nr
);
1093 uncompact(src2_subreg_nr
);
1099 brw_uncompact_instruction(struct brw_context
*brw
, brw_inst
*dst
,
1100 brw_compact_inst
*src
)
1102 memset(dst
, 0, sizeof(*dst
));
1104 if (brw
->gen
>= 8 && is_3src(brw_compact_inst_3src_opcode(src
))) {
1105 brw_uncompact_3src_instruction(brw
, dst
, src
);
1109 brw_inst_set_opcode(brw
, dst
, brw_compact_inst_opcode(src
));
1110 brw_inst_set_debug_control(brw
, dst
, brw_compact_inst_debug_control(src
));
1112 set_uncompacted_control(brw
, dst
, src
);
1113 set_uncompacted_datatype(brw
, dst
, src
);
1115 /* src0/1 register file fields are in the datatype table. */
1116 bool is_immediate
= brw_inst_src0_reg_file(brw
, dst
) == BRW_IMMEDIATE_VALUE
||
1117 brw_inst_src1_reg_file(brw
, dst
) == BRW_IMMEDIATE_VALUE
;
1119 set_uncompacted_subreg(brw
, dst
, src
);
1120 brw_inst_set_acc_wr_control(brw
, dst
, brw_compact_inst_acc_wr_control(src
));
1121 brw_inst_set_cond_modifier(brw
, dst
, brw_compact_inst_cond_modifier(src
));
1123 brw_inst_set_flag_subreg_nr(brw
, dst
,
1124 brw_compact_inst_flag_subreg_nr(src
));
1125 set_uncompacted_src0(brw
, dst
, src
);
1126 set_uncompacted_src1(brw
, dst
, src
, is_immediate
);
1127 brw_inst_set_dst_da_reg_nr(brw
, dst
, brw_compact_inst_dst_reg_nr(src
));
1128 brw_inst_set_src0_da_reg_nr(brw
, dst
, brw_compact_inst_src0_reg_nr(src
));
1130 brw_inst_set_imm_ud(brw
, dst
,
1131 brw_inst_imm_ud(brw
, dst
) |
1132 brw_compact_inst_src1_reg_nr(src
));
1134 brw_inst_set_src1_da_reg_nr(brw
, dst
, brw_compact_inst_src1_reg_nr(src
));
1138 void brw_debug_compact_uncompact(struct brw_context
*brw
,
1140 brw_inst
*uncompacted
)
1142 fprintf(stderr
, "Instruction compact/uncompact changed (gen%d):\n",
1145 fprintf(stderr
, " before: ");
1146 brw_disassemble_inst(stderr
, brw
, orig
, true);
1148 fprintf(stderr
, " after: ");
1149 brw_disassemble_inst(stderr
, brw
, uncompacted
, false);
1151 uint32_t *before_bits
= (uint32_t *)orig
;
1152 uint32_t *after_bits
= (uint32_t *)uncompacted
;
1153 fprintf(stderr
, " changed bits:\n");
1154 for (int i
= 0; i
< 128; i
++) {
1155 uint32_t before
= before_bits
[i
/ 32] & (1 << (i
& 31));
1156 uint32_t after
= after_bits
[i
/ 32] & (1 << (i
& 31));
1158 if (before
!= after
) {
1159 fprintf(stderr
, " bit %d, %s to %s\n", i
,
1160 before
? "set" : "unset",
1161 after
? "set" : "unset");
1167 compacted_between(int old_ip
, int old_target_ip
, int *compacted_counts
)
1169 int this_compacted_count
= compacted_counts
[old_ip
];
1170 int target_compacted_count
= compacted_counts
[old_target_ip
];
1171 return target_compacted_count
- this_compacted_count
;
1175 update_uip_jip(struct brw_context
*brw
, brw_inst
*insn
,
1176 int this_old_ip
, int *compacted_counts
)
1178 /* JIP and UIP are in units of:
1179 * - bytes on Gen8+; and
1180 * - compacted instructions on Gen6+.
1182 int32_t jip
= brw_inst_jip(brw
, insn
);
1183 int32_t jip_compacted
= jip
/ (brw
->gen
>= 8 ? sizeof(brw_compact_inst
) : 1);
1184 int32_t jip_uncompacted
= jip
/ (brw
->gen
>= 8 ? sizeof(brw_inst
) : 2);
1185 jip_compacted
-= compacted_between(this_old_ip
,
1186 this_old_ip
+ jip_uncompacted
,
1188 brw_inst_set_jip(brw
, insn
,
1189 jip_compacted
* (brw
->gen
>= 8 ? sizeof(brw_compact_inst
) : 1));
1191 if (brw_inst_opcode(brw
, insn
) == BRW_OPCODE_ENDIF
||
1192 brw_inst_opcode(brw
, insn
) == BRW_OPCODE_WHILE
||
1193 (brw_inst_opcode(brw
, insn
) == BRW_OPCODE_ELSE
&& brw
->gen
<= 7))
1196 int32_t uip
= brw_inst_uip(brw
, insn
);
1197 int32_t uip_compacted
= uip
/ (brw
->gen
>= 8 ? sizeof(brw_compact_inst
) : 1);
1198 int32_t uip_uncompacted
= uip
/ (brw
->gen
>= 8 ? sizeof(brw_inst
) : 2);
1199 uip_compacted
-= compacted_between(this_old_ip
,
1200 this_old_ip
+ uip_uncompacted
,
1202 brw_inst_set_uip(brw
, insn
,
1203 uip_compacted
* (brw
->gen
>= 8 ? sizeof(brw_compact_inst
) : 1));
1207 update_gen4_jump_count(struct brw_context
*brw
, brw_inst
*insn
,
1208 int this_old_ip
, int *compacted_counts
)
1210 assert(brw
->gen
== 5);
1212 /* Jump Count is in units of:
1213 * - compacted instructions on Gen5.
1215 int jump_count
= brw_inst_gen4_jump_count(brw
, insn
);
1216 int jump_count_compacted
= jump_count
;
1217 int jump_count_uncompacted
= jump_count
/ 2;
1219 int target_old_ip
= this_old_ip
+ jump_count_uncompacted
;
1221 int this_compacted_count
= compacted_counts
[this_old_ip
];
1222 int target_compacted_count
= compacted_counts
[target_old_ip
];
1224 jump_count_compacted
-= (target_compacted_count
- this_compacted_count
);
1225 brw_inst_set_gen4_jump_count(brw
, insn
, jump_count_compacted
);
1229 brw_init_compaction_tables(struct brw_context
*brw
)
1231 assert(g45_control_index_table
[ARRAY_SIZE(g45_control_index_table
) - 1] != 0);
1232 assert(g45_datatype_table
[ARRAY_SIZE(g45_datatype_table
) - 1] != 0);
1233 assert(g45_subreg_table
[ARRAY_SIZE(g45_subreg_table
) - 1] != 0);
1234 assert(g45_src_index_table
[ARRAY_SIZE(g45_src_index_table
) - 1] != 0);
1235 assert(gen6_control_index_table
[ARRAY_SIZE(gen6_control_index_table
) - 1] != 0);
1236 assert(gen6_datatype_table
[ARRAY_SIZE(gen6_datatype_table
) - 1] != 0);
1237 assert(gen6_subreg_table
[ARRAY_SIZE(gen6_subreg_table
) - 1] != 0);
1238 assert(gen6_src_index_table
[ARRAY_SIZE(gen6_src_index_table
) - 1] != 0);
1239 assert(gen7_control_index_table
[ARRAY_SIZE(gen7_control_index_table
) - 1] != 0);
1240 assert(gen7_datatype_table
[ARRAY_SIZE(gen7_datatype_table
) - 1] != 0);
1241 assert(gen7_subreg_table
[ARRAY_SIZE(gen7_subreg_table
) - 1] != 0);
1242 assert(gen7_src_index_table
[ARRAY_SIZE(gen7_src_index_table
) - 1] != 0);
1243 assert(gen8_control_index_table
[ARRAY_SIZE(gen8_control_index_table
) - 1] != 0);
1244 assert(gen8_datatype_table
[ARRAY_SIZE(gen8_datatype_table
) - 1] != 0);
1245 assert(gen8_subreg_table
[ARRAY_SIZE(gen8_subreg_table
) - 1] != 0);
1246 assert(gen8_src_index_table
[ARRAY_SIZE(gen8_src_index_table
) - 1] != 0);
1250 control_index_table
= gen8_control_index_table
;
1251 datatype_table
= gen8_datatype_table
;
1252 subreg_table
= gen8_subreg_table
;
1253 src_index_table
= gen8_src_index_table
;
1256 control_index_table
= gen7_control_index_table
;
1257 datatype_table
= gen7_datatype_table
;
1258 subreg_table
= gen7_subreg_table
;
1259 src_index_table
= gen7_src_index_table
;
1262 control_index_table
= gen6_control_index_table
;
1263 datatype_table
= gen6_datatype_table
;
1264 subreg_table
= gen6_subreg_table
;
1265 src_index_table
= gen6_src_index_table
;
1268 control_index_table
= g45_control_index_table
;
1269 datatype_table
= g45_datatype_table
;
1270 subreg_table
= g45_subreg_table
;
1271 src_index_table
= g45_src_index_table
;
1279 brw_compact_instructions(struct brw_compile
*p
, int start_offset
,
1280 int num_annotations
, struct annotation
*annotation
)
1282 struct brw_context
*brw
= p
->brw
;
1283 void *store
= p
->store
+ start_offset
/ 16;
1284 /* For an instruction at byte offset 16*i before compaction, this is the
1285 * number of compacted instructions that preceded it.
1287 int compacted_counts
[(p
->next_insn_offset
- start_offset
) / sizeof(brw_inst
)];
1288 /* For an instruction at byte offset 8*i after compaction, this was its IP
1289 * (in 16-byte units) before compaction.
1291 int old_ip
[(p
->next_insn_offset
- start_offset
) / sizeof(brw_compact_inst
)];
1297 int compacted_count
= 0;
1298 for (int src_offset
= 0; src_offset
< p
->next_insn_offset
- start_offset
;
1299 src_offset
+= sizeof(brw_inst
)) {
1300 brw_inst
*src
= store
+ src_offset
;
1301 void *dst
= store
+ offset
;
1303 old_ip
[offset
/ sizeof(brw_compact_inst
)] = src_offset
/ sizeof(brw_inst
);
1304 compacted_counts
[src_offset
/ sizeof(brw_inst
)] = compacted_count
;
1306 brw_inst saved
= *src
;
1308 if (brw_try_compact_instruction(brw
, dst
, src
)) {
1312 brw_inst uncompacted
;
1313 brw_uncompact_instruction(brw
, &uncompacted
, dst
);
1314 if (memcmp(&saved
, &uncompacted
, sizeof(uncompacted
))) {
1315 brw_debug_compact_uncompact(brw
, &saved
, &uncompacted
);
1319 offset
+= sizeof(brw_compact_inst
);
1321 /* It appears that the end of thread SEND instruction needs to be
1322 * aligned, or the GPU hangs.
1324 if ((brw_inst_opcode(brw
, src
) == BRW_OPCODE_SEND
||
1325 brw_inst_opcode(brw
, src
) == BRW_OPCODE_SENDC
) &&
1326 brw_inst_eot(brw
, src
) &&
1327 (offset
& sizeof(brw_compact_inst
)) != 0) {
1328 brw_compact_inst
*align
= store
+ offset
;
1329 memset(align
, 0, sizeof(*align
));
1330 brw_compact_inst_set_opcode(align
, BRW_OPCODE_NOP
);
1331 brw_compact_inst_set_cmpt_control(align
, true);
1332 offset
+= sizeof(brw_compact_inst
);
1333 old_ip
[offset
/ sizeof(brw_compact_inst
)] = src_offset
/ sizeof(brw_inst
);
1335 dst
= store
+ offset
;
1338 /* If we didn't compact this intruction, we need to move it down into
1341 if (offset
!= src_offset
) {
1342 memmove(dst
, src
, sizeof(brw_inst
));
1344 offset
+= sizeof(brw_inst
);
1348 /* Fix up control flow offsets. */
1349 p
->next_insn_offset
= start_offset
+ offset
;
1350 for (offset
= 0; offset
< p
->next_insn_offset
- start_offset
;
1351 offset
= next_offset(brw
, store
, offset
)) {
1352 brw_inst
*insn
= store
+ offset
;
1353 int this_old_ip
= old_ip
[offset
/ sizeof(brw_compact_inst
)];
1354 int this_compacted_count
= compacted_counts
[this_old_ip
];
1355 int target_old_ip
, target_compacted_count
;
1357 switch (brw_inst_opcode(brw
, insn
)) {
1358 case BRW_OPCODE_BREAK
:
1359 case BRW_OPCODE_CONTINUE
:
1360 case BRW_OPCODE_HALT
:
1361 if (brw
->gen
>= 6) {
1362 update_uip_jip(brw
, insn
, this_old_ip
, compacted_counts
);
1364 update_gen4_jump_count(brw
, insn
, this_old_ip
, compacted_counts
);
1369 case BRW_OPCODE_IFF
:
1370 case BRW_OPCODE_ELSE
:
1371 case BRW_OPCODE_ENDIF
:
1372 case BRW_OPCODE_WHILE
:
1373 if (brw
->gen
>= 7) {
1374 update_uip_jip(brw
, insn
, this_old_ip
, compacted_counts
);
1375 } else if (brw
->gen
== 6) {
1376 /* Jump Count is in units of compacted instructions on Gen6. */
1377 int jump_count_compacted
= brw_inst_gen6_jump_count(brw
, insn
);
1378 int jump_count_uncompacted
= jump_count_compacted
/ 2;
1380 target_old_ip
= this_old_ip
+ jump_count_uncompacted
;
1381 target_compacted_count
= compacted_counts
[target_old_ip
];
1382 jump_count_compacted
-= (target_compacted_count
- this_compacted_count
);
1383 brw_inst_set_gen6_jump_count(brw
, insn
, jump_count_compacted
);
1385 update_gen4_jump_count(brw
, insn
, this_old_ip
, compacted_counts
);
1389 case BRW_OPCODE_ADD
:
1390 /* Add instructions modifying the IP register use an immediate src1,
1391 * and Gens that use this cannot compact instructions with immediate
1394 if (brw_inst_cmpt_control(brw
, insn
))
1397 if (brw_inst_dst_reg_file(brw
, insn
) == BRW_ARCHITECTURE_REGISTER_FILE
&&
1398 brw_inst_dst_da_reg_nr(brw
, insn
) == BRW_ARF_IP
) {
1399 assert(brw_inst_src1_reg_file(brw
, insn
) == BRW_IMMEDIATE_VALUE
);
1401 int jump
= brw_inst_imm_d(brw
, insn
);
1402 int jump_compacted
= jump
/ sizeof(brw_compact_inst
);
1403 int jump_uncompacted
= jump
/ sizeof(brw_inst
);
1405 target_old_ip
= this_old_ip
+ jump_uncompacted
;
1406 target_compacted_count
= compacted_counts
[target_old_ip
];
1407 jump_compacted
-= (target_compacted_count
- this_compacted_count
);
1408 brw_inst_set_imm_ud(brw
, insn
, jump_compacted
*
1409 sizeof(brw_compact_inst
));
1415 /* p->nr_insn is counting the number of uncompacted instructions still, so
1416 * divide. We do want to be sure there's a valid instruction in any
1417 * alignment padding, so that the next compression pass (for the FS 8/16
1418 * compile passes) parses correctly.
1420 if (p
->next_insn_offset
& sizeof(brw_compact_inst
)) {
1421 brw_compact_inst
*align
= store
+ offset
;
1422 memset(align
, 0, sizeof(*align
));
1423 brw_compact_inst_set_opcode(align
, BRW_OPCODE_NOP
);
1424 brw_compact_inst_set_cmpt_control(align
, true);
1425 p
->next_insn_offset
+= sizeof(brw_compact_inst
);
1427 p
->nr_insn
= p
->next_insn_offset
/ sizeof(brw_inst
);
1429 /* Update the instruction offsets for each annotation. */
1431 for (int offset
= 0, i
= 0; i
< num_annotations
; i
++) {
1432 while (start_offset
+ old_ip
[offset
/ sizeof(brw_compact_inst
)] *
1433 sizeof(brw_inst
) != annotation
[i
].offset
) {
1434 assert(start_offset
+ old_ip
[offset
/ sizeof(brw_compact_inst
)] *
1435 sizeof(brw_inst
) < annotation
[i
].offset
);
1436 offset
= next_offset(brw
, store
, offset
);
1439 annotation
[i
].offset
= start_offset
+ offset
;
1441 offset
= next_offset(brw
, store
, offset
);
1444 annotation
[num_annotations
].offset
= p
->next_insn_offset
;