From 011f5e92f8ff87f099ed0aae736e79be20a77c6c Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Mon, 2 Nov 2020 14:39:24 +0000 Subject: [PATCH] arm: Improve thumb1_gen_const_int Enable thumb1_gen_const_int to generate RTL or asm depending on the context, so that we avoid duplicating code to handle constants in Thumb-1 with -mpure-code. Use a template so that the algorithm is effectively shared, and rely on two classes to handle the actual emission as RTL or asm. The generated sequence is improved to handle right-shiftable and small values with less instructions. We now generate: 128: movs r0, r0, #128 264: movs r3, #33 lsls r3, #3 510: movs r3, #255 lsls r3, #1 512: movs r3, #1 lsls r3, #9 764: movs r3, #191 lsls r3, #2 65536: movs r3, #1 lsls r3, #16 0x123456: movs r3, #18 ;0x12 lsls r3, #8 adds r3, #52 ;0x34 lsls r3, #8 adds r3, #86 ;0x56 0x1123456: movs r3, #137 ;0x89 lsls r3, #8 adds r3, #26 ;0x1a lsls r3, #8 adds r3, #43 ;0x2b lsls r3, #1 0x1000010: movs r3, #16 lsls r3, #16 adds r3, #1 lsls r3, #4 0x1000011: movs r3, #1 lsls r3, #24 adds r3, #17 -8192: movs r3, #1 lsls r3, #13 rsbs r3, #0 The patch adds a testcase which does not fully exercise thumb1_gen_const_int, as other existing patterns already catch small constants. These parts of thumb1_gen_const_int are used by arm_thumb1_mi_thunk. 2020-11-02 Christophe Lyon gcc/ * config/arm/arm.c (thumb1_const_rtl, thumb1_const_print): New classes. (thumb1_gen_const_int): Rename to ... (thumb1_gen_const_int_1): ... New helper function. Add capability to emit either RTL or asm, improve generated code. (thumb1_gen_const_int_rtl): New function. * config/arm/arm-protos.h (thumb1_gen_const_int): Rename to thumb1_gen_const_int_rtl. * config/arm/thumb1.md: Call thumb1_gen_const_int_rtl instead of thumb1_gen_const_int. gcc/testsuite/ * gcc.target/arm/pure-code/no-literal-pool-m0.c: New. --- gcc/config/arm/arm-protos.h | 2 +- gcc/config/arm/arm.c | 224 +++++++++++++++--- gcc/config/arm/thumb1.md | 2 +- .../arm/pure-code/no-literal-pool-m0.c | 175 ++++++++++++++ 4 files changed, 369 insertions(+), 34 deletions(-) create mode 100644 gcc/testsuite/gcc.target/arm/pure-code/no-literal-pool-m0.c diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 703d6160c24..5b581e00023 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -74,7 +74,7 @@ extern bool arm_small_register_classes_for_mode_p (machine_mode); extern int const_ok_for_arm (HOST_WIDE_INT); extern int const_ok_for_op (HOST_WIDE_INT, enum rtx_code); extern int const_ok_for_dimode_op (HOST_WIDE_INT, enum rtx_code); -extern void thumb1_gen_const_int (rtx, HOST_WIDE_INT); +extern void thumb1_gen_const_int_rtl (rtx, HOST_WIDE_INT); extern int arm_split_constant (RTX_CODE, machine_mode, rtx, HOST_WIDE_INT, rtx, rtx, int); extern int legitimate_pic_operand_p (rtx); diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index ae05891451a..203d2b6b50b 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -4528,38 +4528,6 @@ const_ok_for_dimode_op (HOST_WIDE_INT i, enum rtx_code code) } } -/* Emit a sequence of movs/adds/shift to produce a 32-bit constant. - Avoid generating useless code when one of the bytes is zero. */ -void -thumb1_gen_const_int (rtx op0, HOST_WIDE_INT op1) -{ - bool mov_done_p = false; - int i; - - /* Emit upper 3 bytes if needed. */ - for (i = 0; i < 3; i++) - { - int byte = (op1 >> (8 * (3 - i))) & 0xff; - - if (byte) - { - emit_set_insn (op0, mov_done_p - ? gen_rtx_PLUS (SImode,op0, GEN_INT (byte)) - : GEN_INT (byte)); - mov_done_p = true; - } - - if (mov_done_p) - emit_set_insn (op0, gen_rtx_ASHIFT (SImode, op0, GEN_INT (8))); - } - - /* Emit lower byte if needed. */ - if (!mov_done_p) - emit_set_insn (op0, GEN_INT (op1 & 0xff)); - else if (op1 & 0xff) - emit_set_insn (op0, gen_rtx_PLUS (SImode, op0, GEN_INT (op1 & 0xff))); -} - /* Emit a sequence of insns to handle a large constant. CODE is the code of the operation required, it can be any of SET, PLUS, IOR, AND, XOR, MINUS; @@ -28263,6 +28231,198 @@ arm_internal_label (FILE *stream, const char *prefix, unsigned long labelno) default_internal_label (stream, prefix, labelno); } +/* Define classes to generate code as RTL or output asm to a file. + Using templates then allows to use the same code to output code + sequences in the two formats. */ +class thumb1_const_rtl +{ + public: + thumb1_const_rtl (rtx dst) : dst (dst) {} + + void mov (HOST_WIDE_INT val) + { + emit_set_insn (dst, GEN_INT (val)); + } + + void add (HOST_WIDE_INT val) + { + emit_set_insn (dst, gen_rtx_PLUS (SImode, dst, GEN_INT (val))); + } + + void ashift (HOST_WIDE_INT shift) + { + emit_set_insn (dst, gen_rtx_ASHIFT (SImode, dst, GEN_INT (shift))); + } + + void neg () + { + emit_set_insn (dst, gen_rtx_NEG (SImode, dst)); + } + + private: + rtx dst; +}; + +class thumb1_const_print +{ + public: + thumb1_const_print (FILE *f, int regno) + { + t_file = f; + dst_regname = reg_names[regno]; + } + + void mov (HOST_WIDE_INT val) + { + asm_fprintf (t_file, "\tmovs\t%s, #" HOST_WIDE_INT_PRINT_DEC "\n", + dst_regname, val); + } + + void add (HOST_WIDE_INT val) + { + asm_fprintf (t_file, "\tadds\t%s, #" HOST_WIDE_INT_PRINT_DEC "\n", + dst_regname, val); + } + + void ashift (HOST_WIDE_INT shift) + { + asm_fprintf (t_file, "\tlsls\t%s, #" HOST_WIDE_INT_PRINT_DEC "\n", + dst_regname, shift); + } + + void neg () + { + asm_fprintf (t_file, "\trsbs\t%s, #0\n", dst_regname); + } + + private: + FILE *t_file; + const char *dst_regname; +}; + +/* Emit a sequence of movs/adds/shift to produce a 32-bit constant. + Avoid generating useless code when one of the bytes is zero. */ +template +void +thumb1_gen_const_int_1 (T dst, HOST_WIDE_INT op1) +{ + bool mov_done_p = false; + unsigned HOST_WIDE_INT val = op1; + int shift = 0; + int i; + + gcc_assert (op1 == trunc_int_for_mode (op1, SImode)); + + if (val <= 255) + { + dst.mov (val); + return; + } + + /* For negative numbers with the first nine bits set, build the + opposite of OP1, then negate it, it's generally shorter and not + longer. */ + if ((val & 0xFF800000) == 0xFF800000) + { + thumb1_gen_const_int_1 (dst, -op1); + dst.neg (); + return; + } + + /* In the general case, we need 7 instructions to build + a 32 bits constant (1 movs, 3 lsls, 3 adds). We can + do better if VAL is small enough, or + right-shiftable by a suitable amount. If the + right-shift enables to encode at least one less byte, + it's worth it: we save a adds and a lsls at the + expense of a final lsls. */ + int final_shift = number_of_first_bit_set (val); + + int leading_zeroes = clz_hwi (val); + int number_of_bytes_needed + = ((HOST_BITS_PER_WIDE_INT - 1 - leading_zeroes) + / BITS_PER_UNIT) + 1; + int number_of_bytes_needed2 + = ((HOST_BITS_PER_WIDE_INT - 1 - leading_zeroes - final_shift) + / BITS_PER_UNIT) + 1; + + if (number_of_bytes_needed2 < number_of_bytes_needed) + val >>= final_shift; + else + final_shift = 0; + + /* If we are in a very small range, we can use either a single movs + or movs+adds. */ + if (val <= 510) + { + if (val > 255) + { + unsigned HOST_WIDE_INT high = val - 255; + + dst.mov (high); + dst.add (255); + } + else + dst.mov (val); + + if (final_shift > 0) + dst.ashift (final_shift); + } + else + { + /* General case, emit upper 3 bytes as needed. */ + for (i = 0; i < 3; i++) + { + unsigned HOST_WIDE_INT byte = (val >> (8 * (3 - i))) & 0xff; + + if (byte) + { + /* We are about to emit new bits, stop accumulating a + shift amount, and left-shift only if we have already + emitted some upper bits. */ + if (mov_done_p) + { + dst.ashift (shift); + dst.add (byte); + } + else + dst.mov (byte); + + /* Stop accumulating shift amount since we've just + emitted some bits. */ + shift = 0; + + mov_done_p = true; + } + + if (mov_done_p) + shift += 8; + } + + /* Emit lower byte. */ + if (!mov_done_p) + dst.mov (val & 0xff); + else + { + dst.ashift (shift); + if (val & 0xff) + dst.add (val & 0xff); + } + + if (final_shift > 0) + dst.ashift (final_shift); + } +} + +/* Proxy for thumb1.md, since the thumb1_const_print and + thumb1_const_rtl classes are not exported. */ +void +thumb1_gen_const_int_rtl (rtx dst, HOST_WIDE_INT op1) +{ + thumb1_const_rtl t (dst); + thumb1_gen_const_int_1 (t, op1); +} + /* Output code to add DELTA to the first argument, and then jump to FUNCTION. Used for C++ multiple inheritance. */ diff --git a/gcc/config/arm/thumb1.md b/gcc/config/arm/thumb1.md index 320e78d1161..e2fcb1045fa 100644 --- a/gcc/config/arm/thumb1.md +++ b/gcc/config/arm/thumb1.md @@ -820,7 +820,7 @@ && !satisfies_constraint_K (operands[1])" [(clobber (const_int 0))] " - thumb1_gen_const_int (operands[0], INTVAL (operands[1])); + thumb1_gen_const_int_rtl (operands[0], INTVAL (operands[1])); DONE; " ) diff --git a/gcc/testsuite/gcc.target/arm/pure-code/no-literal-pool-m0.c b/gcc/testsuite/gcc.target/arm/pure-code/no-literal-pool-m0.c new file mode 100644 index 00000000000..787a61a125d --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/pure-code/no-literal-pool-m0.c @@ -0,0 +1,175 @@ +/* { dg-do compile } */ +/* { dg-options "-mpure-code -mcpu=cortex-m0 -march=armv6s-m -mthumb" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +/* Does not use thumb1_gen_const_int. +** test_0: +** ... +** movs r[0-3], #0 +** ... +*/ +int +test_0 () +{ + return 0; +} + +/* Does not use thumb1_gen_const_int. +** test_128: +** ... +** movs r[0-3], #128 +** ... +*/ +int +test_128 () +{ + return 128; +} + +/* Does not use thumb1_gen_const_int. +** test_264: +** ... +** movs r[0-3], #132 +** lsls r[0-3], r[0-3], #1 +** ... +*/ +int +test_264 () +{ + return 264; +} + +/* Does not use thumb1_gen_const_int. +** test_510: +** ... +** movs r[0-3], #255 +** lsls r[0-3], r[0-3], #1 +** ... +*/ +int +test_510 () +{ + return 510; +} + +/* Does not use thumb1_gen_const_int. +** test_512: +** ... +** movs r[0-3], #128 +** lsls r[0-3], r[0-3], #2 +** ... +*/ +int +test_512 () +{ + return 512; +} + +/* Does not use thumb1_gen_const_int. +** test_764: +** ... +** movs r[0-3], #191 +** lsls r[0-3], r[0-3], #2 +** ... +*/ +int +test_764 () +{ + return 764; +} + +/* Does not use thumb1_gen_const_int. +** test_65536: +** ... +** movs r[0-3], #128 +** lsls r[0-3], r[0-3], #9 +** ... +*/ +int +test_65536 () +{ + return 65536; +} + +/* +** test_0x123456: +** ... +** movs r[0-3], #18 +** lsls r[0-3], r[0-3], #8 +** adds r[0-3], r[0-3], #52 +** lsls r[0-3], r[0-3], #8 +** adds r[0-3], r[0-3], #86 +** ... +*/ +int +test_0x123456 () +{ + return 0x123456; +} + +/* +** test_0x1123456: +** ... +** movs r[0-3], #137 +** lsls r[0-3], r[0-3], #8 +** adds r[0-3], r[0-3], #26 +** lsls r[0-3], r[0-3], #8 +** adds r[0-3], r[0-3], #43 +** lsls r[0-3], r[0-3], #1 +** ... +*/ +int +test_0x1123456 () +{ + return 0x1123456; +} + +/* With -Os, we generate: + movs r0, #16 + lsls r0, r0, r0 + With the other optimization levels, we generate: + movs r0, #16 + lsls r0, r0, #16 + hence the two alternatives. */ +/* +** test_0x1000010: +** ... +** movs r[0-3], #16 +** lsls r[0-3], r[0-3], (#16|r[0-3]) +** adds r[0-3], r[0-3], #1 +** lsls r[0-3], r[0-3], #4 +** ... +*/ +int +test_0x1000010 () +{ + return 0x1000010; +} + +/* +** test_0x1000011: +** ... +** movs r[0-3], #1 +** lsls r[0-3], r[0-3], #24 +** adds r[0-3], r[0-3], #17 +** ... +*/ +int +test_0x1000011 () +{ + return 0x1000011; +} + +/* +** test_m8192: +** ... +** movs r[0-3], #1 +** lsls r[0-3], r[0-3], #13 +** rsbs r[0-3], r[0-3], #0 +** ... +*/ +int +test_m8192 () +{ + return -8192; +} -- 2.30.2