From 0e6c51de8ec47bf5f0dfaabfd1898c722d0485b4 Mon Sep 17 00:00:00 2001 From: Hans-Peter Nilsson Date: Mon, 24 Aug 2020 03:15:21 +0200 Subject: [PATCH] reorg.c (fill_slots_from_thread): Improve for TARGET_FLAGS_REGNUM This handles TARGET_FLAGS_REGNUM clobbering insns as delay-slot fillers using a method similar to that in commit 33c2207d3fda, where care was taken for fill_simple_delay_slots to allow such insns when scanning for delay-slot fillers *backwards* (before the insn). A TARGET_FLAGS_REGNUM target is typically a former cc0 target. For cc0 targets, insns don't mention clobbering cc0, so the clobbers are mentioned in the "resources" only as a special entity and only for compare-insns and branches, where the cc0 value matters. In contrast, with TARGET_FLAGS_REGNUM, most insns clobber it and the register liveness detection in reorg.c / resource.c treats that as a blocker (for other insns mentioning it, i.e. most) when looking for delay-slot-filling candidates. This means that when comparing core and performance for a delay-slot cc0 target before and after the de-cc0 conversion, the inability to fill a delay slot after conversion manifests as a regression. This was one such case, for CRIS, with random_bitstring in gcc.c-torture/execute/arith-rand-ll.c as well as the target libgcc division function. After this, all known performance regressions compared to cc0 are fixed. gcc: PR target/93372 * reorg.c (fill_slots_from_thread): Allow trial insns that clobber TARGET_FLAGS_REGNUM as delay-slot fillers. gcc/testsuite: PR target/93372 * gcc.target/cris/pr93372-47.c: New test. --- gcc/reorg.c | 37 +++++++++++++++- gcc/testsuite/gcc.target/cris/pr93372-47.c | 49 ++++++++++++++++++++++ 2 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/cris/pr93372-47.c diff --git a/gcc/reorg.c b/gcc/reorg.c index 84beb9395aa..613831e02cf 100644 --- a/gcc/reorg.c +++ b/gcc/reorg.c @@ -2412,6 +2412,21 @@ fill_slots_from_thread (rtx_jump_insn *insn, rtx condition, CLEAR_RESOURCE (&needed); CLEAR_RESOURCE (&set); + /* Handle the flags register specially, to be able to accept a + candidate that clobbers it. See also fill_simple_delay_slots. */ + bool filter_flags + = (slots_to_fill == 1 + && targetm.flags_regnum != INVALID_REGNUM + && find_regno_note (insn, REG_DEAD, targetm.flags_regnum)); + struct resources fset; + struct resources flags_res; + if (filter_flags) + { + CLEAR_RESOURCE (&fset); + CLEAR_RESOURCE (&flags_res); + SET_HARD_REG_BIT (flags_res.regs, targetm.flags_regnum); + } + /* If we do not own this thread, we must stop as soon as we find something that we can't put in a delay slot, since all we can do is branch into THREAD at a later point. Therefore, labels stop @@ -2440,8 +2455,18 @@ fill_slots_from_thread (rtx_jump_insn *insn, rtx condition, /* If TRIAL conflicts with the insns ahead of it, we lose. Also, don't separate or copy insns that set and use CC0. */ if (! insn_references_resource_p (trial, &set, true) - && ! insn_sets_resource_p (trial, &set, true) + && ! insn_sets_resource_p (trial, filter_flags ? &fset : &set, true) && ! insn_sets_resource_p (trial, &needed, true) + /* If we're handling sets to the flags register specially, we + only allow an insn into a delay-slot, if it either: + - doesn't set the flags register, + - the "set" of the flags register isn't used (clobbered), + - insns between the delay-slot insn and the trial-insn + as accounted in "set", have not affected the flags register. */ + && (! filter_flags + || ! insn_sets_resource_p (trial, &flags_res, true) + || find_regno_note (trial, REG_UNUSED, targetm.flags_regnum) + || ! TEST_HARD_REG_BIT (set.regs, targetm.flags_regnum)) && (!HAVE_cc0 || (! (reg_mentioned_p (cc0_rtx, pat) && (! own_thread || ! sets_cc0_p (pat))))) && ! can_throw_internal (trial)) @@ -2619,6 +2644,16 @@ fill_slots_from_thread (rtx_jump_insn *insn, rtx condition, lose = 1; mark_set_resources (trial, &set, 0, MARK_SRC_DEST_CALL); mark_referenced_resources (trial, &needed, true); + if (filter_flags) + { + mark_set_resources (trial, &fset, 0, MARK_SRC_DEST_CALL); + + /* Groups of flags-register setters with users should not + affect opportunities to move flags-register-setting insns + (clobbers) into the delay-slot. */ + CLEAR_HARD_REG_BIT (needed.regs, targetm.flags_regnum); + CLEAR_HARD_REG_BIT (fset.regs, targetm.flags_regnum); + } /* Ensure we don't put insns between the setting of cc and the comparison by moving a setting of cc into an earlier delay slot since these insns diff --git a/gcc/testsuite/gcc.target/cris/pr93372-47.c b/gcc/testsuite/gcc.target/cris/pr93372-47.c new file mode 100644 index 00000000000..8d80ae6b4aa --- /dev/null +++ b/gcc/testsuite/gcc.target/cris/pr93372-47.c @@ -0,0 +1,49 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=v10" } */ +/* { dg-final { scan-assembler-times {\tnop} 1 } } */ + +/* A somewhat brittle test-case, checking that we have (only) one + unfilled delay-slot in random_bitstring: there might be none or two + or more, and general improvements may lead to unfilled delay-slots. + When the scan-assembler-times directive regresses, re-run + gcc.c-torture/execute/arith-rand-ll.c, check cycle-level + execution-time regressions in random_bitstring and take appropriate + action. */ + +static long long +simple_rand () +{ + static unsigned long long seed = 47114711; + unsigned long long this = seed * 1103515245 + 12345; + seed = this; + return this >> 8; +} + +unsigned long long int +random_bitstring () +{ + unsigned long long int x; + int n_bits; + long long ran; + int tot_bits = 0; + + x = 0; + for (;;) + { + ran = simple_rand (); + n_bits = (ran >> 1) % 16; + tot_bits += n_bits; + + if (n_bits == 0) + return x; + else + { + x <<= n_bits; + if (ran & 1) + x |= (1 << n_bits) - 1; + + if (tot_bits > 8 * sizeof (long long) + 6) + return x; + } + } +} -- 2.30.2