From 16493b5771cf3b1670a4a9bf7c7426fd74b727d0 Mon Sep 17 00:00:00 2001 From: Claudiu Zissulescu Date: Wed, 13 Jun 2018 10:43:56 +0200 Subject: [PATCH] [ARC] Reimplement return padding operation for ARC700. For ARC700, adding padding if necessary to avoid a mispredict. A return could happen immediately after the function start. A call/return and return/return must be 6 bytes apart to avoid mispredict. The old implementation was doing this operation very late in the compilation process, and the additional nop instructions and/or forcing some other instruction to take their long form was not taken into account when generating brcc instructions. Thus, wrong code could be generated. gcc/ 2017-03-24 Claudiu Zissulescu * config/arc/arc-protos.h (arc_pad_return): Remove. * config/arc/arc.c (machine_function): Remove force_short_suffix and size_reason. (arc_print_operand): Adjust printing of '&'. (arc_verify_short): Remove conditional printing of short suffix. (arc_final_prescan_insn): Remove reference to size_reason. (pad_return): New function. (arc_reorg): Call pad_return. (arc_pad_return): Remove. (arc_init_machine_status): Remove reference to force_short_suffix. * config/arc/arc.md (vunspec): Add VUNSPEC_ARC_BLOCKAGE. (attr length): When attribute iscompact is true force to 2 regardless; in the case of maybe check if we want to force the instruction to have 4 bytes length. (nopv): Change it to generate 4 byte long nop as well. (blockage): New pattern. (simple_return): Remove call to arc_pad_return. (p_return_i): Likewise. gcc/testsuite/ 2017-03-24 Claudiu Zissulescu * gcc.target/arc/pr9001107555.c: New file. From-SVN: r261542 --- gcc/ChangeLog | 21 +++ gcc/config/arc/arc-protos.h | 1 - gcc/config/arc/arc.c | 156 +++++++++----------- gcc/config/arc/arc.md | 26 +++- gcc/testsuite/ChangeLog | 4 + gcc/testsuite/gcc.target/arc/pr9001107555.c | 51 +++++++ 6 files changed, 166 insertions(+), 93 deletions(-) create mode 100644 gcc/testsuite/gcc.target/arc/pr9001107555.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 70be1d2f482..ce7197bf27c 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,24 @@ +2018-06-12 Claudiu Zissulescu + + * config/arc/arc-protos.h (arc_pad_return): Remove. + * config/arc/arc.c (machine_function): Remove force_short_suffix + and size_reason. + (arc_print_operand): Adjust printing of '&'. + (arc_verify_short): Remove conditional printing of short suffix. + (arc_final_prescan_insn): Remove reference to size_reason. + (pad_return): New function. + (arc_reorg): Call pad_return. + (arc_pad_return): Remove. + (arc_init_machine_status): Remove reference to force_short_suffix. + * config/arc/arc.md (vunspec): Add VUNSPEC_ARC_BLOCKAGE. + (attr length): When attribute iscompact is true force to 2 + regardless; in the case of maybe check if we want to force the + instruction to have 4 bytes length. + (nopv): Change it to generate 4 byte long nop as well. + (blockage): New pattern. + (simple_return): Remove call to arc_pad_return. + (p_return_i): Likewise. + 2018-06-12 Claudiu Zissulescu * config/arc/elf.h (LINK_GCC_C_SEQUENCE_SPEC): Define. diff --git a/gcc/config/arc/arc-protos.h b/gcc/config/arc/arc-protos.h index 67f3b4e3226..ce4b6f84749 100644 --- a/gcc/config/arc/arc-protos.h +++ b/gcc/config/arc/arc-protos.h @@ -89,7 +89,6 @@ extern void arc_clear_unalign (void); extern void arc_toggle_unalign (void); extern void split_addsi (rtx *); extern void split_subsi (rtx *); -extern void arc_pad_return (void); extern void arc_split_move (rtx *); extern const char *arc_short_long (rtx_insn *insn, const char *, const char *); extern rtx arc_regno_use_in (unsigned int, rtx); diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c index 6b90dd5896d..85f473ab195 100644 --- a/gcc/config/arc/arc.c +++ b/gcc/config/arc/arc.c @@ -2629,8 +2629,6 @@ typedef struct GTY (()) machine_function struct arc_frame_info frame_info; /* To keep track of unalignment caused by short insns. */ int unalign; - int force_short_suffix; /* Used when disgorging return delay slot insns. */ - const char *size_reason; struct arc_ccfsm ccfsm_current; /* Map from uid to ccfsm state during branch shortening. */ rtx ccfsm_current_insn; @@ -4288,7 +4286,7 @@ arc_print_operand (FILE *file, rtx x, int code) } break; case '&': - if (TARGET_ANNOTATE_ALIGN && cfun->machine->size_reason) + if (TARGET_ANNOTATE_ALIGN) fprintf (file, "; unalign: %d", cfun->machine->unalign); return; case '+': @@ -4961,7 +4959,6 @@ static int arc_verify_short (rtx_insn *insn, int, int check_attr) { enum attr_iscompact iscompact; - struct machine_function *machine; if (check_attr > 0) { @@ -4969,10 +4966,6 @@ arc_verify_short (rtx_insn *insn, int, int check_attr) if (iscompact == ISCOMPACT_FALSE) return 0; } - machine = cfun->machine; - - if (machine->force_short_suffix >= 0) - return machine->force_short_suffix; return (get_attr_length (insn) & 2) != 0; } @@ -5011,8 +5004,6 @@ arc_final_prescan_insn (rtx_insn *insn, rtx *opvec ATTRIBUTE_UNUSED, cfun->machine->prescan_initialized = 1; } arc_ccfsm_advance (insn, &arc_ccfsm_current); - - cfun->machine->size_reason = 0; } /* Given FROM and TO register numbers, say whether this elimination is allowed. @@ -7654,6 +7645,76 @@ jli_call_scan (void) } } +/* Add padding if necessary to avoid a mispredict. A return could + happen immediately after the function start. A call/return and + return/return must be 6 bytes apart to avoid mispredict. */ + +static void +pad_return (void) +{ + rtx_insn *insn; + long offset; + + if (!TARGET_PAD_RETURN) + return; + + for (insn = get_insns (); insn; insn = NEXT_INSN (insn)) + { + rtx_insn *prev0 = prev_active_insn (insn); + bool wantlong = false; + + if (!INSN_P (insn) || GET_CODE (PATTERN (insn)) != SIMPLE_RETURN) + continue; + + if (!prev0) + { + prev0 = emit_insn_before (gen_nopv (), insn); + /* REG_SAVE_NOTE is used by Haifa scheduler, we are in reorg + so it is safe to reuse it for forcing a particular length + for an instruction. */ + add_reg_note (prev0, REG_SAVE_NOTE, GEN_INT (1)); + emit_insn_before (gen_nopv (), insn); + continue; + } + offset = get_attr_length (prev0); + + if (get_attr_length (prev0) == 2 + && get_attr_iscompact (prev0) != ISCOMPACT_TRUE) + { + /* Force long version of the insn. */ + wantlong = true; + offset += 2; + } + + rtx_insn *prev = prev_active_insn (prev0); + if (prev) + offset += get_attr_length (prev); + + prev = prev_active_insn (prev); + if (prev) + offset += get_attr_length (prev); + + switch (offset) + { + case 2: + prev = emit_insn_before (gen_nopv (), insn); + add_reg_note (prev, REG_SAVE_NOTE, GEN_INT (1)); + break; + case 4: + emit_insn_before (gen_nopv (), insn); + break; + default: + continue; + } + + if (wantlong) + add_reg_note (prev0, REG_SAVE_NOTE, GEN_INT (1)); + + /* Emit a blockage to avoid delay slot scheduling. */ + emit_insn_before (gen_blockage (), insn); + } +} + static int arc_reorg_in_progress = 0; /* ARC's machince specific reorg function. */ @@ -7679,6 +7740,7 @@ arc_reorg (void) workaround_arc_anomaly (); jli_call_scan (); + pad_return (); /* FIXME: should anticipate ccfsm action, generate special patterns for to-be-deleted branches that have no delay slot and have at least the @@ -9237,79 +9299,6 @@ arc_branch_size_unknown_p (void) return !optimize_size && arc_reorg_in_progress; } -/* We are about to output a return insn. Add padding if necessary to avoid - a mispredict. A return could happen immediately after the function - start, but after a call we know that there will be at least a blink - restore. */ - -void -arc_pad_return (void) -{ - rtx_insn *insn = current_output_insn; - rtx_insn *prev = prev_active_insn (insn); - int want_long; - - if (!prev) - { - fputs ("\tnop_s\n", asm_out_file); - cfun->machine->unalign ^= 2; - want_long = 1; - } - /* If PREV is a sequence, we know it must be a branch / jump or a tailcall, - because after a call, we'd have to restore blink first. */ - else if (GET_CODE (PATTERN (prev)) == SEQUENCE) - return; - else - { - want_long = (get_attr_length (prev) == 2); - prev = prev_active_insn (prev); - } - if (!prev - || ((NONJUMP_INSN_P (prev) && GET_CODE (PATTERN (prev)) == SEQUENCE) - ? CALL_ATTR (as_a (PATTERN (prev))->insn (0), - NON_SIBCALL) - : CALL_ATTR (prev, NON_SIBCALL))) - { - if (want_long) - cfun->machine->size_reason - = "call/return and return/return must be 6 bytes apart to avoid mispredict"; - else if (TARGET_UNALIGN_BRANCH && cfun->machine->unalign) - { - cfun->machine->size_reason - = "Long unaligned jump avoids non-delay slot penalty"; - want_long = 1; - } - /* Disgorge delay insn, if there is any, and it may be moved. */ - if (final_sequence - /* ??? Annulled would be OK if we can and do conditionalize - the delay slot insn accordingly. */ - && !INSN_ANNULLED_BRANCH_P (insn) - && (get_attr_cond (insn) != COND_USE - || !reg_set_p (gen_rtx_REG (CCmode, CC_REG), - XVECEXP (final_sequence, 0, 1)))) - { - prev = as_a (XVECEXP (final_sequence, 0, 1)); - gcc_assert (!prev_real_insn (insn) - || !arc_hazard (prev_real_insn (insn), prev)); - cfun->machine->force_short_suffix = !want_long; - rtx save_pred = current_insn_predicate; - final_scan_insn (prev, asm_out_file, optimize, 1, NULL); - cfun->machine->force_short_suffix = -1; - prev->set_deleted (); - current_output_insn = insn; - current_insn_predicate = save_pred; - } - else if (want_long) - fputs ("\tnop\n", asm_out_file); - else - { - fputs ("\tnop_s\n", asm_out_file); - cfun->machine->unalign ^= 2; - } - } - return; -} - /* The usual; we set up our machine_function data. */ static struct machine_function * @@ -9318,7 +9307,6 @@ arc_init_machine_status (void) struct machine_function *machine; machine = ggc_cleared_alloc (); machine->fn_type = ARC_FUNCTION_UNKNOWN; - machine->force_short_suffix = -1; return machine; } diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md index 1a52e6f3859..ebe2982f657 100644 --- a/gcc/config/arc/arc.md +++ b/gcc/config/arc/arc.md @@ -161,6 +161,7 @@ VUNSPEC_ARC_CAS VUNSPEC_ARC_SC VUNSPEC_ARC_LL + VUNSPEC_ARC_BLOCKAGE ]) (define_constants @@ -384,13 +385,18 @@ ;; and insn lengths: insns with shimm values cannot be conditionally executed. (define_attr "length" "" (cond - [(eq_attr "iscompact" "true,maybe") + [(eq_attr "iscompact" "true") + (const_int 2) + + (eq_attr "iscompact" "maybe") (cond [(eq_attr "type" "sfunc") (cond [(match_test "GET_CODE (PATTERN (insn)) == COND_EXEC") (const_int 12)] (const_int 10)) - (match_test "GET_CODE (PATTERN (insn)) == COND_EXEC") (const_int 4)] + (match_test "GET_CODE (PATTERN (insn)) == COND_EXEC") (const_int 4) + (match_test "find_reg_note (insn, REG_SAVE_NOTE, GEN_INT (1))") + (const_int 4)] (const_int 2)) (eq_attr "iscompact" "true_limm") @@ -4431,8 +4437,16 @@ "" "nop%?" [(set_attr "type" "misc") - (set_attr "iscompact" "true") - (set_attr "length" "2")]) + (set_attr "iscompact" "maybe") + (set_attr "length" "*")]) + +(define_insn "blockage" + [(unspec_volatile [(const_int 0)] VUNSPEC_ARC_BLOCKAGE)] + "" + "" + [(set_attr "length" "0") + (set_attr "type" "block")] +) ;; Split up troublesome insns for better scheduling. @@ -4977,8 +4991,6 @@ { return \"rtie\"; } - if (TARGET_PAD_RETURN) - arc_pad_return (); output_asm_insn (\"j%!%* [%0]%&\", ®); return \"\"; } @@ -5022,8 +5034,6 @@ arc_return_address_register (arc_compute_function_type (cfun))); - if (TARGET_PAD_RETURN) - arc_pad_return (); output_asm_insn (\"j%d0%!%# [%1]%&\", xop); /* record the condition in case there is a delay insn. */ arc_ccfsm_record_condition (xop[0], false, insn, 0); diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 5f01694b919..572424df1fa 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2018-06-12 Claudiu Zissulescu + + * gcc.target/arc/pr9001107555.c: New file. + 2018-06-12 Richard Sandiford * g++.dg/torture/aarch64-vect-init-1.C: New test. diff --git a/gcc/testsuite/gcc.target/arc/pr9001107555.c b/gcc/testsuite/gcc.target/arc/pr9001107555.c new file mode 100644 index 00000000000..420fa835fa4 --- /dev/null +++ b/gcc/testsuite/gcc.target/arc/pr9001107555.c @@ -0,0 +1,51 @@ +/* { dg-do assemble } * +/* { dg-skip-if "" { ! { clmcpu } } } */ +/* { dg-options "-O3 -funroll-loops -mno-sdata -mcpu=arc700" } */ + +typedef long long a __attribute__((__mode__(__DI__))); +typedef struct c c; + +struct b +{ + int d; + c *e; +}; + +enum { f }; + +typedef struct +{ + a g; + a h; + int i; +} j; + +struct c +{ + int count; + int current; +}; + +int k; + +extern void bar (int, long long); +int foo (struct b *demux, __builtin_va_list args) +{ + c m = *demux->e; + j *n; + switch (k) + case f: + { + a o = __builtin_va_arg(args, a); + m.current = 0; + while (m.current < m.count) + { + if (n[m.current].h > o) { + bar (demux->d, 4 + 128LL * n[m.current].i); + break; + } + m.current++; + } + return 0; + } +} -- 2.30.2