From dc884a86d3ffa4108b66a232d18eb3fa35000863 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Thu, 6 May 2004 15:27:19 +0000 Subject: [PATCH] invoke.texi: Document -mvr4130-align. * doc/invoke.texi: Document -mvr4130-align. * config/mips/mips.h (MASK_VR4130_ALIGN, TARGET_VR4130_ALIGN) (TUNE_MIPS4120, TUNE_MIPS4130): New macros. (TUNE_MACC_CHAINS): Include TUNE_MIPS4120 and TUNE_MIPS4130. (TARGET_SWITCHES): Add -mvr4130-align and -mno-vr4130-align. * config/mips/mips.md: Include sched-int.h. (USEFUL_INSN_P, SEQ_BEGIN, SEQ_END, FOR_EACH_SUBINSN): New macros. (mips_rtx_costs): Set integer multiplication costs for TUNE_MIPS4130. (override_options): Enable -mvr4130-align at -O3 and above. (mips_sim_insn): New variable. (mips_sim): New structure. (mips_sim_reset, mips_sim_init, mips_sim_next_cycle, mips_sim_wait_reg) (mips_sim_wait_regs_2, mips_sim_wait_regs_1, mips_sim_wait_regs) (mips_sim_wait_units, mips_sim_wait_insn, mips_sim_record_set) (mips_sim_issue_insn, mips_sim_issue_nop, mips_sim_finish_insn) (vr4130_avoid_branch_rt_conflict, vr4130_align_insns): New functions. (mips_reorg): Call vr4130_align_insns. (vr4130_last_insn): New variable. (vr4130_true_reg_dependence_p_1, vr4130_true_reg_dependence_p) (vr4130_swap_insns_p, vr4130_reorder): New functions. (mips_sched_reorder, mips_variable_issue): Hook in vr4130 code. (mips_issue_rate): Return 2 for PROCESSOR_R4130. (mips_use_dfa_pipeline_interface): Return true for the same. * config/mips/4130.md: New file. * config/mips/mips.md: Include it. Add a peephole2 to convert "mult;mflo" into "mtlo;macc". (*macc, *umul_acc_di, *smul_acc_di): Use $1 rather than $0 as the target of maccs. (*msac_using_macc): New pattern. From-SVN: r81567 --- gcc/ChangeLog | 32 +++ gcc/config/mips/4130.md | 136 ++++++++++ gcc/config/mips/mips.c | 531 ++++++++++++++++++++++++++++++++++++++++ gcc/config/mips/mips.h | 13 +- gcc/config/mips/mips.md | 68 ++++- gcc/doc/invoke.texi | 15 +- 6 files changed, 789 insertions(+), 6 deletions(-) create mode 100644 gcc/config/mips/4130.md diff --git a/gcc/ChangeLog b/gcc/ChangeLog index ebddaf54fd0..3ea323e341c 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,35 @@ +2004-05-06 Richard Sandiford + + * doc/invoke.texi: Document -mvr4130-align. + * config/mips/mips.h (MASK_VR4130_ALIGN, TARGET_VR4130_ALIGN) + (TUNE_MIPS4120, TUNE_MIPS4130): New macros. + (TUNE_MACC_CHAINS): Include TUNE_MIPS4120 and TUNE_MIPS4130. + (TARGET_SWITCHES): Add -mvr4130-align and -mno-vr4130-align. + * config/mips/mips.md: Include sched-int.h. + (USEFUL_INSN_P, SEQ_BEGIN, SEQ_END, FOR_EACH_SUBINSN): New macros. + (mips_rtx_costs): Set integer multiplication costs for TUNE_MIPS4130. + (override_options): Enable -mvr4130-align at -O3 and above. + (mips_sim_insn): New variable. + (mips_sim): New structure. + (mips_sim_reset, mips_sim_init, mips_sim_next_cycle, mips_sim_wait_reg) + (mips_sim_wait_regs_2, mips_sim_wait_regs_1, mips_sim_wait_regs) + (mips_sim_wait_units, mips_sim_wait_insn, mips_sim_record_set) + (mips_sim_issue_insn, mips_sim_issue_nop, mips_sim_finish_insn) + (vr4130_avoid_branch_rt_conflict, vr4130_align_insns): New functions. + (mips_reorg): Call vr4130_align_insns. + (vr4130_last_insn): New variable. + (vr4130_true_reg_dependence_p_1, vr4130_true_reg_dependence_p) + (vr4130_swap_insns_p, vr4130_reorder): New functions. + (mips_sched_reorder, mips_variable_issue): Hook in vr4130 code. + (mips_issue_rate): Return 2 for PROCESSOR_R4130. + (mips_use_dfa_pipeline_interface): Return true for the same. + * config/mips/4130.md: New file. + * config/mips/mips.md: Include it. Add a peephole2 to convert + "mult;mflo" into "mtlo;macc". + (*macc, *umul_acc_di, *smul_acc_di): Use $1 rather than $0 as the + target of maccs. + (*msac_using_macc): New pattern. + 2004-05-06 Richard Sandiford * config/mips/5500.md (ir_vr55_store): Set latency to 0. diff --git a/gcc/config/mips/4130.md b/gcc/config/mips/4130.md new file mode 100644 index 00000000000..eddc405de34 --- /dev/null +++ b/gcc/config/mips/4130.md @@ -0,0 +1,136 @@ +;; +;; Pipeline description for the VR4130 family. +;; +;; The processor issues each 8-byte aligned pair of instructions together, +;; stalling the second instruction if it depends on the first. Thus, if we +;; want two instructions to issue in parallel, we need to make sure that the +;; first one is 8-byte aligned. +;; +;; For the purposes of this pipeline description, we treat the processor +;; like a standard two-way superscalar architecture. If scheduling were +;; the last pass to run, we could use the scheduler hooks to vary the +;; issue rate depending on whether an instruction is at an aligned or +;; unaligned address. Unfortunately, delayed branch scheduling and +;; hazard avoidance are done after the final scheduling pass, and they +;; can change the addresses of many instructions. +;; +;; We get around this in two ways: +;; +;; (1) By running an extra pass at the end of compilation. This pass goes +;; through the function looking for pairs of instructions that could +;; execute in parallel. It makes sure that the first instruction in +;; each pair is suitably aligned, inserting nops if necessary. Doing +;; this gives the same kind of pipeline behavior we would see on a +;; normal superscalar target. +;; +;; This pass is generally a speed improvement, but the extra nops will +;; obviously make the program bigger. It is therefore unsuitable for +;; -Os (at the very least). +;; +;; (2) By modifying the scheduler hooks so that, where possible: +;; +;; (a) dependent instructions are separated by a non-dependent +;; instruction; +;; +;; (b) instructions that use the multiplication unit are separated +;; by non-multiplication instructions; and +;; +;; (c) memory access instructions are separated by non-memory +;; instructions. +;; +;; The idea is to keep conflicting instructions apart wherever possible +;; and thus make the schedule less dependent on alignment. + +(define_automaton "vr4130_main, vr4130_muldiv, vr4130_mulpre") + +(define_cpu_unit "vr4130_alu1, vr4130_alu2, vr4130_dcache" "vr4130_main") +(define_cpu_unit "vr4130_muldiv" "vr4130_muldiv") + +;; This is a fake unit for pre-reload scheduling of multiplications. +;; It enforces the true post-reload repeat rate. +(define_cpu_unit "vr4130_mulpre" "vr4130_mulpre") + +;; The scheduling hooks use this attribute for (b) above. +(define_attr "vr4130_class" "mul,mem,alu" + (cond [(eq_attr "type" "load,store") + (const_string "mem") + + (eq_attr "type" "mfhilo,mthilo,imul,imadd,idiv") + (const_string "mul")] + (const_string "alu"))) + +(define_insn_reservation "vr4130_multi" 1 + (and (eq_attr "cpu" "r4130") + (eq_attr "type" "multi,unknown")) + "vr4130_alu1 + vr4130_alu2 + vr4130_dcache + vr4130_muldiv") + +(define_insn_reservation "vr4130_int" 1 + (and (eq_attr "cpu" "r4130") + (eq_attr "type" "const,arith,shift,slt,nop")) + "vr4130_alu1 | vr4130_alu2") + +(define_insn_reservation "vr4130_load" 3 + (and (eq_attr "cpu" "r4130") + (eq_attr "type" "load")) + "vr4130_dcache") + +(define_insn_reservation "vr4130_store" 1 + (and (eq_attr "cpu" "r4130") + (eq_attr "type" "store")) + "vr4130_dcache") + +(define_insn_reservation "vr4130_mfhilo" 3 + (and (eq_attr "cpu" "r4130") + (eq_attr "type" "mfhilo")) + "vr4130_muldiv") + +(define_insn_reservation "vr4130_mthilo" 1 + (and (eq_attr "cpu" "r4130") + (eq_attr "type" "mthilo")) + "vr4130_muldiv") + +;; The product is available in LO & HI after one cycle. Moving the result +;; into an integer register will take an additional three cycles, see mflo +;; & mfhi above. Note that the same latencies and repeat rates apply if we +;; use "mtlo; macc" instead of "mult; mflo". +(define_insn_reservation "vr4130_mulsi" 4 + (and (eq_attr "cpu" "r4130") + (and (eq_attr "type" "imul") + (eq_attr "mode" "SI"))) + "vr4130_muldiv + (vr4130_mulpre * 2)") + +;; As for vr4130_mulsi, but the product is available in LO and HI +;; after 3 cycles. +(define_insn_reservation "vr4130_muldi" 6 + (and (eq_attr "cpu" "r4130") + (and (eq_attr "type" "imul") + (eq_attr "mode" "DI"))) + "(vr4130_muldiv * 3) + (vr4130_mulpre * 4)") + +;; maccs can execute in consecutive cycles without stalling, but it +;; is 3 cycles before the integer destination can be read. +(define_insn_reservation "vr4130_macc" 3 + (and (eq_attr "cpu" "r4130") + (eq_attr "type" "imadd")) + "vr4130_muldiv") + +(define_bypass 1 "vr4130_mulsi,vr4130_macc" "vr4130_macc" "mips_linked_madd_p") +(define_bypass 1 "vr4130_mulsi,vr4130_macc" "vr4130_mfhilo") +(define_bypass 3 "vr4130_muldi" "vr4130_mfhilo") + +(define_insn_reservation "vr4130_divsi" 36 + (and (eq_attr "cpu" "r4130") + (and (eq_attr "type" "idiv") + (eq_attr "mode" "SI"))) + "vr4130_muldiv * 36") + +(define_insn_reservation "vr4130_divdi" 72 + (and (eq_attr "cpu" "r4130") + (and (eq_attr "type" "idiv") + (eq_attr "mode" "DI"))) + "vr4130_muldiv * 72") + +(define_insn_reservation "vr4130_branch" 0 + (and (eq_attr "cpu" "r4130") + (eq_attr "type" "branch,jump,call")) + "vr4130_alu1 | vr4130_alu2") diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c index ad5a48e8a40..598120fd409 100644 --- a/gcc/config/mips/mips.c +++ b/gcc/config/mips/mips.c @@ -54,6 +54,7 @@ Boston, MA 02111-1307, USA. */ #include "integrate.h" #include "langhooks.h" #include "cfglayout.h" +#include "sched-int.h" /* Enumeration for all of the relational tests, so that we can build arrays indexed by the test type, and not worry about the order @@ -107,6 +108,33 @@ enum internal_test { multi-instruction addu sequence. Use 0x7fe0 to work around this. */ #define MIPS_MAX_FIRST_STACK_STEP (TARGET_MIPS16 ? 0x100 : 0x7fe0) +/* True if INSN is a mips.md pattern or asm statement. */ +#define USEFUL_INSN_P(INSN) \ + (INSN_P (INSN) \ + && GET_CODE (PATTERN (INSN)) != USE \ + && GET_CODE (PATTERN (INSN)) != CLOBBER \ + && GET_CODE (PATTERN (INSN)) != ADDR_VEC \ + && GET_CODE (PATTERN (INSN)) != ADDR_DIFF_VEC) + +/* If INSN is a delayed branch sequence, return the first instruction + in the sequence, otherwise return INSN itself. */ +#define SEQ_BEGIN(INSN) \ + (INSN_P (INSN) && GET_CODE (PATTERN (INSN)) == SEQUENCE \ + ? XVECEXP (PATTERN (INSN), 0, 0) \ + : (INSN)) + +/* Likewise for the last instruction in a delayed branch sequence. */ +#define SEQ_END(INSN) \ + (INSN_P (INSN) && GET_CODE (PATTERN (INSN)) == SEQUENCE \ + ? XVECEXP (PATTERN (INSN), 0, XVECLEN (PATTERN (INSN), 0) - 1) \ + : (INSN)) + +/* Execute the following loop body with SUBINSN set to each instruction + between SEQ_BEGIN (INSN) and SEQ_END (INSN) inclusive. */ +#define FOR_EACH_SUBINSN(SUBINSN, INSN) \ + for ((SUBINSN) = SEQ_BEGIN (INSN); \ + (SUBINSN) != NEXT_INSN (SEQ_END (INSN)); \ + (SUBINSN) = NEXT_INSN (SUBINSN)) /* Classifies an address. @@ -138,6 +166,7 @@ struct mips16_constant; struct mips_arg_info; struct mips_address_info; struct mips_integer_op; +struct mips_sim; static enum mips_symbol_type mips_classify_symbol (rtx); static void mips_split_const (rtx, rtx *, HOST_WIDE_INT *); @@ -219,6 +248,21 @@ static void dump_constants (struct mips16_constant *, rtx); static int mips16_insn_length (rtx); static int mips16_rewrite_pool_refs (rtx *, void *); static void mips16_lay_out_constants (void); +static void mips_sim_reset (struct mips_sim *); +static void mips_sim_init (struct mips_sim *, state_t); +static void mips_sim_next_cycle (struct mips_sim *); +static void mips_sim_wait_reg (struct mips_sim *, rtx, rtx); +static int mips_sim_wait_regs_2 (rtx *, void *); +static void mips_sim_wait_regs_1 (rtx *, void *); +static void mips_sim_wait_regs (struct mips_sim *, rtx); +static void mips_sim_wait_units (struct mips_sim *, rtx); +static void mips_sim_wait_insn (struct mips_sim *, rtx); +static void mips_sim_record_set (rtx, rtx, void *); +static void mips_sim_issue_insn (struct mips_sim *, rtx); +static void mips_sim_issue_nop (struct mips_sim *); +static void mips_sim_finish_insn (struct mips_sim *, rtx); +static void vr4130_avoid_branch_rt_conflict (rtx); +static void vr4130_align_insns (void); static void mips_avoid_hazard (rtx, rtx, int *, rtx *, rtx); static void mips_avoid_hazards (void); static void mips_reorg (void); @@ -230,6 +274,10 @@ static bool mips_return_in_memory (tree, tree); static bool mips_strict_argument_naming (CUMULATIVE_ARGS *); static void mips_macc_chains_record (rtx); static void mips_macc_chains_reorder (rtx *, int); +static void vr4130_true_reg_dependence_p_1 (rtx, rtx, void *); +static bool vr4130_true_reg_dependence_p (rtx); +static bool vr4130_swap_insns_p (rtx, rtx); +static void vr4130_reorder (rtx *, int); static void mips_promote_ready (rtx *, int, int); static int mips_sched_reorder (FILE *, int, rtx *, int *, int); static int mips_variable_issue (FILE *, int, rtx, int); @@ -2347,6 +2395,8 @@ mips_rtx_costs (rtx x, int code, int outer_code, int *total) *total = COSTS_N_INSNS (12); else if (TUNE_MIPS3900) *total = COSTS_N_INSNS (2); + else if (TUNE_MIPS4130) + *total = COSTS_N_INSNS (mode == DImode ? 6 : 4); else if (TUNE_MIPS5400 || TUNE_SB1) *total = COSTS_N_INSNS (mode == DImode ? 4 : 3); else if (TUNE_MIPS5500 || TUNE_MIPS7000) @@ -4788,6 +4838,12 @@ override_options (void) if (TARGET_NAME_REGS) memcpy (mips_reg_names, mips_sw_reg_names, sizeof (mips_reg_names)); + /* -mvr4130-align is a "speed over size" optimization: it usually produces + faster code, but at the expense of more nops. Enable it at -O3 and + above. */ + if (optimize > 2 && (target_flags_explicit & MASK_VR4130_ALIGN) == 0) + target_flags |= MASK_VR4130_ALIGN; + /* When compiling for the mips16, we can not use floating point. We record the original hard float value in mips16_hard_float. */ if (TARGET_MIPS16) @@ -8367,8 +8423,373 @@ mips16_lay_out_constants (void) } dump_constants (pool.first, get_last_insn ()); } + +/* A temporary variable used by for_each_rtx callbacks, etc. */ +static rtx mips_sim_insn; + +/* A structure representing the state of the processor pipeline. + Used by the mips_sim_* family of functions. */ +struct mips_sim { + /* The maximum number of instructions that can be issued in a cycle. + (Caches mips_issue_rate.) */ + unsigned int issue_rate; + + /* The current simulation time. */ + unsigned int time; + + /* How many more instructions can be issued in the current cycle. */ + unsigned int insns_left; + + /* LAST_SET[X].INSN is the last instruction to set register X. + LAST_SET[X].TIME is the time at which that instruction was issued. + INSN is null if no instruction has yet set register X. */ + struct { + rtx insn; + unsigned int time; + } last_set[FIRST_PSEUDO_REGISTER]; + + /* The pipeline's current DFA state. */ + state_t dfa_state; +}; + +/* Reset STATE to the initial simulation state. */ + +static void +mips_sim_reset (struct mips_sim *state) +{ + state->time = 0; + state->insns_left = state->issue_rate; + memset (&state->last_set, 0, sizeof (state->last_set)); + state_reset (state->dfa_state); +} + +/* Initialize STATE before its first use. DFA_STATE points to an + allocated but uninitialized DFA state. */ + +static void +mips_sim_init (struct mips_sim *state, state_t dfa_state) +{ + state->issue_rate = mips_issue_rate (); + state->dfa_state = dfa_state; + mips_sim_reset (state); +} + +/* Advance STATE by one clock cycle. */ + +static void +mips_sim_next_cycle (struct mips_sim *state) +{ + state->time++; + state->insns_left = state->issue_rate; + state_transition (state->dfa_state, 0); +} + +/* Advance simulation state STATE until instruction INSN can read + register REG. */ + +static void +mips_sim_wait_reg (struct mips_sim *state, rtx insn, rtx reg) +{ + unsigned int i; + + for (i = 0; i < HARD_REGNO_NREGS (REGNO (reg), GET_MODE (reg)); i++) + if (state->last_set[REGNO (reg) + i].insn != 0) + { + unsigned int t; + + t = state->last_set[REGNO (reg) + i].time; + t += insn_latency (state->last_set[REGNO (reg) + i].insn, insn); + while (state->time < t) + mips_sim_next_cycle (state); + } +} + +/* A for_each_rtx callback. If *X is a register, advance simulation state + DATA until mips_sim_insn can read the register's value. */ + +static int +mips_sim_wait_regs_2 (rtx *x, void *data) +{ + if (REG_P (*x)) + mips_sim_wait_reg (data, mips_sim_insn, *x); + return 0; +} + +/* Call mips_sim_wait_regs_2 (R, DATA) for each register R mentioned in *X. */ + +static void +mips_sim_wait_regs_1 (rtx *x, void *data) +{ + for_each_rtx (x, mips_sim_wait_regs_2, data); +} + +/* Advance simulation state STATE until all of INSN's register + dependencies are satisfied. */ + +static void +mips_sim_wait_regs (struct mips_sim *state, rtx insn) +{ + mips_sim_insn = insn; + note_uses (&PATTERN (insn), mips_sim_wait_regs_1, state); +} + +/* Advance simulation state STATE until the units required by + instruction INSN are available. */ + +static void +mips_sim_wait_units (struct mips_sim *state, rtx insn) +{ + state_t tmp_state; + + tmp_state = alloca (state_size ()); + while (state->insns_left == 0 + || (memcpy (tmp_state, state->dfa_state, state_size ()), + state_transition (tmp_state, insn) >= 0)) + mips_sim_next_cycle (state); +} + +/* Advance simulation state STATE until INSN is ready to issue. */ + +static void +mips_sim_wait_insn (struct mips_sim *state, rtx insn) +{ + mips_sim_wait_regs (state, insn); + mips_sim_wait_units (state, insn); +} + +/* mips_sim_insn has just set X. Update the LAST_SET array + in simulation state DATA. */ + +static void +mips_sim_record_set (rtx x, rtx pat ATTRIBUTE_UNUSED, void *data) +{ + struct mips_sim *state; + unsigned int i; + + state = data; + if (REG_P (x)) + for (i = 0; i < HARD_REGNO_NREGS (REGNO (x), GET_MODE (x)); i++) + { + state->last_set[REGNO (x) + i].insn = mips_sim_insn; + state->last_set[REGNO (x) + i].time = state->time; + } +} + +/* Issue instruction INSN in scheduler state STATE. Assume that INSN + can issue immediately (i.e., that mips_sim_wait_insn has already + been called). */ + +static void +mips_sim_issue_insn (struct mips_sim *state, rtx insn) +{ + state_transition (state->dfa_state, insn); + state->insns_left--; + + mips_sim_insn = insn; + note_stores (PATTERN (insn), mips_sim_record_set, state); +} + +/* Simulate issuing a NOP in state STATE. */ + +static void +mips_sim_issue_nop (struct mips_sim *state) +{ + if (state->insns_left == 0) + mips_sim_next_cycle (state); + state->insns_left--; +} + +/* Update simulation state STATE so that it's ready to accept the instruction + after INSN. INSN should be part of the main rtl chain, not a member of a + SEQUENCE. */ + +static void +mips_sim_finish_insn (struct mips_sim *state, rtx insn) +{ + /* If INSN is a jump with an implicit delay slot, simulate a nop. */ + if (JUMP_P (insn)) + mips_sim_issue_nop (state); + + switch (GET_CODE (SEQ_BEGIN (insn))) + { + case CODE_LABEL: + case CALL_INSN: + /* We can't predict the processor state after a call or label. */ + mips_sim_reset (state); + break; + + case JUMP_INSN: + /* The delay slots of branch likely instructions are only executed + when the branch is taken. Therefore, if the caller has simulated + the delay slot instruction, STATE does not really reflect the state + of the pipeline for the instruction after the delay slot. Also, + branch likely instructions tend to incur a penalty when not taken, + so there will probably be an extra delay between the branch and + the instruction after the delay slot. */ + if (INSN_ANNULLED_BRANCH_P (SEQ_BEGIN (insn))) + mips_sim_reset (state); + break; + + default: + break; + } +} + +/* The VR4130 pipeline issues aligned pairs of instructions together, + but it stalls the second instruction if it depends on the first. + In order to cut down the amount of logic required, this dependence + check is not based on a full instruction decode. Instead, any non-SPECIAL + instruction is assumed to modify the register specified by bits 20-16 + (which is usually the "rt" field). + + In beq, beql, bne and bnel instructions, the rt field is actually an + input, so we can end up with a false dependence between the branch + and its delay slot. If this situation occurs in instruction INSN, + try to avoid it by swapping rs and rt. */ + +static void +vr4130_avoid_branch_rt_conflict (rtx insn) +{ + rtx first, second; + + first = SEQ_BEGIN (insn); + second = SEQ_END (insn); + if (GET_CODE (first) == JUMP_INSN + && GET_CODE (second) == INSN + && GET_CODE (PATTERN (first)) == SET + && GET_CODE (SET_DEST (PATTERN (first))) == PC + && GET_CODE (SET_SRC (PATTERN (first))) == IF_THEN_ELSE) + { + /* Check for the right kind of condition. */ + rtx cond = XEXP (SET_SRC (PATTERN (first)), 0); + if ((GET_CODE (cond) == EQ || GET_CODE (cond) == NE) + && REG_P (XEXP (cond, 0)) + && REG_P (XEXP (cond, 1)) + && reg_referenced_p (XEXP (cond, 1), PATTERN (second)) + && !reg_referenced_p (XEXP (cond, 0), PATTERN (second))) + { + /* SECOND mentions the rt register but not the rs register. */ + rtx tmp = XEXP (cond, 0); + XEXP (cond, 0) = XEXP (cond, 1); + XEXP (cond, 1) = tmp; + } + } +} + +/* Implement -mvr4130-align. Go through each basic block and simulate the + processor pipeline. If we find that a pair of instructions could execute + in parallel, and the first of those instruction is not 8-byte aligned, + insert a nop to make it aligned. */ +static void +vr4130_align_insns (void) +{ + struct mips_sim state; + rtx insn, subinsn, last, last2, next; + bool aligned_p; + + dfa_start (); + + /* LAST is the last instruction before INSN to have a nonzero length. + LAST2 is the last such instruction before LAST. */ + last = 0; + last2 = 0; + + /* ALIGNED_P is true if INSN is known to be at an aligned address. */ + aligned_p = true; + + mips_sim_init (&state, alloca (state_size ())); + for (insn = get_insns (); insn != 0; insn = next) + { + unsigned int length; + + next = NEXT_INSN (insn); + + /* See the comment above vr4130_avoid_branch_rt_conflict for details. + This isn't really related to the alignment pass, but we do it on + the fly to avoid a separate instruction walk. */ + vr4130_avoid_branch_rt_conflict (insn); + + if (USEFUL_INSN_P (insn)) + FOR_EACH_SUBINSN (subinsn, insn) + { + mips_sim_wait_insn (&state, subinsn); + + /* If we want this instruction to issue in parallel with the + previous one, make sure that the previous instruction is + aligned. There are several reasons why this isn't worthwhile + when the second instruction is a call: + + - Calls are less likely to be performance critical, + - There's a good chance that the delay slot can execute + in parallel with the call. + - The return address would then be unaligned. + + In general, if we're going to insert a nop between instructions + X and Y, it's better to insert it immediately after X. That + way, if the nop makes Y aligned, it will also align any labels + between X and Y. */ + if (state.insns_left != state.issue_rate + && GET_CODE (subinsn) != CALL_INSN) + { + if (subinsn == SEQ_BEGIN (insn) && aligned_p) + { + /* SUBINSN is the first instruction in INSN and INSN is + aligned. We want to align the previous instruction + instead, so insert a nop between LAST2 and LAST. + + Note that LAST could be either a single instruction + or a branch with a delay slot. In the latter case, + LAST, like INSN, is already aligned, but the delay + slot must have some extra delay that stops it from + issuing at the same time as the branch. We therefore + insert a nop before the branch in order to align its + delay slot. */ + emit_insn_after (gen_nop (), last2); + aligned_p = false; + } + else if (subinsn != SEQ_BEGIN (insn) && !aligned_p) + { + /* SUBINSN is the delay slot of INSN, but INSN is + currently unaligned. Insert a nop between + LAST and INSN to align it. */ + emit_insn_after (gen_nop (), last); + aligned_p = true; + } + } + mips_sim_issue_insn (&state, subinsn); + } + mips_sim_finish_insn (&state, insn); + + /* Update LAST, LAST2 and ALIGNED_P for the next instruction. */ + length = get_attr_length (insn); + if (length > 0) + { + /* If the instruction is an asm statement or multi-instruction + mips.md patern, the length is only an estimate. Insert an + 8 byte alignment after it so that the following instructions + can be handled correctly. */ + if (GET_CODE (SEQ_BEGIN (insn)) == INSN + && (recog_memoized (insn) < 0 || length >= 8)) + { + next = emit_insn_after (gen_align (GEN_INT (3)), insn); + next = NEXT_INSN (next); + mips_sim_next_cycle (&state); + aligned_p = true; + } + else if (length & 4) + aligned_p = !aligned_p; + last2 = last; + last = insn; + } + /* See whether INSN is an aligned label. */ + if (LABEL_P (insn) && label_to_alignment (insn) >= 3) + aligned_p = true; + } + dfa_finish (); +} + /* Subroutine of mips_reorg. If there is a hazard between INSN and a previous instruction, avoid it by inserting nops after instruction AFTER. @@ -8499,6 +8920,8 @@ mips_reorg (void) if (mips_flag_delayed_branch) dbr_schedule (get_insns (), dump_file); mips_avoid_hazards (); + if (TUNE_MIPS4130 && TARGET_VR4130_ALIGN) + vr4130_align_insns (); } } @@ -9266,6 +9689,104 @@ mips_macc_chains_reorder (rtx *ready, int nready) } } +/* The last instruction to be scheduled. */ + +static rtx vr4130_last_insn; + +/* A note_stores callback used by vr4130_true_reg_dependence_p. DATA + points to an rtx that is initially an instruction. Nullify the rtx + if the instruction uses the value of register X. */ + +static void +vr4130_true_reg_dependence_p_1 (rtx x, rtx pat ATTRIBUTE_UNUSED, void *data) +{ + rtx *insn_ptr = data; + if (REG_P (x) + && *insn_ptr != 0 + && reg_referenced_p (x, PATTERN (*insn_ptr))) + *insn_ptr = 0; +} + +/* Return true if there is true register dependence between vr4130_last_insn + and INSN. */ + +static bool +vr4130_true_reg_dependence_p (rtx insn) +{ + note_stores (PATTERN (vr4130_last_insn), + vr4130_true_reg_dependence_p_1, &insn); + return insn == 0; +} + +/* A TUNE_MIPS4130 helper function. Given that INSN1 is at the head of + the ready queue and that INSN2 is the instruction after it, return + true if it is worth promoting INSN2 ahead of INSN1. Look for cases + in which INSN1 and INSN2 can probably issue in parallel, but for + which (INSN2, INSN1) should be less sensitive to instruction + alignment than (INSN1, INSN2). See 4130.md for more details. */ + +static bool +vr4130_swap_insns_p (rtx insn1, rtx insn2) +{ + rtx dep; + + /* Check for the following case: + + 1) there is some other instruction X with an anti dependence on INSN1; + 2) X has a higher priority than INSN2; and + 3) X is an arithmetic instruction (and thus has no unit restrictions). + + If INSN1 is the last instruction blocking X, it would better to + choose (INSN1, X) over (INSN2, INSN1). */ + for (dep = INSN_DEPEND (insn1); dep != 0; dep = XEXP (dep, 1)) + if (REG_NOTE_KIND (dep) == REG_DEP_ANTI + && INSN_PRIORITY (XEXP (dep, 0)) > INSN_PRIORITY (insn2) + && recog_memoized (XEXP (dep, 0)) >= 0 + && get_attr_vr4130_class (XEXP (dep, 0)) == VR4130_CLASS_ALU) + return false; + + if (vr4130_last_insn != 0 + && recog_memoized (insn1) >= 0 + && recog_memoized (insn2) >= 0) + { + /* See whether INSN1 and INSN2 use different execution units, + or if they are both ALU-type instructions. If so, they can + probably execute in parallel. */ + enum attr_vr4130_class class1 = get_attr_vr4130_class (insn1); + enum attr_vr4130_class class2 = get_attr_vr4130_class (insn2); + if (class1 != class2 || class1 == VR4130_CLASS_ALU) + { + /* If only one of the instructions has a dependence on + vr4130_last_insn, prefer to schedule the other one first. */ + bool dep1 = vr4130_true_reg_dependence_p (insn1); + bool dep2 = vr4130_true_reg_dependence_p (insn2); + if (dep1 != dep2) + return dep1; + + /* Prefer to schedule INSN2 ahead of INSN1 if vr4130_last_insn + is not an ALU-type instruction and if INSN1 uses the same + execution unit. (Note that if this condition holds, we already + know that INSN2 uses a different execution unit.) */ + if (class1 != VR4130_CLASS_ALU + && recog_memoized (vr4130_last_insn) >= 0 + && class1 == get_attr_vr4130_class (vr4130_last_insn)) + return true; + } + } + return false; +} + +/* A TUNE_MIPS4130 helper function. (READY, NREADY) describes a ready + queue with at least two instructions. Swap the first two if + vr4130_swap_insns_p says that it could be worthwhile. */ + +static void +vr4130_reorder (rtx *ready, int nready) +{ + if (vr4130_swap_insns_p (ready[nready - 1], ready[nready - 2])) + mips_promote_ready (ready, nready - 2, nready - 1); +} + /* Remove the instruction at index LOWER from ready queue READY and reinsert it in front of the instruction at index HIGHER. LOWER must be <= HIGHER. */ @@ -9295,6 +9816,13 @@ mips_sched_reorder (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED, if (*nreadyp > 0) mips_macc_chains_reorder (ready, *nreadyp); } + if (reload_completed && TUNE_MIPS4130 && !TARGET_VR4130_ALIGN) + { + if (cycle == 0) + vr4130_last_insn = 0; + if (*nreadyp > 1) + vr4130_reorder (ready, *nreadyp); + } return mips_issue_rate (); } @@ -9315,6 +9843,7 @@ mips_variable_issue (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED, more--; if (!reload_completed && TUNE_MACC_CHAINS) mips_macc_chains_record (insn); + vr4130_last_insn = insn; break; } return more; @@ -9339,6 +9868,7 @@ mips_issue_rate (void) { switch (mips_tune) { + case PROCESSOR_R4130: case PROCESSOR_R5400: case PROCESSOR_R5500: case PROCESSOR_R7000: @@ -9368,6 +9898,7 @@ mips_use_dfa_pipeline_interface (void) { switch (mips_tune) { + case PROCESSOR_R4130: case PROCESSOR_R5400: case PROCESSOR_R5500: case PROCESSOR_R7000: diff --git a/gcc/config/mips/mips.h b/gcc/config/mips/mips.h index 37111cad9a7..e23abf1658d 100644 --- a/gcc/config/mips/mips.h +++ b/gcc/config/mips/mips.h @@ -171,7 +171,7 @@ extern const struct mips_cpu_info *mips_tune_info; #define MASK_FIX_R4400 0x01000000 /* Work around R4400 errata. */ #define MASK_FIX_SB1 0x02000000 /* Work around SB-1 errata. */ #define MASK_FIX_VR4120 0x04000000 /* Work around VR4120 errata. */ - +#define MASK_VR4130_ALIGN 0x08000000 /* Perform VR4130 alignment opts. */ #define MASK_FP_EXCEPTIONS 0x10000000 /* FP exceptions are enabled. */ /* Debug switches, not documented */ @@ -253,6 +253,7 @@ extern const struct mips_cpu_info *mips_tune_info; /* Work around R4400 errata. */ #define TARGET_FIX_R4400 (target_flags & MASK_FIX_R4400) #define TARGET_FIX_VR4120 (target_flags & MASK_FIX_VR4120) +#define TARGET_VR4130_ALIGN (target_flags & MASK_VR4130_ALIGN) #define TARGET_FP_EXCEPTIONS (target_flags & MASK_FP_EXCEPTIONS) @@ -332,6 +333,8 @@ extern const struct mips_cpu_info *mips_tune_info; #define TUNE_MIPS3000 (mips_tune == PROCESSOR_R3000) #define TUNE_MIPS3900 (mips_tune == PROCESSOR_R3900) #define TUNE_MIPS4000 (mips_tune == PROCESSOR_R4000) +#define TUNE_MIPS4120 (mips_tune == PROCESSOR_R4120) +#define TUNE_MIPS4130 (mips_tune == PROCESSOR_R4130) #define TUNE_MIPS5000 (mips_tune == PROCESSOR_R5000) #define TUNE_MIPS5400 (mips_tune == PROCESSOR_R5400) #define TUNE_MIPS5500 (mips_tune == PROCESSOR_R5500) @@ -371,7 +374,9 @@ extern const struct mips_cpu_info *mips_tune_info; Multiply-accumulate instructions are a bigger win for some targets than others, so this macro is defined on an opt-in basis. */ -#define TUNE_MACC_CHAINS TUNE_MIPS5500 +#define TUNE_MACC_CHAINS (TUNE_MIPS5500 \ + || TUNE_MIPS4120 \ + || TUNE_MIPS4130) #define TARGET_OLDABI (mips_abi == ABI_32 || mips_abi == ABI_O64) #define TARGET_NEWABI (mips_abi == ABI_N32 || mips_abi == ABI_64) @@ -619,6 +624,10 @@ extern const struct mips_cpu_info *mips_tune_info; N_("Don't generate fused multiply/add instructions")}, \ {"fused-madd", -MASK_NO_FUSED_MADD, \ N_("Generate fused multiply/add instructions")}, \ + {"vr4130-align", MASK_VR4130_ALIGN, \ + N_("Perform VR4130-specific alignment optimizations")}, \ + {"no-vr4130-align", -MASK_VR4130_ALIGN, \ + N_("Don't perform VR4130-specific alignment optimizations")}, \ {"fix4300", MASK_4300_MUL_FIX, \ N_("Work around early 4300 hardware bug")}, \ {"no-fix4300", -MASK_4300_MUL_FIX, \ diff --git a/gcc/config/mips/mips.md b/gcc/config/mips/mips.md index cdb50059704..e9682464a46 100644 --- a/gcc/config/mips/mips.md +++ b/gcc/config/mips/mips.md @@ -631,6 +631,7 @@ ;; Include scheduling descriptions. +(include "4130.md") (include "5400.md") (include "5500.md") (include "7000.md") @@ -1584,6 +1585,37 @@ (set_attr "mode" "SI") (set_attr "length" "8")]) +;; On the VR4120 and VR4130, it is better to use "mtlo $0; macc" instead +;; of "mult; mflo". They have the same latency, but the first form gives +;; us an extra cycle to compute the operands. + +;; Operand 0: LO +;; Operand 1: GPR (1st multiplication operand) +;; Operand 2: GPR (2nd multiplication operand) +;; Operand 3: HI +;; Operand 4: GPR (destination) +(define_peephole2 + [(parallel + [(set (match_operand:SI 0 "register_operand" "") + (mult:SI (match_operand:SI 1 "register_operand" "") + (match_operand:SI 2 "register_operand" ""))) + (clobber (match_operand:SI 3 "register_operand" ""))]) + (set (match_operand:SI 4 "register_operand" "") + (unspec:SI [(match_dup 0) (match_dup 3)] UNSPEC_MFHILO))] + "ISA_HAS_MACC && !GENERATE_MULT3_SI" + [(set (match_dup 0) + (const_int 0)) + (parallel + [(set (match_dup 0) + (plus:SI (mult:SI (match_dup 1) + (match_dup 2)) + (match_dup 0))) + (set (match_dup 4) + (plus:SI (mult:SI (match_dup 1) + (match_dup 2)) + (match_dup 0))) + (clobber (match_dup 3))])]) + ;; Multiply-accumulate patterns ;; For processors that can copy the output to a general register: @@ -1673,7 +1705,10 @@ else if (TARGET_MIPS5500) return "madd\t%1,%2"; else - return "macc\t%.,%1,%2"; + /* The VR4130 assumes that there is a two-cycle latency between a macc + that "writes" to $0 and an instruction that reads from it. We avoid + this by assigning to $1 instead. */ + return "%[macc\t%@,%1,%2%]"; } [(set_attr "type" "imadd") (set_attr "mode" "SI")]) @@ -1697,6 +1732,31 @@ [(set_attr "type" "imadd") (set_attr "mode" "SI")]) +;; An msac-like instruction implemented using negation and a macc. +(define_insn_and_split "*msac_using_macc" + [(set (match_operand:SI 0 "register_operand" "=l,d") + (minus:SI (match_operand:SI 1 "register_operand" "0,l") + (mult:SI (match_operand:SI 2 "register_operand" "d,d") + (match_operand:SI 3 "register_operand" "d,d")))) + (clobber (match_scratch:SI 4 "=h,h")) + (clobber (match_scratch:SI 5 "=X,1")) + (clobber (match_scratch:SI 6 "=d,d"))] + "ISA_HAS_MACC && !ISA_HAS_MSAC" + "#" + "&& reload_completed" + [(set (match_dup 6) + (neg:SI (match_dup 3))) + (parallel + [(set (match_dup 0) + (plus:SI (mult:SI (match_dup 2) + (match_dup 6)) + (match_dup 1))) + (clobber (match_dup 4)) + (clobber (match_dup 5))])] + "" + [(set_attr "type" "imadd") + (set_attr "length" "8")]) + ;; Patterns generated by the define_peephole2 below. (define_insn "*macc2" @@ -2367,7 +2427,8 @@ else if (TARGET_MIPS5500) return "maddu\t%1,%2"; else - return "maccu\t%.,%1,%2"; + /* See comment in *macc. */ + return "%[maccu\t%@,%1,%2%]"; } [(set_attr "type" "imadd") (set_attr "mode" "SI")]) @@ -2387,7 +2448,8 @@ else if (TARGET_MIPS5500) return "madd\t%1,%2"; else - return "macc\t%.,%1,%2"; + /* See comment in *macc. */ + return "%[macc\t%@,%1,%2%]"; } [(set_attr "type" "imadd") (set_attr "mode" "SI")]) diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 7bafb8cd023..ea27ed0cc3d 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -483,7 +483,8 @@ in the following sections. -mfix-vr4120 -mno-fix-vr4120 -mfix-sb1 -mno-fix-sb1 @gol -mflush-func=@var{func} -mno-flush-func @gol -mbranch-likely -mno-branch-likely @gol --mfp-exceptions -mno-fp-exceptions} +-mfp-exceptions -mno-fp-exceptions @gol +-mvr4130-align -mno-vr4130-align} @emph{i386 and x86-64 Options} @gccoptlist{-mtune=@var{cpu-type} -march=@var{cpu-type} @gol @@ -8245,6 +8246,18 @@ enabled. For instance, on the SB-1, if FP exceptions are disabled, and we are emitting 64-bit code, then we can use both FP pipes. Otherwise, we can only use one FP pipe. + +@item -mvr4130-align +@itemx -mno-vr4130-align +@opindex mvr4130-align +The VR4130 pipeline is two-way superscalar, but can only issue two +instructions together if the first one is 8-byte aligned. When this +option is enabled, GCC will align pairs of instructions that it +thinks should execute in parallel. + +This option only has an effect when optimizing for the VR4130. +It normally makes code faster, but at the expense of making it bigger. +It is enabled by default at optimization level @option{-O3}. @end table @node i386 and x86-64 Options -- 2.30.2