From: Eric Botcazou Date: Thu, 16 Jul 2020 15:50:26 +0000 (+0200) Subject: Reunify x86 stack checking implementation X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=f569026aa3088aa895ea39618d2998333b08600b;p=gcc.git Reunify x86 stack checking implementation The stack clash protection mechanism in the x86 back-end was implemented by largely duplicating the existing stack checking implementation. Now the only significant difference between them is the probing window, which is shifted by 1 probing interval (not 2 as documented in explow.c), but we can certainly do 1 more probe for stack checking even if it is redundant in almost all cases. gcc/ChangeLog: * config/i386/i386.c (ix86_compute_frame_layout): Minor tweak. (ix86_adjust_stack_and_probe): Delete. (ix86_adjust_stack_and_probe_stack_clash): Rename to above and add PROTECTION_AREA parameter. If it is true, probe PROBE_INTERVAL plus a small dope beyond SIZE bytes. (ix86_emit_probe_stack_range): Use local variable. (ix86_expand_prologue): Adjust calls to ix86_adjust_stack_and_probe and tidy up the stack checking code. * explow.c (get_stack_check_protect): Fix head comment. (anti_adjust_stack_and_probe_stack_clash): Likewise. (allocate_dynamic_stack_space): Add comment. * tree-nested.c (lookup_field_for_decl): Set the DECL_IGNORED_P and TREE_NO_WARNING but not TREE_ADDRESSABLE flags on the field. --- diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 5c373c091ce..31757b044c8 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -6169,10 +6169,7 @@ ix86_compute_frame_layout (void) } frame->save_regs_using_mov - = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue - /* If static stack checking is enabled and done with probes, - the registers need to be saved before allocating the frame. */ - && flag_stack_check != STATIC_BUILTIN_STACK_CHECK); + = TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue; /* Skip return address and error code in exception handler. */ offset = INCOMING_FRAME_SP_OFFSET; @@ -6329,6 +6326,9 @@ ix86_compute_frame_layout (void) if ((!to_allocate && frame->nregs <= 1) || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)) + /* If static stack checking is enabled and done with probes, + the registers need to be saved before allocating the frame. */ + || flag_stack_check == STATIC_BUILTIN_STACK_CHECK /* If stack clash probing needs a loop, then it needs a scratch register. But the returned register is only guaranteed to be safe to use after register saves are complete. So if @@ -7122,17 +7122,20 @@ release_scratch_register_on_entry (struct scratch_reg *sr, HOST_WIDE_INT offset, /* Emit code to adjust the stack pointer by SIZE bytes while probing it. - This differs from the next routine in that it tries hard to prevent - attacks that jump the stack guard. Thus it is never allowed to allocate - more than PROBE_INTERVAL bytes of stack space without a suitable - probe. + If INT_REGISTERS_SAVED is true, then integer registers have already been + pushed on the stack. - INT_REGISTERS_SAVED is true if integer registers have already been - pushed on the stack. */ + If PROTECTION AREA is true, then probe PROBE_INTERVAL plus a small dope + beyond SIZE bytes. + + This assumes no knowledge of the current probing state, i.e. it is never + allowed to allocate more than PROBE_INTERVAL bytes of stack space without + a suitable probe. */ static void -ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size, - const bool int_registers_saved) +ix86_adjust_stack_and_probe (HOST_WIDE_INT size, + const bool int_registers_saved, + const bool protection_area) { struct machine_function *m = cfun->machine; @@ -7194,10 +7197,17 @@ ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size, emit_insn (gen_blockage ()); } + const HOST_WIDE_INT probe_interval = get_probe_interval (); + const int dope = 4 * UNITS_PER_WORD; + + /* If there is protection area, take it into account in the size. */ + if (protection_area) + size += probe_interval + dope; + /* If we allocate less than the size of the guard statically, then no probing is necessary, but we do need to allocate the stack. */ - if (size < (1 << param_stack_clash_protection_guard_size)) + else if (size < (1 << param_stack_clash_protection_guard_size)) { pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, GEN_INT (-size), -1, @@ -7209,7 +7219,6 @@ ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size, /* We're allocating a large enough stack frame that we need to emit probes. Either emit them inline or in a loop depending on the size. */ - HOST_WIDE_INT probe_interval = get_probe_interval (); if (size <= 4 * probe_interval) { HOST_WIDE_INT i; @@ -7228,12 +7237,19 @@ ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size, } /* We need to allocate space for the residual, but we do not need - to probe the residual. */ + to probe the residual... */ HOST_WIDE_INT residual = (i - probe_interval - size); if (residual) - pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, - GEN_INT (residual), -1, - m->fs.cfa_reg == stack_pointer_rtx); + { + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (residual), -1, + m->fs.cfa_reg == stack_pointer_rtx); + + /* ...except if there is a protection area to maintain. */ + if (protection_area) + emit_stack_probe (stack_pointer_rtx); + } + dump_stack_clash_frame_info (PROBE_INLINE, residual != 0); } else @@ -7296,186 +7312,27 @@ ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size, is equal to ROUNDED_SIZE. */ if (size != rounded_size) - pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, - GEN_INT (rounded_size - size), -1, - m->fs.cfa_reg == stack_pointer_rtx); - dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size); - - /* This does not deallocate the space reserved for the scratch - register. That will be deallocated in the epilogue. */ - release_scratch_register_on_entry (&sr, size, false); - } - - /* Make sure nothing is scheduled before we are done. */ - emit_insn (gen_blockage ()); -} - -/* Emit code to adjust the stack pointer by SIZE bytes while probing it. - - INT_REGISTERS_SAVED is true if integer registers have already been - pushed on the stack. */ - -static void -ix86_adjust_stack_and_probe (HOST_WIDE_INT size, - const bool int_registers_saved) -{ - /* We skip the probe for the first interval + a small dope of 4 words and - probe that many bytes past the specified size to maintain a protection - area at the botton of the stack. */ - const int dope = 4 * UNITS_PER_WORD; - rtx size_rtx = GEN_INT (size), last; - - /* See if we have a constant small number of probes to generate. If so, - that's the easy case. The run-time loop is made up of 9 insns in the - generic case while the compile-time loop is made up of 3+2*(n-1) insns - for n # of intervals. */ - if (size <= 4 * get_probe_interval ()) - { - HOST_WIDE_INT i, adjust; - bool first_probe = true; - - /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for - values of N from 1 until it exceeds SIZE. If only one probe is - needed, this will not generate any code. Then adjust and probe - to PROBE_INTERVAL + SIZE. */ - for (i = get_probe_interval (); i < size; i += get_probe_interval ()) { - if (first_probe) - { - adjust = 2 * get_probe_interval () + dope; - first_probe = false; - } - else - adjust = get_probe_interval (); - - emit_insn (gen_rtx_SET (stack_pointer_rtx, - plus_constant (Pmode, stack_pointer_rtx, - -adjust))); - emit_stack_probe (stack_pointer_rtx); - } - - if (first_probe) - adjust = size + get_probe_interval () + dope; - else - adjust = size + get_probe_interval () - i; - - emit_insn (gen_rtx_SET (stack_pointer_rtx, - plus_constant (Pmode, stack_pointer_rtx, - -adjust))); - emit_stack_probe (stack_pointer_rtx); - - /* Adjust back to account for the additional first interval. */ - last = emit_insn (gen_rtx_SET (stack_pointer_rtx, - plus_constant (Pmode, stack_pointer_rtx, - (get_probe_interval () - + dope)))); - } - - /* Otherwise, do the same as above, but in a loop. Note that we must be - extra careful with variables wrapping around because we might be at - the very top (or the very bottom) of the address space and we have - to be able to handle this case properly; in particular, we use an - equality test for the loop condition. */ - else - { - /* We expect the GP registers to be saved when probes are used - as the probing sequences might need a scratch register and - the routine to allocate one assumes the integer registers - have already been saved. */ - gcc_assert (int_registers_saved); - - HOST_WIDE_INT rounded_size; - struct scratch_reg sr; - - get_scratch_register_on_entry (&sr); - - /* If we needed to save a register, then account for any space - that was pushed (we are not going to pop the register when - we do the restore). */ - if (sr.saved) - size -= UNITS_PER_WORD; - - /* Step 1: round SIZE to the previous multiple of the interval. */ - - rounded_size = ROUND_DOWN (size, get_probe_interval ()); - - - /* Step 2: compute initial and final value of the loop counter. */ - - /* SP = SP_0 + PROBE_INTERVAL. */ - emit_insn (gen_rtx_SET (stack_pointer_rtx, - plus_constant (Pmode, stack_pointer_rtx, - - (get_probe_interval () + dope)))); - - /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */ - if (rounded_size <= (HOST_WIDE_INT_1 << 31)) - emit_insn (gen_rtx_SET (sr.reg, - plus_constant (Pmode, stack_pointer_rtx, - -rounded_size))); - else - { - emit_move_insn (sr.reg, GEN_INT (-rounded_size)); - emit_insn (gen_rtx_SET (sr.reg, - gen_rtx_PLUS (Pmode, sr.reg, - stack_pointer_rtx))); - } - - - /* Step 3: the loop - - do - { - SP = SP + PROBE_INTERVAL - probe at SP - } - while (SP != LAST_ADDR) - - adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for - values of N from 1 until it is equal to ROUNDED_SIZE. */ - - emit_insn (gen_adjust_stack_and_probe (Pmode, sr.reg, sr.reg, size_rtx)); - - - /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot - assert at compile-time that SIZE is equal to ROUNDED_SIZE. */ + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (rounded_size - size), -1, + m->fs.cfa_reg == stack_pointer_rtx); - if (size != rounded_size) - { - emit_insn (gen_rtx_SET (stack_pointer_rtx, - plus_constant (Pmode, stack_pointer_rtx, - rounded_size - size))); - emit_stack_probe (stack_pointer_rtx); + if (protection_area) + emit_stack_probe (stack_pointer_rtx); } - /* Adjust back to account for the additional first interval. */ - last = emit_insn (gen_rtx_SET (stack_pointer_rtx, - plus_constant (Pmode, stack_pointer_rtx, - (get_probe_interval () - + dope)))); + dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size); /* This does not deallocate the space reserved for the scratch register. That will be deallocated in the epilogue. */ release_scratch_register_on_entry (&sr, size, false); } - /* Even if the stack pointer isn't the CFA register, we need to correctly - describe the adjustments made to it, in particular differentiate the - frame-related ones from the frame-unrelated ones. */ - if (size > 0) - { - rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2)); - XVECEXP (expr, 0, 0) - = gen_rtx_SET (stack_pointer_rtx, - plus_constant (Pmode, stack_pointer_rtx, -size)); - XVECEXP (expr, 0, 1) - = gen_rtx_SET (stack_pointer_rtx, - plus_constant (Pmode, stack_pointer_rtx, - get_probe_interval () + dope + size)); - add_reg_note (last, REG_FRAME_RELATED_EXPR, expr); - RTX_FRAME_RELATED_P (last) = 1; - - cfun->machine->fs.sp_offset += size; - } + /* Adjust back to account for the protection area. */ + if (protection_area) + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (probe_interval + dope), -1, + m->fs.cfa_reg == stack_pointer_rtx); /* Make sure nothing is scheduled before we are done. */ emit_insn (gen_blockage ()); @@ -7527,18 +7384,20 @@ static void ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size, const bool int_registers_saved) { + const HOST_WIDE_INT probe_interval = get_probe_interval (); + /* See if we have a constant small number of probes to generate. If so, that's the easy case. The run-time loop is made up of 6 insns in the generic case while the compile-time loop is made up of n insns for n # of intervals. */ - if (size <= 6 * get_probe_interval ()) + if (size <= 6 * probe_interval) { HOST_WIDE_INT i; /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until it exceeds SIZE. If only one probe is needed, this will not generate any code. Then probe at FIRST + SIZE. */ - for (i = get_probe_interval (); i < size; i += get_probe_interval ()) + for (i = probe_interval; i < size; i += probe_interval) emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, -(first + i))); @@ -7567,7 +7426,7 @@ ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size, /* Step 1: round SIZE to the previous multiple of the interval. */ - rounded_size = ROUND_DOWN (size, get_probe_interval ()); + rounded_size = ROUND_DOWN (size, probe_interval); /* Step 2: compute initial and final value of the loop counter. */ @@ -8324,27 +8183,33 @@ ix86_expand_prologue (void) sse_registers_saved = true; } + /* If stack clash protection is requested, then probe the stack. */ + if (allocate >= 0 && flag_stack_clash_protection) + { + ix86_adjust_stack_and_probe (allocate, int_registers_saved, false); + allocate = 0; + } + /* The stack has already been decremented by the instruction calling us so probe if the size is non-negative to preserve the protection area. */ - if (allocate >= 0 - && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK - || flag_stack_clash_protection)) + else if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK) { - if (flag_stack_clash_protection) - { - ix86_adjust_stack_and_probe_stack_clash (allocate, - int_registers_saved); - allocate = 0; - } - else if (STACK_CHECK_MOVING_SP) + const HOST_WIDE_INT probe_interval = get_probe_interval (); + + if (STACK_CHECK_MOVING_SP) { - if (!(crtl->is_leaf && !cfun->calls_alloca - && allocate <= get_probe_interval ())) + if (crtl->is_leaf + && !cfun->calls_alloca + && allocate <= probe_interval) + ; + + else { - ix86_adjust_stack_and_probe (allocate, int_registers_saved); + ix86_adjust_stack_and_probe (allocate, int_registers_saved, true); allocate = 0; } } + else { HOST_WIDE_INT size = allocate; @@ -8356,7 +8221,7 @@ ix86_expand_prologue (void) { if (crtl->is_leaf && !cfun->calls_alloca) { - if (size > get_probe_interval ()) + if (size > probe_interval) ix86_emit_probe_stack_range (0, size, int_registers_saved); } else @@ -8368,7 +8233,7 @@ ix86_expand_prologue (void) { if (crtl->is_leaf && !cfun->calls_alloca) { - if (size > get_probe_interval () + if (size > probe_interval && size > get_stack_check_protect ()) ix86_emit_probe_stack_range (get_stack_check_protect (), (size diff --git a/gcc/explow.c b/gcc/explow.c index 15c9cfb0318..0fbc6d25b81 100644 --- a/gcc/explow.c +++ b/gcc/explow.c @@ -1293,9 +1293,9 @@ get_dynamic_stack_size (rtx *psize, unsigned size_align, /* Return the number of bytes to "protect" on the stack for -fstack-check. - "protect" in the context of -fstack-check means how many bytes we - should always ensure are available on the stack. More importantly - this is how many bytes are skipped when probing the stack. + "protect" in the context of -fstack-check means how many bytes we need + to always ensure are available on the stack; as a consequence, this is + also how many bytes are first skipped when probing the stack. On some targets we want to reuse the -fstack-check prologue support to give a degree of protection against stack clashing style attacks. @@ -1303,14 +1303,16 @@ get_dynamic_stack_size (rtx *psize, unsigned size_align, In that scenario we do not want to skip bytes before probing as that would render the stack clash protections useless. - So we never use STACK_CHECK_PROTECT directly. Instead we indirect though - this helper which allows us to provide different values for - -fstack-check and -fstack-clash-protection. */ + So we never use STACK_CHECK_PROTECT directly. Instead we indirectly + use it through this helper, which allows to provide different values + for -fstack-check and -fstack-clash-protection. */ + HOST_WIDE_INT get_stack_check_protect (void) { if (flag_stack_clash_protection) return 0; + return STACK_CHECK_PROTECT; } @@ -1532,6 +1534,8 @@ allocate_dynamic_stack_space (rtx size, unsigned size_align, saved_stack_pointer_delta = stack_pointer_delta; + /* If stack checking or stack clash protection is requested, + then probe the stack while allocating space from it. */ if (flag_stack_check && STACK_CHECK_MOVING_SP) anti_adjust_stack_and_probe (size, false); else if (flag_stack_clash_protection) @@ -1940,8 +1944,8 @@ emit_stack_clash_protection_probe_loop_end (rtx loop_lab, rtx end_loop, probes were not emitted. 2. It never skips probes, whereas anti_adjust_stack_and_probe will - skip probes on the first couple PROBE_INTERVALs on the assumption - they're done elsewhere. + skip the probe on the first PROBE_INTERVAL on the assumption it + was already done in the prologue and in previous allocations. 3. It only allocates and probes SIZE bytes, it does not need to allocate/probe beyond that because this probing style does not diff --git a/gcc/tree-nested.c b/gcc/tree-nested.c index de168df70a1..4dc5533be84 100644 --- a/gcc/tree-nested.c +++ b/gcc/tree-nested.c @@ -305,13 +305,14 @@ lookup_field_for_decl (struct nesting_info *info, tree decl, } else { - TREE_TYPE (field) = TREE_TYPE (decl); - DECL_SOURCE_LOCATION (field) = DECL_SOURCE_LOCATION (decl); - SET_DECL_ALIGN (field, DECL_ALIGN (decl)); - DECL_USER_ALIGN (field) = DECL_USER_ALIGN (decl); - TREE_ADDRESSABLE (field) = TREE_ADDRESSABLE (decl); - DECL_NONADDRESSABLE_P (field) = !TREE_ADDRESSABLE (decl); - TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (decl); + TREE_TYPE (field) = TREE_TYPE (decl); + DECL_SOURCE_LOCATION (field) = DECL_SOURCE_LOCATION (decl); + SET_DECL_ALIGN (field, DECL_ALIGN (decl)); + DECL_USER_ALIGN (field) = DECL_USER_ALIGN (decl); + DECL_IGNORED_P (field) = DECL_IGNORED_P (decl); + DECL_NONADDRESSABLE_P (field) = !TREE_ADDRESSABLE (decl); + TREE_NO_WARNING (field) = TREE_NO_WARNING (decl); + TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (decl); /* Declare the transformation and adjust the original DECL. For a variable or for a parameter when not optimizing, we make it point