Fix race exposed by gdb.threads/killed.exp

author Pedro Alves <palves@redhat.com>

Thu, 19 Mar 2015 15:12:33 +0000 (15:12 +0000)

committer Pedro Alves <palves@redhat.com>

Thu, 19 Mar 2015 17:07:38 +0000 (17:07 +0000)
author Pedro Alves <palves@redhat.com>
Thu, 19 Mar 2015 15:12:33 +0000 (15:12 +0000)
committer Pedro Alves <palves@redhat.com>
Thu, 19 Mar 2015 17:07:38 +0000 (17:07 +0000)
diff --git a/gdb/ChangeLog b/gdb/ChangeLog

index 7ae3c58347776ebd9c2fac1201ec637e884fb124..a8b8850f7f663721f91d8d424b63a175f7a34592 100644 (file)
--- a/gdb/ChangeLog
+++ b/gdb/ChangeLog
@@ -1,3 +1,16 @@
+2015-03-19  Pedro Alves  <palves@redhat.com>
+
+       * linux-nat.c (linux_resume_one_lwp): Rename to ...
+       (linux_resume_one_lwp_throw): ... this.  Don't handle ESRCH here,
+       instead call perror_with_name.
+       (check_ptrace_stopped_lwp_gone): New function.
+       (linux_resume_one_lwp): Reimplement as wrapper around
+       linux_resume_one_lwp_throw that swallows errors if the LWP is
+       gone.
+       (resume_stopped_resumed_lwps): Try register reads in TRY/CATCH and
+       swallows errors if the LWP is gone.  Use
+       linux_resume_one_lwp_throw instead of linux_resume_one_lwp.
+
  2015-03-19  Pedro Alves  <palves@redhat.com>
  
         * linux-nat.c (status_callback): Return early if the LWP has no
diff --git a/gdb/gdbserver/ChangeLog b/gdb/gdbserver/ChangeLog

index 0383e67c59a3f1646bca74b3dfdc34fa5482bbed..cbd199b6c9f134f09a1ded70926646ebfea57756 100644 (file)
--- a/gdb/gdbserver/ChangeLog
+++ b/gdb/gdbserver/ChangeLog
@@ -1,3 +1,13 @@
+2015-03-19  Pedro Alves  <palves@redhat.com>
+
+       * linux-low.c (linux_resume_one_lwp): Rename to ...
+       (linux_resume_one_lwp_throw): ... this.  Don't handle ESRCH here,
+       instead call perror_with_name.
+       (check_ptrace_stopped_lwp_gone): New function.
+       (linux_resume_one_lwp): Reimplement as wrapper around
+       linux_resume_one_lwp_throw that swallows errors if the LWP is
+       gone.
+
  2015-03-19  Pedro Alves  <palves@redhat.com>
  
         * linux-low.c (count_events_callback, select_event_lwp_callback):
diff --git a/gdb/gdbserver/linux-low.c b/gdb/gdbserver/linux-low.c

index 2b988ec13611c0154bc575c16ab5f1bf4dcef04c..0c5411510f5f9bff8b8a28e20d1e278a531faafd 100644 (file)
--- a/gdb/gdbserver/linux-low.c
+++ b/gdb/gdbserver/linux-low.c
@@ -3379,13 +3379,12 @@ stop_all_lwps (int suspend, struct lwp_info *except)
      }
  }
  
-/* Resume execution of the inferior process.
-   If STEP is nonzero, single-step it.
-   If SIGNAL is nonzero, give it that signal.  */
+/* Resume execution of LWP.  If STEP is nonzero, single-step it.  If
+   SIGNAL is nonzero, give it that signal.  */
  
  static void
-linux_resume_one_lwp (struct lwp_info *lwp,
-                     int step, int signal, siginfo_t *info)
+linux_resume_one_lwp_throw (struct lwp_info *lwp,
+                           int step, int signal, siginfo_t *info)
  {
    struct thread_info *thread = get_lwp_thread (lwp);
    struct thread_info *saved_thread;
@@ -3566,8 +3565,6 @@ linux_resume_one_lwp (struct lwp_info *lwp,
  
    regcache_invalidate_thread (thread);
    errno = 0;
-  lwp->stopped = 0;
-  lwp->stop_reason = TARGET_STOPPED_BY_NO_REASON;
    lwp->stepping = step;
    ptrace (step ? PTRACE_SINGLESTEP : PTRACE_CONT, lwpid_of (thread),
           (PTRACE_TYPE_ARG3) 0,
@@ -3577,19 +3574,68 @@ linux_resume_one_lwp (struct lwp_info *lwp,
  
    current_thread = saved_thread;
    if (errno)
+    perror_with_name ("resuming thread");
+
+  /* Successfully resumed.  Clear state that no longer makes sense,
+     and mark the LWP as running.  Must not do this before resuming
+     otherwise if that fails other code will be confused.  E.g., we'd
+     later try to stop the LWP and hang forever waiting for a stop
+     status.  Note that we must not throw after this is cleared,
+     otherwise handle_zombie_lwp_error would get confused.  */
+  lwp->stopped = 0;
+  lwp->stop_reason = TARGET_STOPPED_BY_NO_REASON;
+}
+
+/* Called when we try to resume a stopped LWP and that errors out.  If
+   the LWP is no longer in ptrace-stopped state (meaning it's zombie,
+   or about to become), discard the error, clear any pending status
+   the LWP may have, and return true (we'll collect the exit status
+   soon enough).  Otherwise, return false.  */
+
+static int
+check_ptrace_stopped_lwp_gone (struct lwp_info *lp)
+{
+  struct thread_info *thread = get_lwp_thread (lp);
+
+  /* If we get an error after resuming the LWP successfully, we'd
+     confuse !T state for the LWP being gone.  */
+  gdb_assert (lp->stopped);
+
+  /* We can't just check whether the LWP is in 'Z (Zombie)' state,
+     because even if ptrace failed with ESRCH, the tracee may be "not
+     yet fully dead", but already refusing ptrace requests.  In that
+     case the tracee has 'R (Running)' state for a little bit
+     (observed in Linux 3.18).  See also the note on ESRCH in the
+     ptrace(2) man page.  Instead, check whether the LWP has any state
+     other than ptrace-stopped.  */
+
+  /* Don't assume anything if /proc/PID/status can't be read.  */
+  if (linux_proc_pid_is_trace_stopped_nowarn (lwpid_of (thread)) == 0)
      {
-      /* ESRCH from ptrace either means that the thread was already
-        running (an error) or that it is gone (a race condition).  If
-        it's gone, we will get a notification the next time we wait,
-        so we can ignore the error.  We could differentiate these
-        two, but it's tricky without waiting; the thread still exists
-        as a zombie, so sending it signal 0 would succeed.  So just
-        ignore ESRCH.  */
-      if (errno == ESRCH)
-       return;
+      lp->stop_reason = TARGET_STOPPED_BY_NO_REASON;
+      lp->status_pending_p = 0;
+      return 1;
+    }
+  return 0;
+}
+
+/* Like linux_resume_one_lwp_throw, but no error is thrown if the LWP
+   disappears while we try to resume it.  */
  
-      perror_with_name ("ptrace");
+static void
+linux_resume_one_lwp (struct lwp_info *lwp,
+                     int step, int signal, siginfo_t *info)
+{
+  TRY
+    {
+      linux_resume_one_lwp_throw (lwp, step, signal, info);
+    }
+  CATCH (ex, RETURN_MASK_ERROR)
+    {
+      if (!check_ptrace_stopped_lwp_gone (lwp))
+       throw_exception (ex);
      }
+  END_CATCH
  }
  
  struct thread_resume_array
diff --git a/gdb/linux-nat.c b/gdb/linux-nat.c

index 40f1e1fce4056e4460d01f45e4dfec2359152822..8b620417a40d0b886ab56bf11daba98fdb7bb3e7 100644 (file)
--- a/gdb/linux-nat.c
+++ b/gdb/linux-nat.c
@@ -1503,7 +1503,8 @@ linux_nat_detach (struct target_ops *ops, const char *args, int from_tty)
     single-step it.  If SIGNAL is nonzero, give it that signal.  */
  
  static void
-linux_resume_one_lwp (struct lwp_info *lp, int step, enum gdb_signal signo)
+linux_resume_one_lwp_throw (struct lwp_info *lp, int step,
+                           enum gdb_signal signo)
  {
    lp->step = step;
  
@@ -1522,11 +1523,68 @@ linux_resume_one_lwp (struct lwp_info *lp, int step, enum gdb_signal signo)
    if (linux_nat_prepare_to_resume != NULL)
      linux_nat_prepare_to_resume (lp);
    linux_ops->to_resume (linux_ops, lp->ptid, step, signo);
-  lp->stop_reason = TARGET_STOPPED_BY_NO_REASON;
+
+  /* Successfully resumed.  Clear state that no longer makes sense,
+     and mark the LWP as running.  Must not do this before resuming
+     otherwise if that fails other code will be confused.  E.g., we'd
+     later try to stop the LWP and hang forever waiting for a stop
+     status.  Note that we must not throw after this is cleared,
+     otherwise handle_zombie_lwp_error would get confused.  */
    lp->stopped = 0;
+  lp->stop_reason = TARGET_STOPPED_BY_NO_REASON;
    registers_changed_ptid (lp->ptid);
  }
  
+/* Called when we try to resume a stopped LWP and that errors out.  If
+   the LWP is no longer in ptrace-stopped state (meaning it's zombie,
+   or about to become), discard the error, clear any pending status
+   the LWP may have, and return true (we'll collect the exit status
+   soon enough).  Otherwise, return false.  */
+
+static int
+check_ptrace_stopped_lwp_gone (struct lwp_info *lp)
+{
+  /* If we get an error after resuming the LWP successfully, we'd
+     confuse !T state for the LWP being gone.  */
+  gdb_assert (lp->stopped);
+
+  /* We can't just check whether the LWP is in 'Z (Zombie)' state,
+     because even if ptrace failed with ESRCH, the tracee may be "not
+     yet fully dead", but already refusing ptrace requests.  In that
+     case the tracee has 'R (Running)' state for a little bit
+     (observed in Linux 3.18).  See also the note on ESRCH in the
+     ptrace(2) man page.  Instead, check whether the LWP has any state
+     other than ptrace-stopped.  */
+
+  /* Don't assume anything if /proc/PID/status can't be read.  */
+  if (linux_proc_pid_is_trace_stopped_nowarn (ptid_get_lwp (lp->ptid)) == 0)
+    {
+      lp->stop_reason = TARGET_STOPPED_BY_NO_REASON;
+      lp->status = 0;
+      lp->waitstatus.kind = TARGET_WAITKIND_IGNORE;
+      return 1;
+    }
+  return 0;
+}
+
+/* Like linux_resume_one_lwp_throw, but no error is thrown if the LWP
+   disappears while we try to resume it.  */
+
+static void
+linux_resume_one_lwp (struct lwp_info *lp, int step, enum gdb_signal signo)
+{
+  TRY
+    {
+      linux_resume_one_lwp_throw (lp, step, signo);
+    }
+  CATCH (ex, RETURN_MASK_ERROR)
+    {
+      if (!check_ptrace_stopped_lwp_gone (lp))
+       throw_exception (ex);
+    }
+  END_CATCH
+}
+
  /* Resume LP.  */
  
  static void
@@ -3542,24 +3600,39 @@ resume_stopped_resumed_lwps (struct lwp_info *lp, void *data)
      {
        struct regcache *regcache = get_thread_regcache (lp->ptid);
        struct gdbarch *gdbarch = get_regcache_arch (regcache);
-      CORE_ADDR pc = regcache_read_pc (regcache);
  
-      /* Don't bother if there's a breakpoint at PC that we'd hit
-        immediately, and we're not waiting for this LWP.  */
-      if (!ptid_match (lp->ptid, *wait_ptid_p))
+      TRY
         {
-         if (breakpoint_inserted_here_p (get_regcache_aspace (regcache), pc))
-           return 0;
-       }
+         CORE_ADDR pc = regcache_read_pc (regcache);
+         int leave_stopped = 0;
  
-      if (debug_linux_nat)
-       fprintf_unfiltered (gdb_stdlog,
-                           "RSRL: resuming stopped-resumed LWP %s at %s: step=%d\n",
-                           target_pid_to_str (lp->ptid),
-                           paddress (gdbarch, pc),
-                           lp->step);
+         /* Don't bother if there's a breakpoint at PC that we'd hit
+            immediately, and we're not waiting for this LWP.  */
+         if (!ptid_match (lp->ptid, *wait_ptid_p))
+           {
+             if (breakpoint_inserted_here_p (get_regcache_aspace (regcache), pc))
+               leave_stopped = 1;
+           }
  
-      linux_resume_one_lwp (lp, lp->step, GDB_SIGNAL_0);
+         if (!leave_stopped)
+           {
+             if (debug_linux_nat)
+               fprintf_unfiltered (gdb_stdlog,
+                                   "RSRL: resuming stopped-resumed LWP %s at "
+                                   "%s: step=%d\n",
+                                   target_pid_to_str (lp->ptid),
+                                   paddress (gdbarch, pc),
+                                   lp->step);
+
+             linux_resume_one_lwp_throw (lp, lp->step, GDB_SIGNAL_0);
+           }
+       }
+      CATCH (ex, RETURN_MASK_ERROR)
+       {
+         if (!check_ptrace_stopped_lwp_gone (lp))
+           throw_exception (ex);
+       }
+      END_CATCH
      }
  
    return 0;
diff --git a/gdb/nat/linux-procfs.c b/gdb/nat/linux-procfs.c

index f1493839f60254f1a96d2d3cb2784a38ecce9d41..7599b32ed3eade6072136e8cb99c9ec231802e3e 100644 (file)
--- a/gdb/nat/linux-procfs.c
+++ b/gdb/nat/linux-procfs.c
@@ -151,6 +151,15 @@ linux_proc_pid_is_stopped (pid_t pid)
    return linux_proc_pid_has_state (pid, "T (stopped)", 1);
  }
  
+/* Detect `T (tracing stop)' in `/proc/PID/status'.
+   Other states including `T (stopped)' are reported as false.  */
+
+int
+linux_proc_pid_is_trace_stopped_nowarn (pid_t pid)
+{
+  return linux_proc_pid_has_state (pid, "T (tracing stop)", 1);
+}
+
  /* Return non-zero if PID is a zombie.  If WARN, warn on failure to
     open the /proc file.  */
  
diff --git a/gdb/nat/linux-procfs.h b/gdb/nat/linux-procfs.h

index 979ae0dbd824765914708a6d5c314b7632140965..c4f57885b1d0da60e99d818a897d9414a874f3bf 100644 (file)
--- a/gdb/nat/linux-procfs.h
+++ b/gdb/nat/linux-procfs.h
@@ -36,6 +36,8 @@ extern pid_t linux_proc_get_tracerpid_nowarn (pid_t lwpid);
  
  extern int linux_proc_pid_is_stopped (pid_t pid);
  
+extern int linux_proc_pid_is_trace_stopped_nowarn (pid_t pid);
+
  /* Return non-zero if PID is a zombie.  Failure to open the
     /proc/pid/status file results in a warning.  */
author	Pedro Alves <palves@redhat.com>
	Thu, 19 Mar 2015 15:12:33 +0000 (15:12 +0000)
committer	Pedro Alves <palves@redhat.com>
	Thu, 19 Mar 2015 17:07:38 +0000 (17:07 +0000)
gdb/ChangeLog		patch \| blob \| history
gdb/gdbserver/ChangeLog		patch \| blob \| history
gdb/gdbserver/linux-low.c		patch \| blob \| history
gdb/linux-nat.c		patch \| blob \| history
gdb/nat/linux-procfs.c		patch \| blob \| history
gdb/nat/linux-procfs.h		patch \| blob \| history