From 823b018e5b1196d810790559357447948f644548 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 23 Mar 2011 10:37:01 +0100
Subject: job control: Small reorganization of wait_consider_task()

Move EXIT_DEAD test in wait_consider_task() above ptrace check.  As
ptraced tasks can't be EXIT_DEAD, this change doesn't cause any
behavior change.  This is to prepare for further changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/exit.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45ebcc7b1..b4a935c72159 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1537,6 +1537,10 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 		return 0;
 	}
 
+	/* dead body doesn't have much to contribute */
+	if (p->exit_state == EXIT_DEAD)
+		return 0;
+
 	if (likely(!ptrace) && unlikely(task_ptrace(p))) {
 		/*
 		 * This child is hidden by ptrace.
@@ -1546,9 +1550,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 		return 0;
 	}
 
-	if (p->exit_state == EXIT_DEAD)
-		return 0;
-
 	/*
 	 * We don't reap group leaders with subthreads.
 	 */
-- 
cgit v1.2.3


From 9b84cca2564b9a5b2d064fb44d2a55a5b44473a0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 23 Mar 2011 10:37:01 +0100
Subject: job control: Fix ptracer wait(2) hang and explain notask_error
 clearing

wait(2) and friends allow access to stopped/continued states through
zombies, which is required as the states are process-wide and should
be accessible whether the leader task is alive or undead.
wait_consider_task() implements this by always clearing notask_error
and going through wait_task_stopped/continued() for unreaped zombies.

However, while ptraced, the stopped state is per-task and as such if
the ptracee became a zombie, there's no further stopped event to
listen to and wait(2) and friends should return -ECHILD on the tracee.

Fix it by clearing notask_error only if WCONTINUED | WEXITED is set
for ptraced zombies.  While at it, document why clearing notask_error
is safe for each case.

Test case follows.

  #include <stdio.h>
  #include <unistd.h>
  #include <pthread.h>
  #include <time.h>
  #include <sys/types.h>
  #include <sys/ptrace.h>
  #include <sys/wait.h>

  static void *nooper(void *arg)
  {
	  pause();
	  return NULL;
  }

  int main(void)
  {
	  const struct timespec ts1s = { .tv_sec = 1 };
	  pid_t tracee, tracer;
	  siginfo_t si;

	  tracee = fork();
	  if (tracee == 0) {
		  pthread_t thr;

		  pthread_create(&thr, NULL, nooper, NULL);
		  nanosleep(&ts1s, NULL);
		  printf("tracee exiting\n");
		  pthread_exit(NULL);	/* let subthread run */
	  }

	  tracer = fork();
	  if (tracer == 0) {
		  ptrace(PTRACE_ATTACH, tracee, NULL, NULL);
		  while (1) {
			  if (waitid(P_PID, tracee, &si, WSTOPPED) < 0) {
				  perror("waitid");
				  break;
			  }
			  ptrace(PTRACE_CONT, tracee, NULL,
				 (void *)(long)si.si_status);
		  }
		  return 0;
	  }

	  waitid(P_PID, tracer, &si, WEXITED);
	  kill(tracee, SIGKILL);
	  return 0;
  }

Before the patch, after the tracee becomes a zombie, the tracer's
waitid(WSTOPPED) never returns and the program doesn't terminate.

  tracee exiting
  ^C

After the patch, tracee exiting triggers waitid() to fail.

  tracee exiting
  waitid: No child processes

-v2: Oleg pointed out that exited in addition to continued can happen
     for ptraced dead group leader.  Clear notask_error for ptraced
     child on WEXITED too.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/exit.c | 44 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 10 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index b4a935c72159..84d13d6bb30b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1550,17 +1550,41 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 		return 0;
 	}
 
-	/*
-	 * We don't reap group leaders with subthreads.
-	 */
-	if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
-		return wait_task_zombie(wo, p);
+	/* slay zombie? */
+	if (p->exit_state == EXIT_ZOMBIE) {
+		/* we don't reap group leaders with subthreads */
+		if (!delay_group_leader(p))
+			return wait_task_zombie(wo, p);
 
-	/*
-	 * It's stopped or running now, so it might
-	 * later continue, exit, or stop again.
-	 */
-	wo->notask_error = 0;
+		/*
+		 * Allow access to stopped/continued state via zombie by
+		 * falling through.  Clearing of notask_error is complex.
+		 *
+		 * When !@ptrace:
+		 *
+		 * If WEXITED is set, notask_error should naturally be
+		 * cleared.  If not, subset of WSTOPPED|WCONTINUED is set,
+		 * so, if there are live subthreads, there are events to
+		 * wait for.  If all subthreads are dead, it's still safe
+		 * to clear - this function will be called again in finite
+		 * amount time once all the subthreads are released and
+		 * will then return without clearing.
+		 *
+		 * When @ptrace:
+		 *
+		 * Stopped state is per-task and thus can't change once the
+		 * target task dies.  Only continued and exited can happen.
+		 * Clear notask_error if WCONTINUED | WEXITED.
+		 */
+		if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
+			wo->notask_error = 0;
+	} else {
+		/*
+		 * @p is alive and it's gonna stop, continue or exit, so
+		 * there always is something to wait for.
+		 */
+		wo->notask_error = 0;
+	}
 
 	if (task_stopped_code(p, ptrace))
 		return wait_task_stopped(wo, ptrace, p);
-- 
cgit v1.2.3


From 45cb24a1da53beb70f09efccc0373f6a47a9efe0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 23 Mar 2011 10:37:01 +0100
Subject: job control: Allow access to job control events through ptracees

Currently a real parent can't access job control stopped/continued
events through a ptraced child.  This utterly breaks job control when
the children are ptraced.

For example, if a program is run from an interactive shell and then
strace(1) attaches to it, pressing ^Z would send SIGTSTP and strace(1)
would notice it but the shell has no way to tell whether the child
entered job control stop and thus can't tell when to take over the
terminal - leading to awkward lone ^Z on the terminal.

Because the job control and ptrace stopped states are independent,
there is no reason to prevent real parents from accessing the stopped
state regardless of ptrace.  The continued state isn't separate but
ptracers don't have any use for them as ptracees can never resume
without explicit command from their ptracers, so as long as ptracers
don't consume it, it should be fine.

Although this is a behavior change, because the previous behavior is
utterly broken when viewed from real parents and the change is only
visible to real parents, I don't think it's necessary to make this
behavior optional.

One situation to be careful about is when a task from the real
parent's group is ptracing.  The parent group is the recipient of both
ptrace and job control stop events and one stop can be reported as
both job control and ptrace stops.  As this can break the current
ptrace users, suppress job control stopped events for these cases.

If a real parent ptracer wants to know about both job control and
ptrace stops, it can create a separate process to serve the role of
real parent.

Note that this only updates wait(2) side of things.  The real parent
can access the states via wait(2) but still is not properly notified
(woken up and delivered signal).  Test case polls wait(2) with WNOHANG
to work around.  Notification will be updated by future patches.

Test case follows.

  #include <stdio.h>
  #include <unistd.h>
  #include <time.h>
  #include <errno.h>
  #include <sys/types.h>
  #include <sys/ptrace.h>
  #include <sys/wait.h>

  int main(void)
  {
	  const struct timespec ts100ms = { .tv_nsec = 100000000 };
	  pid_t tracee, tracer;
	  siginfo_t si;
	  int i;

	  tracee = fork();
	  if (tracee == 0) {
		  while (1) {
			  printf("tracee: SIGSTOP\n");
			  raise(SIGSTOP);
			  nanosleep(&ts100ms, NULL);
			  printf("tracee: SIGCONT\n");
			  raise(SIGCONT);
			  nanosleep(&ts100ms, NULL);
		  }
	  }

	  waitid(P_PID, tracee, &si, WSTOPPED | WNOHANG | WNOWAIT);

	  tracer = fork();
	  if (tracer == 0) {
		  nanosleep(&ts100ms, NULL);
		  ptrace(PTRACE_ATTACH, tracee, NULL, NULL);

		  for (i = 0; i < 11; i++) {
			  si.si_pid = 0;
			  waitid(P_PID, tracee, &si, WSTOPPED);
			  if (si.si_pid && si.si_code == CLD_TRAPPED)
				  ptrace(PTRACE_CONT, tracee, NULL,
					 (void *)(long)si.si_status);
		  }
		  printf("tracer: EXITING\n");
		  return 0;
	  }

	  while (1) {
		  si.si_pid = 0;
		  waitid(P_PID, tracee, &si,
			 WSTOPPED | WCONTINUED | WEXITED | WNOHANG);
		  if (si.si_pid)
			  printf("mommy : WAIT status=%02d code=%02d\n",
				 si.si_status, si.si_code);
		  nanosleep(&ts100ms, NULL);
	  }
	  return 0;
  }

Before the patch, while ptraced, the parent can't see any job control
events.

  tracee: SIGSTOP
  mommy : WAIT status=19 code=05
  tracee: SIGCONT
  tracee: SIGSTOP
  tracee: SIGCONT
  tracee: SIGSTOP
  tracee: SIGCONT
  tracee: SIGSTOP
  tracer: EXITING
  mommy : WAIT status=19 code=05
  ^C

After the patch,

  tracee: SIGSTOP
  mommy : WAIT status=19 code=05
  tracee: SIGCONT
  mommy : WAIT status=18 code=06
  tracee: SIGSTOP
  mommy : WAIT status=19 code=05
  tracee: SIGCONT
  mommy : WAIT status=18 code=06
  tracee: SIGSTOP
  mommy : WAIT status=19 code=05
  tracee: SIGCONT
  mommy : WAIT status=18 code=06
  tracee: SIGSTOP
  tracer: EXITING
  mommy : WAIT status=19 code=05
  ^C

-v2: Oleg pointed out that wait(2) should be suppressed for the real
     parent's group instead of only the real parent task itself.
     Updated accordingly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/exit.c | 41 +++++++++++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 8 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 84d13d6bb30b..1a0f10f0a4db 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1541,17 +1541,19 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 	if (p->exit_state == EXIT_DEAD)
 		return 0;
 
-	if (likely(!ptrace) && unlikely(task_ptrace(p))) {
+	/* slay zombie? */
+	if (p->exit_state == EXIT_ZOMBIE) {
 		/*
-		 * This child is hidden by ptrace.
-		 * We aren't allowed to see it now, but eventually we will.
+		 * A zombie ptracee is only visible to its ptracer.
+		 * Notification and reaping will be cascaded to the real
+		 * parent when the ptracer detaches.
 		 */
-		wo->notask_error = 0;
-		return 0;
-	}
+		if (likely(!ptrace) && unlikely(task_ptrace(p))) {
+			/* it will become visible, clear notask_error */
+			wo->notask_error = 0;
+			return 0;
+		}
 
-	/* slay zombie? */
-	if (p->exit_state == EXIT_ZOMBIE) {
 		/* we don't reap group leaders with subthreads */
 		if (!delay_group_leader(p))
 			return wait_task_zombie(wo, p);
@@ -1579,6 +1581,20 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 		if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
 			wo->notask_error = 0;
 	} else {
+		/*
+		 * If @p is ptraced by a task in its real parent's group,
+		 * hide group stop/continued state when looking at @p as
+		 * the real parent; otherwise, a single stop can be
+		 * reported twice as group and ptrace stops.
+		 *
+		 * If a ptracer wants to distinguish the two events for its
+		 * own children, it should create a separate process which
+		 * takes the role of real parent.
+		 */
+		if (likely(!ptrace) && task_ptrace(p) &&
+		    same_thread_group(p->parent, p->real_parent))
+			return 0;
+
 		/*
 		 * @p is alive and it's gonna stop, continue or exit, so
 		 * there always is something to wait for.
@@ -1586,9 +1602,18 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 		wo->notask_error = 0;
 	}
 
+	/*
+	 * Wait for stopped.  Depending on @ptrace, different stopped state
+	 * is used and the two don't interact with each other.
+	 */
 	if (task_stopped_code(p, ptrace))
 		return wait_task_stopped(wo, ptrace, p);
 
+	/*
+	 * Wait for continued.  There's only one continued state and the
+	 * ptracer can consume it which can confuse the real parent.  Don't
+	 * use WCONTINUED from ptracer.  You don't need or want it.
+	 */
 	return wait_task_continued(wo, p);
 }
 
-- 
cgit v1.2.3


From 19e274630c9e23a84d5940af83cf5db35103f968 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 12 May 2011 10:47:23 +0200
Subject: job control: reorganize wait_task_stopped()

wait_task_stopped() tested task_stopped_code() without acquiring
siglock and, if stop condition existed, called wait_task_stopped() and
directly returned the result.  This patch moves the initial
task_stopped_code() testing into wait_task_stopped() and make
wait_consider_task() fall through to wait_task_continue() on 0 return.

This is for the following two reasons.

* Because the initial task_stopped_code() test is done without
  acquiring siglock, it may race against SIGCONT generation.  The
  stopped condition might have been replaced by continued state by the
  time wait_task_stopped() acquired siglock.  This may lead to
  unexpected failure of WNOHANG waits.

  This reorganization addresses this single race case but there are
  other cases - TASK_RUNNING -> TASK_STOPPED transition and EXIT_*
  transitions.

* Scheduled ptrace updates require changes to the initial test which
  would fit better inside wait_task_stopped().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/exit.c | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 5cbc83e83a5d..33837936b98c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1377,11 +1377,23 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
 	return NULL;
 }
 
-/*
- * Handle sys_wait4 work for one task in state TASK_STOPPED.  We hold
- * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
- * the lock and this task is uninteresting.  If we return nonzero, we have
- * released the lock and the system call should return.
+/**
+ * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
+ * @wo: wait options
+ * @ptrace: is the wait for ptrace
+ * @p: task to wait for
+ *
+ * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
+ *
+ * CONTEXT:
+ * read_lock(&tasklist_lock), which is released if return value is
+ * non-zero.  Also, grabs and releases @p->sighand->siglock.
+ *
+ * RETURNS:
+ * 0 if wait condition didn't exist and search for other wait conditions
+ * should continue.  Non-zero return, -errno on failure and @p's pid on
+ * success, implies that tasklist_lock is released and wait condition
+ * search should terminate.
  */
 static int wait_task_stopped(struct wait_opts *wo,
 				int ptrace, struct task_struct *p)
@@ -1397,6 +1409,9 @@ static int wait_task_stopped(struct wait_opts *wo,
 	if (!ptrace && !(wo->wo_flags & WUNTRACED))
 		return 0;
 
+	if (!task_stopped_code(p, ptrace))
+		return 0;
+
 	exit_code = 0;
 	spin_lock_irq(&p->sighand->siglock);
 
@@ -1607,8 +1622,9 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 	 * Wait for stopped.  Depending on @ptrace, different stopped state
 	 * is used and the two don't interact with each other.
 	 */
-	if (task_stopped_code(p, ptrace))
-		return wait_task_stopped(wo, ptrace, p);
+	ret = wait_task_stopped(wo, ptrace, p);
+	if (ret)
+		return ret;
 
 	/*
 	 * Wait for continued.  There's only one continued state and the
-- 
cgit v1.2.3