Discussion:
[PATCH RFC] sched: Revert delayed_put_task_struct() and fix use after free
Kirill Tkhai
2014-10-15 12:31:40 UTC
Permalink
This WARN_ON_ONCE() placed into __schedule() triggers warning:

@@ -2852,6 +2852,7 @@ static void __sched __schedule(void)

if (likely(prev != next)) {
rq->nr_switches++;
+ WARN_ON_ONCE(atomic_read(&prev->usage) == 1);
rq->curr = next;
++*switch_count;

WARNING: CPU: 2 PID: 6497 at kernel/sched/core.c:2855 __schedule+0x656/0x8a0()
Modules linked in:
CPU: 2 PID: 6497 Comm: cat Not tainted 3.17.0+ #3
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
0000000000000009 ffff88022f50bdd8 ffffffff81518c78 0000000000000004
0000000000000000 ffff88022f50be18 ffffffff8104b1ac ffff88022f50be18
ffff880239912b40 ffff88022e5720d0 0000000000000002 0000000000000000
Call Trace:
[<ffffffff81518c78>] dump_stack+0x4f/0x7c
[<ffffffff8104b1ac>] warn_slowpath_common+0x7c/0xa0
[<ffffffff8104b275>] warn_slowpath_null+0x15/0x20
[<ffffffff8151bad6>] __schedule+0x656/0x8a0
[<ffffffff8151bd44>] schedule+0x24/0x70
[<ffffffff8104c7aa>] do_exit+0x72a/0xb40
[<ffffffff81071b31>] ? get_parent_ip+0x11/0x50
[<ffffffff8104da6a>] do_group_exit+0x3a/0xa0
[<ffffffff8104dadf>] SyS_exit_group+0xf/0x10
[<ffffffff8151fe92>] system_call_fastpath+0x12/0x17
---[ end trace d07155396c4faa0c ]---

This means the final put_task_struct() happens against RCU rules.
Regarding to scheduler this may be a reason of use-after-free.

task_numa_compare() schedule()
rcu_read_lock() ...
cur = ACCESS_ONCE(dst_rq->curr) ...
... rq->curr = next;
... context_switch()
... finish_task_switch()
... put_task_struct()
... __put_task_struct()
... free_task_struct()
task_numa_assign() ...
get_task_struct() ...

If other subsystems have a similar link to task, then the problem is also possible
there.

Delayed put_task_struct() was introduced in 8c7904a00b06:
"task: RCU protect task->usage" at "Fri Mar 31 02:31:37 2006".

It looks like it was safe to use that way, but now it's not. Something has changed
(preempt RCU?). Welcome to the analysis!

Signed-off-by: Kirill Tkhai <***@parallels.com>
---
include/linux/sched.h | 3 ++-
kernel/exit.c | 8 ++++----
kernel/fork.c | 1 -
3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5e344bb..6bfc041 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1854,11 +1854,12 @@ extern void free_task(struct task_struct *tsk);
#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)

extern void __put_task_struct(struct task_struct *t);
+extern void __put_task_struct_cb(struct rcu_head *rhp);

static inline void put_task_struct(struct task_struct *t)
{
if (atomic_dec_and_test(&t->usage))
- __put_task_struct(t);
+ call_rcu(&t->rcu, __put_task_struct_cb);
}

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
diff --git a/kernel/exit.c b/kernel/exit.c
index 5d30019..326eae7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -159,15 +159,15 @@ static void __exit_signal(struct task_struct *tsk)
}
}

-static void delayed_put_task_struct(struct rcu_head *rhp)
+void __put_task_struct_cb(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);

perf_event_delayed_put(tsk);
trace_sched_process_free(tsk);
- put_task_struct(tsk);
+ __put_task_struct(tsk);
}
-
+EXPORT_SYMBOL_GPL(__put_task_struct_cb);

void release_task(struct task_struct *p)
{
@@ -207,7 +207,7 @@ void release_task(struct task_struct *p)

write_unlock_irq(&tasklist_lock);
release_thread(p);
- call_rcu(&p->rcu, delayed_put_task_struct);
+ put_task_struct(p);

p = leader;
if (unlikely(zap_leader))
diff --git a/kernel/fork.c b/kernel/fork.c
index 9b7d746..4d3ac3c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -249,7 +249,6 @@ void __put_task_struct(struct task_struct *tsk)
if (!profile_handoff_task(tsk))
free_task(tsk);
}
-EXPORT_SYMBOL_GPL(__put_task_struct);

void __init __weak arch_task_cache_init(void) { }
Oleg Nesterov
2014-10-15 15:06:41 UTC
Permalink
Post by Kirill Tkhai
@@ -2852,6 +2852,7 @@ static void __sched __schedule(void)
if (likely(prev != next)) {
rq->nr_switches++;
+ WARN_ON_ONCE(atomic_read(&prev->usage) == 1);
I think you know this, but let me clarify just in case that this WARN()
is wrong, prev->usage == 1 is fine if the task does its last schedule()
and it was already (auto)reaped.
Post by Kirill Tkhai
This means the final put_task_struct() happens against RCU rules.
Well, yes, it doesn't use delayed_put_pid(). But this should be fine,
this drops the extra reference created by dup_task_struct().

However,
Post by Kirill Tkhai
Regarding to scheduler this may be a reason of use-after-free.
task_numa_compare() schedule()
rcu_read_lock() ...
cur = ACCESS_ONCE(dst_rq->curr) ...
... rq->curr = next;
... context_switch()
... finish_task_switch()
... put_task_struct()
... __put_task_struct()
... free_task_struct()
task_numa_assign() ...
get_task_struct() ...
Agreed. I don't understand this code (will try to take another look later),
but at first glance this looks wrong.

At least the code like

rcu_read_lock();
get_task_struct(foreign_rq->curr);
rcu_read_unlock();

is certainly wrong. And _probably_ the problem should be fixed here. Perhaps
we can add try_to_get_task_struct() which does atomic_inc_not_zero() ...
Post by Kirill Tkhai
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1854,11 +1854,12 @@ extern void free_task(struct task_struct *tsk);
#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
extern void __put_task_struct(struct task_struct *t);
+extern void __put_task_struct_cb(struct rcu_head *rhp);
static inline void put_task_struct(struct task_struct *t)
{
if (atomic_dec_and_test(&t->usage))
- __put_task_struct(t);
+ call_rcu(&t->rcu, __put_task_struct_cb);
}
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
diff --git a/kernel/exit.c b/kernel/exit.c
index 5d30019..326eae7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -159,15 +159,15 @@ static void __exit_signal(struct task_struct *tsk)
}
}
-static void delayed_put_task_struct(struct rcu_head *rhp)
+void __put_task_struct_cb(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
perf_event_delayed_put(tsk);
trace_sched_process_free(tsk);
- put_task_struct(tsk);
+ __put_task_struct(tsk);
}
-
+EXPORT_SYMBOL_GPL(__put_task_struct_cb);
void release_task(struct task_struct *p)
{
@@ -207,7 +207,7 @@ void release_task(struct task_struct *p)
write_unlock_irq(&tasklist_lock);
release_thread(p);
- call_rcu(&p->rcu, delayed_put_task_struct);
+ put_task_struct(p);
p = leader;
if (unlikely(zap_leader))
diff --git a/kernel/fork.c b/kernel/fork.c
index 9b7d746..4d3ac3c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -249,7 +249,6 @@ void __put_task_struct(struct task_struct *tsk)
if (!profile_handoff_task(tsk))
free_task(tsk);
}
-EXPORT_SYMBOL_GPL(__put_task_struct);
void __init __weak arch_task_cache_init(void) { }
Hmm. I am not sure I understand how this patch can actually fix this problem.
It seems that it is still possible that get_task_struct() can be called after
call_rcu(__put_task_struct_cb) ? But perhaps I misread this patch.

And I think it adds another problem. Suppose we have a zombie which already
called schedule() in TASK_DEAD state. IOW, its ->usage == 1, its parent will
free this task when it calls sys_wait().

With this patch the code like

rcu_read_lock();
for_each_process(p) {
if (pred(p) {
get_task_struct(p);
return p;
}
}
rcu_read_unlock();

becomes unsafe: we can race with release_task(p) and get_task_struct() can
can be called when prev->usage is already 0 and this task_struct can be freed
omce you drop rcu_read_lock().

Oleg.
Oleg Nesterov
2014-10-15 19:40:44 UTC
Permalink
Post by Oleg Nesterov
Post by Kirill Tkhai
Regarding to scheduler this may be a reason of use-after-free.
task_numa_compare() schedule()
rcu_read_lock() ...
cur = ACCESS_ONCE(dst_rq->curr) ...
... rq->curr = next;
... context_switch()
... finish_task_switch()
... put_task_struct()
... __put_task_struct()
... free_task_struct()
task_numa_assign() ...
get_task_struct() ...
Agreed. I don't understand this code (will try to take another look later),
but at first glance this looks wrong.
At least the code like
rcu_read_lock();
get_task_struct(foreign_rq->curr);
rcu_read_unlock();
is certainly wrong. And _probably_ the problem should be fixed here. Perhaps
we can add try_to_get_task_struct() which does atomic_inc_not_zero() ...
Yes, but perhaps in this particular case another simple fix makes more
sense. The patch below needs a comment to explain that we check PF_EXITING
because:

1. It doesn't make sense to migrate the exiting task. Although perhaps
we could check ->mm == NULL instead.

But let me repeat that I do not understand this code, I am not sure
we can equally treat is_idle_task() and PF_EXITING here...

2. If PF_EXITING is not set (or ->mm != NULL) then delayed_put_task_struct()
won't be called until we drop rcu_read_lock(), and thus get_task_struct()
is safe.

And. it seems that there is another problem? Can't task_h_load(cur) race
with itself if 2 CPU's call task_numa_migrate() and inspect the same rq
in parallel? Again, I don't understand this code, but update_cfs_rq_h_load()
doesn't look "atomic". In fact I am not even sure about task_h_load(env->p),
p == current but we do not disable preemption.

What do you think?

Oleg.

--- x/kernel/sched/fair.c
+++ x/kernel/sched/fair.c
@@ -1165,7 +1165,7 @@ static void task_numa_compare(struct tas

rcu_read_lock();
cur = ACCESS_ONCE(dst_rq->curr);
- if (cur->pid == 0) /* idle */
+ if (is_idle_task(cur) || (curr->flags & PF_EXITING))
cur = NULL;

/*
Kirill Tkhai
2014-10-15 21:46:07 UTC
Permalink
Yeah, you're sure about initial patch. Thanks for signal explanation.
Post by Oleg Nesterov
Post by Oleg Nesterov
Post by Kirill Tkhai
Regarding to scheduler this may be a reason of use-after-free.
task_numa_compare() schedule()
rcu_read_lock() ...
cur = ACCESS_ONCE(dst_rq->curr) ...
... rq->curr = next;
... context_switch()
... finish_task_switch()
... put_task_struct()
... __put_task_struct()
... free_task_struct()
task_numa_assign() ...
get_task_struct() ...
Agreed. I don't understand this code (will try to take another look later),
but at first glance this looks wrong.
At least the code like
rcu_read_lock();
get_task_struct(foreign_rq->curr);
rcu_read_unlock();
is certainly wrong. And _probably_ the problem should be fixed here. Perhaps
we can add try_to_get_task_struct() which does atomic_inc_not_zero() ...
Yes, but perhaps in this particular case another simple fix makes more
sense. The patch below needs a comment to explain that we check PF_EXITING
1. It doesn't make sense to migrate the exiting task. Although perhaps
we could check ->mm == NULL instead.
But let me repeat that I do not understand this code, I am not sure
we can equally treat is_idle_task() and PF_EXITING here...
2. If PF_EXITING is not set (or ->mm != NULL) then delayed_put_task_struct()
won't be called until we drop rcu_read_lock(), and thus get_task_struct()
is safe.
Cool! Elegant fix. We set PF_EXITING in exit_signals(), which is earlier
than release_task() is called.

Shouldn't we use smp_rmb/smp_wmb here?
Post by Oleg Nesterov
And. it seems that there is another problem? Can't task_h_load(cur) race
with itself if 2 CPU's call task_numa_migrate() and inspect the same rq
in parallel? Again, I don't understand this code, but update_cfs_rq_h_load()
doesn't look "atomic". In fact I am not even sure about task_h_load(env->p),
p == current but we do not disable preemption.
What do you think?
We use it completely unlocked, so nothing good is here. Also we work
with pointers.

As I understand in update_cfs_rq_h_load() we go from bottom to top,
and then from top to bottom. We set cfs_rq::h_load_next to be able
to do top-bottom passage (top is a root of "tree").

Yeah, this "way" may be overwritten by competitor. Also, task may change
its cfs_rq.
Post by Oleg Nesterov
--- x/kernel/sched/fair.c
+++ x/kernel/sched/fair.c
@@ -1165,7 +1165,7 @@ static void task_numa_compare(struct tas
rcu_read_lock();
cur = ACCESS_ONCE(dst_rq->curr);
- if (cur->pid == 0) /* idle */
+ if (is_idle_task(cur) || (curr->flags & PF_EXITING))
cur = NULL;
/*
Looks like, we have to use the same fix for task_numa_group().

grp = rcu_dereference(tsk->numa_group);

Below we dereference grp->nr_tasks.

Also, the same in rt.c and deadline.c, but we do no take second
reference there. Wrong pointer dereference is not possible there,
not so bad.

Kirill
Kirill Tkhai
2014-10-15 22:02:38 UTC
Permalink
Post by Kirill Tkhai
Yeah, you're sure about initial patch. Thanks for signal explanation.
Post by Oleg Nesterov
Post by Oleg Nesterov
Post by Kirill Tkhai
Regarding to scheduler this may be a reason of use-after-free.
task_numa_compare() schedule()
rcu_read_lock() ...
cur = ACCESS_ONCE(dst_rq->curr) ...
... rq->curr = next;
... context_switch()
... finish_task_switch()
... put_task_struct()
... __put_task_struct()
... free_task_struct()
task_numa_assign() ...
get_task_struct() ...
Agreed. I don't understand this code (will try to take another look later),
but at first glance this looks wrong.
At least the code like
rcu_read_lock();
get_task_struct(foreign_rq->curr);
rcu_read_unlock();
is certainly wrong. And _probably_ the problem should be fixed here. Perhaps
we can add try_to_get_task_struct() which does atomic_inc_not_zero() ...
Yes, but perhaps in this particular case another simple fix makes more
sense. The patch below needs a comment to explain that we check PF_EXITING
1. It doesn't make sense to migrate the exiting task. Although perhaps
we could check ->mm == NULL instead.
But let me repeat that I do not understand this code, I am not sure
we can equally treat is_idle_task() and PF_EXITING here...
2. If PF_EXITING is not set (or ->mm != NULL) then delayed_put_task_struct()
won't be called until we drop rcu_read_lock(), and thus get_task_struct()
is safe.
Cool! Elegant fix. We set PF_EXITING in exit_signals(), which is earlier
than release_task() is called.
Shouldn't we use smp_rmb/smp_wmb here?
Post by Oleg Nesterov
And. it seems that there is another problem? Can't task_h_load(cur) race
with itself if 2 CPU's call task_numa_migrate() and inspect the same rq
in parallel? Again, I don't understand this code, but update_cfs_rq_h_load()
doesn't look "atomic". In fact I am not even sure about task_h_load(env->p),
p == current but we do not disable preemption.
What do you think?
We use it completely unlocked, so nothing good is here. Also we work
with pointers.
As I understand in update_cfs_rq_h_load() we go from bottom to top,
and then from top to bottom. We set cfs_rq::h_load_next to be able
to do top-bottom passage (top is a root of "tree").
Yeah, this "way" may be overwritten by competitor. Also, task may change
its cfs_rq.
Wrong, it's not a task... Brain is sleepy, it's better tomorrow.
Post by Kirill Tkhai
Post by Oleg Nesterov
--- x/kernel/sched/fair.c
+++ x/kernel/sched/fair.c
@@ -1165,7 +1165,7 @@ static void task_numa_compare(struct tas
rcu_read_lock();
cur = ACCESS_ONCE(dst_rq->curr);
- if (cur->pid == 0) /* idle */
+ if (is_idle_task(cur) || (curr->flags & PF_EXITING))
cur = NULL;
/*
Looks like, we have to use the same fix for task_numa_group().
grp = rcu_dereference(tsk->numa_group);
Below we dereference grp->nr_tasks.
Also, the same in rt.c and deadline.c, but we do no take second
reference there. Wrong pointer dereference is not possible there,
not so bad.
Kirill
Peter Zijlstra
2014-10-16 07:59:50 UTC
Permalink
Post by Kirill Tkhai
Post by Oleg Nesterov
--- x/kernel/sched/fair.c
+++ x/kernel/sched/fair.c
@@ -1165,7 +1165,7 @@ static void task_numa_compare(struct tas
rcu_read_lock();
cur = ACCESS_ONCE(dst_rq->curr);
- if (cur->pid == 0) /* idle */
+ if (is_idle_task(cur) || (curr->flags & PF_EXITING))
cur = NULL;
/*
Looks like, we have to use the same fix for task_numa_group().
Don't think so, task_numa_group() is only called from task_numa_fault()
which is on 'current' and neither idle and PF_EXITING should be
faulting.
Kirill Tkhai
2014-10-16 08:16:44 UTC
Permalink
=D0=92 =D0=A7=D1=82, 16/10/2014 =D0=B2 09:59 +0200, Peter Zijlstra =D0=BF=
Post by Kirill Tkhai
Post by Oleg Nesterov
--- x/kernel/sched/fair.c
+++ x/kernel/sched/fair.c
@@ -1165,7 +1165,7 @@ static void task_numa_compare(struct tas
=20
rcu_read_lock();
cur =3D ACCESS_ONCE(dst_rq->curr);
- if (cur->pid =3D=3D 0) /* idle */
+ if (is_idle_task(cur) || (curr->flags & PF_EXITING))
cur =3D NULL;
=20
/*
=20
=20
Looks like, we have to use the same fix for task_numa_group().
=20
Don't think so, task_numa_group() is only called from task_numa_fault=
()
which is on 'current' and neither idle and PF_EXITING should be
faulting.
Isn't task_numa_group() fully preemptible?

It seems cpu_rq(cpu)->curr is not always equal to p.
Peter Zijlstra
2014-10-16 09:43:33 UTC
Permalink
Post by Kirill Tkhai
=D0=92 =D0=A7=D1=82, 16/10/2014 =D0=B2 09:59 +0200, Peter Zijlstra =D0=
Post by Kirill Tkhai
Post by Oleg Nesterov
--- x/kernel/sched/fair.c
+++ x/kernel/sched/fair.c
@@ -1165,7 +1165,7 @@ static void task_numa_compare(struct tas
=20
rcu_read_lock();
cur =3D ACCESS_ONCE(dst_rq->curr);
- if (cur->pid =3D=3D 0) /* idle */
+ if (is_idle_task(cur) || (curr->flags & PF_EXITING))
cur =3D NULL;
=20
/*
=20
=20
Looks like, we have to use the same fix for task_numa_group().
=20
Don't think so, task_numa_group() is only called from task_numa_fau=
lt()
Post by Kirill Tkhai
which is on 'current' and neither idle and PF_EXITING should be
faulting.
=20
Isn't task_numa_group() fully preemptible?
Not seeing how that is relevant.
Post by Kirill Tkhai
It seems cpu_rq(cpu)->curr is not always equal to p.
It should be afaict:

task_numa_fault()
p =3D current;

task_numa_group(p, ..);

And like said, idle tasks and PF_EXITING task should never get (numa)
faults for they should never be touching userspace.
Kirill Tkhai
2014-10-16 09:50:02 UTC
Permalink
=D0=92 =D0=A7=D1=82, 16/10/2014 =D0=B2 11:43 +0200, Peter Zijlstra =D0=BF=
Post by Peter Zijlstra
Post by Kirill Tkhai
=D0=92 =D0=A7=D1=82, 16/10/2014 =D0=B2 09:59 +0200, Peter Zijlstra =
Post by Kirill Tkhai
Post by Oleg Nesterov
--- x/kernel/sched/fair.c
+++ x/kernel/sched/fair.c
@@ -1165,7 +1165,7 @@ static void task_numa_compare(struct ta=
s
Post by Peter Zijlstra
Post by Kirill Tkhai
Post by Kirill Tkhai
Post by Oleg Nesterov
=20
rcu_read_lock();
cur =3D ACCESS_ONCE(dst_rq->curr);
- if (cur->pid =3D=3D 0) /* idle */
+ if (is_idle_task(cur) || (curr->flags & PF_EXITING))
cur =3D NULL;
=20
/*
=20
=20
Looks like, we have to use the same fix for task_numa_group().
=20
Don't think so, task_numa_group() is only called from task_numa_f=
ault()
Post by Peter Zijlstra
Post by Kirill Tkhai
which is on 'current' and neither idle and PF_EXITING should be
faulting.
=20
Isn't task_numa_group() fully preemptible?
=20
Not seeing how that is relevant.
=20
Post by Kirill Tkhai
It seems cpu_rq(cpu)->curr is not always equal to p.
=20
=20
task_numa_fault()
p =3D current;
=20
task_numa_group(p, ..);
=20
And like said, idle tasks and PF_EXITING task should never get (numa)
faults for they should never be touching userspace.
I mean p can be moved to other cpu.

tsk =3D ACCESS_ONCE(cpu_rq(cpu)->curr);

tsk is not p, (i.e current) here.
Kirill Tkhai
2014-10-16 09:51:55 UTC
Permalink
=D0=92 =D0=A7=D1=82, 16/10/2014 =D0=B2 13:50 +0400, Kirill Tkhai =D0=BF=
Post by Kirill Tkhai
=D0=92 =D0=A7=D1=82, 16/10/2014 =D0=B2 11:43 +0200, Peter Zijlstra =D0=
Post by Peter Zijlstra
=D0=92 =D0=A7=D1=82, 16/10/2014 =D0=B2 09:59 +0200, Peter Zijlstr=
Post by Oleg Nesterov
--- x/kernel/sched/fair.c
+++ x/kernel/sched/fair.c
@@ -1165,7 +1165,7 @@ static void task_numa_compare(struct =
tas
Post by Kirill Tkhai
Post by Peter Zijlstra
Post by Oleg Nesterov
=20
rcu_read_lock();
cur =3D ACCESS_ONCE(dst_rq->curr);
- if (cur->pid =3D=3D 0) /* idle */
+ if (is_idle_task(cur) || (curr->flags & PF_EXITING))
cur =3D NULL;
=20
/*
=20
=20
Looks like, we have to use the same fix for task_numa_group()=
=2E
Post by Kirill Tkhai
Post by Peter Zijlstra
=20
Don't think so, task_numa_group() is only called from task_numa=
_fault()
Post by Kirill Tkhai
Post by Peter Zijlstra
which is on 'current' and neither idle and PF_EXITING should be
faulting.
=20
Isn't task_numa_group() fully preemptible?
=20
Not seeing how that is relevant.
=20
It seems cpu_rq(cpu)->curr is not always equal to p.
=20
=20
task_numa_fault()
p =3D current;
=20
task_numa_group(p, ..);
=20
And like said, idle tasks and PF_EXITING task should never get (num=
a)
Post by Kirill Tkhai
Post by Peter Zijlstra
faults for they should never be touching userspace.
=20
I mean p can be moved to other cpu.
=20
tsk =3D ACCESS_ONCE(cpu_rq(cpu)->curr);
=20
tsk is not p, (i.e current) here.
Maybe I undestand wrong and preemption is disabled in memory fault?
Kirill Tkhai
2014-10-16 10:04:53 UTC
Permalink
=D0=92 =D0=A7=D1=82, 16/10/2014 =D0=B2 13:51 +0400, Kirill Tkhai =D0=BF=
Post by Kirill Tkhai
=D0=92 =D0=A7=D1=82, 16/10/2014 =D0=B2 13:50 +0400, Kirill Tkhai =D0=BF=
Post by Kirill Tkhai
=D0=92 =D0=A7=D1=82, 16/10/2014 =D0=B2 11:43 +0200, Peter Zijlstra =
Post by Peter Zijlstra
=D0=92 =D0=A7=D1=82, 16/10/2014 =D0=B2 09:59 +0200, Peter Zijls=
Post by Oleg Nesterov
--- x/kernel/sched/fair.c
+++ x/kernel/sched/fair.c
@@ -1165,7 +1165,7 @@ static void task_numa_compare(struc=
t tas
Post by Kirill Tkhai
Post by Kirill Tkhai
Post by Peter Zijlstra
Post by Oleg Nesterov
=20
rcu_read_lock();
cur =3D ACCESS_ONCE(dst_rq->curr);
- if (cur->pid =3D=3D 0) /* idle */
+ if (is_idle_task(cur) || (curr->flags & PF_EXITING))
cur =3D NULL;
=20
/*
=20
=20
Looks like, we have to use the same fix for task_numa_group=
().
Post by Kirill Tkhai
Post by Kirill Tkhai
Post by Peter Zijlstra
=20
Don't think so, task_numa_group() is only called from task_nu=
ma_fault()
Post by Kirill Tkhai
Post by Kirill Tkhai
Post by Peter Zijlstra
which is on 'current' and neither idle and PF_EXITING should =
be
Post by Kirill Tkhai
Post by Kirill Tkhai
Post by Peter Zijlstra
faulting.
=20
Isn't task_numa_group() fully preemptible?
=20
Not seeing how that is relevant.
=20
It seems cpu_rq(cpu)->curr is not always equal to p.
=20
=20
task_numa_fault()
p =3D current;
=20
task_numa_group(p, ..);
=20
And like said, idle tasks and PF_EXITING task should never get (n=
uma)
Post by Kirill Tkhai
Post by Kirill Tkhai
Post by Peter Zijlstra
faults for they should never be touching userspace.
=20
I mean p can be moved to other cpu.
=20
tsk =3D ACCESS_ONCE(cpu_rq(cpu)->curr);
=20
tsk is not p, (i.e current) here.
=20
Maybe I undestand wrong and preemption is disabled in memory fault?
Ah, I found pagefault_disable(). No questions.
Oleg Nesterov
2014-10-17 21:34:40 UTC
Permalink
Post by Kirill Tkhai
Cool! Elegant fix. We set PF_EXITING in exit_signals(), which is earlier
than release_task() is called.
OK, thanks, I am sending the patch...
Post by Kirill Tkhai
Shouldn't we use smp_rmb/smp_wmb here?
No, we do not. call_rcu(delayed_put_pid) itself implies the barrier on
all CPUs. IOW, by the time RCU actually calls delayed_put_pid() every
CPU must see all memory changes which were done before call_rcu() was
called. And otoh, all rcu-read-lock critical sections which could miss
PF_EXITING should be already finished.

Oleg.
Peter Zijlstra
2014-10-16 07:56:50 UTC
Permalink
Post by Oleg Nesterov
What do you think?
Oleg.
--- x/kernel/sched/fair.c
+++ x/kernel/sched/fair.c
@@ -1165,7 +1165,7 @@ static void task_numa_compare(struct tas
rcu_read_lock();
cur = ACCESS_ONCE(dst_rq->curr);
- if (cur->pid == 0) /* idle */
+ if (is_idle_task(cur) || (curr->flags & PF_EXITING))
cur = NULL;
/*
That makes sense, is_idle_task() is indeed the right function there, and
PF_EXITING avoids doing work where it doesn't make sense.
Peter Zijlstra
2014-10-16 08:01:06 UTC
Permalink
Post by Oleg Nesterov
At least the code like
rcu_read_lock();
get_task_struct(foreign_rq->curr);
rcu_read_unlock();
is certainly wrong. And _probably_ the problem should be fixed here. Perhaps
we can add try_to_get_task_struct() which does atomic_inc_not_zero() ...
There is an rcu_read_lock() around it through task_numa_compare().
Oleg Nesterov
2014-10-16 22:05:40 UTC
Permalink
Post by Peter Zijlstra
Post by Oleg Nesterov
At least the code like
rcu_read_lock();
get_task_struct(foreign_rq->curr);
rcu_read_unlock();
is certainly wrong. And _probably_ the problem should be fixed here. Perhaps
we can add try_to_get_task_struct() which does atomic_inc_not_zero() ...
There is an rcu_read_lock() around it through task_numa_compare().
Yes, and the code above has rcu_read_lock() too. But it doesn't help
as Kirill pointed out.

Sorry, didn't have time today to read other emails in this thread,
will do tomorrow and (probably) send the patch which adds PF_EXITING
check.

Oleg.
Oleg Nesterov
2014-10-17 21:36:41 UTC
Permalink
The lockless get_task_struct(tsk) is only safe if tsk == current
and didn't pass exit_notify(), or if this tsk was found on a rcu
protected list (say, for_each_process() or find_task_by_vpid()).
IOW, it is only safe if release_task() was not called before we
take rcu_read_lock(), in this case we can rely on the fact that
delayed_put_pid() can not drop the (potentially) last reference
until rcu_read_unlock().

And as Kirill pointed out task_numa_compare()->task_numa_assign()
path does get_task_struct(dst_rq->curr) and this is not safe. The
task_struct itself can't go away, but rcu_read_lock() can't save
us from the final put_task_struct() in finish_task_switch(); this
reference goes away without rcu gp.

Reported-by: Kirill Tkhai <***@parallels.com>
Signed-off-by: Oleg Nesterov <***@redhat.com>
---
kernel/sched/fair.c | 8 +++++++-
1 files changed, 7 insertions(+), 1 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0090e8c..52049b9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1158,7 +1158,13 @@ static void task_numa_compare(struct task_numa_env *env,

rcu_read_lock();
cur = ACCESS_ONCE(dst_rq->curr);
- if (cur->pid == 0) /* idle */
+ /*
+ * No need to move the exiting task, and this ensures that ->curr
+ * wasn't reaped and thus get_task_struct() in task_numa_assign()
+ * is safe; note that rcu_read_lock() can't protect from the final
+ * put_task_struct() after the last schedule().
+ */
+ if (is_idle_task(cur) || (cur->flags & PF_EXITING))
cur = NULL;

/*
--
1.5.5.1
Kirill Tkhai
2014-10-18 08:15:01 UTC
Permalink
The lockless get_task_struct(tsk) is only safe if tsk =3D=3D current
and didn't pass exit_notify(), or if this tsk was found on a rcu
protected list (say, for_each_process() or find_task_by_vpid()).
IOW, it is only safe if release_task() was not called before we
take rcu_read_lock(), in this case we can rely on the fact that
delayed_put_pid() can not drop the (potentially) last reference
until rcu_read_unlock().
And as Kirill pointed out task_numa_compare()->task_numa_assign()
path does get_task_struct(dst_rq->curr) and this is not safe. The
task_struct itself can't go away, but rcu_read_lock() can't save
us from the final put_task_struct() in finish_task_switch(); this
reference goes away without rcu gp.
---
=9Akernel/sched/fair.c | =9A=9A=9A8 +++++++-
=9A1 files changed, 7 insertions(+), 1 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0090e8c..52049b9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1158,7 +1158,13 @@ static void task_numa_compare(struct task_numa=
_env *env,
=9A=9A=9A=9A=9A=9A=9A=9A=9Arcu_read_lock();
=9A=9A=9A=9A=9A=9A=9A=9A=9Acur =3D ACCESS_ONCE(dst_rq->curr);
- if (cur->pid =3D=3D 0) /* idle */
+ /*
+ * No need to move the exiting task, and this ensures that ->curr
+ * wasn't reaped and thus get_task_struct() in task_numa_assign()
+ * is safe; note that rcu_read_lock() can't protect from the final
+ * put_task_struct() after the last schedule().
+ */
+ if (is_idle_task(cur) || (cur->flags & PF_EXITING))
=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9Acur =3D NULL;
=9A=9A=9A=9A=9A=9A=9A=9A=9A/*
Oleg, I've looked once again, and now it's not good for me.
Where is the guarantee this memory hasn't been allocated again?
If so, PF_EXITING is not of the task we are interesting, but it's
not a task's even.

rcu_read_lock() ... ...
cur =3D ACCESS_ONCE(dst_rq->curr); ... ...
<interrupt> rq->curr =3D next; ...
<interrupt> put_prev_task() ...
<interrupt> __put_prev_task ...
<interrupt> kmem_cache_free() ...
<interrupt> ... <alocat=
ed again>
<interrupt> ... memset(=
, 0, )
<interrupt> ... ...
if (cur->flags & PF_EXITING) ... ...
<no> ... ...
get_task_struct() ... ...

Kirill
Kirill Tkhai
2014-10-18 08:33:27 UTC
Permalink
Post by Kirill Tkhai
=9AThe lockless get_task_struct(tsk) is only safe if tsk =3D=3D curr=
ent
Post by Kirill Tkhai
=9Aand didn't pass exit_notify(), or if this tsk was found on a rcu
=9Aprotected list (say, for_each_process() or find_task_by_vpid()).
=9AIOW, it is only safe if release_task() was not called before we
=9Atake rcu_read_lock(), in this case we can rely on the fact that
=9Adelayed_put_pid() can not drop the (potentially) last reference
=9Auntil rcu_read_unlock().
=9AAnd as Kirill pointed out task_numa_compare()->task_numa_assign()
=9Apath does get_task_struct(dst_rq->curr) and this is not safe. The
=9Atask_struct itself can't go away, but rcu_read_lock() can't save
=9Aus from the final put_task_struct() in finish_task_switch(); this
=9Areference goes away without rcu gp.
=9A---
=9A=9Akernel/sched/fair.c | =9A=9A=9A8 +++++++-
=9A=9A1 files changed, 7 insertions(+), 1 deletions(-)
=9Adiff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
=9Aindex 0090e8c..52049b9 100644
=9A--- a/kernel/sched/fair.c
=9A+++ b/kernel/sched/fair.c
numa_env *env,
Post by Kirill Tkhai
=9A=9A=9A=9A=9A=9A=9A=9A=9A=9Arcu_read_lock();
=9A=9A=9A=9A=9A=9A=9A=9A=9A=9Acur =3D ACCESS_ONCE(dst_rq->curr);
=9A- if (cur->pid =3D=3D 0) /* idle */
=9A+ /*
=9A+ * No need to move the exiting task, and this ensures that ->cur=
r
Post by Kirill Tkhai
=9A+ * wasn't reaped and thus get_task_struct() in task_numa_assign(=
)
Post by Kirill Tkhai
=9A+ * is safe; note that rcu_read_lock() can't protect from the fin=
al
Post by Kirill Tkhai
=9A+ * put_task_struct() after the last schedule().
=9A+ */
=9A+ if (is_idle_task(cur) || (cur->flags & PF_EXITING))
=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9Acur =3D NULL;
=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A/*
Oleg, I've looked once again, and now it's not good for me.
Where is the guarantee this memory hasn't been allocated again?
If so, PF_EXITING is not of the task we are interesting, but it's
not a task's even.
rcu_read_lock() =9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=
=2E.. =9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=
=9A=9A=9A=9A...
Post by Kirill Tkhai
cur =3D ACCESS_ONCE(dst_rq->curr); =9A... =9A=9A=9A=9A=9A=9A=9A=9A=9A=
=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A...
Post by Kirill Tkhai
<interrupt> =9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=
=9A=9A=9Arq->curr =3D next; =9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A...
Post by Kirill Tkhai
<interrupt> =9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=
=9A=9A=9A=9A=9A=9A=9Aput_prev_task() =9A=9A=9A=9A=9A=9A=9A=9A=9A=9A...
Post by Kirill Tkhai
<interrupt> =9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=
=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A__put_prev_task =9A=9A=9A=9A=9A=9A...
Post by Kirill Tkhai
<interrupt> =9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=
=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9Akmem_cache_free() =9A...
Post by Kirill Tkhai
<interrupt> =9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=
=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A... =9A=9A=9A=9A=9A=9A=9A=9A=9A=
=9A=9A=9A=9A=9A=9A<alocated again>
Post by Kirill Tkhai
<interrupt> =9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=
=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A... =9A=9A=9A=9A=9A=9A=9A=9A=9A=
=9A=9A=9A=9A=9A=9Amemset(, 0, )
Post by Kirill Tkhai
<interrupt> =9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=
=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A... =9A=9A=9A=9A=9A=9A=9A=9A=9A=
=9A=9A=9A=9A=9A=9A...
Post by Kirill Tkhai
if (cur->flags & PF_EXITING) =9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=
=9A=9A... =9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A...
Post by Kirill Tkhai
=9A=9A=9A=9A<no> =9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=
=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A... =9A=9A=9A=9A=9A=
=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A...
Post by Kirill Tkhai
get_task_struct() =9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=
=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A... =9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=
=9A=9A...

How about this?

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b78280c..d46427e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1165,7 +1165,21 @@ static void task_numa_compare(struct task_numa_e=
nv *env,
=20
rcu_read_lock();
cur =3D ACCESS_ONCE(dst_rq->curr);
- if (cur->pid =3D=3D 0) /* idle */
+ /*
+ * No need to move the exiting task, and this ensures that ->curr
+ * wasn't reaped and thus get_task_struct() in task_numa_assign()
+ * is safe; note that rcu_read_lock() can't protect from the final
+ * put_task_struct() after the last schedule().
+ */
+ if (is_idle_task(cur) || (cur->flags & PF_EXITING))
+ cur =3D NULL;
+ /*
+ * Check once again to be sure curr is still on dst_rq. Even if
+ * it points on a new task, which is using the memory of freed
+ * cur, it's OK, because we've locked RCU before
+ * delayed_put_task_struct() callback is called to put its struct.
+ */
+ if (cur !=3D ACCESS_ONCE(dst_rq->curr))
cur =3D NULL;
=20
/*
Peter Zijlstra
2014-10-18 19:36:12 UTC
Permalink
Post by Kirill Tkhai
How about this?
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b78280c..d46427e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1165,7 +1165,21 @@ static void task_numa_compare(struct task_numa_env *env,
rcu_read_lock();
cur = ACCESS_ONCE(dst_rq->curr);
- if (cur->pid == 0) /* idle */
+ /*
+ * No need to move the exiting task, and this ensures that ->curr
+ * wasn't reaped and thus get_task_struct() in task_numa_assign()
+ * is safe; note that rcu_read_lock() can't protect from the final
+ * put_task_struct() after the last schedule().
+ */
+ if (is_idle_task(cur) || (cur->flags & PF_EXITING))
+ cur = NULL;
+ /*
+ * Check once again to be sure curr is still on dst_rq. Even if
+ * it points on a new task, which is using the memory of freed
+ * cur, it's OK, because we've locked RCU before
+ * delayed_put_task_struct() callback is called to put its struct.
+ */
+ if (cur != ACCESS_ONCE(dst_rq->curr))
cur = NULL;
/*
So you worry about the refcount doing 0->1 ? In which case the above is
still wrong and we should be using atomic_inc_not_zero() in order to
acquire the reference count.
Oleg Nesterov
2014-10-18 21:18:40 UTC
Permalink
Post by Peter Zijlstra
So you worry about the refcount doing 0->1 ? In which case the above is
still wrong and we should be using atomic_inc_not_zero() in order to
acquire the reference count.
It is actually worse, please see my reply to Kirill. We simply can't
dereference foreign_rq->curr lockless.

Again, task_struct is only protected by RCU if it was found on a RCU
protected list. rq->curr is not protected by rcu. Perhaps we have to
change this... but this will be a bit unfortunate.

Oleg.
Kirill Tkhai
2014-10-19 08:20:59 UTC
Permalink
Post by Peter Zijlstra
Post by Kirill Tkhai
How about this?
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b78280c..d46427e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1165,7 +1165,21 @@ static void task_numa_compare(struct task_numa_env *env,
rcu_read_lock();
cur = ACCESS_ONCE(dst_rq->curr);
- if (cur->pid == 0) /* idle */
+ /*
+ * No need to move the exiting task, and this ensures that ->curr
+ * wasn't reaped and thus get_task_struct() in task_numa_assign()
+ * is safe; note that rcu_read_lock() can't protect from the final
+ * put_task_struct() after the last schedule().
+ */
+ if (is_idle_task(cur) || (cur->flags & PF_EXITING))
+ cur = NULL;
+ /*
+ * Check once again to be sure curr is still on dst_rq. Even if
+ * it points on a new task, which is using the memory of freed
+ * cur, it's OK, because we've locked RCU before
+ * delayed_put_task_struct() callback is called to put its struct.
+ */
+ if (cur != ACCESS_ONCE(dst_rq->curr))
cur = NULL;
/*
So you worry about the refcount doing 0->1 ? In which case the above is
still wrong and we should be using atomic_inc_not_zero() in order to
acquire the reference count.
We can't use atomic_inc_not_zero(). The problem is that cur is pointing
to a memory, which may be not a task_struct even. No guarantees at all.
Oleg Nesterov
2014-10-18 20:56:14 UTC
Permalink
Post by Kirill Tkhai
Post by Oleg Nesterov
...
The
task_struct itself can't go away,
...
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1158,7 +1158,13 @@ static void task_numa_compare(struct task_nu=
ma_env *env,
Post by Kirill Tkhai
Post by Oleg Nesterov
=A0=A0=A0=A0=A0=A0=A0=A0=A0rcu_read_lock();
=A0=A0=A0=A0=A0=A0=A0=A0=A0cur =3D ACCESS_ONCE(dst_rq->curr);
- if (cur->pid =3D=3D 0) /* idle */
+ /*
+ * No need to move the exiting task, and this ensures that ->curr
+ * wasn't reaped and thus get_task_struct() in task_numa_assign()
+ * is safe; note that rcu_read_lock() can't protect from the final
+ * put_task_struct() after the last schedule().
+ */
+ if (is_idle_task(cur) || (cur->flags & PF_EXITING))
=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0cur =3D NULL;
=A0=A0=A0=A0=A0=A0=A0=A0=A0/*
Oleg, I've looked once again, and now it's not good for me.
Ah. Thanks a lot Kirill for correcting me!

I was looking at this rcu_read_lock() and I didn't even try to think
what it can actually protect. Nothing.
Post by Kirill Tkhai
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1165,7 +1165,21 @@ static void task_numa_compare(struct task_numa=
_env *env,
Post by Kirill Tkhai
^^
rcu_read_lock();
cur =3D ACCESS_ONCE(dst_rq->curr);
- if (cur->pid =3D=3D 0) /* idle */
+ /*
+ * No need to move the exiting task, and this ensures that ->cu=
rr
Post by Kirill Tkhai
+ * wasn't reaped and thus get_task_struct() in task_numa_assign=
()
Post by Kirill Tkhai
+ * is safe; note that rcu_read_lock() can't protect from the fi=
nal
Post by Kirill Tkhai
+ * put_task_struct() after the last schedule().
+ */
+ if (is_idle_task(cur) || (cur->flags & PF_EXITING))
+ cur =3D NULL;
+ /*
+ * Check once again to be sure curr is still on dst_rq. Even if
+ * it points on a new task, which is using the memory of freed
+ * cur, it's OK, because we've locked RCU before
+ * delayed_put_task_struct() callback is called to put its stru=
ct.
Post by Kirill Tkhai
+ */
+ if (cur !=3D ACCESS_ONCE(dst_rq->curr))
No, I don't think this can work. Let's look at the current code:

rcu_read_lock();
cur =3D ACCESS_ONCE(dst_rq->curr);
if (cur->pid =3D=3D 0) /* idle */

And any dereference, even reading ->pid is not safe. This memory can be
freed, unmapped, reused, etc.

Looks like, task_numa_compare() needs to take dst_rq->lock and get the
refernce first.

Or, perhaps, we need to change the rules to ensure that any "task_struc=
t *"
pointer is rcu-safe. Perhaps we have more similar problems... I'd like =
to
avoid this if possible.

Hmm. I'll try to think more.

Thanks!

Oleg.
Kirill Tkhai
2014-10-18 23:13:31 UTC
Permalink
Post by Kirill Tkhai
=9A=9A...
=9A=9AThe
=9A=9Atask_struct itself can't go away,
=9A=9A...
=9A=9A--- a/kernel/sched/fair.c
=9A=9A+++ b/kernel/sched/fair.c
ask_numa_env *env,
Post by Kirill Tkhai
=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9Arcu_read_lock();
=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9Acur =3D ACCESS_ONCE(dst_rq->curr);
=9A=9A- if (cur->pid =3D=3D 0) /* idle */
=9A=9A+ /*
=9A=9A+ * No need to move the exiting task, and this ensures that -=
curr
Post by Kirill Tkhai
=9A=9A+ * wasn't reaped and thus get_task_struct() in task_numa_ass=
ign()
Post by Kirill Tkhai
=9A=9A+ * is safe; note that rcu_read_lock() can't protect from the=
final
Post by Kirill Tkhai
=9A=9A+ * put_task_struct() after the last schedule().
=9A=9A+ */
=9A=9A+ if (is_idle_task(cur) || (cur->flags & PF_EXITING))
=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9Acur =3D NU=
LL;
Post by Kirill Tkhai
=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A=9A/*
=9A=9AOleg, I've looked once again, and now it's not good for me.
=9AAh. Thanks a lot Kirill for correcting me!
=9AI was looking at this rcu_read_lock() and I didn't even try to thi=
nk
=9Awhat it can actually protect. Nothing.
<snip>
=9A=9A=9A=9A=9A=9A=9A=9A=9Arcu_read_lock();
=9A=9A=9A=9A=9A=9A=9A=9A=9Acur =3D ACCESS_ONCE(dst_rq->curr);
=9A=9A=9A=9A=9A=9A=9A=9A=9Aif (cur->pid =3D=3D 0) /* idle */
=9AAnd any dereference, even reading ->pid is not safe. This memory c=
an be
=9Afreed, unmapped, reused, etc.
=9ALooks like, task_numa_compare() needs to take dst_rq->lock and get=
the
=9Arefernce first.
Yeah, detection of idle is not save. If we reorder the checks almost al=
l
problems will be gone. All except unmapping. JFI, is it possible with
such kernel structures as task_struct? I.e. do mem caches use high memo=
ry
in their bosoms?
Thanks!

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b78280c..114ec33 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1165,7 +1165,30 @@ static void task_numa_compare(struct task_numa_e=
nv *env,
=20
rcu_read_lock();
cur =3D ACCESS_ONCE(dst_rq->curr);
- if (cur->pid =3D=3D 0) /* idle */
+ /*
+ * No need to move the exiting task, and this ensures that ->curr
+ * wasn't reaped and thus get_task_struct() in task_numa_assign()
+ * is safe; note that rcu_read_lock() can't protect from the final
+ * put_task_struct() after the last schedule().
+ */
+ if (cur->flags & PF_EXITING)
+ cur =3D NULL;
+ smp_rmb(); /* Pairs with dst_rq->lock unlocking which implies smp_wmb=
*/
+ /*
+ * Check once again to be sure curr is still on dst_rq. Three situati=
ons
+ * are possible here:
+ * 1)cur has gone and been freed, and dst_rq->curr is pointing on oth=
er
+ * memory. In this case the check will fail;
+ * 2)cur is pointing to a new task, which is using the memory of just
+ * freed cur (and it is new dst_rq->curr). It's OK, because we've
+ * locked RCU before the new task has been even created
+ * (so delayed_put_task_struct() hasn't been called);
+ * 3)we've taken not exiting task (likely case). No need to worry.
+ */
+ if (cur !=3D ACCESS_ONCE(dst_rq->curr))
+ cur =3D NULL;
+
+ if (is_idle_task(cur))
cur =3D NULL;
=20
/*
=9AOr, perhaps, we need to change the rules to ensure that any "task_=
struct *"
=9Apointer is rcu-safe. Perhaps we have more similar problems... I'd =
like to
=9Aavoid this if possible.
RT tree has:

https://git.kernel.org/cgit/linux/kernel/git/paulg/3.10-rt-patches.git/=
tree/patches/sched-delay-put-task.patch

But other problem was decided there..
=9AHmm. I'll try to think more.
=9AThanks!
Kirill
Oleg Nesterov
2014-10-19 19:24:37 UTC
Permalink
Post by Oleg Nesterov
=A0=A0=A0=A0=A0=A0=A0=A0=A0rcu_read_lock();
=A0=A0=A0=A0=A0=A0=A0=A0=A0cur =3D ACCESS_ONCE(dst_rq->curr);
=A0=A0=A0=A0=A0=A0=A0=A0=A0if (cur->pid =3D=3D 0) /* idle */
=A0And any dereference, even reading ->pid is not safe. This memory=
can be
Post by Oleg Nesterov
=A0freed, unmapped, reused, etc.
=A0Looks like, task_numa_compare() needs to take dst_rq->lock and g=
et the
Post by Oleg Nesterov
=A0refernce first.
Yeah, detection of idle is not save. If we reorder the checks almost =
all
problems will be gone. All except unmapping. JFI, is it possible with
such kernel structures as task_struct?
Yes, if DEBUG_PAGEALLOC. See kernel_map_pages() in arch/x86/mm/pageattr=
=2Ec
kernel_map_pages(enable =3D> false) clears PAGE_PRESENT if slab returns=
the
pages to system.
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1165,7 +1165,30 @@ static void task_numa_compare(struct task_numa=
_env *env,
rcu_read_lock();
cur =3D ACCESS_ONCE(dst_rq->curr);
- if (cur->pid =3D=3D 0) /* idle */
+ /*
+ * No need to move the exiting task, and this ensures that ->curr
+ * wasn't reaped and thus get_task_struct() in task_numa_assign()
+ * is safe; note that rcu_read_lock() can't protect from the final
+ * put_task_struct() after the last schedule().
+ */
+ if (cur->flags & PF_EXITING)
+ cur =3D NULL;
so this needs probe_kernel_read(&cur->flags).
+ if (cur !=3D ACCESS_ONCE(dst_rq->curr))
+ cur =3D NULL;
Yes, if this task_struct was freed in between we do not care if this me=
mory
was reused (except PF_EXITING can be false positive). If it was freed a=
nd
now the same memory is ->curr again we know that delayed_put_task_struc=
t()
can't be called until we drop rcu lock, even if PF_EXITING is already s=
et
again.

I won't argue, but you need to convince Peter to accept this hack ;)
Post by Oleg Nesterov
=A0Or, perhaps, we need to change the rules to ensure that any "tas=
k_struct *"
Post by Oleg Nesterov
=A0pointer is rcu-safe. Perhaps we have more similar problems... I'=
d like to
Post by Oleg Nesterov
=A0avoid this if possible.
https://git.kernel.org/cgit/linux/kernel/git/paulg/3.10-rt-patches.gi=
t/
tree/patches/sched-delay-put-task.patch
Yes, and this obviously implies more rcu callbacks in flight, and anoth=
er
gp before __put_task_struct(). but may be we will need to do this anywa=
y...

Oleg.
Oleg Nesterov
2014-10-19 19:37:44 UTC
Permalink
Post by Oleg Nesterov
Post by Oleg Nesterov
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1165,7 +1165,30 @@ static void task_numa_compare(struct task_nu=
ma_env *env,
Post by Oleg Nesterov
Post by Oleg Nesterov
rcu_read_lock();
cur =3D ACCESS_ONCE(dst_rq->curr);
- if (cur->pid =3D=3D 0) /* idle */
+ /*
+ * No need to move the exiting task, and this ensures that ->curr
+ * wasn't reaped and thus get_task_struct() in task_numa_assign()
+ * is safe; note that rcu_read_lock() can't protect from the fina=
l
Post by Oleg Nesterov
Post by Oleg Nesterov
+ * put_task_struct() after the last schedule().
+ */
+ if (cur->flags & PF_EXITING)
+ cur =3D NULL;
so this needs probe_kernel_read(&cur->flags).
Post by Oleg Nesterov
+ if (cur !=3D ACCESS_ONCE(dst_rq->curr))
+ cur =3D NULL;
Yes, if this task_struct was freed in between we do not care if this =
memory
Post by Oleg Nesterov
was reused (except PF_EXITING can be false positive). If it was freed=
and
Post by Oleg Nesterov
now the same memory is ->curr again we know that delayed_put_task_str=
uct()
Post by Oleg Nesterov
can't be called until we drop rcu lock, even if PF_EXITING is already=
set
Post by Oleg Nesterov
again.
I won't argue, but you need to convince Peter to accept this hack ;)
Post by Oleg Nesterov
=A0Or, perhaps, we need to change the rules to ensure that any "t=
ask_struct *"
Post by Oleg Nesterov
Post by Oleg Nesterov
=A0pointer is rcu-safe. Perhaps we have more similar problems... =
I'd like to
Post by Oleg Nesterov
Post by Oleg Nesterov
=A0avoid this if possible.
https://git.kernel.org/cgit/linux/kernel/git/paulg/3.10-rt-patches.=
git/
Post by Oleg Nesterov
Post by Oleg Nesterov
tree/patches/sched-delay-put-task.patch
Yes, and this obviously implies more rcu callbacks in flight, and ano=
ther
Post by Oleg Nesterov
gp before __put_task_struct(). but may be we will need to do this any=
way...

=46orgot to mention... Or we can make task_struct_cachep SLAB_DESTROY_B=
Y_RCU,
in this case ->curr (or any other "task_struct *" ponter) can not go aw=
ay
under rcu_read_lock(). task_numa_compare() still needs the PF_EXITING c=
heck,
but we do not need to recheck ->curr or probe_kernel_read().

Oleg.
Oleg Nesterov
2014-10-19 19:43:14 UTC
Permalink
Forgot to mention... Or we can make task_struct_cachep SLAB_DESTROY_BY_RCU,
in this case ->curr (or any other "task_struct *" ponter) can not go away
under rcu_read_lock(). task_numa_compare() still needs the PF_EXITING check,
but we do not need to recheck ->curr or probe_kernel_read().
Damn, please ignore ;) we still need to recheck ->curr.

Oleg.
Kirill Tkhai
2014-10-20 09:03:14 UTC
Permalink
=D0=92 =D0=92=D1=81, 19/10/2014 =D0=B2 21:43 +0200, Oleg Nesterov =D0=BF=
Post by Oleg Nesterov
Forgot to mention... Or we can make task_struct_cachep SLAB_DESTROY=
_BY_RCU,
Post by Oleg Nesterov
in this case ->curr (or any other "task_struct *" ponter) can not g=
o away
Post by Oleg Nesterov
under rcu_read_lock(). task_numa_compare() still needs the PF_EXITI=
NG check,
Post by Oleg Nesterov
but we do not need to recheck ->curr or probe_kernel_read().
=20
Damn, please ignore ;) we still need to recheck ->curr.
Yeah, this bug like "collect puzzle" :)
Peter Zijlstra
2014-10-20 09:13:11 UTC
Permalink
OK, I think I'm finally awake enough to see what you're all talking
about :-)
Post by Kirill Tkhai
https://git.kernel.org/cgit/linux/kernel/git/paulg/3.10-rt-patches.git/
tree/patches/sched-delay-put-task.patch
(answering the other email asking about this)

RT does this because we call put_task_struct() with preempt disabled and
on RT the memory allocator has sleeping locks.
Yes, and this obviously implies more rcu callbacks in flight, and another
gp before __put_task_struct(). but may be we will need to do this anyway...
Forgot to mention... Or we can make task_struct_cachep SLAB_DESTROY_BY_RCU,
in this case ->curr (or any other "task_struct *" ponter) can not go away
under rcu_read_lock(). task_numa_compare() still needs the PF_EXITING check,
but we do not need to recheck ->curr or probe_kernel_read().
I think I would prefer SLAB_DESTROY_BY_RCU for this, because as you
pointed out, I'm not sure mainline would like the extra callbacks.
Kirill Tkhai
2014-10-20 10:36:13 UTC
Permalink
=D0=92 =D0=9F=D0=BD, 20/10/2014 =D0=B2 11:13 +0200, Peter Zijlstra =D0=BF=
Post by Peter Zijlstra
OK, I think I'm finally awake enough to see what you're all talking
about :-)
=20
https://git.kernel.org/cgit/linux/kernel/git/paulg/3.10-rt-patc=
hes.git/
Post by Peter Zijlstra
tree/patches/sched-delay-put-task.patch
=20
(answering the other email asking about this)
=20
RT does this because we call put_task_struct() with preempt disabled =
and
Post by Peter Zijlstra
on RT the memory allocator has sleeping locks.
Now it's clearly for me. I though it's because task_struct freeing is s=
low.
Thanks!
Post by Peter Zijlstra
Yes, and this obviously implies more rcu callbacks in flight, and=
another
Post by Peter Zijlstra
gp before __put_task_struct(). but may be we will need to do this=
anyway...
Post by Peter Zijlstra
=20
Forgot to mention... Or we can make task_struct_cachep SLAB_DESTROY=
_BY_RCU,
Post by Peter Zijlstra
in this case ->curr (or any other "task_struct *" ponter) can not g=
o away
Post by Peter Zijlstra
under rcu_read_lock(). task_numa_compare() still needs the PF_EXITI=
NG check,
Post by Peter Zijlstra
but we do not need to recheck ->curr or probe_kernel_read().
=20
I think I would prefer SLAB_DESTROY_BY_RCU for this, because as you
pointed out, I'm not sure mainline would like the extra callbacks.
I've sent one more patch with this:

"[PATCH v3] sched/numa: fix unsafe get_task_struct() in
task_numa_assign()"

Kirill

Kirill Tkhai
2014-10-20 09:00:20 UTC
Permalink
=D0=92 =D0=92=D1=81, 19/10/2014 =D0=B2 21:24 +0200, Oleg Nesterov =D0=BF=
Post by Oleg Nesterov
rcu_read_lock();
cur =3D ACCESS_ONCE(dst_rq->curr);
if (cur->pid =3D=3D 0) /* idle */
And any dereference, even reading ->pid is not safe. This memory=
can be
Post by Oleg Nesterov
freed, unmapped, reused, etc.
Looks like, task_numa_compare() needs to take dst_rq->lock and g=
et the
Post by Oleg Nesterov
refernce first.
Yeah, detection of idle is not save. If we reorder the checks almos=
t all
problems will be gone. All except unmapping. JFI, is it possible wi=
th
such kernel structures as task_struct?
=20
Yes, if DEBUG_PAGEALLOC. See kernel_map_pages() in arch/x86/mm/pageat=
tr.c
kernel_map_pages(enable =3D> false) clears PAGE_PRESENT if slab retur=
ns the
pages to system.
Thanks, Oleg!
=20
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1165,7 +1165,30 @@ static void task_numa_compare(struct task_nu=
ma_env *env,
rcu_read_lock();
cur =3D ACCESS_ONCE(dst_rq->curr);
- if (cur->pid =3D=3D 0) /* idle */
+ /*
+ * No need to move the exiting task, and this ensures that ->curr
+ * wasn't reaped and thus get_task_struct() in task_numa_assign()
+ * is safe; note that rcu_read_lock() can't protect from the fina=
l
+ * put_task_struct() after the last schedule().
+ */
+ if (cur->flags & PF_EXITING)
+ cur =3D NULL;
=20
so this needs probe_kernel_read(&cur->flags).
=20
+ if (cur !=3D ACCESS_ONCE(dst_rq->curr))
+ cur =3D NULL;
=20
Yes, if this task_struct was freed in between we do not care if this =
memory
was reused (except PF_EXITING can be false positive). If it was freed=
and
now the same memory is ->curr again we know that delayed_put_task_str=
uct()
can't be called until we drop rcu lock, even if PF_EXITING is already=
set
again.
=20
I won't argue, but you need to convince Peter to accept this hack ;)
Just sent a new version with all of you suggestions :) Thanks!
=20
Post by Oleg Nesterov
Or, perhaps, we need to change the rules to ensure that any "tas=
k_struct *"
Post by Oleg Nesterov
pointer is rcu-safe. Perhaps we have more similar problems... I'=
d like to
Post by Oleg Nesterov
avoid this if possible.
https://git.kernel.org/cgit/linux/kernel/git/paulg/3.10-rt-patches.=
git/
tree/patches/sched-delay-put-task.patch
=20
Yes, and this obviously implies more rcu callbacks in flight, and ano=
ther
gp before __put_task_struct(). but may be we will need to do this any=
way...

Kirill
Peter Zijlstra
2014-10-19 21:38:21 UTC
Permalink
+ smp_rmb(); /* Pairs with dst_rq->lock unlocking which implies smp_wmb */
RELEASE does not imply a WMB.
Kirill Tkhai
2014-10-20 08:56:32 UTC
Permalink
=D0=92 =D0=92=D1=81, 19/10/2014 =D0=B2 23:38 +0200, Peter Zijlstra =D0=BF=
Post by Peter Zijlstra
=20
=20
+ smp_rmb(); /* Pairs with dst_rq->lock unlocking which implies smp=
_wmb */
Post by Peter Zijlstra
=20
RELEASE does not imply a WMB.
Thanks, please see, I've sent new version.
Kirill Tkhai
2014-10-18 09:16:43 UTC
Permalink
And smp_rmb() beetween ifs which is pairs with rq unlocking
Post by Kirill Tkhai
Post by Kirill Tkhai
Post by Oleg Nesterov
The lockless get_task_struct(tsk) is only safe if tsk == current
and didn't pass exit_notify(), or if this tsk was found on a rcu
protected list (say, for_each_process() or find_task_by_vpid()).
IOW, it is only safe if release_task() was not called before we
take rcu_read_lock(), in this case we can rely on the fact that
delayed_put_pid() can not drop the (potentially) last reference
until rcu_read_unlock().
And as Kirill pointed out task_numa_compare()->task_numa_assign()
path does get_task_struct(dst_rq->curr) and this is not safe. The
task_struct itself can't go away, but rcu_read_lock() can't save
us from the final put_task_struct() in finish_task_switch(); this
reference goes away without rcu gp.
---
kernel/sched/fair.c | 8 +++++++-
1 files changed, 7 insertions(+), 1 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0090e8c..52049b9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1158,7 +1158,13 @@ static void task_numa_compare(struct task_numa_env *env,
rcu_read_lock();
cur = ACCESS_ONCE(dst_rq->curr);
- if (cur->pid == 0) /* idle */
+ /*
+ * No need to move the exiting task, and this ensures that ->curr
+ * wasn't reaped and thus get_task_struct() in task_numa_assign()
+ * is safe; note that rcu_read_lock() can't protect from the final
+ * put_task_struct() after the last schedule().
+ */
+ if (is_idle_task(cur) || (cur->flags & PF_EXITING))
cur = NULL;
/*
Oleg, I've looked once again, and now it's not good for me.
Where is the guarantee this memory hasn't been allocated again?
If so, PF_EXITING is not of the task we are interesting, but it's
not a task's even.
rcu_read_lock() ... ...
cur = ACCESS_ONCE(dst_rq->curr); ... ...
<interrupt> rq->curr = next; ...
<interrupt> put_prev_task() ...
<interrupt> __put_prev_task ...
<interrupt> kmem_cache_free() ...
<interrupt> ... <alocated again>
<interrupt> ... memset(, 0, )
<interrupt> ... ...
if (cur->flags & PF_EXITING) ... ...
<no> ... ...
get_task_struct() ... ...
How about this?
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b78280c..d46427e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1165,7 +1165,21 @@ static void task_numa_compare(struct task_numa_env *env,
rcu_read_lock();
cur = ACCESS_ONCE(dst_rq->curr);
- if (cur->pid == 0) /* idle */
+ /*
+ * No need to move the exiting task, and this ensures that ->curr
+ * wasn't reaped and thus get_task_struct() in task_numa_assign()
+ * is safe; note that rcu_read_lock() can't protect from the final
+ * put_task_struct() after the last schedule().
+ */
+ if (is_idle_task(cur) || (cur->flags & PF_EXITING))
+ cur = NULL;
+ /*
+ * Check once again to be sure curr is still on dst_rq. Even if
+ * it points on a new task, which is using the memory of freed
+ * cur, it's OK, because we've locked RCU before
+ * delayed_put_task_struct() callback is called to put its struct.
+ */
+ if (cur != ACCESS_ONCE(dst_rq->curr))
cur = NULL;
/*
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Loading...