cfs_bandwidth下的distribute_cfs_runtime hard lockup
内核发生了hard LOCKUP然后panic了,代码版本是linux-3.10.0-514.16.1.el7.x86_64
[4474426.249700] NMI watchdog: Watchdog detected hard LOCKUP on cpu 50
crash下的bt信息如下:
[exception RIP: tg_unthrottle_up+24]RIP: ffffffff810c9658 RSP: ffff882f7fc83dc8 RFLAGS: 00000046RAX: ffff885d4767d800 RBX: ffff885f7e4d6c40 RCX: ffff8830767f2930RDX: 000000000000005b RSI: ffff885f7e4d6c40 RDI: ffff8830767f2800RBP: ffff882f7fc83dc8 R8: ffff885f697c7900 R9: 0000000000000001R10: 0000000000000000 R11: 0000000000000000 R12: ffff8830764e5400R13: ffffffff810c9640 R14: 0000000000000000 R15: ffff8830767f2800ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018--- <NMI exception stack> ---#12 [ffff882f7fc83dc8] tg_unthrottle_up at ffffffff810c9658#13 [ffff882f7fc83dd0] walk_tg_tree_from at ffffffff810c17db#14 [ffff882f7fc83e20] unthrottle_cfs_rq at ffffffff810d1675#15 [ffff882f7fc83e58] distribute_cfs_runtime at ffffffff810d18e2#16 [ffff882f7fc83ea0] sched_cfs_period_timer at ffffffff810d1a7f#17 [ffff882f7fc83ed8] __hrtimer_run_queues at ffffffff810b4d72#18 [ffff882f7fc83f30] hrtimer_interrupt at ffffffff810b5310#19 [ffff882f7fc83f80] local_apic_timer_interrupt at ffffffff81050fd7#20 [ffff882f7fc83f98] smp_apic_timer_interrupt at ffffffff8169978f#21 [ffff882f7fc83fb0] apic_timer_interrupt at ffffffff81697cdd--- <IRQ stack> ---
hard LOCKUP原理比较简单,就是在关中断的情况下栈里面的函数执行时间过长,时间和是否panic由以下开关决定
```shell
[oen@han]# cat /proc/sys/kernel/hardlockup_panic
1
[oen@han]# cat /proc/sys/kernel/watchdog_thresh
10
```
从栈上可以看出,整个函数栈都是cfs_bandwidth的东西,cfs_bandwidth就是控制cfs调度带宽的,先详细看一下cfs_bandwidth
```shell
[oen@han]# cat /proc/sys/kernel/hardlockup_panic
1
[oen@han]# cat /proc/sys/kernel/watchdog_thresh
10
```
从栈上可以看出,整个函数栈都是cfs_bandwidth的东西,cfs_bandwidth就是控制cfs调度带宽的,先详细看一下cfs_bandwidth
struct cfs_bandwidth {
#ifdef CONFIG_CFS_BANDWIDTH
raw_spinlock_t lock;
//cfs_b->period = ns_to_ktime(default_cfs_period()) = 0.1s
// 控制周期为 0.1s ,在/sys/fs/cgroup/cpu/cpu.cfs_period_us可以读到
time_t period;
u64 quota,// 周期的时间配额
runtime; // 周期内剩余可运行时间
s64 hierarchal_quota;
u64 runtime_expires;
int idle, timer_active;
//周期性定时器
struct hrtimer period_timer, slack_timer;
struct list_head throttled_cfs_rq;
/* statistics */
int nr_periods, nr_throttled;
u64 throttled_time;
#endif
};
先说一下大致上的运行逻辑
看第一点,cfs_rq运行时间被限制是throttle_cfs_rq完成的,它的调用关系是:
+-> unthrottle_cfs_rq
|
|
+-> check_enqueue_throttle+-> enqueue_entity|
| +-> enqueue_task_fair
throttle_cfs_rq|
|
+-> check_cfs_rq_runtime+---> put_prev_entity+> put_prev_task_fair
在throttle_cfs_rq和check_enqueue_throttle中可以看到,当cfs_rq->runtime_remaining不大于0时,cfs_rq就会被限制。
在throttle_cfs_rq中,干了如下一些事情,注意cfs_rq将自己添加到cfs_b->throttled_cfs_rq。
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
if (!cfs_b->timer_active)
__start_cfs_bandwidth(cfs_b, false);
raw_spin_unlock(&cfs_b->lock);
再看第二步:scheduler_tick负责更新调度时间,具体调用路径如下:
scheduler_tick -> task_tick_fair -> entity_tick -> update_curr -> account_cfs_rq_runtime -> __account_cfs_rq_runtime
在__account_cfs_rq_runtime下,
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* 更新CFS_rq的运行时间 */
cfs_rq->runtime_remaining -= delta_exec;
expire_cfs_rq_runtime(cfs_rq);
//如果cfs_rq没超时,则不需要处理
if (likely(cfs_rq->runtime_remaining > 0))
return;
/*
* 从cfs_bandwidth借时间,如果失败,则意味当前cfs_rq时间受到限制,当前运行的进程也被调度走
*/
if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
resched_task(rq_of(cfs_rq)->curr);
}
assign_cfs_rq_runtime负责cfs_rq从cfs_bandwith里面借时间,sched_cfs_bandwidth_slice的值默认是5ms
static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
struct task_group *tg = cfs_rq->tg;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
u64 amount = 0, min_amount, expires;
/* cfs_rq->runtime_remaining此时肯定为负值, min_amount即需要借取的时间 */
/* note: this is a positive sum as runtime_remaining <= 0 */
min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
raw_spin_lock(&cfs_b->lock);
if (cfs_b->quota == RUNTIME_INF)
// quota没限制自然随便借
amount = min_amount;
else {
/*
* If the bandwidth pool has become inactive, then at least one
* period must have elapsed since the last consumption.
* Refresh the global state and ensure bandwidth timer becomes
* active.
*/
if (!cfs_b->timer_active) {
__refill_cfs_bandwidth_runtime(cfs_b);
__start_cfs_bandwidth(cfs_b, false);
}
if (cfs_b->runtime > 0) {
// 当quota有限制时最多借走全部的runtime
amount = min(cfs_b->runtime, min_amount);
// 更新cfs_bandwith
cfs_b->runtime -= amount;
cfs_b->idle = 0;
}
}
expires = cfs_b->runtime_expires;
raw_spin_unlock(&cfs_b->lock);
// cfs_rq时间更新
cfs_rq->runtime_remaining += amount;
/*
* we may have advanced our local expiration to account for allowed
* spread between our sched_clock and the one on which runtime was
* issued.
*/
if ((s64)(expires - cfs_rq->runtime_expires) > 0)
cfs_rq->runtime_expires = expires;
return cfs_rq->runtime_remaining > 0;
}
sched_cfs_bandwidth_slice值是白送的5ms,因为计算时间不够时需要一个负值的判断。
再看第三步, init_cfs_bandwidth搞了两个定时器,sched_cfs_period_timer和sched_cfs_slack_timer
void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
raw_spin_lock_init(&cfs_b->lock);
cfs_b->runtime = 0;
// 默认不限制
cfs_b->quota = RUNTIME_INF;
// period默认是100ms
cfs_b->period = ns_to_ktime(default_cfs_period());
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->period_timer.function = sched_cfs_period_timer;
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->slack_timer.function = sched_cfs_slack_timer;
}
do_sched_cfs_period_timer具体内容如下:
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
{
u64 runtime, runtime_expires;
int throttled;
/* no need to continue the timer with no bandwidth constraint */
if (cfs_b->quota == RUNTIME_INF)
goto out_deactivate;
// 获取cfs_b有没有被throttled的cfs_rq
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
// period计数器
cfs_b->nr_periods += overrun;
/*
* idle depends on !throttled (for the case of a large deficit), and if
* we're going inactive then everything else can be deferred
*/
if (cfs_b->idle && !throttled)
goto out_deactivate;
/*
* if we have relooped after returning idle once, we need to update our
* status as actually running, so that other cpus doing
* __start_cfs_bandwidth will stop trying to cancel us.
*/
cfs_b->timer_active = 1;
/* __refill_cfs_bandwidth_runtime中给cfs_bandwith重新赋值,第三步的本质内容在这里完成
* cfs_b->runtime = cfs_b->quota;
* cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
*/
__refill_cfs_bandwidth_runtime(cfs_b);
if (!throttled) {
/* mark as potentially idle for the upcoming period */
cfs_b->idle = 1;
return 0;
}
/* account preceding periods in which throttling occurred */
cfs_b->nr_throttled += overrun;
/*
* There are throttled entities so we must first use the new bandwidth
* to unthrottle them before making it generally available. This
* ensures that all existing debts will be paid before a new cfs_rq is
* allowed to run.
*/
runtime = cfs_b->runtime;
runtime_expires = cfs_b->runtime_expires;
cfs_b->runtime = 0;
/*
* 主动给各个rq发送时间,让各个rq尽快复活
*/
while (throttled && runtime > 0) {
raw_spin_unlock(&cfs_b->lock);
/* we can't nest cfs_b->lock while distributing bandwidth */
runtime = distribute_cfs_runtime(cfs_b, runtime,
runtime_expires);
raw_spin_lock(&cfs_b->lock);
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
}
/* return (any) remaining runtime */
cfs_b->runtime = runtime;
/*
* While we are ensured activity in the period following an
* unthrottle, this also covers the case in which the new bandwidth is
* insufficient to cover the existing bandwidth deficit. (Forcing the
* timer to remain active while there are any throttled entities.)
*/
cfs_b->idle = 0;
return 0;
out_deactivate:
cfs_b->timer_active = 0;
return 1;
}
回来接着看问题,问题就出在do_sched_cfs_period_timer函数下,来自zhoujian的结论,内核patch如下:
do_sched_cfs_period_timer下有一个while循环 :while (throttled && runtime > 0){},在distribute_cfs_runtime下,第16行,唤醒cfs_rq时cfs_rq只被给了多余的1ns,不能再吝啬了,而在外层的do_sched_cfs_period_timer,runtime则是一个从全局赋值的本地变量,并发代码对runtime的消耗便体现不出来了,导致while循环次数非常多。
static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
u64 remaining, u64 expires)
{
struct cfs_rq *cfs_rq;
u64 runtime = remaining;
rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
throttled_list) {
struct rq *rq = rq_of(cfs_rq);
raw_spin_lock(&rq->lock);
if (!cfs_rq_throttled(cfs_rq))
goto next;
runtime = -cfs_rq->runtime_remaining + 1;
if (runtime > remaining)
runtime = remaining;
remaining -= runtime;
cfs_rq->runtime_remaining += runtime;
cfs_rq->runtime_expires = expires;
/* we check whether we're throttled above */
if (cfs_rq->runtime_remaining > 0)
unthrottle_cfs_rq(cfs_rq);
next:
raw_spin_unlock(&rq->lock);
if (!remaining)
break;
}
rcu_read_unlock();
return remaining;
}
而在另外一个循环中,distribute_cfs_runtime下有一个for循环: list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, throttled_list) {},原理比较简单,即throttled_cfs_rq上处理的cfs_rq没有进栈的速度快。
bsegall的patch实际上两个方面都考虑了,那么哪个才可能性大呢,第一个循环需要苛刻的偶然性,而第二个则不需要,最重要的是通过稳定的重现方法,证明了第二个概率大。
最终结论:
在CentOS内核3.10.0-585.el7版本之后该问题解决:
[kernel] sched: Fix potential near-infinite distribute_cfs_runtime() loop (Lauro Ramos Venancio) [1399391]
kpatch热补丁:不要用原版patch,做出的热补丁不解决问题,需要魔改。
cfs_bandwidth下的distribute_cfs_runtime hard lockup来自于OENHAN
链接为:https://oenhan.com/cfs-bandwidth/