一次 irq_fpu_usable 内核报错排查总结

2024-01-01 22:35:54

开发了一个内核ko模块async_memory_reclaim_for_cold_file_area(内核版本5.14.0-284.11.1)，使用kprobe技术捕捉内核copy_page_to_iter()函数，在里边执行自定义的hot_file_update_file_status()函数，统计文件页page的访问信息，源码简要如下：

static void kprobe_handler_post(struct kprobe *p, struct pt_regs *regs,
??????? unsigned long flags)
{
??? struct page *page = (struct page *)(regs->di);
??? if(page){
??????? hot_file_update_file_status(page);
??? }
}
static struct kprobe kp_read_cache_func = {
??? //buffer io read读取文件页page数据执行到 copy_folio_to_iter()
??? .symbol_name??? = "copy_page_to_iter",
};
kp_read_cache_func.post_handler = kprobe_handler_post;
//注册kprobe函数
ret = register_kprobe(&kp_read_cache_func);

但是偶发触发内核异常告警，如下：

[ 2606.173350] WARNING: CPU: 3 PID: 3347 at arch/x86/kernel/fpu/core.c:60 irq_fpu_usable+0x39/0x50
[ 2606.173420]? fuse [last unloaded: async_memory_reclaim_for_cold_file_area_debug]
[ 2606.173423] CPU: 3 PID: 3347 Comm: bash Kdump: loaded Tainted: G?????????? OE??? -------- h---? 5.14.0-284.11.1.el9_2.x86_64 #1
[ 2606.173425] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 02/27/2020
[ 2606.173426] RIP: 0010:irq_fpu_usable+0x39/0x50
[ 2606.173432] RSP: 0018:ffffb6fe009076e8 EFLAGS: 00010006
[ 2606.173434] RAX: ffffffffa30c6794 RBX: ffffffffa30c6740 RCX: 0000000000000024
[ 2606.173435] RDX: 0000000080110002 RSI: ffffffffa30c6770 RDI: ffffffffa30c6740
[ 2606.173435] RBP: ffffffffa30c6770 R08: 0000000000000004 R09: 0000000000000004
[ 2606.173436] R10: ffffffffa30c6794 R11: 00000000000319c0 R12: 0000000000000001
[ 2606.173437] R13: ffffffffa30c6740 R14: 0000000000000024 R15: fffffc6140046780
[ 2606.173438] FS:? 00007f6c674d4740(0000) GS:ffff971e39ec0000(0000) knlGS:0000000000000000
[ 2606.173461] CS:? 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 2606.173462] CR2: 0000559a22f5dad0 CR3: 000000000c066004 CR4: 00000000003706e0
[ 2606.173475]? blake2s_compress+0x20/0xa0
[ 2606.173480]? blake2s_final+0x41/0x80
[ 2606.173483]? extract_entropy.constprop.0+0x94/0x250
[ 2606.173493]? crng_make_state+0x129/0x180
[ 2606.173495]? _get_random_bytes.part.0+0x4c/0x190
[ 2606.173497]? ? __alloc_pages+0xe6/0x230
[ 2606.173499]? get_random_u32+0x59/0x90
[ 2606.173501]? allocate_slab+0x14a/0x460
[ 2606.173505]? ___slab_alloc+0x44f/0x570
[ 2606.173509]? ? hot_file_update_file_status+0x164/0xa60 [async_memory_reclaim_for_cold_file_area]
[ 2606.173513]? ? find_busiest_group+0x65/0x190
[ 2606.173516]? ? hot_file_update_file_status+0x164/0xa60 [async_memory_reclaim_for_cold_file_area]
[ 2606.173519]? kmem_cache_alloc+0x294/0x300
[ 2606.173522]? hot_file_update_file_status+0x164/0xa60 [async_memory_reclaim_for_cold_file_area]
[ 2606.173526]? ? 0xffffffffc0572011
[ 2606.173528]? kprobe_post_process+0x25/0x70
[ 2606.173531]? ? 0xffffffffc0572012
[ 2606.173532]? kprobe_int3_handler+0x154/0x180
[ 2606.173534]? do_int3+0x3b/0x80
[ 2606.173536]? exc_int3+0x81/0xc0
[ 2606.173546] RSP: 0018:ffffb6fe00907bf0 EFLAGS: 00000246
[ 2606.173548] RBP: 0000000000000000 R08: 0000000000000402 R09: ffff971e10b97500
[ 2606.173549] R10: ffffb6fe00907d98 R11: ffffffffffffffff R12: ffffb6fe00907d38
[ 2606.173550] R13: ffffb6fe00907d60 R14: 0000000000000100 R15: 0000000000000000
[ 2606.173553]? ? 0xffffffffc0572012
[ 2606.173554]? filemap_read+0x18a/0x320
...........................

显然是irq_fpu_usable()函数里触发的问题。为了查清楚根因，/proc/sys/kernel/panic_on_warn置1。下次触发这个内核告警后，触发了内核crash，并保存了vmcoe。通过vmcore看到出问题时完整函数流程：

crash> bt
PID: 33???? TASK: ffff971d01358000? CPU: 3??? COMMAND: "bash"
?#0 [ffffb6fe00907400] machine_kexec at ffffffffa126c237
?#1 [ffffb6fe00907458] __crash_kexec at ffffffffa13c3c9a
?#2 [ffffb6fe00907518] panic at ffffffffa1cd7dc6
?#3 [ffffb6fe00907598] __warn.cold at ffffffffa1cd7feb
?#4 [ffffb6fe009075d0] report_bug at ffffffffa17ac20e
?#5 [ffffb6fe00907608] handle_bug at ffffffffa1d254bc
?#6 [ffffb6fe00907618] exc_invalid_op at ffffffffa1d25684
?#7 [ffffb6fe00907630] asm_exc_invalid_op at ffffffffa1e00af6
??? [exception RIP: irq_fpu_usable+57]
??? RIP: ffffffffa1232d29? RSP: ffffb6fe009076e8? RFLAGS: 00010006
??? RAX: ffffffffa30c6794? RBX: ffffffffa30c6740? RCX: 0000000000000024
??? RDX: 0000000080110002? RSI: ffffffffa30c6770? RDI: ffffffffa30c6740
??? RBP: ffffffffa30c6770?? R8: 0000000000000004?? R9: 0000000000000004
??? R10: ffffffffa30c6794? R11: 00000000000319c0? R12: 0000000000000001
??? R13: ffffffffa30c6740? R14: 0000000000000024? R15: fffffc6140046780
??? ORIG_RAX: ffffffffffffffff? CS: 0010? SS: 0018
?#8 [ffffb6fe009076e8] blake2s_compress at ffffffffa12e5ae0
?#9 [ffffb6fe00907718] blake2s_final at ffffffffa176e391
#10 [ffffb6fe00907738] extract_entropy.constprop.0 at ffffffffa18e2904
#11 [ffffb6fe009077d8] crng_reseed.constprop.0 at ffffffffa18e2b25
#12 [ffffb6fe00907818] crng_make_state at ffffffffa18e2dd9
#13 [ffffb6fe00907848] _get_random_bytes at ffffffffa18e2e7c
#14 [ffffb6fe009078f8] get_random_u32 at ffffffffa18e3129
#15 [ffffb6fe00907918] allocate_slab at ffffffffa1577bea
#16 [ffffb6fe00907960] ___slab_alloc at ffffffffa157b17f
#17 [ffffb6fe00907a00] kmem_cache_alloc at ffffffffa157d194
#18 [ffffb6fe00907a48] hot_file_update_file_status at ffffffffc0dd9d24 [async_memory_reclaim_for_cold_file_area]
#19 [ffffb6fe00907aa8] kprobe_post_process at ffffffffa126ec65
#20 [ffffb6fe00907ab8] kprobe_int3_handler at ffffffffa126ef44
#21 [ffffb6fe00907ae8] do_int3 at ffffffffa1225b2b
#22 [ffffb6fe00907af8] exc_int3 at ffffffffa1d25f51
#23 [ffffb6fe00907b10] asm_exc_int3 at ffffffffa1e00b35
#24 [ffffb6fe00907b98] copy_page_to_iter at ffffffffa1764d32
#25 [ffffb6fe00907bf8] filemap_read at ffffffffa14d7b9a
..................

触发问题的函数流程是，kprobe捕捉内核函数copy_page_to_iter。在读文件时执行到该函数时，产生int3中断，在中断服务函数里最后执行自定义的hot_file_update_file_status()函数，在该函数里执行kmem_cache_alloc()分配一个自定义slab数据结构。而在分配slab时，执行allocate_slab->shuffle_freelist->get_random_int分配一个随机数，源码如下：

static bool shuffle_freelist(struct kmem_cache *s, struct page *page)
{
??? void *start;
??? void *cur;
??? void *next;
??? unsigned long idx, pos, page_limit, freelist_count;
??? if (page->objects < 2 || !s->random_seq)
??????? return false;
??? freelist_count = oo_objects(s->oo);
??? //分配随机数，这里触发内核告警
??? pos = get_random_int() % freelist_count;
??? ............
??? next = next_freelist_entry(s, page, &pos, start, page_limit,freelist_count);
??? set_freepointer(s, cur, next);
??? ............
}

为什么分配slab时要分配随机数呢？查资料说是为了使分配的slab obj地址随机，不容易被攻击，安全考虑。

把这个过程的函数流程再整理一下hot_file_update_file_status->kmem_cache_alloc->___slab_alloc->allocate_slab->shuffle_freelist->get_random_int->get_random_u32->_get_random_bytes->crng_make_state->crng_reseed->drain_entropy->extract_entropy->blake2s_final->__blake2s_final->blake2s_compress->kernel_fpu_begin->irq_fpu_usable。再看下最后的几个关键blake2s_compress函数调用：

void blake2s_compress(struct blake2s_state *state, const u8 *block,
?????????? ???size_t nblocks, const u32 inc)
{
??? kernel_fpu_begin();
..................
??? kernel_fpu_end();
}
static inline void kernel_fpu_begin(void)
{
??? kernel_fpu_begin_mask(KFPU_MXCSR);
}
void kernel_fpu_begin_mask(unsigned int kfpu_mask)
{
??? preempt_disable();
??? //这里触发内核告警
??? WARN_ON_FPU(!irq_fpu_usable());
??? WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
??? this_cpu_write(in_kernel_fpu, true);
??? ..................
}

是irq_fpu_usable()函数返回false触发的crash，看下这个函数的源码

bool irq_fpu_usable(void)
{
??? if (WARN_ON_ONCE(in_nmi()))
??????? return false;
??? if (this_cpu_read(in_kernel_fpu))
??????? return false;
??? if (!in_hardirq())
??????? return true;
??? return !softirq_count();
}

显然，in_nmi()、this_cpu_read(in_kernel_fpu)、softirq_count()大于0都可能会导致该函数返回false，得一一排查。

crash时的cpu是cpu3，很快查找cpu3的in_kernel_fpu这个per cpu 变量是0，

crash> in_kernel_fpu
PER-CPU DATA TYPE:
? bool in_kernel_fpu;
PER-CPU ADDRESSES:
? [0]: ffff971e39e18ed8
? [1]: ffff971e39e58ed8
? [2]: ffff971e39e98ed8
? //cpu3的in_kernel_fpu 变量地址
? [3]: ffff971e39ed8ed8
crash> rd? 0xffff971e39ed8ed8
ffff971e39ed8ed8:? 0000000000000000??????????????????? ........? ??

in_nmi() 和 softirq_count() 得查看cpu3的preempt_count变量，是0x 80110003，如下：

crash> __preempt_count
PER-CPU DATA TYPE:
? int __preempt_count;
PER-CPU ADDRESSES:
? [0]: ffff971e39e18f00
? [1]: ffff971e39e58f00
? [2]: ffff971e39e98f00
? //cpu3的preempt_count变量地址
? [3]: ffff971e39ed8f00
crash> rd -x? ffff971e39ed8f00
ffff971e39ed8f00:? 0000000080110003?

然后查看in_nmi()和softirq_count()，全都来自preempt_count()函数。

#define PREEMPT_NEED_RESCHED??? 0x80000000
static __always_inline int preempt_count(void)
{
??? return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
}
#define PREEMPT_BITS??? 8
#define PREEMPT_SHIFT?? 0
#define SOFTIRQ_BITS? ??8
#define SOFTIRQ_SHIFT?? (PREEMPT_SHIFT + PREEMPT_BITS)//8
#define __IRQ_MASK(x)?? ((1UL << (x))-1)
#define HARDIRQ_SHIFT?? (SOFTIRQ_SHIFT + SOFTIRQ_BITS)//16
#define HARDIRQ_BITS??? 4
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?// ((1 << 4) - 1) << 16? 0xF << 16
#define HARDIRQ_MASK??? (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ??//preempt_count 的 bit16~bit19
#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ??//((1 << 8) - 1) << 8??? 0x7F << 8
#define SOFTIRQ_MASK??? (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ??//preempt_count 的 bit8~bit15
# define softirq_count()??? (preempt_count() & SOFTIRQ_MASK)
#define NMI_SHIFT?? (HARDIRQ_SHIFT + HARDIRQ_BITS)//20
#define NMI_BITS??? 4
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? //((1 << 4) - 1) << 20?? 0xF << 20
#define NMI_MASK??? (__IRQ_MASK(NMI_BITS)???? << NMI_SHIFT)
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ??//preempt_count 的 bit20~bit23
#define nmi_count() (preempt_count() & NMI_MASK)
#define in_nmi()??????? (nmi_count())

cpu3的preempt_count()值是 0x80110003 , bit8~bit15 是0，bit16~bit19是1，bit20~bit23是1。即 in_nmi() 是1，in_hardirq()是1，softirq_count()是0。显然，问题根源竟然是 in_nmi() 是非0导致的。这个就是NMI计数，非0说明当前处于NMI中断。这怎么可能呢？怎么会处于NMI中断呢？什么是NMI中断，这个一般是当发生hard lock、cpu硬件级别错误时才会发生的不可屏蔽中断。可是这个问题场景只是我利用kprobe捕捉内核函数，然后产生int3中断，在中断服务函数里最后执行自定义的hot_file_update_file_status()函数，在该函数里执行kmem_cache_alloc()分配一个slab数据结构而已，这个普通场景怎么会触发NMI中断呢？

这个虚拟机是rocky 9.2 ，之前就出现过莫名奇妙的问题，怀疑内存数据有问题，难道这次又是内存数据有问题？当遇到莫名其妙的问题，就容易怀疑到内存有问题导致数据跳变、内存越界等导致？哪里会有这么多神奇的行为，还是先耐着性子看下源码吧。于是发现in_nmi()大于0是执行了 __nmi_enter()导致的，如下：

#define __nmi_enter()?????????????????????? \
??? do {??????????????????????????? \
??????? lockdep_off();????????????????? \
??????? arch_nmi_enter();?????????????? \
??????? BUG_ON(in_nmi() == NMI_MASK);?????????? \
??????? __preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);?? \
??? } while (0)

再进一步，发现int3中断里竟然在irqentry_nmi_enter()函数里执行了__nmi_enter，细节如下：

irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
{
??? irqentry_state_t irq_state;
??? irq_state.lockdep = lockdep_hardirqs_enabled();
??? __nmi_enter();//令 in_nmi 加1
??? lockdep_hardirqs_off(CALLER_ADDR0);
??? lockdep_hardirq_enter();
??? rcu_nmi_enter();
??? instrumentation_begin();
??? trace_hardirqs_off_finish();
??? ftrace_nmi_enter();
??? instrumentation_end();
??? return irq_state;
}
DEFINE_IDTENTRY_RAW(exc_int3)
{
??? if (poke_int3_handler(regs))
??????? return;
? ?if (user_mode(regs)) {
? ? ? ?……………
??? } else {
??????? //令 in_nmi 加1
??????? irqentry_state_t irq_state = irqentry_nmi_enter(regs);
??????? instrumentation_begin();
??????? //执行kprobe 的函数
??????? if (!do_int3(regs))
??????????? die("int3", regs, 0);
??????? instrumentation_end();
??????? //令 in_nmi 减1
??????? irqentry_nmi_exit(regs, irq_state);
??? }
}
static bool do_int3(struct pt_regs *regs)
{
??? int res;
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
??? if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP,
???????????? SIGTRAP) == NOTIFY_STOP)
??????? return true;
#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
#ifdef CONFIG_KPROBES
??? //执行kprobe 的函数
??? if (kprobe_int3_handler(regs))
??????? return true;
#endif
??? res = notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP);
??? return res == NOTIFY_STOP;
}

简单说：从源码角度来说，每次kprobe触发的int3中断，在exc_int3()中执行kprobe hot_file_update_file_status函数前，都会先执行 irqentry_nmi_enter->__nmi_enter 令NMI中断加1。这样看到的，每次执行 hot_file_update_file_status->kmem_cache_alloc 函数分配slab 前，都会出发内核告警irq_fpu_usable。可实际测试是随机的？这是怎么回事？

我认为 hot_file_update_file_status->kmem_cache_alloc->___slab_alloc->allocate_slab->shuffle_freelist->get_random_int->get_random_u32->_get_random_bytes->crng_make_state->crng_reseed->drain_entropy->extract_entropy->blake2s_final->__blake2s_final->blake2s_compress->kernel_fpu_begin->irq_fpu_usable，这个流程并不是每次都执行到最后的 irq_fpu_usable ，中间有很多条件判断，但是一旦执行到irq_fpu_usable ，必然会触发内核告警。

简单说，kprobe的函数里，不能执行 kmem_cache_alloc()分配slab 内存。因为kprobe的init3中断会令NMI计数加1，然后执行kmem_cache_alloc()分配slab有概率执行到irq_fpu_usable，因为NMI计数大于0而触发内核告警。

怎么解决呢？

1:在执行 hot_file_update_file_status->kmem_cache_alloc 前，自己的源码里强制令nmi计数减1，但怕对系统会有不良影响。
2:自己实现一套简易的slab源码，不执行 get_random_int() 函数就行。

但是这个两个方案改动太大，研究源码后，其实有个很简单的方法，不用动内核。

禁止掉 allocate_slab random 功能，就不会执行 hot_file_update_file_status->kmem_cache_alloc->___slab_alloc->allocate_slab->shuffle_freelist->get_random_int 函数，具体改动如下：

struct hot_cold_file_global
{
??? struct kmem_cache *file_stat_cachep;
}
struct hot_cold_file_global hot_cold_file_global_info;
static void (*cache_random_seq_destroy)(struct kmem_cache *cachep);
//获取内核cache_random_seq_destroy函数指针
cache_random_seq_destroy = (void *)kallsyms_lookup_name_async("cache_random_seq_destroy");
cache_random_seq_destroy(hot_cold_file_global_info.file_stat_cachep);
//禁止分配 file_stat_cachep? 这个slab时使用随机数功能，分配slab时就不会执行到shuffle_freelist->get_random_int函数
cache_random_seq_destroy(hot_cold_file_global_info.file_stat_cachep);

当执行cache_random_seq_destroy()函数后，就令slab的random_seq为NULL

void cache_random_seq_destroy(struct kmem_cache *cachep)
{
??? kfree(cachep->random_seq);
??? cachep->random_seq = NULL;
}

然后将来分配slab，执行到allocate_slab->shuffle_freelist时，直接返回false，不会再执行到get_random_int()函数。

static bool shuffle_freelist(struct kmem_cache *s, struct page *page)
{
??? void *start;
??? void *cur;
??? void *next;
??? unsigned long idx, pos, page_limit, freelist_count;
? ?? //直接return false
??? if (page->objects < 2 || !s->random_seq)
??????? return false;
??? freelist_count = oo_objects(s->oo);
??? //分配随机数，这里触发crash
??? pos = get_random_int() % freelist_count;
??? ............
}

文章来源:https://blog.csdn.net/hu1610552336/article/details/135315876
本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若内容造成侵权/违法违规/事实不符，请联系我的编程经验分享网邮箱：veading@qq.com进行投诉反馈，一经查实，立即删除！