内核block层开发时遇到的各种变量同步问题总结
本文是我在开发一个内核模块(统计进程级的IO派发延迟)时,遇到的一系列并发问题总结。这个内核模块的详细功能在《一次无语的内核调试经历(内核卡死、内核内存越界、spin lock锁异常)》开头第1节有详细介绍,希望读者先看下,本文不再介绍。
这个内核模块的基本功能是:在IO请求(简称为rq或者req)插入IO队列blk_mq_sched_request_inserted函数记录rq插入IO队列的时间点,在IO请求派发函数blk_mq_dispatch_rq_list记录rq真正派发给磁盘驱动的时间点,在IO请求传输完成执行的函数blk_account_io_done计算IO请求在磁盘驱动层的传输耗时
- void blk_mq_sched_request_inserted(struct request *rq)
- {
- ??? if(rq->rq_disk && rq->rq_disk->process_io.enable){
- ??????? struct process_rq_stat *p_process_rq_stat_tmp = NULL;
- ??????? struct process_io_info *p_process_io_info_tmp = NULL;
- ??????? ………………….
- ? ? ? ?//为每个rq分配一个process_io_info
- ??????? p_process_io_info_tmp = kmem_cache_alloc(rq->rq_disk->process_io.process_io_info_cachep,GFP_ATOMIC);
- ??????? memset(p_process_io_info_tmp,0,sizeof(struct process_io_info));
- ??????? //为每个派发IO的进程分配一个process_io_stat
- ??????? p_process_rq_stat_tmp = kmem_cache_alloc(rq->rq_disk->process_io.process_rq_stat_cachep,GFP_ATOMIC);
- ??????? memset(p_process_rq_stat_tmp,0,sizeof(struct process_rq_stat))
- ??????? //记录rq所属进程pid及名字
- ?????? ?p_process_io_info_tmp->pid = current->pid;
- ??????? strncpy(p_process_io_info_tmp->comm,current->comm,COMM_LEN-1);
- ???????
- ??????? p_process_rq_stat_tmp->p_process_io_info = p_process_io_info_tmp;
- ??????? smp_mb();
- ??????? //记录rq插入IO队列的时间点
- ??????? p_process_rq_stat_tmp->rq_inset_time = ktime_to_us(ktime_get());
- ??????? p_process_rq_stat_tmp->rq = rq;
- ??????? rq->p_process_rq_stat = p_process_rq_stat_tmp;
- ??
- ???????? spin_lock_irq(&(rq->rq_disk->process_io.process_io_insert_lock));??????? list_add(&rq->p_process_rq_stat->process_io_insert,&(rq->rq_disk->process_io.process_io_insert_head));
- ??????? spin_unlock_irq(&(rq->rq_disk->process_io.process_io_insert_lock));
- ??????? return;
- ??? }
- }
- bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
- ???????????????? bool got_budget)
- {
- ??? struct process_rq_stat *p_process_rq_stat_tmp = NULL;
- ??? struct process_io_info *p_process_io_info_tmp = NULL;
- ??? ...............
- ??? ret = q->mq_ops->queue_rq(hctx, &bd);
- ??? ...............
- ??? p_process_io_info_tmp = rq->p_process_rq_stat->p_process_io_info;
- ??? p_process_rq_stat_tmp = rq->p_process_rq_stat;
- ??? if(rq->rq_disk && rq->rq_disk->process_io.enable && rq->p_process_rq_stat && rq->p_process_rq_stat->p_process_io_info){
- ??????? //记录rq真正派发给磁盘驱动的时间点
- ??????? p_process_rq_stat_tmp->rq_real_issue_time = ktime_to_us(ktime_get());
- ??? }? ??
- }
- void blk_account_io_done(struct request *req, u64 now)
- {
- ??? .......................
- ??? if(req->rq_disk && req->rq_disk->process_io.enable && req->p_process_rq_stat){
- ??????????? struct process_rq_stat *p_process_rq_stat_tmp = req->p_process_rq_stat;
- ??????????? struct process_io_info *p_process_io_info_tmp = req->p_process_rq_stat->p_process_io_info;
- ??????????? p_process_rq_stat_tmp->dc_time = ktime_to_us(ktime_get()) - p_process_rq_stat_tmp->rq_issue_time;
- ??????????? p_process_rq_stat_tmp->idc_time = p_process_rq_stat_tmp->dc_time + p_process_rq_stat_tmp->id_time;
- ???????????
- ??????????? //计算IO请求在磁盘驱动层传输的真正耗时,并把最大的耗时保存到max_real_dc_time
- ?????????? ?p_process_rq_stat_tmp->real_dc_time = ktime_to_us(ktime_get()) - p_process_rq_stat_tmp->rq_real_issue_time;
- ??????????? if( p_process_rq_stat_tmp->real_dc_time > p_process_io_info_tmp->max_real_dc_time){
- ??????????????? p_process_io_info_tmp->max_real_dc_time = p_process_rq_stat_tmp->real_dc_time;
- ??????????? }
- ??????????? req->p_process_rq_stat = NULL;
- ??? }
- ??? .......................
- }
在blk_mq_dispatch_rq_list函数执行q->mq_ops->queue_rq真正派发rq到磁盘驱动后,记录rq的真实派发时间:
- bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
- ???????????????? bool got_budget)
- {
- ??? struct process_rq_stat *p_process_rq_stat_tmp = NULL;
- ??? struct process_io_info *p_process_io_info_tmp = NULL;
- ??? ...............
- ??? ret = q->mq_ops->queue_rq(hctx, &bd);
- ??? ...............
- ??? p_process_io_info_tmp = rq->p_process_rq_stat->p_process_io_info;
- ??? p_process_rq_stat_tmp = rq->p_process_rq_stat;
- ??? if(rq->rq_disk && rq->rq_disk->process_io.enable && rq->p_process_rq_stat && rq->p_process_rq_stat->p_process_io_info){
- ??????
- ????????? ??//p_process_rq_stat_tmp->rq_real_issue_time 非0说明是无效的
- ??????????? if(p_process_rq_stat_tmp->rq_real_issue_time == 0){
- ??????????????? spin_lock_irq(&(p_process_io_info_tmp->io_data_lock));
- ??????????????? //计算rq的真实派发时间
- ??????????????? p_process_rq_stat_tmp->rq_real_issue_time = ktime_to_us(ktime_get());
- ??????????????? spin_unlock_irq(&(p_process_io_info_tmp->io_data_lock));
- ??????????? }
- ??? }
- }
在print_process_io_info函数打印IO延迟参数,但遇到如下异常打印:
- 1:打印的 max_real_dc_time 非常大
- kworker/3:0 6202 rq_count:1 io_size:0M max_id_time:9us max_dc_time:693us max_idc_time:702us max_real_dc_time:1575904706us max_hctx_list_rq:0 rq_inflght_issue:0_1 rq_inflght_done:0_1? avg_id_time:9us avg_dc_time:693us avg_idc_time:702us
- 2:打印的 max_real_dc_time 是负数
- fio 6386 rq_count:1083 io_size:4M max_id_time:61199us max_dc_time:3106us max_idc_time:62114us max_real_dc_time:-2075967423us max_hctx_list_rq:0 rq_inflght_issue:219_28 rq_inflght_done:221_31? avg_id_time:9774us avg_dc_time:1391us avg_idc_time:11165us
为什么会出现以上问题呢?在把IO请求插入IO运行队列时会对struct process_rq_stat *p_process_rq_stat_tmp清0。执行到blk_mq_dispatch_rq_list()里的if(rq->rq_disk && rq->rq_disk->process_io.enable && rq->p_process_rq_stat && rq->p_process_rq_stat->p_process_io_info),正常rq这个IO请求还没派发完成,此时执行blk_account_io_done()里的p_process_rq_stat_tmp->real_dc_time = ktime_to_us(ktime_get()) - p_process_rq_stat_tmp->rq_real_issue_time计算的real_dc_time是准确的。
但是也可能已经派发完成并执行了 blk_account_io_done()里的计算real_dc_time的代码,此时p_process_rq_stat_tmp->rq_real_issue_time还没在blk_mq_dispatch_rq_list函数中赋值,real_dc_time还是初值0。此时计算的real_dc_time是有问题的,就会” max_real_dc_time:1575904706us”这么大。并且这个rq很快又被新的进程分配,并分配rq->p_process_rq_stat。接着才执行到blk_mq_dispatch_rq_list函数这里,错误执行p_process_rq_stat_tmp->rq_real_issue_time = ktime_to_us(ktime_get())。而这个rq被赋值了新的进程,只是插入IO队列,还没有派发!简单说rq->p_process_rq_stat已经是新分配的了。等这个rq接下来真的被派发,执行到blk_mq_dispatch_rq_list函数就会发现p_process_rq_stat_tmp->rq_real_issue_time不是0。这种情况就需要blk_mq_dispatch_rq_list函数中在ret = q->mq_ops->queue_rq(hctx, &bd)派发rq给磁盘驱动前后,判断rq->p_process_rq_stat所属的进程是不是变了。
那为什么max_real_dc_time会是负数呢?执行到blk_mq_dispatch_rq_list函数派发IO后返回但还没执行p_process_rq_stat_tmp->rq_real_issue_time = ktime_to_us(ktime_get())时,rq正好派发完成执行blk_account_io_done()函数,rq->p_process_rq_stat还没清NULL,因为blk_mq_dispatch_rq_list函数里的if(rq->rq_disk && rq->rq_disk->process_io.enable && rq->p_process_rq_stat)成立,里边的p_process_rq_stat_tmp->rq_real_issue_time = ktime_to_us(ktime_get())和 blk_account_io_done()里的p_process_rq_stat_tmp->real_dc_time = ktime_to_us(ktime_get())-p_process_rq_stat_tmp->rq_real_issue_time就会同时执行,谁前谁后不一定。
这样就可能blk_account_io_done()里先ktime_to_us(ktime_get()),然后blk_mq_dispatch_rq_list函数执行p_process_rq_stat_tmp->rq_real_issue_time = ktime_to_us(ktime_get())对p_process_rq_stat_tmp->rq_real_issue_time赋值,此时blk_account_io_done函数执行的p_process_rq_stat_tmp->real_dc_time = ktime_to_us(ktime_get()) - p_process_rq_stat_tmp->rq_real_issue_time就可能是负数。这种情况需要加锁保护,绝对保证两个函数对rq_real_issue_time的使用或赋值,同时只有一个进程在进行。
根据以上两种情况,这样修改源码,并添加一些调试信息,下文用到:
- bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
- ???????????????? bool got_budget)
- {
- ??? struct process_rq_stat *p_process_rq_stat_tmp = NULL;
- ??? struct process_io_info *p_process_io_info_tmp = NULL;
- ??? if(rq->rq_disk && rq->rq_disk->process_io.enable && rq->p_process_rq_stat){
- ??????? //先保存rq所属进程PID
- ??????? rq_pid = rq->p_process_rq_stat->p_process_io_info->pid;
- ??????? printk("1:%s %s %d\n",__func__,current->comm,current->pid);
- ??? }
- ??? ...............
- ??? ret = q->mq_ops->queue_rq(hctx, &bd);
- ??? ...............
- ??? p_process_io_info_tmp = rq->p_process_rq_stat->p_process_io_info;
- ??? p_process_rq_stat_tmp = rq->p_process_rq_stat;
- ??? if(rq->rq_disk && rq->rq_disk->process_io.enable && rq->p_process_rq_stat && rq->p_process_rq_stat->p_process_io_info){
- ??????? printk("2:%s %s %d\n",__func__,current->comm,current->pid);
- ??????? //派发rq前后rq所属进程必须是同一个
- ??????? if(rq->p_process_rq_stat &&(rq_pid == rq->p_process_rq_stat->p_process_io_info->pid)){
- ??????????? //p_process_rq_stat_tmp->rq_real_issue_time 非0说明是无效的,舍弃
- ??????????? if(p_process_rq_stat_tmp->rq_real_issue_time == 0){
- ??????????????? spin_lock_irq(&(p_process_io_info_tmp->io_data_lock));
- ??????????????? p_process_rq_stat_tmp->rq_real_issue_time = ktime_to_us(ktime_get());
- ??????????????? spin_unlock_irq(&(p_process_io_info_tmp->io_data_lock));
- ??????????? }
- ???? ???????else
- ??????????? {
- ??????????????? printk(KERN_DEBUG"%s rq_real_issue_time:%llu rq_issue_time:%llu rq_inset_time:%llu p_process_io_info_tmp:%p\n",__func__,p_process_rq_stat_tmp->rq_real_issue_time,p_process_rq_stat
- _tmp->rq_issue_time,p_process_rq_stat_tmp->rq_inset_time,p_process_io_info_tmp);
- ??????????? }
- ??????? }
- ??? }
- }
- void blk_account_io_done(struct request *req, u64 now)
- {
- ??? .......................
- ??? if(req->rq_disk && req->rq_disk->process_io.enable && req->p_process_rq_stat){
- ??????????? struct process_rq_stat *p_process_rq_stat_tmp = req->p_process_rq_stat;
- ??????????? struct process_io_info *p_process_io_info_tmp = req->p_process_rq_stat->p_process_io_info;
- ??????????? printk("%s rq:0x%llx process_rq_stat:0x%llx p_process_io_info_tmp:0x%llx pid:%d\n",__func__,(u64)req,(u64)(req->p_process_rq_stat),(u64)p_process_io_info_tmp,p_process_io_info_tmp->pid);
- ??????????? p_process_rq_stat_tmp->dc_time = ktime_to_us(ktime_get()) - p_process_rq_stat_tmp->rq_issue_time;
- ??????????? p_process_rq_stat_tmp->idc_time = p_process_rq_stat_tmp->dc_time + p_process_rq_stat_tmp->id_time;
- ???????????
- ??????????? spin_lock(&(p_process_io_info_tmp->io_data_lock));
- ??????????? //计算IO请求在磁盘驱动层传输的真正耗时,并把最大的耗时保存到max_real_dc_time
- ??????????? p_process_rq_stat_tmp->real_dc_time = ktime_to_us(ktime_get()) - p_process_rq_stat_tmp->rq_real_issue_time;
- ??????????? if( p_process_rq_stat_tmp->real_dc_time > p_process_io_info_tmp->max_real_dc_time){
- ??????????????? p_process_io_info_tmp->max_real_dc_time = p_process_rq_stat_tmp->real_dc_time;
- ??????????? }
- ??????????? rq->p_process_rq_stat = NULL;
- ??????????? spin_unlock(&(p_process_io_info_tmp->io_data_lock));
- ??? }
- ??? .......................
- }
blk_mq_sched_request_inserted函数源码再贴下:
- void blk_mq_sched_request_inserted(struct request *rq)
- {
- ??? if(rq->rq_disk && rq->rq_disk->process_io.enable){
- ??????? struct process_rq_stat *p_process_rq_stat_tmp = NULL;
- ??????? struct process_io_info *p_process_io_info_tmp = NULL;
- ???????
- ??????? p_process_io_info_tmp = kmem_cache_alloc(rq->rq_disk->process_io.process_io_info_cachep,GFP_ATOMIC);
- ??????? memset(p_process_io_info_tmp,0,sizeof(struct process_io_info));
- ??????? p_process_rq_stat_tmp = kmem_cache_alloc(rq->rq_disk->process_io.process_rq_stat_cachep,GFP_ATOMIC);
- ??????? memset(p_process_rq_stat_tmp,0,sizeof(struct process_rq_stat))
- ??????? p_process_io_info_tmp->pid = current->pid;
- ??????? strncpy(p_process_io_info_tmp->comm,current->comm,COMM_LEN-1);
- ???????
- ??????? p_process_rq_stat_tmp->p_process_io_info = p_process_io_info_tmp;
- ??????? smp_mb();
- ??????? p_process_rq_stat_tmp->rq_inset_time = ktime_to_us(ktime_get());
- ??????? p_process_rq_stat_tmp->rq = rq;
- ??????? rq->p_process_rq_stat = p_process_rq_stat_tmp;
- ??
- ??????? spin_lock_irq(&(rq->rq_disk->process_io.process_io_insert_lock));
- ??????? /*为什么不直接把rq添加到process_io_insert_head链表,而是把rq->p_process_rq_stat添加到process_io_insert_head链表。这是因为可能在IO传输完成执行blk_account_io_done()后可能会释放掉rq。然后print_process_io_info()中从process_io_insert_head遍历到这个被释放的rq,使用rq->p_process_rq_stat->rq_inset_time就有问题了,因为rq已经失效了*/
- ??????? list_add(&rq->p_process_rq_stat->process_io_insert,&(rq->rq_disk->process_io.process_io_insert_head));
- ??????? spin_unlock_irq(&(rq->rq_disk->process_io.process_io_insert_lock));
- ???????
- ??????? printk("%s rq:0x%llx process_rq_stat:0x%llx rq_inset_time:%lld? p_process_io_info_tmp:0x%llx pid:%d rq_real_issue_time:%lld\n",__func__,(u64)rq,(u64)(rq->p_process_rq_stat),p_proce
- ss_rq_stat_tmp->rq_inset_time,(u64)p_process_io_info_tmp,p_process_io_info_tmp->pid,p_process_rq_stat_tmp->rq_real_issue_time);
- ??????? return;
- ??? }
- }
blk_mq_sched_request_inserted函数有两点需要注意:
- 1:在 p_process_rq_stat_tmp->p_process_io_info = p_process_io_info_tmp 和 p_process_rq_stat_tmp->rq_inset_time = ktime_to_us(ktime_get())之间加了smp_mb()内存屏障,这是为了保证两个赋值的先后顺序。p_process_rq_stat_tmp->p_process_io_info表示该rq已经与进程绑定了,有了这个赋值才能使用p_process_rq_stat_tmp->rq_inset_time、p_process_rq_stat_tmp->rq等信息。
- 2:用的list_add(&rq->p_process_rq_stat->process_io_insert,&(rq->rq_disk->process_io.process_io_insert_head))而不是list_add(&rq-> process_io_insert,&(rq->rq_disk->process_io.process_io_insert_head)),原因是:因为可能在IO传输完成执行blk_account_io_done()后可能会释放掉rq。然后print_process_io_info()中从process_io_insert_head遍历到这个被释放的rq,使用rq->p_process_rq_stat->rq_inset_time就有问题了,因为rq已经失效了。rq的分配和释放我不能控制,但是rq->p_process_rq_stat的分配和释放我可以控制!在blk_account_io_done函数中,先使用rq->p_process_rq_stat信息,再释放,防止丢失IO采集信息。
继续测试,又来了新的问题:压测时blk_mq_dispatch_rq_list函数会有打印如下:
[ 5071.063712] blk_mq_dispatch_rq_list rq:0xffff9e0ca34be4d0 process_rq_stat:0xffff9e0caba03b40 rq_real_issue_time:5070726879 p_process_io_info_tmp:0xffff9e0c77b086e8 pid:7129
就是说,blk_mq_dispatch_rq_list函数中if(p_process_rq_stat_tmp->rq_real_issue_time == 0)总是不成立,而走else分支。这是不合理的,我已经做了各种防护,为什么p_process_rq_stat_tmp->rq_real_issue_time不是0呢?不是0就说明p_process_rq_stat_tmp->rq_real_issue_time已经被其他进程赋值过了!这个很不合理,但是奇葩情况见多了,解决这种奇葩的问题,就要用最简单的问题。于是在IO请求插入IO算法队列的blk_mq_sched_request_inserted函数、IO派发给磁盘驱动执行的blk_mq_start_request和blk_mq_dispatch_rq_list函数、IO请求传输完成执行的blk_account_io_done函数,添加printk打印rq、p_process_rq_stat_tmp、p_process_rq_stat_tmp->rq_real_issue_time等信息。这些调试信息,前文列举源码时已经添加,这里只用再列下 blk_mq_start_request的:
- void blk_mq_start_request(struct request *rq)? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
- {
- ??? if(rq->rq_disk && rq->rq_disk->process_io.enable && rq->p_process_rq_stat){
- ??????? struct process_rq_stat *p_process_rq_stat_tmp = rq->p_process_rq_stat;
- ??????? struct process_io_info *p_process_io_info_tmp = rq->p_process_rq_stat->p_process_io_info;
- ???????
- ??????? p_process_rq_stat_tmp->rq_issue_time = ktime_to_us(ktime_get());? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ??
- ??????? p_process_rq_stat_tmp->id_time = p_process_rq_stat_tmp->rq_issue_time - p_process_rq_stat_tmp->rq_inset_time;
- ??????? printk("%s %s %d rq:0x%llx process_rq_stat:0x%llx rq_issue_time:%lld p_process_io_info_tmp:0x%llx pid:%d? rq_real_issue_time:%lld\n",__func__,current->comm,current->pid,(u64)rq,(u64)(rq->p_process_rq_stat),p_process_rq_stat_tmp->rq_issue_time,(u64)p_process_io_info_tmp,p_process_io_info_tmp->pid,p_process_rq_stat_tmp->rq_real_issue_time);
- ??? }
- }
实际调试下来,这个并发问题简直离谱到家了!问题的根源还是在blk_mq_dispatch_rq_list函数,再看下它的源码:
- bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
- ???????????????? bool got_budget)
- {
- ??? struct process_rq_stat *p_process_rq_stat_tmp = NULL;
- ??? struct process_io_info *p_process_io_info_tmp = NULL;
- ??? if(rq->rq_disk && rq->rq_disk->process_io.enable && rq->p_process_rq_stat){
- ??????? //先保存rq所属进程PID
- ??????? rq_pid = rq->p_process_rq_stat->p_process_io_info->pid;
- ??????? printk("1:%s %s %d\n",__func__,current->comm,current->pid);
- ??? }
- ...............
- ? ?//派发rq到磁盘驱动
- ??? ret = q->mq_ops->queue_rq(hctx, &bd);
- ??? ...............
- ??? p_process_io_info_tmp = rq->p_process_rq_stat->p_process_io_info;
- ??? p_process_rq_stat_tmp = rq->p_process_rq_stat;
- ??? if(rq->rq_disk && rq->rq_disk->process_io.enable && rq->p_process_rq_stat && rq->p_process_rq_stat->p_process_io_info){
- ??????? printk("2:%s %s %d\n",__func__,current->comm,current->pid);
- ??????? //派发rq前后rq所属进程必须是同一个
- ??????? if(rq->p_process_rq_stat &&(rq_pid == rq->p_process_rq_stat->p_process_io_info->pid)){
- ??????????? //p_process_rq_stat_tmp->rq_real_issue_time 非0说明是无效的,舍弃
- ??????????? if(p_process_rq_stat_tmp->rq_real_issue_time == 0){
- ??????????????? spin_lock_irq(&(p_process_io_info_tmp->io_data_lock));
- ??????????????? p_process_rq_stat_tmp->rq_real_issue_time = ktime_to_us(ktime_get());
- ??????????????? spin_unlock_irq(&(p_process_io_info_tmp->io_data_lock));
- ??????????? }
- ??????????? else
- ??????????? {
- ??????????????? printk(KERN_DEBUG"%s rq_real_issue_time:%llu rq_issue_time:%llu rq_inset_time:%llu p_process_io_info_tmp:%p\n",__func__,p_process_rq_stat_tmp->rq_real_issue_time,p_process_rq_stat
- _tmp->rq_issue_time,p_process_rq_stat_tmp->rq_inset_time,p_process_io_info_tmp);
- ??????????? }
- ??????? }
- ??? }
- }
blk_mq_dispatch_rq_list函数中,首先rq_pid = rq->p_process_rq_stat->p_process_io_info->pid记录这个rq所属的进程pid,然后执行q->mq_ops->queue_rq(hctx, &bd)派发rq到磁盘驱动。接着执行if(rq->p_process_rq_stat &&(rq_pid == rq->p_process_rq_stat->p_process_io_info->pid))判断rq所属进程是否变了,没有的话才会执行p_process_rq_stat_tmp->rq_real_issue_time = ktime_to_us(ktime_get())对 p_process_rq_stat_tmp->rq_real_issue_time赋值rq派发后的时间。
问题个关键点就是q->mq_ops->queue_rq(hctx, &bd) 和 if(rq->p_process_rq_stat &&(rq_pid == rq->p_process_rq_stat->p_process_io_info->pid)) 之间极短的时间内,这个rq派发完成了,中断里执行blk_account_io_done释放掉了这个rq及它的process_io_info。然后又被同一个rq_pid进程立即分配了同一个rq,然后传输这个rq。在把rq插入到IO队列执行blk_mq_sched_request_inserted函数时,又为这个rq分配了同一个process_io_info!虽然难以置信,但这种情况完成可能成立的!你要考虑到,软中断是可以打断当前进程的。并且这是虚拟机测试环境,更容易发生。
同一个进程,同一个rq,同一个process_io_info!因此老的还在blk_mq_dispatch_rq_list函数里的进程,执行到q->mq_ops->queue_rq(hctx, &bd) 后,执行到 if(rq->p_process_rq_stat &&(rq_pid == rq->p_process_rq_stat->p_process_io_info->pid))时,这个if就是成立的,但是这个rq及process_io_info却被其他进程分配走了,对当前进程来说是无效的。其他进程完全可以也在执行blk_mq_dispatch_rq_list函数,并对p_process_rq_stat_tmp->rq_real_issue_time赋值。因此这个当前进程就会发现if(p_process_rq_stat_tmp->rq_real_issue_time == 0)竟然不成立。
遇到离谱的事先不要动不动就怀疑内存有问题,内存越界!要先要静下心来把可能的情况耐心复盘一下,往往此时就发现契机了!并且,有些问题可能在物理机上测试很难发生,但是在虚拟机环境却很容易发生,还是要在尽可能多的环境测试!
最后,再说下其他变量同步问题:
blk_mq_sched_request_inserted函数中分配进程派发IO的分配的process_rq_stat和process_io_info信息,每次传输IO请求分配一个process_rq_stat结构,每个进程派发IO则分配一个process_io_info结构。process_io_info与进程绑定,process_rq_stat与IO请求rq绑定。IO请求派发给磁盘驱动执行的blk_mq_start_request函数、IO请求传输完成执行的blk_account_io_done函数中,计算IO请求传输的id和dc耗时,并保存到process_rq_stat和process_io_info结构体中。
print_process_io_info函数每隔1s采集一次所有派发IO进程的process_io_info信息,获取每个进程传输IO的最大id、dc、传输数据量、iops、在磁盘驱动最大IO数、在IO算法队列最大延迟等IO数据,然后printk打印出来,最后对process_io_info清0。如果此时process_io_info绑定的进程也在对process_io_info结构提成员赋值,那二者就存在数据同步问题。这种情况只能使用spin-lock解决,保证同时只有一个进程在使用或者修改process_io_info结构体信息。源码如下:
- void blk_account_io_done(struct request *req, u64 now)
- {
- ??? .......................
- ??? if(req->rq_disk && req->rq_disk->process_io.enable && req->p_process_rq_stat){
- ??????????? struct process_rq_stat *p_process_rq_stat_tmp = req->p_process_rq_stat;
- ??????????? struct process_io_info *p_process_io_info_tmp = req->p_process_rq_stat->p_process_io_info;
- ??????????? p_process_rq_stat_tmp->dc_time = ktime_to_us(ktime_get()) - p_process_rq_stat_tmp->rq_issue_time;
- ??????????? p_process_rq_stat_tmp->idc_time = p_process_rq_stat_tmp->dc_time + p_process_rq_stat_tmp->id_time;
- ??????????? ?//加锁防护
- ??????????? spin_lock(&(p_process_io_info_tmp->io_data_lock));
- ??????????? //计算IO请求在磁盘驱动层传输的真正耗时,并把最大的耗时保存到max_real_dc_time
- ??????????? p_process_rq_stat_tmp->real_dc_time = ktime_to_us(ktime_get()) - p_process_rq_stat_tmp->rq_real_issue_time;
- ??????????? if( p_process_rq_stat_tmp->real_dc_time > p_process_io_info_tmp->max_real_dc_time){
- ??????????????? p_process_io_info_tmp->max_real_dc_time = p_process_rq_stat_tmp->real_dc_time;
- ??????????? }
- ??????????? rq->p_process_rq_stat = NULL;
- ??????????? spin_unlock(&(p_process_io_info_tmp->io_data_lock));
- ??? }
- ??? .......................
- }
- ……………….
- void print_process_io_info(struct process_io_control *p_process_io_tmp)
- {
- ??? struct process_io_info *p_process_io_info_tmp = NULL;
- ???
- ??? list_for_each_entry_rcu(p_process_io_info_tmp, &(p_process_io_tmp->process_io_control_head), process_io_info_list){
- ??????? if(p_process_io_info_tmp->complete_rq_count != 0){
- ??????????? //加锁防护
- ??????????? spin_lock_irq(&(p_process_io_info_tmp->io_data_lock));
- ??????????? .............
- ??????????? max_id_time = p_process_io_info_tmp->max_id_time;
- ??????????? max_dc_time = p_process_io_info_tmp->max_dc_time;
- ??????????? max_idc_time = p_process_io_info_tmp->max_idc_time;
- ???????????
- ??????????? p_process_io_info_tmp->max_id_time = 0;
- ??????????? p_process_io_info_tmp->max_dc_time = 0;
- ??????????? p_process_io_info_tmp->max_idc_time = 0;
- ??????????? spin_unlock_irq(&(p_process_io_info_tmp->io_data_lock));
- ??????????? printk打印 max_id_time、max_dc_time、max_idc_time 信息
- ??????? }else{
- ??????????? spin_lock_irq(&(p_process_io_tmp->process_lock_list));
- ??????????? list_del_rcu(&p_process_io_info_tmp->process_io_info_list);
- ?????????? ?spin_unlock_irq(&(p_process_io_tmp->process_lock_list));
- ??????? }
- ??? }
- }
明显,io_data_lock锁保证了blk_account_io_done和print_process_io_info函数对process_io_info结构体成员的读取、赋值、清0都是独占的,不存在数据同步问题。
还有一个问题是,如果多个地方都需要使用spin lock锁,最好定义多个spin lock锁,防止单个spin lock临界区代码过多,比如blk_mq_sched_request_inserted函数和print_process_io_info中,使用process_io_insert_lock锁防止对process_io_info添加到process_io_insert_head链表、删除、遍历,进程是独占的,不能存在数据同步问题。代码前文已经列出。
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。 如若内容造成侵权/违法违规/事实不符,请联系我的编程经验分享网邮箱:veading@qq.com进行投诉反馈,一经查实,立即删除!