Linode Xen 下 grsecurity >= 4.3 崩溃问题
自从Linux4.3开始,在Linode上使用PaX/grsecurity时,内核会在被pv-grub执行后不久立即崩溃。由于崩溃是在启动后极早期立刻发生的,没有任何可以用来调试的日志,同时公司也不是盖子开的,也没有办法得到母机上有意义的调试信息。这导致了盖子的VPS内核从去年12月开始被锁定在4.2.7。由于不知什么时候产生了Linode东京机房会在2016年6月从Xen迁移到KVM的错觉,也没有花精力去尝试调试这个问题。
然而今年Linode周年庆时硬件全部翻倍,惟独东京机房除外。而根据官方最新的说法,新机房乐观估计要第四季度上线。解决内核问题就不得不提上了盖子的日程,首先是手工修复了不少CVE高危漏洞,随后又祭出diff折腾半天,内核始终会在启动后立刻死亡。而由于grsecurity并不提供git源,所以gitbisect也是不可能的,唯一可用的工具只有Linux4.2.7/补丁文件,与Linux4.3.3/补丁文件。
在阅读代码差异时,一个很大的挑战是如何区分上游内核的修改与下游PaX/grsecurity补丁的修改。直接比较补丁文件会导致代码上下文丢失,让代码的意图不可理解。最后盖子打算编写一个名为metadiff的工具,自动比较并去除在上游中出现的代码段,以便仅仅对PaX/grsecurity的代码进行比较,就连名字都想好了就叫metadiff,但一直没有动手。
直到上个月和Shawn聊天时,提到了自己装个Xen也不是不可行;于是周六终于动手在VirutalBox虚拟机里撞了个Debian+Xen,又在Xen里启动了一个虚拟机,果然很快就得到了内核崩溃的traceback。
rip:ffffffff8100b2b0pmu_msr_read+0x10 flags:00000282isnz rsp:ffffffff81aeff30 rax:8000000000000000rcx:0000000000000001rdx:ffffffff81aeffcc rbx:00000000c0000080rsi:ffffffff81aeffa0rdi:00000000c0000080 rbp:ffffffff81aeffa0r8:0000000000000001r9:00000000ffffffff r10:ffffffff81cf9000r11:0000000000000000r12:ffffffff81aeffcc r13:ffffffff81aeffc4r14:ffffffff81aeffc0r15:6f73b764afec1c9d cs:e033ss:e02bds:0000es:0000 fs:0000@0000000000000000 gs:0000@0000000000000000/0000000000000000 Code(instraddrffffffff8100b2b0) 000000000041544989d4554889f55389fb4883ec10<65>488b0425280000004889 Stack: 000000000000000100000000000000000000000000000000ffffffff8100b2b0 000000010000e0300000000000010082ffffffff81aeff70000000000000e02b 0000000000000000000000000000000000000000c0000080ffffffff81aeffcc ffffffff81aeffc8ffffffff810041c8ffffffff81aeffc8ffffffff81aeffcc CallTrace: [<ffffffff8100b2b0>]pmu_msr_read+0x10<-- [<ffffffff8100b2b0>]pmu_msr_read+0x10 [<ffffffff810041c8>]xen_read_msr_safe+0x18 [<ffffffff81be93eb>]xen_start_kernel+0x1b9
哦?可见内核在xen_start_kernel不久就崩溃了,这是/*FirstCfunctiontobecalledonXenboot*/,在如此早期就崩溃,什么错误日志到看不到也就不奇怪了。来看看xen_read_msr和pmu_msr_read在4.2和4.3之间有什么改变:
---../../4.2.7/linux-4.2.7/arch/x86/xen/enlighten.c2016-09-1100:44:12.010022936+0800 +++arch/x86/xen/enlighten.c2015-12-1513:41:43.000000000+0800 @@-1030,6+1034,9@@staticu64xen_read_msr_safe(unsignedin { u64val; +if(pmu_msr_read(msr,&val,err)) +returnval; + val=native_read_msr_safe(msr,err); switch(msr){ caseMSR_IA32_APICBASE: @@-1074,9+1081,11@@staticintxen_write_msr_safe(unsignedi /*Fastsyscallsetupisalldoneinhypercalls,so theseareallignored.Stubthemoutheretostop Xenconsolenoise.*/ +break; default: -ret=native_write_msr_safe(msr,low,high); +if(!pmu_msr_write(msr,low,high,&ret)) +ret=native_write_msr_safe(msr,low,high); } returnret;
可见pmu_msr_read完全是个新东西,使用gitblame继续追查。
xen/PMU:InitializationcodeforXenPMU65d0cf0be79feebeb19e7626fd3ed41ae73f642d
xen/PMU:Describevendor-specificPMUregisterse27b72df01109c689062caeba1defa013b759e0e
xen/PMU:InterceptPMU-relatedMSRandAPICaccesses6b08cd6328c58a2ae190c5ee03a2ffcab5ef828e
xen/PMU:PMUemulationcodebf6dfb154d935725c9a2005033ca33017b9df439
发现PMU是Xen在4.3进入主线内核的新特性,于是解决方法就很简单了,把bf6dfb和6b08cd都撤销就好,接下来的事情就让PaXTeam和spender去追查吧。最后的补丁是:
diff-uprNlinux-4.7.3-hardened/arch/x86/xen/apic.clinux-4.7.3-hardened.good/arch/x86/xen/apic.c ---linux-4.7.3-hardened/arch/x86/xen/apic.c2016-07-2419:23:50.000000000+0000 +++linux-4.7.3-hardened.good/arch/x86/xen/apic.c2016-09-1020:05:21.450647009+0000 @@-7,7+7,6@@ #include<xen/xen.h> #include<xen/interface/physdev.h> #include"xen-ops.h" -#include"pmu.h" #include"smp.h" staticunsignedintxen_io_apic_read(unsignedapic,unsignedreg) @@-73,10+72,8@@staticu32xen_apic_read(u32reg) staticvoidxen_apic_write(u32reg,u32val) { -if(reg==APIC_LVTPC){ -(void)pmu_apic_update(reg); +if(reg==APIC_LVTPC) return; -} /*Warntoseeifthere'sanystrayreferences*/ WARN(1,"register:%x,value:%x\n",reg,val); diff-uprNlinux-4.7.3-hardened/arch/x86/xen/enlighten.clinux-4.7.3-hardened.good/arch/x86/xen/enlighten.c ---linux-4.7.3-hardened/arch/x86/xen/enlighten.c2016-09-1019:59:29.237313676+0000 +++linux-4.7.3-hardened.good/arch/x86/xen/enlighten.c2016-09-1020:06:49.683980342+0000 @@-1031,9+1031,6@@staticu64xen_read_msr_safe(unsignedin { u64val; -if(pmu_msr_read(msr,&val,err)) -returnval; - val=native_read_msr_safe(msr,err); switch(msr){ caseMSR_IA32_APICBASE: @@-1081,13+1078,17@@staticintxen_write_msr_safe(unsignedi break; default: -if(!pmu_msr_write(msr,low,high,&ret)) -ret=native_write_msr_safe(msr,low,high); +ret=native_write_msr_safe(msr,low,high); } returnret; } +unsignedlonglongxen_read_pmc(intcounter) +{ +return0; +} + staticu64xen_read_msr(unsignedintmsr) { /* diff-uprNlinux-4.7.3-hardened/arch/x86/xen/pmu.clinux-4.7.3-hardened.good/arch/x86/xen/pmu.c ---linux-4.7.3-hardened/arch/x86/xen/pmu.c2016-07-2419:23:50.000000000+0000 +++linux-4.7.3-hardened.good/arch/x86/xen/pmu.c2016-09-1020:05:21.450647009+0000 @@-13,20+13,11@@ /*x86_pmu.handle_irqdefinition*/ #include"../events/perf_event.h" -#defineXENPMU_IRQ_PROCESSING1 -structxenpmu{ -/*Sharedpagebetweenhypervisoranddomain*/ -structxen_pmu_data*xenpmu_data; -uint8_tflags; -}; -staticDEFINE_PER_CPU(structxenpmu,xenpmu_shared); -#defineget_xenpmu_data()(this_cpu_ptr(&xenpmu_shared)->xenpmu_data) -#defineget_xenpmu_flags()(this_cpu_ptr(&xenpmu_shared)->flags) - -/*MacroforcomputingaddressofaPMUMSRbank*/ -#definefield_offset(ctxt,field)((void*)((uintptr_t)ctxt+\ -(uintptr_t)ctxt->field)) +/*Sharedpagebetweenhypervisoranddomain*/ +staticDEFINE_PER_CPU(structxen_pmu_data*,xenpmu_shared); +#defineget_xenpmu_data()per_cpu(xenpmu_shared,smp_processor_id()) + /*AMDPMU*/ #defineF15H_NUM_COUNTERS6 @@-60,8+51,6@@static__read_mostlyintamd_num_counter /*Aliasregisters(0x4c1)forfull-widthwritestoPMCs*/ #defineMSR_PMC_ALIAS_MASK(~(MSR_IA32_PERFCTR0^MSR_IA32_PMC0)) -#defineINTEL_PMC_TYPE_SHIFT30 - static__read_mostlyintintel_num_arch_counters,intel_num_fixed_counters; @@-178,232+167,6@@staticintis_intel_pmu_msr(u32msr_inde } } -staticboolxen_intel_pmu_emulate(unsignedintmsr,u64*val,inttype, -intindex,boolis_read) -{ -uint64_t*reg=NULL; -structxen_pmu_intel_ctxt*ctxt; -uint64_t*fix_counters; -structxen_pmu_cntr_pair*arch_cntr_pair; -structxen_pmu_data*xenpmu_data=get_xenpmu_data(); -uint8_txenpmu_flags=get_xenpmu_flags(); - - -if(!xenpmu_data||!(xenpmu_flags&XENPMU_IRQ_PROCESSING)) -returnfalse; - -ctxt=&xenpmu_data->pmu.c.intel; - -switch(msr){ -caseMSR_CORE_PERF_GLOBAL_OVF_CTRL: -reg=&ctxt->global_ovf_ctrl; -break; -caseMSR_CORE_PERF_GLOBAL_STATUS: -reg=&ctxt->global_status; -break; -caseMSR_CORE_PERF_GLOBAL_CTRL: -reg=&ctxt->global_ctrl; -break; -caseMSR_CORE_PERF_FIXED_CTR_CTRL: -reg=&ctxt->fixed_ctrl; -break; -default: -switch(type){ -caseMSR_TYPE_COUNTER: -fix_counters=field_offset(ctxt,fixed_counters); -reg=&fix_counters[index]; -break; -caseMSR_TYPE_ARCH_COUNTER: -arch_cntr_pair=field_offset(ctxt,arch_counters); -reg=&arch_cntr_pair[index].counter; -break; -caseMSR_TYPE_ARCH_CTRL: -arch_cntr_pair=field_offset(ctxt,arch_counters); -reg=&arch_cntr_pair[index].control; -break; -default: -returnfalse; -} -} - -if(reg){ -if(is_read) -*val=*reg; -else{ -*reg=*val; - -if(msr==MSR_CORE_PERF_GLOBAL_OVF_CTRL) -ctxt->global_status&=(~(*val)); -} -returntrue; -} - -returnfalse; -} - -staticboolxen_amd_pmu_emulate(unsignedintmsr,u64*val,boolis_read) -{ -uint64_t*reg=NULL; -inti,off=0; -structxen_pmu_amd_ctxt*ctxt; -uint64_t*counter_regs,*ctrl_regs; -structxen_pmu_data*xenpmu_data=get_xenpmu_data(); -uint8_txenpmu_flags=get_xenpmu_flags(); - -if(!xenpmu_data||!(xenpmu_flags&XENPMU_IRQ_PROCESSING)) -returnfalse; - -if(k7_counters_mirrored&& -((msr>=MSR_K7_EVNTSEL0)&&(msr<=MSR_K7_PERFCTR3))) -msr=get_fam15h_addr(msr); - -ctxt=&xenpmu_data->pmu.c.amd; -for(i=0;i<amd_num_counters;i++){ -if(msr==amd_ctrls_base+off){ -ctrl_regs=field_offset(ctxt,ctrls); -reg=&ctrl_regs[i]; -break; -}elseif(msr==amd_counters_base+off){ -counter_regs=field_offset(ctxt,counters); -reg=&counter_regs[i]; -break; -} -off+=amd_msr_step; -} - -if(reg){ -if(is_read) -*val=*reg; -else -*reg=*val; - -returntrue; -} -returnfalse; -} - -boolpmu_msr_read(unsignedintmsr,uint64_t*val,int*err) -{ -if(boot_cpu_data.x86_vendor==X86_VENDOR_AMD){ -if(is_amd_pmu_msr(msr)){ -if(!xen_amd_pmu_emulate(msr,val,1)) -*val=native_read_msr_safe(msr,err); -returntrue; -} -}else{ -inttype,index; - -if(is_intel_pmu_msr(msr,&type,&index)){ -if(!xen_intel_pmu_emulate(msr,val,type,index,1)) -*val=native_read_msr_safe(msr,err); -returntrue; -} -} - -returnfalse; -} - -boolpmu_msr_write(unsignedintmsr,uint32_tlow,uint32_thigh,int*err) -{ -uint64_tval=((uint64_t)high<<32)|low; - -if(boot_cpu_data.x86_vendor==X86_VENDOR_AMD){ -if(is_amd_pmu_msr(msr)){ -if(!xen_amd_pmu_emulate(msr,&val,0)) -*err=native_write_msr_safe(msr,low,high); -returntrue; -} -}else{ -inttype,index; - -if(is_intel_pmu_msr(msr,&type,&index)){ -if(!xen_intel_pmu_emulate(msr,&val,type,index,0)) -*err=native_write_msr_safe(msr,low,high); -returntrue; -} -} - -returnfalse; -} - -staticunsignedlonglongxen_amd_read_pmc(intcounter) -{ -structxen_pmu_amd_ctxt*ctxt; -uint64_t*counter_regs; -structxen_pmu_data*xenpmu_data=get_xenpmu_data(); -uint8_txenpmu_flags=get_xenpmu_flags(); - -if(!xenpmu_data||!(xenpmu_flags&XENPMU_IRQ_PROCESSING)){ -uint32_tmsr; -interr; - -msr=amd_counters_base+(counter*amd_msr_step); -returnnative_read_msr_safe(msr,&err); -} - -ctxt=&xenpmu_data->pmu.c.amd; -counter_regs=field_offset(ctxt,counters); -returncounter_regs[counter]; -} - -staticunsignedlonglongxen_intel_read_pmc(intcounter) -{ -structxen_pmu_intel_ctxt*ctxt; -uint64_t*fixed_counters; -structxen_pmu_cntr_pair*arch_cntr_pair; -structxen_pmu_data*xenpmu_data=get_xenpmu_data(); -uint8_txenpmu_flags=get_xenpmu_flags(); - -if(!xenpmu_data||!(xenpmu_flags&XENPMU_IRQ_PROCESSING)){ -uint32_tmsr; -interr; - -if(counter&(1<<INTEL_PMC_TYPE_SHIFT)) -msr=MSR_CORE_PERF_FIXED_CTR0+(counter&0xffff); -else -msr=MSR_IA32_PERFCTR0+counter; - -returnnative_read_msr_safe(msr,&err); -} - -ctxt=&xenpmu_data->pmu.c.intel; -if(counter&(1<<INTEL_PMC_TYPE_SHIFT)){ -fixed_counters=field_offset(ctxt,fixed_counters); -returnfixed_counters[counter&0xffff]; -} - -arch_cntr_pair=field_offset(ctxt,arch_counters); -returnarch_cntr_pair[counter].counter; -} - -unsignedlonglongxen_read_pmc(intcounter) -{ -if(boot_cpu_data.x86_vendor==X86_VENDOR_AMD) -returnxen_amd_read_pmc(counter); -else -returnxen_intel_read_pmc(counter); -} - -intpmu_apic_update(uint32_tval) -{ -intret; -structxen_pmu_data*xenpmu_data=get_xenpmu_data(); - -if(!xenpmu_data){ -pr_warn_once("%s:pmudatanotinitialized\n",__func__); -return-EINVAL; -} - -xenpmu_data->pmu.l.lapic_lvtpc=val; - -if(get_xenpmu_flags()&XENPMU_IRQ_PROCESSING) -return0; - -ret=HYPERVISOR_xenpmu_op(XENPMU_lvtpc_set,NULL); - -returnret; -} - /*perfcallbacks*/ staticintxen_is_in_guest(void) { @@-476,37+239,26@@staticvoidxen_convert_regs(conststruc irqreturn_txen_pmu_irq_handler(intirq,void*dev_id) { -interr,ret=IRQ_NONE; +intret=IRQ_NONE; structpt_regsregs; conststructxen_pmu_data*xenpmu_data=get_xenpmu_data(); -uint8_txenpmu_flags=get_xenpmu_flags(); if(!xenpmu_data){ pr_warn_once("%s:pmudatanotinitialized\n",__func__); returnret; } -this_cpu_ptr(&xenpmu_shared)->flags= -xenpmu_flags|XENPMU_IRQ_PROCESSING; xen_convert_regs(&xenpmu_data->pmu.r.regs,®s, xenpmu_data->pmu.pmu_flags); if(x86_pmu.handle_irq(®s)) ret=IRQ_HANDLED; -/*WriteoutcachedcontexttoHW*/ -err=HYPERVISOR_xenpmu_op(XENPMU_flush,NULL); -this_cpu_ptr(&xenpmu_shared)->flags=xenpmu_flags; -if(err){ -pr_warn_once("%s:failedhypercall,err:%d\n",__func__,err); -returnIRQ_NONE; -} - returnret; } boolis_xen_pmu(intcpu) { -return(get_xenpmu_data()!=NULL); +return(per_cpu(xenpmu_shared,cpu)!=NULL); } voidxen_pmu_init(intcpu) @@-536,8+288,7@@voidxen_pmu_init(intcpu) if(err) gotofail; -per_cpu(xenpmu_shared,cpu).xenpmu_data=xenpmu_data; -per_cpu(xenpmu_shared,cpu).flags=0; +per_cpu(xenpmu_shared,cpu)=xenpmu_data; if(cpu==0){ perf_register_guest_info_callbacks(&xen_guest_cbs); @@-565,6+316,6@@voidxen_pmu_finish(intcpu) (void)HYPERVISOR_xenpmu_op(XENPMU_finish,&xp); -free_pages((unsignedlong)per_cpu(xenpmu_shared,cpu).xenpmu_data,0); -per_cpu(xenpmu_shared,cpu).xenpmu_data=NULL; +free_pages((unsignedlong)per_cpu(xenpmu_shared,cpu),0); +per_cpu(xenpmu_shared,cpu)=NULL; } diff-uprNlinux-4.7.3-hardened/arch/x86/xen/pmu.hlinux-4.7.3-hardened.good/arch/x86/xen/pmu.h ---linux-4.7.3-hardened/arch/x86/xen/pmu.h2016-07-2419:23:50.000000000+0000 +++linux-4.7.3-hardened.good/arch/x86/xen/pmu.h2016-09-1020:05:21.453980342+0000 @@-7,9+7,5@@irqreturn_txen_pmu_irq_handler(intirq, voidxen_pmu_init(intcpu); voidxen_pmu_finish(intcpu); boolis_xen_pmu(intcpu); -boolpmu_msr_read(unsignedintmsr,uint64_t*val,int*err); -boolpmu_msr_write(unsignedintmsr,uint32_tlow,uint32_thigh,int*err); -intpmu_apic_update(uint32_treg); -unsignedlonglongxen_read_pmc(intcounter); #endif/*__XEN_PMU_H*/
打好补丁再编译内核,被智子锁定版本的内核果然升级成功了。
$uname-r 4.7.3-hardened
更新:官方已在grsecurity-3.1-4.7.4-201609152234.patch中修复问题,不再需要此workaround。