Linode Xen 下 grsecurity >= 4.3 崩溃问题
自从Linux4.3开始,在Linode上使用PaX/grsecurity时,内核会在被pv-grub执行后不久立即崩溃。由于崩溃是在启动后极早期立刻发生的,没有任何可以用来调试的日志,同时公司也不是盖子开的,也没有办法得到母机上有意义的调试信息。这导致了盖子的VPS内核从去年12月开始被锁定在4.2.7。由于不知什么时候产生了Linode东京机房会在2016年6月从Xen迁移到KVM的错觉,也没有花精力去尝试调试这个问题。
然而今年Linode周年庆时硬件全部翻倍,惟独东京机房除外。而根据官方最新的说法,新机房乐观估计要第四季度上线。解决内核问题就不得不提上了盖子的日程,首先是手工修复了不少CVE高危漏洞,随后又祭出diff折腾半天,内核始终会在启动后立刻死亡。而由于grsecurity并不提供git源,所以gitbisect也是不可能的,唯一可用的工具只有Linux4.2.7/补丁文件,与Linux4.3.3/补丁文件。
在阅读代码差异时,一个很大的挑战是如何区分上游内核的修改与下游PaX/grsecurity补丁的修改。直接比较补丁文件会导致代码上下文丢失,让代码的意图不可理解。最后盖子打算编写一个名为metadiff的工具,自动比较并去除在上游中出现的代码段,以便仅仅对PaX/grsecurity的代码进行比较,就连名字都想好了就叫metadiff,但一直没有动手。
直到上个月和Shawn聊天时,提到了自己装个Xen也不是不可行;于是周六终于动手在VirutalBox虚拟机里撞了个Debian+Xen,又在Xen里启动了一个虚拟机,果然很快就得到了内核崩溃的traceback。
rip:ffffffff8100b2b0pmu_msr_read+0x10 flags:00000282isnz rsp:ffffffff81aeff30 rax:8000000000000000rcx:0000000000000001rdx:ffffffff81aeffcc rbx:00000000c0000080rsi:ffffffff81aeffa0rdi:00000000c0000080 rbp:ffffffff81aeffa0r8:0000000000000001r9:00000000ffffffff r10:ffffffff81cf9000r11:0000000000000000r12:ffffffff81aeffcc r13:ffffffff81aeffc4r14:ffffffff81aeffc0r15:6f73b764afec1c9d cs:e033ss:e02bds:0000es:0000 fs:0000@0000000000000000 gs:0000@0000000000000000/0000000000000000 Code(instraddrffffffff8100b2b0) 000000000041544989d4554889f55389fb4883ec10<65>488b0425280000004889 Stack: 000000000000000100000000000000000000000000000000ffffffff8100b2b0 000000010000e0300000000000010082ffffffff81aeff70000000000000e02b 0000000000000000000000000000000000000000c0000080ffffffff81aeffcc ffffffff81aeffc8ffffffff810041c8ffffffff81aeffc8ffffffff81aeffcc CallTrace: [<ffffffff8100b2b0>]pmu_msr_read+0x10<-- [<ffffffff8100b2b0>]pmu_msr_read+0x10 [<ffffffff810041c8>]xen_read_msr_safe+0x18 [<ffffffff81be93eb>]xen_start_kernel+0x1b9
哦?可见内核在xen_start_kernel不久就崩溃了,这是/*FirstCfunctiontobecalledonXenboot*/,在如此早期就崩溃,什么错误日志到看不到也就不奇怪了。来看看xen_read_msr和pmu_msr_read在4.2和4.3之间有什么改变:
---../../4.2.7/linux-4.2.7/arch/x86/xen/enlighten.c2016-09-1100:44:12.010022936+0800
+++arch/x86/xen/enlighten.c2015-12-1513:41:43.000000000+0800
@@-1030,6+1034,9@@staticu64xen_read_msr_safe(unsignedin
{
u64val;
+if(pmu_msr_read(msr,&val,err))
+returnval;
+
val=native_read_msr_safe(msr,err);
switch(msr){
caseMSR_IA32_APICBASE:
@@-1074,9+1081,11@@staticintxen_write_msr_safe(unsignedi
/*Fastsyscallsetupisalldoneinhypercalls,so
theseareallignored.Stubthemoutheretostop
Xenconsolenoise.*/
+break;
default:
-ret=native_write_msr_safe(msr,low,high);
+if(!pmu_msr_write(msr,low,high,&ret))
+ret=native_write_msr_safe(msr,low,high);
}
returnret;
可见pmu_msr_read完全是个新东西,使用gitblame继续追查。
xen/PMU:InitializationcodeforXenPMU65d0cf0be79feebeb19e7626fd3ed41ae73f642d
xen/PMU:Describevendor-specificPMUregisterse27b72df01109c689062caeba1defa013b759e0e
xen/PMU:InterceptPMU-relatedMSRandAPICaccesses6b08cd6328c58a2ae190c5ee03a2ffcab5ef828e
xen/PMU:PMUemulationcodebf6dfb154d935725c9a2005033ca33017b9df439
发现PMU是Xen在4.3进入主线内核的新特性,于是解决方法就很简单了,把bf6dfb和6b08cd都撤销就好,接下来的事情就让PaXTeam和spender去追查吧。最后的补丁是:
diff-uprNlinux-4.7.3-hardened/arch/x86/xen/apic.clinux-4.7.3-hardened.good/arch/x86/xen/apic.c
---linux-4.7.3-hardened/arch/x86/xen/apic.c2016-07-2419:23:50.000000000+0000
+++linux-4.7.3-hardened.good/arch/x86/xen/apic.c2016-09-1020:05:21.450647009+0000
@@-7,7+7,6@@
#include<xen/xen.h>
#include<xen/interface/physdev.h>
#include"xen-ops.h"
-#include"pmu.h"
#include"smp.h"
staticunsignedintxen_io_apic_read(unsignedapic,unsignedreg)
@@-73,10+72,8@@staticu32xen_apic_read(u32reg)
staticvoidxen_apic_write(u32reg,u32val)
{
-if(reg==APIC_LVTPC){
-(void)pmu_apic_update(reg);
+if(reg==APIC_LVTPC)
return;
-}
/*Warntoseeifthere'sanystrayreferences*/
WARN(1,"register:%x,value:%x\n",reg,val);
diff-uprNlinux-4.7.3-hardened/arch/x86/xen/enlighten.clinux-4.7.3-hardened.good/arch/x86/xen/enlighten.c
---linux-4.7.3-hardened/arch/x86/xen/enlighten.c2016-09-1019:59:29.237313676+0000
+++linux-4.7.3-hardened.good/arch/x86/xen/enlighten.c2016-09-1020:06:49.683980342+0000
@@-1031,9+1031,6@@staticu64xen_read_msr_safe(unsignedin
{
u64val;
-if(pmu_msr_read(msr,&val,err))
-returnval;
-
val=native_read_msr_safe(msr,err);
switch(msr){
caseMSR_IA32_APICBASE:
@@-1081,13+1078,17@@staticintxen_write_msr_safe(unsignedi
break;
default:
-if(!pmu_msr_write(msr,low,high,&ret))
-ret=native_write_msr_safe(msr,low,high);
+ret=native_write_msr_safe(msr,low,high);
}
returnret;
}
+unsignedlonglongxen_read_pmc(intcounter)
+{
+return0;
+}
+
staticu64xen_read_msr(unsignedintmsr)
{
/*
diff-uprNlinux-4.7.3-hardened/arch/x86/xen/pmu.clinux-4.7.3-hardened.good/arch/x86/xen/pmu.c
---linux-4.7.3-hardened/arch/x86/xen/pmu.c2016-07-2419:23:50.000000000+0000
+++linux-4.7.3-hardened.good/arch/x86/xen/pmu.c2016-09-1020:05:21.450647009+0000
@@-13,20+13,11@@
/*x86_pmu.handle_irqdefinition*/
#include"../events/perf_event.h"
-#defineXENPMU_IRQ_PROCESSING1
-structxenpmu{
-/*Sharedpagebetweenhypervisoranddomain*/
-structxen_pmu_data*xenpmu_data;
-uint8_tflags;
-};
-staticDEFINE_PER_CPU(structxenpmu,xenpmu_shared);
-#defineget_xenpmu_data()(this_cpu_ptr(&xenpmu_shared)->xenpmu_data)
-#defineget_xenpmu_flags()(this_cpu_ptr(&xenpmu_shared)->flags)
-
-/*MacroforcomputingaddressofaPMUMSRbank*/
-#definefield_offset(ctxt,field)((void*)((uintptr_t)ctxt+\
-(uintptr_t)ctxt->field))
+/*Sharedpagebetweenhypervisoranddomain*/
+staticDEFINE_PER_CPU(structxen_pmu_data*,xenpmu_shared);
+#defineget_xenpmu_data()per_cpu(xenpmu_shared,smp_processor_id())
+
/*AMDPMU*/
#defineF15H_NUM_COUNTERS6
@@-60,8+51,6@@static__read_mostlyintamd_num_counter
/*Aliasregisters(0x4c1)forfull-widthwritestoPMCs*/
#defineMSR_PMC_ALIAS_MASK(~(MSR_IA32_PERFCTR0^MSR_IA32_PMC0))
-#defineINTEL_PMC_TYPE_SHIFT30
-
static__read_mostlyintintel_num_arch_counters,intel_num_fixed_counters;
@@-178,232+167,6@@staticintis_intel_pmu_msr(u32msr_inde
}
}
-staticboolxen_intel_pmu_emulate(unsignedintmsr,u64*val,inttype,
-intindex,boolis_read)
-{
-uint64_t*reg=NULL;
-structxen_pmu_intel_ctxt*ctxt;
-uint64_t*fix_counters;
-structxen_pmu_cntr_pair*arch_cntr_pair;
-structxen_pmu_data*xenpmu_data=get_xenpmu_data();
-uint8_txenpmu_flags=get_xenpmu_flags();
-
-
-if(!xenpmu_data||!(xenpmu_flags&XENPMU_IRQ_PROCESSING))
-returnfalse;
-
-ctxt=&xenpmu_data->pmu.c.intel;
-
-switch(msr){
-caseMSR_CORE_PERF_GLOBAL_OVF_CTRL:
-reg=&ctxt->global_ovf_ctrl;
-break;
-caseMSR_CORE_PERF_GLOBAL_STATUS:
-reg=&ctxt->global_status;
-break;
-caseMSR_CORE_PERF_GLOBAL_CTRL:
-reg=&ctxt->global_ctrl;
-break;
-caseMSR_CORE_PERF_FIXED_CTR_CTRL:
-reg=&ctxt->fixed_ctrl;
-break;
-default:
-switch(type){
-caseMSR_TYPE_COUNTER:
-fix_counters=field_offset(ctxt,fixed_counters);
-reg=&fix_counters[index];
-break;
-caseMSR_TYPE_ARCH_COUNTER:
-arch_cntr_pair=field_offset(ctxt,arch_counters);
-reg=&arch_cntr_pair[index].counter;
-break;
-caseMSR_TYPE_ARCH_CTRL:
-arch_cntr_pair=field_offset(ctxt,arch_counters);
-reg=&arch_cntr_pair[index].control;
-break;
-default:
-returnfalse;
-}
-}
-
-if(reg){
-if(is_read)
-*val=*reg;
-else{
-*reg=*val;
-
-if(msr==MSR_CORE_PERF_GLOBAL_OVF_CTRL)
-ctxt->global_status&=(~(*val));
-}
-returntrue;
-}
-
-returnfalse;
-}
-
-staticboolxen_amd_pmu_emulate(unsignedintmsr,u64*val,boolis_read)
-{
-uint64_t*reg=NULL;
-inti,off=0;
-structxen_pmu_amd_ctxt*ctxt;
-uint64_t*counter_regs,*ctrl_regs;
-structxen_pmu_data*xenpmu_data=get_xenpmu_data();
-uint8_txenpmu_flags=get_xenpmu_flags();
-
-if(!xenpmu_data||!(xenpmu_flags&XENPMU_IRQ_PROCESSING))
-returnfalse;
-
-if(k7_counters_mirrored&&
-((msr>=MSR_K7_EVNTSEL0)&&(msr<=MSR_K7_PERFCTR3)))
-msr=get_fam15h_addr(msr);
-
-ctxt=&xenpmu_data->pmu.c.amd;
-for(i=0;i<amd_num_counters;i++){
-if(msr==amd_ctrls_base+off){
-ctrl_regs=field_offset(ctxt,ctrls);
-reg=&ctrl_regs[i];
-break;
-}elseif(msr==amd_counters_base+off){
-counter_regs=field_offset(ctxt,counters);
-reg=&counter_regs[i];
-break;
-}
-off+=amd_msr_step;
-}
-
-if(reg){
-if(is_read)
-*val=*reg;
-else
-*reg=*val;
-
-returntrue;
-}
-returnfalse;
-}
-
-boolpmu_msr_read(unsignedintmsr,uint64_t*val,int*err)
-{
-if(boot_cpu_data.x86_vendor==X86_VENDOR_AMD){
-if(is_amd_pmu_msr(msr)){
-if(!xen_amd_pmu_emulate(msr,val,1))
-*val=native_read_msr_safe(msr,err);
-returntrue;
-}
-}else{
-inttype,index;
-
-if(is_intel_pmu_msr(msr,&type,&index)){
-if(!xen_intel_pmu_emulate(msr,val,type,index,1))
-*val=native_read_msr_safe(msr,err);
-returntrue;
-}
-}
-
-returnfalse;
-}
-
-boolpmu_msr_write(unsignedintmsr,uint32_tlow,uint32_thigh,int*err)
-{
-uint64_tval=((uint64_t)high<<32)|low;
-
-if(boot_cpu_data.x86_vendor==X86_VENDOR_AMD){
-if(is_amd_pmu_msr(msr)){
-if(!xen_amd_pmu_emulate(msr,&val,0))
-*err=native_write_msr_safe(msr,low,high);
-returntrue;
-}
-}else{
-inttype,index;
-
-if(is_intel_pmu_msr(msr,&type,&index)){
-if(!xen_intel_pmu_emulate(msr,&val,type,index,0))
-*err=native_write_msr_safe(msr,low,high);
-returntrue;
-}
-}
-
-returnfalse;
-}
-
-staticunsignedlonglongxen_amd_read_pmc(intcounter)
-{
-structxen_pmu_amd_ctxt*ctxt;
-uint64_t*counter_regs;
-structxen_pmu_data*xenpmu_data=get_xenpmu_data();
-uint8_txenpmu_flags=get_xenpmu_flags();
-
-if(!xenpmu_data||!(xenpmu_flags&XENPMU_IRQ_PROCESSING)){
-uint32_tmsr;
-interr;
-
-msr=amd_counters_base+(counter*amd_msr_step);
-returnnative_read_msr_safe(msr,&err);
-}
-
-ctxt=&xenpmu_data->pmu.c.amd;
-counter_regs=field_offset(ctxt,counters);
-returncounter_regs[counter];
-}
-
-staticunsignedlonglongxen_intel_read_pmc(intcounter)
-{
-structxen_pmu_intel_ctxt*ctxt;
-uint64_t*fixed_counters;
-structxen_pmu_cntr_pair*arch_cntr_pair;
-structxen_pmu_data*xenpmu_data=get_xenpmu_data();
-uint8_txenpmu_flags=get_xenpmu_flags();
-
-if(!xenpmu_data||!(xenpmu_flags&XENPMU_IRQ_PROCESSING)){
-uint32_tmsr;
-interr;
-
-if(counter&(1<<INTEL_PMC_TYPE_SHIFT))
-msr=MSR_CORE_PERF_FIXED_CTR0+(counter&0xffff);
-else
-msr=MSR_IA32_PERFCTR0+counter;
-
-returnnative_read_msr_safe(msr,&err);
-}
-
-ctxt=&xenpmu_data->pmu.c.intel;
-if(counter&(1<<INTEL_PMC_TYPE_SHIFT)){
-fixed_counters=field_offset(ctxt,fixed_counters);
-returnfixed_counters[counter&0xffff];
-}
-
-arch_cntr_pair=field_offset(ctxt,arch_counters);
-returnarch_cntr_pair[counter].counter;
-}
-
-unsignedlonglongxen_read_pmc(intcounter)
-{
-if(boot_cpu_data.x86_vendor==X86_VENDOR_AMD)
-returnxen_amd_read_pmc(counter);
-else
-returnxen_intel_read_pmc(counter);
-}
-
-intpmu_apic_update(uint32_tval)
-{
-intret;
-structxen_pmu_data*xenpmu_data=get_xenpmu_data();
-
-if(!xenpmu_data){
-pr_warn_once("%s:pmudatanotinitialized\n",__func__);
-return-EINVAL;
-}
-
-xenpmu_data->pmu.l.lapic_lvtpc=val;
-
-if(get_xenpmu_flags()&XENPMU_IRQ_PROCESSING)
-return0;
-
-ret=HYPERVISOR_xenpmu_op(XENPMU_lvtpc_set,NULL);
-
-returnret;
-}
-
/*perfcallbacks*/
staticintxen_is_in_guest(void)
{
@@-476,37+239,26@@staticvoidxen_convert_regs(conststruc
irqreturn_txen_pmu_irq_handler(intirq,void*dev_id)
{
-interr,ret=IRQ_NONE;
+intret=IRQ_NONE;
structpt_regsregs;
conststructxen_pmu_data*xenpmu_data=get_xenpmu_data();
-uint8_txenpmu_flags=get_xenpmu_flags();
if(!xenpmu_data){
pr_warn_once("%s:pmudatanotinitialized\n",__func__);
returnret;
}
-this_cpu_ptr(&xenpmu_shared)->flags=
-xenpmu_flags|XENPMU_IRQ_PROCESSING;
xen_convert_regs(&xenpmu_data->pmu.r.regs,®s,
xenpmu_data->pmu.pmu_flags);
if(x86_pmu.handle_irq(®s))
ret=IRQ_HANDLED;
-/*WriteoutcachedcontexttoHW*/
-err=HYPERVISOR_xenpmu_op(XENPMU_flush,NULL);
-this_cpu_ptr(&xenpmu_shared)->flags=xenpmu_flags;
-if(err){
-pr_warn_once("%s:failedhypercall,err:%d\n",__func__,err);
-returnIRQ_NONE;
-}
-
returnret;
}
boolis_xen_pmu(intcpu)
{
-return(get_xenpmu_data()!=NULL);
+return(per_cpu(xenpmu_shared,cpu)!=NULL);
}
voidxen_pmu_init(intcpu)
@@-536,8+288,7@@voidxen_pmu_init(intcpu)
if(err)
gotofail;
-per_cpu(xenpmu_shared,cpu).xenpmu_data=xenpmu_data;
-per_cpu(xenpmu_shared,cpu).flags=0;
+per_cpu(xenpmu_shared,cpu)=xenpmu_data;
if(cpu==0){
perf_register_guest_info_callbacks(&xen_guest_cbs);
@@-565,6+316,6@@voidxen_pmu_finish(intcpu)
(void)HYPERVISOR_xenpmu_op(XENPMU_finish,&xp);
-free_pages((unsignedlong)per_cpu(xenpmu_shared,cpu).xenpmu_data,0);
-per_cpu(xenpmu_shared,cpu).xenpmu_data=NULL;
+free_pages((unsignedlong)per_cpu(xenpmu_shared,cpu),0);
+per_cpu(xenpmu_shared,cpu)=NULL;
}
diff-uprNlinux-4.7.3-hardened/arch/x86/xen/pmu.hlinux-4.7.3-hardened.good/arch/x86/xen/pmu.h
---linux-4.7.3-hardened/arch/x86/xen/pmu.h2016-07-2419:23:50.000000000+0000
+++linux-4.7.3-hardened.good/arch/x86/xen/pmu.h2016-09-1020:05:21.453980342+0000
@@-7,9+7,5@@irqreturn_txen_pmu_irq_handler(intirq,
voidxen_pmu_init(intcpu);
voidxen_pmu_finish(intcpu);
boolis_xen_pmu(intcpu);
-boolpmu_msr_read(unsignedintmsr,uint64_t*val,int*err);
-boolpmu_msr_write(unsignedintmsr,uint32_tlow,uint32_thigh,int*err);
-intpmu_apic_update(uint32_treg);
-unsignedlonglongxen_read_pmc(intcounter);
#endif/*__XEN_PMU_H*/
打好补丁再编译内核,被智子锁定版本的内核果然升级成功了。
$uname-r 4.7.3-hardened
更新:官方已在grsecurity-3.1-4.7.4-201609152234.patch中修复问题,不再需要此workaround。