untrusted comment: signature from openbsd 6.2 base secret key RWRVWzAMgtyg7h6Z/ES+ftCrC3y4jz05b9Q4N4uIZDqQEzb7lw6vB6BGumpp3us1ydI/8HGsYSlzPUl7ai/pMISPf6LswZDJZAI= OpenBSD 6.2 errata 017, June 24, 2018: Intel CPUs speculatively access FPU registers even when FPU is disabled, so data (including AES keys) from previous contexts could be discovered if using lazy-save approach. Switch to eager-saving approach. Apply by doing: signify -Vep /etc/signify/openbsd-62-base.pub -x 017_intelfpu.patch.sig \ -m - | (cd /usr/src && patch -p0) And then rebuild and install the kernel: KK=`sysctl -n kern.osversion | cut -d# -f1` cd /usr/src/sys/arch/`machine`/compile/$KK make obj make config make make install Index: sys/arch/amd64/amd64/acpi_machdep.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/acpi_machdep.c,v retrieving revision 1.78 diff -u -p -r1.78 acpi_machdep.c --- sys/arch/amd64/amd64/acpi_machdep.c 27 Mar 2017 18:32:53 -0000 1.78 +++ sys/arch/amd64/amd64/acpi_machdep.c 21 Jun 2018 11:54:01 -0000 @@ -389,7 +389,7 @@ acpi_sleep_cpu(struct acpi_softc *sc, in */ if (acpi_savecpu()) { /* Suspend path */ - fpusave_cpu(curcpu(), 1); + KASSERT((curcpu()->ci_flags & CPUF_USERXSTATE) == 0); wbinvd(); #ifdef HIBERNATE @@ -416,6 +416,7 @@ acpi_sleep_cpu(struct acpi_softc *sc, in return (ECANCELED); } /* Resume path */ + fpureset(); /* Reset the vectors */ sc->sc_facs->wakeup_vector = 0; Index: sys/arch/amd64/amd64/acpi_wakecode.S =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/acpi_wakecode.S,v retrieving revision 1.41 diff -u -p -r1.41 acpi_wakecode.S --- sys/arch/amd64/amd64/acpi_wakecode.S 30 Aug 2017 23:40:22 -0000 1.41 +++ sys/arch/amd64/amd64/acpi_wakecode.S 21 Jun 2018 11:54:01 -0000 @@ -217,7 +217,7 @@ _C_LABEL(acpi_protected_mode_resume): /* Reenable paging by setting the appropriate bits in CR0 */ movl %cr0,%eax - orl $(CR0_PE|CR0_PG|CR0_NE|CR0_TS|CR0_MP|CR0_WP),%eax + orl $CR0_DEFAULT,%eax movl %eax,%cr0 /* Flush the prefetch queue again */ Index: sys/arch/amd64/amd64/aesni.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/aesni.c,v retrieving revision 1.42 diff -u -p -r1.42 aesni.c --- sys/arch/amd64/amd64/aesni.c 8 Sep 2017 05:36:51 -0000 1.42 +++ sys/arch/amd64/amd64/aesni.c 21 Jun 2018 11:54:01 -0000 @@ -256,7 +256,9 @@ aesni_newsession(u_int32_t *sidp, struct bzero(ses->ses_ghash->Z, GMAC_BLOCK_LEN); /* prepare a hash subkey */ + fpu_kernel_enter(); aesni_enc(ses, ses->ses_ghash->H, ses->ses_ghash->H); + fpu_kernel_exit(); break; case CRYPTO_MD5_HMAC: Index: sys/arch/amd64/amd64/autoconf.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/autoconf.c,v retrieving revision 1.49 diff -u -p -r1.49 autoconf.c --- sys/arch/amd64/amd64/autoconf.c 20 Jun 2017 21:05:46 -0000 1.49 +++ sys/arch/amd64/amd64/autoconf.c 21 Jun 2018 11:54:01 -0000 @@ -138,10 +138,6 @@ cpu_configure(void) unmap_startup(); -#ifdef MULTIPROCESSOR - cpu_init_idle_pcbs(); -#endif - lcr8(0); spl0(); cold = 0; Index: sys/arch/amd64/amd64/cpu.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/cpu.c,v retrieving revision 1.107.2.1 diff -u -p -r1.107.2.1 cpu.c --- sys/arch/amd64/amd64/cpu.c 26 Feb 2018 12:29:48 -0000 1.107.2.1 +++ sys/arch/amd64/amd64/cpu.c 21 Jun 2018 11:54:01 -0000 @@ -70,6 +70,7 @@ #include "pvbus.h" #include +#include #include #include #include @@ -77,6 +78,7 @@ #include #include #include +#include #include @@ -409,7 +411,6 @@ cpu_attach(struct device *parent, struct pcb->pcb_kstack = kstack + USPACE - 16; pcb->pcb_rbp = pcb->pcb_rsp = kstack + USPACE - 16; pcb->pcb_pmap = pmap_kernel(); - pcb->pcb_cr0 = rcr0(); pcb->pcb_cr3 = pcb->pcb_pmap->pm_pdirpa; #endif @@ -491,6 +492,28 @@ cpu_attach(struct device *parent, struct #endif /* NVMM > 0 */ } +static void +replacexsave(void) +{ + extern long _xrstor, _xsave, _xsaveopt; + u_int32_t eax, ebx, ecx, edx; + static int replacedone = 0; + int s; + + if (replacedone) + return; + replacedone = 1; + + /* find out whether xsaveopt is supported */ + CPUID_LEAF(0xd, 1, eax, ebx, ecx, edx); + s = splhigh(); + codepatch_replace(CPTAG_XRSTOR, &_xrstor, 4); + codepatch_replace(CPTAG_XSAVE, + (eax & XSAVE_XSAVEOPT) ? &_xsaveopt : &_xsave, 4); + splx(s); +} + + /* * Initialize the processor appropriately. */ @@ -498,6 +521,7 @@ cpu_attach(struct device *parent, struct void cpu_init(struct cpu_info *ci) { + struct savefpu *sfp; u_int cr4; /* configure the CPU if needed */ @@ -509,7 +533,6 @@ cpu_init(struct cpu_info *ci) */ patinit(ci); - lcr0(rcr0() | CR0_WP); cr4 = rcr4() | CR4_DEFAULT; if (ci->ci_feature_sefflags_ebx & SEFF0EBX_SMEP) cr4 |= CR4_SMEP; @@ -519,7 +542,7 @@ cpu_init(struct cpu_info *ci) cr4 |= CR4_FSGSBASE; if (ci->ci_feature_sefflags_ecx & SEFF0ECX_UMIP) cr4 |= CR4_UMIP; - if (cpu_ecxfeature & CPUIDECX_XSAVE) + if ((cpu_ecxfeature & CPUIDECX_XSAVE) && cpuid_level >= 0xd) cr4 |= CR4_OSXSAVE; lcr4(cr4); @@ -532,9 +555,25 @@ cpu_init(struct cpu_info *ci) xsave_mask |= XCR0_AVX; xsetbv(0, xsave_mask); CPUID_LEAF(0xd, 0, eax, ebx, ecx, edx); - fpu_save_len = ebx; + if (CPU_IS_PRIMARY(ci)) { + fpu_save_len = ebx; + KASSERT(fpu_save_len <= sizeof(struct savefpu)); + } else { + KASSERT(ebx == fpu_save_len); + } + + replacexsave(); } + /* Give proc0 a clean FPU save area */ + sfp = &proc0.p_addr->u_pcb.pcb_savefpu; + memset(sfp, 0, fpu_save_len); + if (xsave_mask) { + /* must not use xsaveopt here */ + xsave(sfp, xsave_mask); + } else + fxsave(sfp); + #if NVMM > 0 /* Re-enable VMM if needed */ if (ci->ci_flags & CPUF_VMM) @@ -602,24 +641,6 @@ cpu_boot_secondary_processors(void) } void -cpu_init_idle_pcbs(void) -{ - struct cpu_info *ci; - u_long i; - - for (i=0; i < MAXCPUS; i++) { - ci = cpu_info[i]; - if (ci == NULL) - continue; - if (ci->ci_idle_pcb == NULL) - continue; - if ((ci->ci_flags & CPUF_PRESENT) == 0) - continue; - x86_64_init_pcb_tss_ldt(ci); - } -} - -void cpu_start_secondary(struct cpu_info *ci) { int i; @@ -738,7 +759,6 @@ cpu_hatch(void *v) panic("%s: already running!?", ci->ci_dev->dv_xname); #endif - lcr0(ci->ci_idle_pcb->pcb_cr0); cpu_init_idt(); lapic_set_lvt(); gdt_init_cpu(ci); @@ -780,15 +800,14 @@ cpu_debug_dump(void) struct cpu_info *ci; CPU_INFO_ITERATOR cii; - db_printf("addr dev id flags ipis curproc fpcurproc\n"); + db_printf("addr dev id flags ipis curproc\n"); CPU_INFO_FOREACH(cii, ci) { - db_printf("%p %s %u %x %x %10p %10p\n", + db_printf("%p %s %u %x %x %10p\n", ci, ci->ci_dev == NULL ? "BOOT" : ci->ci_dev->dv_xname, ci->ci_cpuid, ci->ci_flags, ci->ci_ipis, - ci->ci_curproc, - ci->ci_fpcurproc); + ci->ci_curproc); } } #endif Index: sys/arch/amd64/amd64/db_interface.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/db_interface.c,v retrieving revision 1.29 diff -u -p -r1.29 db_interface.c --- sys/arch/amd64/amd64/db_interface.c 19 Jul 2017 14:34:10 -0000 1.29 +++ sys/arch/amd64/amd64/db_interface.c 21 Jun 2018 11:54:01 -0000 @@ -66,8 +66,8 @@ #endif extern label_t *db_recover; -extern char *trap_type[]; -extern int trap_types; +extern const char * const trap_type[]; +extern const int trap_types; #ifdef MULTIPROCESSOR struct mutex ddb_mp_mutex = Index: sys/arch/amd64/amd64/fpu.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/fpu.c,v retrieving revision 1.37 diff -u -p -r1.37 fpu.c --- sys/arch/amd64/amd64/fpu.c 4 Oct 2017 02:10:33 -0000 1.37 +++ sys/arch/amd64/amd64/fpu.c 21 Jun 2018 11:54:01 -0000 @@ -53,35 +53,13 @@ #include #include -#include - -int xrstor_user(struct savefpu *_addr, uint64_t _mask); void trap(struct trapframe *); /* - * We do lazy initialization and switching using the TS bit in cr0 and the - * MDP_USEDFPU bit in mdproc. - * - * DNA exceptions are handled like this: - * - * 1) If there is no FPU, return and go to the emulator. - * 2) If someone else has used the FPU, save its state into that process' PCB. - * 3a) If MDP_USEDFPU is not set, set it and initialize the FPU. - * 3b) Otherwise, reload the process' previous FPU state. - * - * When a process is created or exec()s, its saved cr0 image has the TS bit - * set and the MDP_USEDFPU bit clear. The MDP_USEDFPU bit is set when the - * process first gets a DNA and the FPU is initialized. The TS bit is turned - * off when the FPU is used, and turned on again later when the process' FPU - * state is saved. - */ - -/* * The mask of enabled XSAVE features. */ uint64_t xsave_mask; -void fpudna(struct cpu_info *, struct trapframe *); static int x86fpflags_to_siginfo(u_int32_t); /* @@ -101,7 +79,6 @@ uint32_t fpu_mxcsr_mask; void fpuinit(struct cpu_info *ci) { - lcr0(rcr0() & ~(CR0_EM|CR0_TS)); fninit(); if (fpu_mxcsr_mask == 0) { struct fxsave64 fx __attribute__((aligned(16))); @@ -113,7 +90,6 @@ fpuinit(struct cpu_info *ci) else fpu_mxcsr_mask = __INITIAL_MXCSR_MASK__; } - lcr0(rcr0() | (CR0_TS)); } /* @@ -126,23 +102,18 @@ fpuinit(struct cpu_info *ci) void fputrap(struct trapframe *frame) { - struct proc *p = curcpu()->ci_fpcurproc; + struct cpu_info *ci = curcpu(); + struct proc *p = curproc; struct savefpu *sfp = &p->p_addr->u_pcb.pcb_savefpu; u_int32_t mxcsr, statbits; u_int16_t cw; int code; union sigval sv; -#ifdef DIAGNOSTIC - /* - * At this point, fpcurproc should be curproc. If it wasn't, - * the TS bit should be set, and we should have gotten a DNA exception. - */ - if (p != curproc) - panic("fputrap: wrong proc"); -#endif + KASSERT(ci->ci_flags & CPUF_USERXSTATE); + ci->ci_flags &= ~CPUF_USERXSTATE; + fpusavereset(sfp); - fxsave(sfp); if (frame->tf_trapno == T_XMM) { mxcsr = sfp->fp_fxsave.fx_mxcsr; statbits = mxcsr; @@ -187,211 +158,21 @@ x86fpflags_to_siginfo(u_int32_t flags) return (FPE_FLTINV); } -/* - * Implement device not available (DNA) exception - * - * If we were the last process to use the FPU, we can simply return. - * Otherwise, we save the previous state, if necessary, and restore our last - * saved state. - */ -void -fpudna(struct cpu_info *ci, struct trapframe *frame) -{ - struct savefpu *sfp; - struct proc *p; - int s; - - if (ci->ci_fpsaving) { - printf("recursive fpu trap; cr0=%x\n", rcr0()); - return; - } - - s = splipi(); - -#ifdef MULTIPROCESSOR - p = ci->ci_curproc; -#else - p = curproc; -#endif - - /* - * Initialize the FPU state to clear any exceptions. If someone else - * was using the FPU, save their state. - */ - if (ci->ci_fpcurproc != NULL && ci->ci_fpcurproc != p) { - fpusave_cpu(ci, ci->ci_fpcurproc != &proc0); - uvmexp.fpswtch++; - } - splx(s); - - if (p == NULL) { - clts(); - return; - } - - KDASSERT(ci->ci_fpcurproc == NULL); -#ifndef MULTIPROCESSOR - KDASSERT(p->p_addr->u_pcb.pcb_fpcpu == NULL); -#else - if (p->p_addr->u_pcb.pcb_fpcpu != NULL) - fpusave_proc(p, 1); -#endif - - p->p_addr->u_pcb.pcb_cr0 &= ~CR0_TS; - clts(); - - s = splipi(); - ci->ci_fpcurproc = p; - p->p_addr->u_pcb.pcb_fpcpu = ci; - splx(s); - - sfp = &p->p_addr->u_pcb.pcb_savefpu; - - if ((p->p_md.md_flags & MDP_USEDFPU) == 0) { - fninit(); - bzero(&sfp->fp_fxsave, sizeof(sfp->fp_fxsave)); - sfp->fp_fxsave.fx_fcw = __INITIAL_NPXCW__; - sfp->fp_fxsave.fx_mxcsr = __INITIAL_MXCSR__; - fxrstor(&sfp->fp_fxsave); - p->p_md.md_flags |= MDP_USEDFPU; - } else { - if (xsave_mask) { - if (xrstor_user(sfp, xsave_mask)) { - fpusave_proc(p, 0); /* faulted */ - frame->tf_trapno = T_PROTFLT; - trap(frame); - return; - } - } else { - static double zero = 0.0; - - /* - * amd fpu does not restore fip, fdp, fop on fxrstor - * thus leaking other process's execution history. - */ - fnclex(); - __asm volatile("ffree %%st(7)\n\tfldl %0" : : "m" (zero)); - fxrstor(sfp); - } - } -} - - -void -fpusave_cpu(struct cpu_info *ci, int save) -{ - struct proc *p; - int s; - - KDASSERT(ci == curcpu()); - - p = ci->ci_fpcurproc; - if (p == NULL) - return; - - if (save) { -#ifdef DIAGNOSTIC - if (ci->ci_fpsaving != 0) - panic("fpusave_cpu: recursive save!"); -#endif - /* - * Set ci->ci_fpsaving, so that any pending exception will be - * thrown away. (It will be caught again if/when the FPU - * state is restored.) - */ - clts(); - ci->ci_fpsaving = 1; - if (xsave_mask) - xsave(&p->p_addr->u_pcb.pcb_savefpu, xsave_mask); - else - fxsave(&p->p_addr->u_pcb.pcb_savefpu); - ci->ci_fpsaving = 0; - } - - stts(); - p->p_addr->u_pcb.pcb_cr0 |= CR0_TS; - - s = splipi(); - p->p_addr->u_pcb.pcb_fpcpu = NULL; - ci->ci_fpcurproc = NULL; - splx(s); -} - -/* - * Save p's FPU state, which may be on this processor or another processor. - */ -void -fpusave_proc(struct proc *p, int save) -{ - struct cpu_info *ci = curcpu(); - struct cpu_info *oci; - - KDASSERT(p->p_addr != NULL); - - oci = p->p_addr->u_pcb.pcb_fpcpu; - if (oci == NULL) - return; - -#if defined(MULTIPROCESSOR) - if (oci == ci) { - int s = splipi(); - fpusave_cpu(ci, save); - splx(s); - } else { - oci->ci_fpsaveproc = p; - x86_send_ipi(oci, - save ? X86_IPI_SYNCH_FPU : X86_IPI_FLUSH_FPU); - while (p->p_addr->u_pcb.pcb_fpcpu != NULL) - CPU_BUSY_CYCLE(); - } -#else - KASSERT(ci->ci_fpcurproc == p); - fpusave_cpu(ci, save); -#endif -} - void fpu_kernel_enter(void) { - struct cpu_info *ci = curcpu(); - uint32_t cw; - int s; - - /* - * Fast path. If the kernel was using the FPU before, there - * is no work to do besides clearing TS. - */ - if (ci->ci_fpcurproc == &proc0) { - clts(); - return; - } - - s = splipi(); + struct cpu_info *ci = curcpu(); - if (ci->ci_fpcurproc != NULL) { - fpusave_cpu(ci, 1); - uvmexp.fpswtch++; + /* save curproc's FPU state if we haven't already */ + if (ci->ci_flags & CPUF_USERXSTATE) { + ci->ci_flags &= ~CPUF_USERXSTATE; + fpusavereset(&curproc->p_addr->u_pcb.pcb_savefpu); } - - /* Claim the FPU */ - ci->ci_fpcurproc = &proc0; - - splx(s); - - /* Disable DNA exceptions */ - clts(); - - /* Initialize the FPU */ - fninit(); - cw = __INITIAL_NPXCW__; - fldcw(&cw); - cw = __INITIAL_MXCSR__; - ldmxcsr(&cw); } void fpu_kernel_exit(void) { - /* Enable DNA exceptions */ - stts(); + /* make sure we don't leave anything in the registers */ + fpureset(); } Index: sys/arch/amd64/amd64/genassym.cf =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/genassym.cf,v retrieving revision 1.31.8.1 diff -u -p -r1.31.8.1 genassym.cf --- sys/arch/amd64/amd64/genassym.cf 26 Feb 2018 12:29:48 -0000 1.31.8.1 +++ sys/arch/amd64/amd64/genassym.cf 21 Jun 2018 11:54:01 -0000 @@ -94,9 +94,8 @@ member pcb_rbp member pcb_kstack member pcb_fsbase member pcb_onfault -member pcb_fpcpu member pcb_pmap -member pcb_cr0 +member pcb_savefpu struct pmap member pm_cpus @@ -131,7 +130,8 @@ member CPU_INFO_USER_CR3 ci_user_cr3 member CPU_INFO_KERN_RSP ci_kern_rsp member CPU_INFO_INTR_RSP ci_intr_rsp -export CPUF_USERSEGS_BIT +export CPUF_USERSEGS +export CPUF_USERXSTATE struct intrsource member is_recurse Index: sys/arch/amd64/amd64/identcpu.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/identcpu.c,v retrieving revision 1.87.2.1 diff -u -p -r1.87.2.1 identcpu.c --- sys/arch/amd64/amd64/identcpu.c 26 Feb 2018 12:29:48 -0000 1.87.2.1 +++ sys/arch/amd64/amd64/identcpu.c 21 Jun 2018 11:54:01 -0000 @@ -217,6 +217,11 @@ const struct { { CPUIDEDX_ITSC, "ITSC" }, }, cpu_amdspec_ebxfeatures[] = { { CPUIDEBX_IBPB, "IBPB" }, +}, cpu_xsave_extfeatures[] = { + { XSAVE_XSAVEOPT, "XSAVEOPT" }, + { XSAVE_XSAVEC, "XSAVEC" }, + { XSAVE_XGETBV1, "XGETBV1" }, + { XSAVE_XSAVES, "XSAVES" }, }; int @@ -651,6 +656,14 @@ identifycpu(struct cpu_info *ci) printf(",%s", cpu_amdspec_ebxfeatures[i].str); } + } + + /* xsave subfeatures */ + if (cpuid_level >= 0xd) { + CPUID_LEAF(0xd, 1, val, dummy, dummy, dummy); + for (i = 0; i < nitems(cpu_xsave_extfeatures); i++) + if (val & cpu_xsave_extfeatures[i].bit) + printf(",%s", cpu_xsave_extfeatures[i].str); } if (cpu_meltdown) Index: sys/arch/amd64/amd64/ipifuncs.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/ipifuncs.c,v retrieving revision 1.28 diff -u -p -r1.28 ipifuncs.c --- sys/arch/amd64/amd64/ipifuncs.c 23 Nov 2015 22:57:12 -0000 1.28 +++ sys/arch/amd64/amd64/ipifuncs.c 21 Jun 2018 11:54:01 -0000 @@ -62,9 +62,6 @@ void x86_64_ipi_nop(struct cpu_info *); void x86_64_ipi_halt(struct cpu_info *); -void x86_64_ipi_synch_fpu(struct cpu_info *); -void x86_64_ipi_flush_fpu(struct cpu_info *); - #if NVMM > 0 void x86_64_ipi_start_vmm(struct cpu_info *); void x86_64_ipi_stop_vmm(struct cpu_info *); @@ -85,8 +82,8 @@ void (*ipifunc[X86_NIPI])(struct cpu_inf { x86_64_ipi_halt, x86_64_ipi_nop, - x86_64_ipi_flush_fpu, - x86_64_ipi_synch_fpu, + NULL, + NULL, NULL, x86_64_ipi_reload_mtrr, x86_setperf_ipi, @@ -115,7 +112,6 @@ x86_64_ipi_halt(struct cpu_info *ci) SCHED_ASSERT_UNLOCKED(); KASSERT(!__mp_lock_held(&kernel_lock)); - fpusave_cpu(ci, 1); disable_intr(); lapic_disable(); wbinvd(); @@ -125,20 +121,6 @@ x86_64_ipi_halt(struct cpu_info *ci) for(;;) { __asm volatile("hlt"); } -} - -void -x86_64_ipi_flush_fpu(struct cpu_info *ci) -{ - if (ci->ci_fpsaveproc == ci->ci_fpcurproc) - fpusave_cpu(ci, 0); -} - -void -x86_64_ipi_synch_fpu(struct cpu_info *ci) -{ - if (ci->ci_fpsaveproc == ci->ci_fpcurproc) - fpusave_cpu(ci, 1); } #ifdef MTRR Index: sys/arch/amd64/amd64/locore.S =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/locore.S,v retrieving revision 1.89.2.1 diff -u -p -r1.89.2.1 locore.S --- sys/arch/amd64/amd64/locore.S 26 Feb 2018 12:29:48 -0000 1.89.2.1 +++ sys/arch/amd64/amd64/locore.S 21 Jun 2018 11:54:01 -0000 @@ -113,10 +113,11 @@ #include #include +#include #include #include #include -#include +#include /* T_PROTFLT */ #include #if NLAPIC > 0 @@ -345,7 +346,12 @@ ENTRY(cpu_switchto) movb $SONPROC,P_STAT(%r12) # p->p_stat = SONPROC SET_CURPROC(%r12,%rcx) - movl CPUVAR(CPUID),%edi + movl CPUVAR(CPUID),%r9d + + /* for the FPU/"extended CPU state" handling below */ + movq xsave_mask(%rip),%rdx + movl %edx,%eax + shrq $32,%rdx /* If old proc exited, don't bother. */ testq %r13,%r13 @@ -358,7 +364,7 @@ ENTRY(cpu_switchto) * %rax, %rcx - scratch * %r13 - old proc, then old pcb * %r12 - new proc - * %edi - cpuid + * %r9d - cpuid */ movq P_ADDR(%r13),%r13 @@ -366,16 +372,46 @@ ENTRY(cpu_switchto) /* clear the old pmap's bit for the cpu */ movq PCB_PMAP(%r13),%rcx lock - btrq %rdi,PM_CPUS(%rcx) + btrq %r9,PM_CPUS(%rcx) /* Save stack pointers. */ movq %rsp,PCB_RSP(%r13) movq %rbp,PCB_RBP(%r13) + /* + * If the old proc ran in userspace then save the + * floating-point/"extended state" registers + */ + testl $CPUF_USERXSTATE,CPUVAR(FLAGS) + jz .Lxstate_reset + + movq %r13, %rdi +#if PCB_SAVEFPU != 0 + addq $PCB_SAVEFPU,%rdi +#endif + CODEPATCH_START + .byte 0x48; fxsave (%rdi) /* really fxsave64 */ + CODEPATCH_END(CPTAG_XSAVE) + switch_exited: - /* did old proc run in userspace? then reset the segment regs */ - btrl $CPUF_USERSEGS_BIT, CPUVAR(FLAGS) - jnc restore_saved + /* now clear the xstate */ + movq proc0paddr(%rip),%rdi +#if PCB_SAVEFPU != 0 + addq $PCB_SAVEFPU,%rdi +#endif + CODEPATCH_START + .byte 0x48; fxrstor (%rdi) /* really fxrstor64 */ + CODEPATCH_END(CPTAG_XRSTOR) + andl $~CPUF_USERXSTATE,CPUVAR(FLAGS) + +.Lxstate_reset: + /* + * If the segment registers haven't been reset since the old proc + * ran in userspace then reset them now + */ + testl $CPUF_USERSEGS,CPUVAR(FLAGS) + jz restore_saved + andl $~CPUF_USERSEGS,CPUVAR(FLAGS) /* set %ds, %es, %fs, and %gs to expected value to prevent info leak */ movw $(GSEL(GUDATA_SEL, SEL_UPL)),%ax @@ -432,32 +468,17 @@ restore_saved: 0: /* set the new pmap's bit for the cpu */ - movl CPUVAR(CPUID),%edi lock - btsq %rdi,PM_CPUS(%rcx) + btsq %r9,PM_CPUS(%rcx) #ifdef DIAGNOSTIC jc _C_LABEL(switch_pmcpu_set) #endif switch_restored: - /* Restore cr0 (including FPU state). */ - movl PCB_CR0(%r13),%ecx -#ifdef MULTIPROCESSOR - movq PCB_FPCPU(%r13),%r8 - cmpq CPUVAR(SELF),%r8 - jz 1f - orl $CR0_TS,%ecx -1: -#endif - movq %rcx,%cr0 - SET_CURPCB(%r13) /* Interrupts are okay again. */ sti - -switch_return: - popq %r15 popq %r14 popq %r13 @@ -497,7 +518,7 @@ ENTRY(cpu_idle_leave) #ifdef DIAGNOSTIC NENTRY(switch_pmcpu_set) - movabsq $switch_active,%rdi + leaq switch_active(%rip),%rdi call _C_LABEL(panic) /* NOTREACHED */ @@ -529,7 +550,7 @@ IDTVEC(syscall) * %rip and the original rflags has been copied to %r11. %cs and * %ss have been updated to the kernel segments, but %rsp is still * the user-space value. - * First order of business is to swap to the kernel gs.base so that + * First order of business is to swap to the kernel GS.base so that * we can access our struct cpu_info and use the scratch space there * to switch to the kernel page tables (thank you, Intel), then * switch to our kernel stack. Once that's in place we can @@ -563,7 +584,7 @@ NENTRY(Xsyscall_untramp) movq %r11, TF_RFLAGS(%rsp) /* old rflags from syscall insn */ movq $(GSEL(GUCODE_SEL, SEL_UPL)), TF_CS(%rsp) movq %rcx,TF_RIP(%rsp) - movq $2,TF_ERR(%rsp) /* ignored */ + movq %rax,TF_ERR(%rsp) /* stash syscall # for SPL check */ movq CPUVAR(CURPROC),%r14 movq %rsp,P_MD_REGS(%r14) # save pointer to frame @@ -590,8 +611,17 @@ NENTRY(Xsyscall_untramp) /* Could registers have been changed that require an iretq? */ testl $MDP_IRET, P_MD_FLAGS(%r14) - jne intr_fast_exit + jne intr_user_exit_post_ast + + /* Restore FPU/"extended CPU state" if it's not already in the CPU */ + testl $CPUF_USERXSTATE,CPUVAR(FLAGS) + jz .Lsyscall_restore_xstate + + /* Restore FS.base if it's not already in the CPU */ + testl $CPUF_USERSEGS,CPUVAR(FLAGS) + jz .Lsyscall_restore_fsbase +.Lsyscall_restore_registers: movq TF_RDI(%rsp),%rdi movq TF_RSI(%rsp),%rsi movq TF_R8(%rsp),%r8 @@ -604,17 +634,6 @@ NENTRY(Xsyscall_untramp) movq TF_RBP(%rsp),%rbp movq TF_RBX(%rsp),%rbx - /* Restore FS.base if it's not already in the CPU */ - btsl $CPUF_USERSEGS_BIT,CPUVAR(FLAGS) - jc 99f - movq CPUVAR(CURPCB),%rdx - movq PCB_FSBASE(%rdx),%rax - movq %rax,%rdx - shrq $32,%rdx - movl $MSR_FSBASE,%ecx - wrmsr -99: - /* * We need to finish reading from the trapframe, then switch * to the user page tables, swapgs, and return. We need @@ -642,11 +661,42 @@ KUENTRY(syscall_trampback) sysretq .text + .align 16,0xcc + /* in this case, need FS.base but not xstate, rarely happens */ +.Lsyscall_restore_fsbase: /* CPU doesn't have curproc's FS.base */ + orl $CPUF_USERSEGS,CPUVAR(FLAGS) + movq CPUVAR(CURPCB),%rdi + jmp .Lsyscall_restore_fsbase_real + + .align 16,0xcc +.Lsyscall_restore_xstate: /* CPU doesn't have curproc's xstate */ + orl $(CPUF_USERXSTATE|CPUF_USERSEGS),CPUVAR(FLAGS) + movq CPUVAR(CURPCB),%rdi + movq xsave_mask(%rip),%rdx + movl %edx,%eax + shrq $32,%rdx +#if PCB_SAVEFPU != 0 + addq $PCB_SAVEFPU,%rdi +#endif + /* untouched state so can't fault */ + CODEPATCH_START + .byte 0x48; fxrstor (%rdi) /* really fxrstor64 */ + CODEPATCH_END(CPTAG_XRSTOR) +#if PCB_SAVEFPU != 0 + subq $PCB_SAVEFPU,%rdi +#endif +.Lsyscall_restore_fsbase_real: + movq PCB_FSBASE(%rdi),%rdx + movl %edx,%eax + shrq $32,%rdx + movl $MSR_FSBASE,%ecx + wrmsr + jmp .Lsyscall_restore_registers #ifdef DIAGNOSTIC .Lsyscall_spl_not_lowered: - movabsq $spl_lowered, %rdi - movl TF_RAX(%rsp),%esi + leaq spl_lowered(%rip), %rdi + movl TF_ERR(%rsp),%esi /* syscall # stashed above */ movl TF_RDI(%rsp),%edx movl %ebx,%ecx movl CPUVAR(ILEVEL),%r8d @@ -676,15 +726,54 @@ NENTRY(proc_trampoline) /* - * Return via iretq, for real interrupts and signal returns + * Returning to userspace via iretq. We do things in this order: + * - check for ASTs + * - restore FPU/"extended CPU state" if it's not already in the CPU + * - DIAGNOSTIC: no more C calls after this, so check the SPL + * - restore FS.base if it's not already in the CPU + * - restore most registers + * - update the iret frame from the trapframe + * - finish reading from the trapframe + * - switch to the trampoline stack \ + * - jump to the .kutext segment |-- Meltdown workaround + * - switch to the user page tables / + * - swapgs + * - iretq */ -NENTRY(intr_fast_exit) +NENTRY(intr_user_exit) #ifdef DIAGNOSTIC pushfq popq %rdx testq $PSL_I,%rdx - jnz .Lintr_exit_not_blocked + jnz .Lintr_user_exit_not_blocked +#endif /* DIAGNOSTIC */ + + /* Check for ASTs */ + CHECK_ASTPENDING(%r11) + je intr_user_exit_post_ast + CLEAR_ASTPENDING(%r11) + sti + movq %rsp,%rdi + call _C_LABEL(ast) + cli + jmp intr_user_exit + +intr_user_exit_post_ast: + /* Restore FPU/"extended CPU state" if it's not already in the CPU */ + testl $CPUF_USERXSTATE,CPUVAR(FLAGS) + jz .Lintr_restore_xstate + +#ifdef DIAGNOSTIC + /* no more C calls after this, so check the SPL */ + cmpl $0,CPUVAR(ILEVEL) + jne .Luser_spl_not_lowered #endif /* DIAGNOSTIC */ + + /* Restore FS.base if it's not already in the CPU */ + testl $CPUF_USERSEGS,CPUVAR(FLAGS) + jz .Lintr_restore_fsbase + +.Lintr_restore_registers: movq TF_RDI(%rsp),%rdi movq TF_RSI(%rsp),%rsi movq TF_R8(%rsp),%r8 @@ -697,30 +786,7 @@ NENTRY(intr_fast_exit) movq TF_RBP(%rsp),%rbp movq TF_RBX(%rsp),%rbx - testq $SEL_RPL,TF_CS(%rsp) - je intr_exit_recurse /* returning back to kernel? */ - - /* returning to userspace. XXX fix up iret frame here */ - - /* restore FS.base if it's not already in the CPU */ - btsl $CPUF_USERSEGS_BIT,CPUVAR(FLAGS) - jc 99f - movq CPUVAR(CURPCB),%rdx /* for below */ - movq PCB_FSBASE(%rdx),%rax - movq %rax,%rdx - shrq $32,%rdx - movl $MSR_FSBASE,%ecx - wrmsr -99: /* - * Returning to userspace. We need to go things in this order: - * - update the iret frame from the trapframe - * - finish reading from the trapframe - * - switch to the trampoline stack - * - jump to the .kutext segment - * - switch to the user page tables - * - swapgs - * - iretq * To get the final value for the register that was used * for the mov to %cr3, we need access to somewhere accessible * on the user page tables, so we save it in CPUVAR(SCRATCH) @@ -758,7 +824,101 @@ KUENTRY(iretq_tramp) _C_LABEL(doreti_iret): iretq -NENTRY(intr_exit_recurse) + .text + .align 16,0xcc +.Lintr_restore_xstate: /* CPU doesn't have curproc's xstate */ + orl $CPUF_USERXSTATE,CPUVAR(FLAGS) + movq CPUVAR(CURPCB),%rdi +#if PCB_SAVEFPU != 0 + addq $PCB_SAVEFPU,%rdi +#endif + movq xsave_mask(%rip),%rsi + call xrstor_user + testl %eax,%eax + jnz .Lintr_xrstor_faulted +.Lintr_restore_fsbase: /* CPU doesn't have curproc's FS.base */ + orl $CPUF_USERSEGS,CPUVAR(FLAGS) + movq CPUVAR(CURPCB),%rdx + movq PCB_FSBASE(%rdx),%rdx + movl %edx,%eax + shrq $32,%rdx + movl $MSR_FSBASE,%ecx + wrmsr + jmp .Lintr_restore_registers + +.Lintr_xrstor_faulted: + /* + * xrstor faulted; we need to reset the FPU state and call trap() + * to post a signal, which requires interrupts be enabled. + */ + sti + movq proc0paddr(%rip),%rdi +#if PCB_SAVEFPU != 0 + addq $PCB_SAVEFPU,%rdi +#endif + CODEPATCH_START + .byte 0x48; fxrstor (%rdi) /* really fxrstor64 */ + CODEPATCH_END(CPTAG_XRSTOR) + movq $T_PROTFLT,TF_TRAPNO(%rsp) + jmp recall_trap + +#ifdef DIAGNOSTIC +.Lintr_user_exit_not_blocked: + movl warn_once(%rip),%edi + testl %edi,%edi + jnz 1f + incl %edi + movl %edi,warn_once(%rip) + leaq .Lnot_blocked(%rip),%rdi + call _C_LABEL(printf) +#ifdef DDB + int $3 +#endif /* DDB */ +1: cli + jmp intr_user_exit + +.Luser_spl_not_lowered: + sti + leaq intr_spl_lowered(%rip),%rdi + movl CPUVAR(ILEVEL),%esi + xorl %edx,%edx /* always SPL zero for userspace */ + xorl %eax,%eax + call _C_LABEL(printf) +#ifdef DDB + int $3 +#endif /* DDB */ + movl $0,CPUVAR(ILEVEL) + cli + jmp intr_user_exit + + .section .rodata +intr_spl_lowered: + .asciz "WARNING: SPL NOT LOWERED ON TRAP EXIT %x %x\n" + .text +#endif /* DIAGNOSTIC */ + + +/* + * Return to supervisor mode from trap or interrupt + */ +NENTRY(intr_fast_exit) +#ifdef DIAGNOSTIC + pushfq + popq %rdx + testq $PSL_I,%rdx + jnz .Lintr_exit_not_blocked +#endif /* DIAGNOSTIC */ + movq TF_RDI(%rsp),%rdi + movq TF_RSI(%rsp),%rsi + movq TF_R8(%rsp),%r8 + movq TF_R9(%rsp),%r9 + movq TF_R10(%rsp),%r10 + movq TF_R12(%rsp),%r12 + movq TF_R13(%rsp),%r13 + movq TF_R14(%rsp),%r14 + movq TF_R15(%rsp),%r15 + movq TF_RBP(%rsp),%rbp + movq TF_RBX(%rsp),%rbx movq TF_RDX(%rsp),%rdx movq TF_RCX(%rsp),%rcx movq TF_R11(%rsp),%r11 @@ -813,7 +973,6 @@ NENTRY(intr_exit_recurse) #ifdef DIAGNOSTIC .Lintr_exit_not_blocked: - xchgw %bx, %bx movl warn_once(%rip),%edi testl %edi,%edi jnz 1f @@ -837,18 +996,71 @@ warn_once: .text #endif +/* + * FPU/"extended CPU state" handling + * int xrstor_user(sfp, mask) + * load given state, returns 0/1 if okay/it trapped + * void fpusave(sfp) + * save current state, but retain it in the FPU + * void fpusavereset(sfp) + * save current state and reset FPU to initial/kernel state + */ + ENTRY(xrstor_user) movq %rsi, %rdx movl %esi, %eax shrq $32, %rdx .globl xrstor_fault xrstor_fault: - xrstor (%rdi) + CODEPATCH_START + .byte 0x48; fxrstor (%rdi) /* really fxrstor64 */ + CODEPATCH_END(CPTAG_XRSTOR) xorl %eax, %eax ret -ENTRY(xrstor_resume) +NENTRY(xrstor_resume) movl $1, %eax ret +END(xrstor_user) + +ENTRY(fpusave) + movq xsave_mask(%rip),%rdx + movl %edx,%eax + shrq $32,%rdx + CODEPATCH_START + .byte 0x48; fxsave (%rdi) /* really fxsave64 */ + CODEPATCH_END(CPTAG_XSAVE) + ret +END(fpusave) + +ENTRY(fpusavereset) + movq xsave_mask(%rip),%rdx + movl %edx,%eax + shrq $32,%rdx + CODEPATCH_START + .byte 0x48; fxsave (%rdi) /* really fxsave64 */ + CODEPATCH_END(CPTAG_XSAVE) + movq proc0paddr(%rip),%rdi +#if PCB_SAVEFPU != 0 + addq $PCB_SAVEFPU,%rdi +#endif + CODEPATCH_START + .byte 0x48; fxrstor (%rdi) /* really fxrstor64 */ + CODEPATCH_END(CPTAG_XRSTOR) + ret +END(fpusavereset) + + .section .rodata + .globl _C_LABEL(_xrstor) +_C_LABEL(_xrstor): + .byte 0x48; xrstor (%rdi) /* really xrstor64 */ + + .globl _C_LABEL(_xsave) +_C_LABEL(_xsave): + .byte 0x48; xsave (%rdi) /* really xsave64 */ + + .globl _C_LABEL(_xsaveopt) +_C_LABEL(_xsaveopt): + .byte 0x48; xsaveopt (%rdi) /* really xsaveopt64 */ ENTRY(pagezero) movq $-PAGE_SIZE,%rdx Index: sys/arch/amd64/amd64/locore0.S =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/locore0.S,v retrieving revision 1.2.2.1 diff -u -p -r1.2.2.1 locore0.S --- sys/arch/amd64/amd64/locore0.S 26 Feb 2018 12:29:48 -0000 1.2.2.1 +++ sys/arch/amd64/amd64/locore0.S 21 Jun 2018 11:54:01 -0000 @@ -601,7 +601,7 @@ write_efer: * 4. Enable paging and the rest of it. */ movl %cr0,%eax - orl $(CR0_PE|CR0_PG|CR0_NE|CR0_TS|CR0_MP|CR0_WP),%eax + orl $CR0_DEFAULT,%eax movl %eax,%cr0 jmp compat compat: Index: sys/arch/amd64/amd64/machdep.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/machdep.c,v retrieving revision 1.231.2.1 diff -u -p -r1.231.2.1 machdep.c --- sys/arch/amd64/amd64/machdep.c 26 Feb 2018 12:29:48 -0000 1.231.2.1 +++ sys/arch/amd64/amd64/machdep.c 21 Jun 2018 11:54:01 -0000 @@ -395,7 +395,6 @@ x86_64_proc0_tss_ldt_init(void) struct pcb *pcb; cpu_info_primary.ci_curpcb = pcb = &proc0.p_addr->u_pcb; - pcb->pcb_cr0 = rcr0(); pcb->pcb_fsbase = 0; pcb->pcb_kstack = (u_int64_t)proc0.p_addr + USPACE - 16; proc0.p_md.md_regs = (struct trapframe *)pcb->pcb_kstack - 1; @@ -404,20 +403,6 @@ x86_64_proc0_tss_ldt_init(void) lldt(0); } -/* - * Set up TSS for a new PCB. - */ - -#ifdef MULTIPROCESSOR -void -x86_64_init_pcb_tss_ldt(struct cpu_info *ci) -{ - struct pcb *pcb = ci->ci_idle_pcb; - - pcb->pcb_cr0 = rcr0(); -} -#endif /* MULTIPROCESSOR */ - bios_diskinfo_t * bios_getdiskinfo(dev_t dev) { @@ -579,6 +564,7 @@ sendsig(sig_t catcher, int sig, int mask struct trapframe *tf = p->p_md.md_regs; struct sigacts *psp = p->p_p->ps_sigacts; struct sigcontext ksc; + struct savefpu *sfp = &p->p_addr->u_pcb.pcb_savefpu; siginfo_t ksi; register_t sp, scp, sip; u_long sss; @@ -597,17 +583,19 @@ sendsig(sig_t catcher, int sig, int mask sp &= ~15ULL; /* just in case */ sss = (sizeof(ksc) + 15) & ~15; - if (p->p_md.md_flags & MDP_USEDFPU) { - fpusave_proc(p, 1); - sp -= fpu_save_len; - ksc.sc_fpstate = (struct fxsave64 *)sp; - if (copyout(&p->p_addr->u_pcb.pcb_savefpu.fp_fxsave, - (void *)sp, fpu_save_len)) - sigexit(p, SIGILL); + /* Save FPU state to PCB if necessary, then copy it out */ + if (curcpu()->ci_flags & CPUF_USERXSTATE) { + curcpu()->ci_flags &= ~CPUF_USERXSTATE; + fpusavereset(&p->p_addr->u_pcb.pcb_savefpu); + } + sp -= fpu_save_len; + ksc.sc_fpstate = (struct fxsave64 *)sp; + if (copyout(sfp, (void *)sp, fpu_save_len)) + sigexit(p, SIGILL); - /* Signal handlers get a completely clean FP state */ - p->p_md.md_flags &= ~MDP_USEDFPU; - } + /* Now reset the FPU state in PCB */ + memcpy(&p->p_addr->u_pcb.pcb_savefpu, + &proc0.p_addr->u_pcb.pcb_savefpu, fpu_save_len); sip = 0; if (psp->ps_siginfo & sigmask(sig)) { @@ -637,6 +625,9 @@ sendsig(sig_t catcher, int sig, int mask tf->tf_rflags &= ~(PSL_T|PSL_D|PSL_VM|PSL_AC); tf->tf_rsp = scp; tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL); + + /* The reset state _is_ the userspace state for this thread now */ + curcpu()->ci_flags |= CPUF_USERXSTATE; } /* @@ -681,16 +672,23 @@ sys_sigreturn(struct proc *p, void *v, r !USERMODE(ksc.sc_cs, ksc.sc_eflags)) return (EINVAL); - if (p->p_md.md_flags & MDP_USEDFPU) - fpusave_proc(p, 0); + /* Current state is obsolete; toss it and force a reload */ + if (curcpu()->ci_flags & CPUF_USERXSTATE) { + curcpu()->ci_flags &= ~CPUF_USERXSTATE; + fpureset(); + } - if (ksc.sc_fpstate) { + /* Copy in the FPU state to restore */ + if (__predict_true(ksc.sc_fpstate != NULL)) { struct fxsave64 *fx = &p->p_addr->u_pcb.pcb_savefpu.fp_fxsave; if ((error = copyin(ksc.sc_fpstate, fx, fpu_save_len))) return (error); fx->fx_mxcsr &= fpu_mxcsr_mask; - p->p_md.md_flags |= MDP_USEDFPU; + } else { + /* shouldn't happen, but handle it */ + memcpy(&p->p_addr->u_pcb.pcb_savefpu, + &proc0.p_addr->u_pcb.pcb_savefpu, fpu_save_len); } ksc.sc_trapno = tf->tf_trapno; @@ -707,6 +705,7 @@ sys_sigreturn(struct proc *p, void *v, r * when a signal was being delivered, the process will be * completely restored, including the userland %rcx and %r11 * registers which the 'sysretq' instruction cannot restore. + * Also need to make sure we can handle faulting on xrstor. */ p->p_md.md_flags |= MDP_IRET; @@ -1092,10 +1091,19 @@ setregs(struct proc *p, struct exec_pack { struct trapframe *tf; - /* If we were using the FPU, forget about it. */ - if (p->p_addr->u_pcb.pcb_fpcpu != NULL) - fpusave_proc(p, 0); - p->p_md.md_flags &= ~MDP_USEDFPU; + /* Reset FPU state in PCB */ + memcpy(&p->p_addr->u_pcb.pcb_savefpu, + &proc0.p_addr->u_pcb.pcb_savefpu, fpu_save_len); + + if (curcpu()->ci_flags & CPUF_USERXSTATE) { + /* state in CPU is obsolete; reset it */ + fpureset(); + } else { + /* the reset state _is_ the userspace state now */ + curcpu()->ci_flags |= CPUF_USERXSTATE; + } + + /* To reset all registers we have to return via iretq */ p->p_md.md_flags |= MDP_IRET; reset_segs(); Index: sys/arch/amd64/amd64/mptramp.S =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/mptramp.S,v retrieving revision 1.15 diff -u -p -r1.15 mptramp.S --- sys/arch/amd64/amd64/mptramp.S 29 Jun 2017 08:14:36 -0000 1.15 +++ sys/arch/amd64/amd64/mptramp.S 21 Jun 2018 11:54:01 -0000 @@ -120,7 +120,7 @@ _C_LABEL(cpu_spinup_trampoline): movw %ax, %ss addr32 lgdtl (.Lmptramp_gdt32_desc) # load flat descriptor table movl %cr0, %eax # get cr0 - orl $0x1, %eax # enable protected mode + orl $CR0_PE, %eax # enable protected mode movl %eax, %cr0 # doit ljmpl $0x8, $.Lmp_startup @@ -179,7 +179,7 @@ _TRMP_LABEL(.Lmp_startup) movl $.Lmptramp_jmp64,%eax movl %cr0,%ecx # get control word - orl $(CR0_PE|CR0_PG|CR0_NE|CR0_TS|CR0_MP|CR0_WP),%ecx + orl $CR0_DEFAULT,%ecx movl %ecx, %cr0 ljmp *(%eax) @@ -230,7 +230,7 @@ _C_LABEL(cpu_spinup_trampoline_end): #en /* Switch address space. */ movq PCB_CR3(%rsi),%rax movq %rax,%cr3 - movl PCB_CR0(%rsi),%eax + movl $CR0_DEFAULT,%eax movq %rax,%cr0 call _C_LABEL(cpu_hatch) /* NOTREACHED */ Index: sys/arch/amd64/amd64/process_machdep.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/process_machdep.c,v retrieving revision 1.14 diff -u -p -r1.14 process_machdep.c --- sys/arch/amd64/amd64/process_machdep.c 28 Jun 2015 18:54:54 -0000 1.14 +++ sys/arch/amd64/amd64/process_machdep.c 21 Jun 2018 11:54:01 -0000 @@ -127,19 +127,6 @@ process_read_fpregs(struct proc *p, stru { struct fxsave64 *frame = process_fpframe(p); - if (p->p_md.md_flags & MDP_USEDFPU) { - fpusave_proc(p, 1); - } else { - /* Fake a FNINIT. */ - memset(frame, 0, sizeof(*regs)); - frame->fx_fcw = __INITIAL_NPXCW__; - frame->fx_fsw = 0x0000; - frame->fx_ftw = 0x00; - frame->fx_mxcsr = __INITIAL_MXCSR__; - frame->fx_mxcsr_mask = fpu_mxcsr_mask; - p->p_md.md_flags |= MDP_USEDFPU; - } - memcpy(®s->fxstate, frame, sizeof(*regs)); return (0); } @@ -189,14 +176,11 @@ process_write_fpregs(struct proc *p, str { struct fxsave64 *frame = process_fpframe(p); - if (p->p_md.md_flags & MDP_USEDFPU) { - fpusave_proc(p, 0); - } else { - p->p_md.md_flags |= MDP_USEDFPU; - } - memcpy(frame, ®s->fxstate, sizeof(*regs)); frame->fx_mxcsr &= fpu_mxcsr_mask; + + /* force target to return via iretq so bogus xstate can be handled */ + p->p_md.md_flags |= MDP_IRET; return (0); } Index: sys/arch/amd64/amd64/spl.S =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/spl.S,v retrieving revision 1.11.4.1 diff -u -p -r1.11.4.1 spl.S --- sys/arch/amd64/amd64/spl.S 26 Feb 2018 12:29:48 -0000 1.11.4.1 +++ sys/arch/amd64/amd64/spl.S 21 Jun 2018 11:54:01 -0000 @@ -158,18 +158,6 @@ KIDTVEC(doreti) jmp *IS_RESUME(%rax) 2: /* Check for ASTs on exit to user mode. */ movl %ebx,CPUVAR(ILEVEL) -5: CHECK_ASTPENDING(%r11) - je 3f - testb $SEL_RPL,TF_CS(%rsp) - jz 3f -4: CLEAR_ASTPENDING(%r11) - sti - movq %rsp, %rdi - call _C_LABEL(ast) - cli - jmp 5b -3: -#ifdef DIAGNOSTIC - movl $254,%esi -#endif /* DIAGNOSTIC */ + testb $SEL_RPL,TF_CS(%rsp) + jnz intr_user_exit INTRFASTEXIT Index: sys/arch/amd64/amd64/trap.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/trap.c,v retrieving revision 1.61.2.1 diff -u -p -r1.61.2.1 trap.c --- sys/arch/amd64/amd64/trap.c 26 Feb 2018 12:29:48 -0000 1.61.2.1 +++ sys/arch/amd64/amd64/trap.c 21 Jun 2018 11:54:01 -0000 @@ -97,7 +97,7 @@ void trap(struct trapframe *); void ast(struct trapframe *); void syscall(struct trapframe *); -const char *trap_type[] = { +const char * const trap_type[] = { "privileged instruction fault", /* 0 T_PRIVINFLT */ "breakpoint trap", /* 1 T_BPTFLT */ "arithmetic trap", /* 2 T_ARITHTRAP */ @@ -119,17 +119,18 @@ const char *trap_type[] = { "machine check", /* 18 T_MCA */ "SSE FP exception", /* 19 T_XMM */ }; -int trap_types = nitems(trap_type); +const int trap_types = nitems(trap_type); #ifdef DEBUG int trapdebug = 0; #endif -#define IDTVEC(name) __CONCAT(X, name) +static inline void frame_dump(struct trapframe *_tf, struct proc *_p, + const char *_sig, uint64_t _cr2); +static inline void verify_smap(const char *_func); +static inline void debug_trap(struct trapframe *_frame, struct proc *_p, + long _type); -#ifdef TRAP_SIGDEBUG -static void frame_dump(struct trapframe *); -#endif /* * trap(frame): @@ -144,38 +145,17 @@ trap(struct trapframe *frame) struct proc *p = curproc; int type = (int)frame->tf_trapno; struct pcb *pcb; - extern char doreti_iret[], resume_iret[]; - extern char xrstor_fault[], xrstor_resume[]; caddr_t onfault; int error; uint64_t cr2; union sigval sv; + verify_smap(__func__); uvmexp.traps++; + debug_trap(frame, p, type); pcb = (p != NULL && p->p_addr != NULL) ? &p->p_addr->u_pcb : NULL; -#ifdef DEBUG - if (trapdebug) { - printf("trap %d code %llx rip %llx cs %llx rflags %llx " - "cr2 %llx cpl %x\n", - type, frame->tf_err, frame->tf_rip, frame->tf_cs, - frame->tf_rflags, rcr2(), curcpu()->ci_ilevel); - printf("curproc %p\n", (void *)p); - if (p != NULL) - printf("pid %d\n", p->p_p->ps_pid); - } -#endif -#ifdef DIAGNOSTIC - if (curcpu()->ci_feature_sefflags_ebx & SEFF0EBX_SMAP) { - u_long rf = read_rflags(); - if (rf & PSL_AC) { - write_rflags(rf & ~PSL_AC); - panic("%s: AC set on entry", "trap"); - } - } -#endif - if (!KERNELMODE(frame->tf_cs, frame->tf_rflags)) { type |= T_USER; p->p_md.md_regs = frame; @@ -205,27 +185,6 @@ trap(struct trapframe *frame) /*NOTREACHED*/ case T_PROTFLT: - /* - * Check for xrstor faulting because of invalid xstate - * We do this by looking at the address of the - * instruction that faulted. - */ - if (frame->tf_rip == (u_int64_t)xrstor_fault && p != NULL) { - frame->tf_rip = (u_int64_t)xrstor_resume; - return; - } - - /* - * Check for failure during return to user mode. - * We do this by looking at the address of the - * instruction that faulted. - */ - if (frame->tf_rip == (u_int64_t)doreti_iret) { - frame->tf_rip = (u_int64_t)resume_iret; - return; - } - /* FALLTHROUGH */ - case T_SEGNPFLT: case T_ALIGNFLT: case T_TSSFLT: @@ -243,12 +202,7 @@ copyfault: case T_TSSFLT|T_USER: case T_SEGNPFLT|T_USER: case T_STKFLT|T_USER: -#ifdef TRAP_SIGDEBUG - printf("pid %d (%s): %s at rip %llx addr %llx\n", - p->p_p->ps_pid, p->p_p->ps_comm, "BUS", - frame->tf_rip, rcr2()); - frame_dump(frame); -#endif + frame_dump(frame, p, "BUS", 0); sv.sival_ptr = (void *)frame->tf_rip; KERNEL_LOCK(); trapsignal(p, SIGBUS, type & ~T_USER, BUS_OBJERR, sv); @@ -267,30 +221,11 @@ copyfault: trapsignal(p, SIGILL, type & ~T_USER, ILL_PRVOPC, sv); KERNEL_UNLOCK(); goto out; - case T_FPOPFLT|T_USER: /* coprocessor operand fault */ -#ifdef TRAP_SIGDEBUG - printf("pid %d (%s): %s at rip %llx addr %llx\n", - p->p_p->ps_pid, p->p_p->ps_comm, "ILL", - frame->tf_rip, rcr2()); - frame_dump(frame); -#endif - sv.sival_ptr = (void *)frame->tf_rip; - KERNEL_LOCK(); - trapsignal(p, SIGILL, type & ~T_USER, ILL_COPROC, sv); - KERNEL_UNLOCK(); - goto out; + case T_FPOPFLT|T_USER: /* impossible without 32bit compat */ case T_BOUND|T_USER: - sv.sival_ptr = (void *)frame->tf_rip; - KERNEL_LOCK(); - trapsignal(p, SIGFPE, type &~ T_USER, FPE_FLTSUB, sv); - KERNEL_UNLOCK(); - goto out; case T_OFLOW|T_USER: - sv.sival_ptr = (void *)frame->tf_rip; - KERNEL_LOCK(); - trapsignal(p, SIGFPE, type &~ T_USER, FPE_INTOVF, sv); - KERNEL_UNLOCK(); - goto out; + case T_DNA|T_USER: + panic("impossible trap"); case T_DIVIDE|T_USER: sv.sival_ptr = (void *)frame->tf_rip; KERNEL_LOCK(); @@ -401,18 +336,13 @@ faultcommon: p->p_ucred ? (int)p->p_ucred->cr_uid : -1); signal = SIGKILL; } else { -#ifdef TRAP_SIGDEBUG - printf("pid %d (%s): %s at rip %llx addr %llx\n", - p->p_p->ps_pid, p->p_p->ps_comm, "SEGV", - frame->tf_rip, rcr2()); - frame_dump(frame); -#endif - } - if (error == EACCES) - sicode = SEGV_ACCERR; - if (error == EIO) { - signal = SIGBUS; - sicode = BUS_OBJERR; + frame_dump(frame, p, "SEGV", cr2); + if (error == EACCES) + sicode = SEGV_ACCERR; + else if (error == EIO) { + signal = SIGBUS; + sicode = BUS_OBJERR; + } } sv.sival_ptr = (void *)fa; trapsignal(p, signal, T_PAGEFLT, sicode, sv); @@ -455,10 +385,12 @@ out: userret(p); } -#ifdef TRAP_SIGDEBUG -static void -frame_dump(struct trapframe *tf) +static inline void +frame_dump(struct trapframe *tf, struct proc *p, const char *sig, uint64_t cr2) { +#ifdef TRAP_SIGDEBUG + printf("pid %d (%s): %s at rip %llx addr %llx\n", + p->p_p->ps_pid, p->p_p->ps_comm, sig, tf->tf_rip, cr2); printf("rip %p cs 0x%x rfl %p rsp %p ss 0x%x\n", (void *)tf->tf_rip, (unsigned)tf->tf_cs & 0xffff, (void *)tf->tf_rflags, @@ -475,8 +407,38 @@ frame_dump(struct trapframe *tf) (void *)tf->tf_r13, (void *)tf->tf_r14, (void *)tf->tf_r15); printf("rbp %p rbx %p rax %p\n", (void *)tf->tf_rbp, (void *)tf->tf_rbx, (void *)tf->tf_rax); +#endif } + +static inline void +verify_smap(const char *func) +{ +#ifdef DIAGNOSTIC + if (curcpu()->ci_feature_sefflags_ebx & SEFF0EBX_SMAP) { + u_long rf = read_rflags(); + if (rf & PSL_AC) { + write_rflags(rf & ~PSL_AC); + panic("%s: AC set on entry", func); + } + } #endif +} + +static inline void +debug_trap(struct trapframe *frame, struct proc *p, long type) +{ +#ifdef DEBUG + if (trapdebug) { + printf("trap %ld code %llx rip %llx cs %llx rflags %llx " + "cr2 %llx cpl %x\n", + type, frame->tf_err, frame->tf_rip, frame->tf_cs, + frame->tf_rflags, rcr2(), curcpu()->ci_ilevel); + printf("curproc %p\n", (void *)p); + if (p != NULL) + printf("pid %d\n", p->p_p->ps_pid); + } +#endif +} /* @@ -514,16 +476,7 @@ syscall(struct trapframe *frame) size_t argsize, argoff; register_t code, args[9], rval[2], *argp; -#ifdef DIAGNOSTIC - if (curcpu()->ci_feature_sefflags_ebx & SEFF0EBX_SMAP) { - u_long rf = read_rflags(); - if (rf & PSL_AC) { - write_rflags(rf & ~PSL_AC); - panic("%s: AC set on entry", "syscall"); - } - } -#endif - + verify_smap(__func__); uvmexp.syscalls++; p = curproc; Index: sys/arch/amd64/amd64/vector.S =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/vector.S,v retrieving revision 1.51.2.2 diff -u -p -r1.51.2.2 vector.S --- sys/arch/amd64/amd64/vector.S 28 Feb 2018 17:01:34 -0000 1.51.2.2 +++ sys/arch/amd64/amd64/vector.S 21 Jun 2018 11:54:01 -0000 @@ -179,17 +179,7 @@ IDTVEC(trap05) IDTVEC(trap06) ZTRAP(T_PRIVINFLT) IDTVEC(trap07) - pushq $0 # dummy error code - pushq $T_DNA - INTRENTRY(trap07) - sti - cld - SMAP_CLAC - movq CPUVAR(SELF),%rdi - movq %rsp, %rsi - call _C_LABEL(fpudna) - cli - INTRFASTEXIT + ZTRAP(T_DNA) # impossible: we don't do lazy FPU IDTVEC(trap08) pushq $T_DOUBLEFLT jmp calltrap_specstk @@ -202,59 +192,47 @@ IDTVEC(trap0b) IDTVEC(trap0c) TRAP(T_STKFLT) - /* - * If iretq faults, we'll get a trap at doreti_iret with CPL==0 but - * the user's GS.base, which INTRENTRY wouldn't handle correctly - * (it would skip the swapgs), so locally expand both it and - * INTR_SAVE_GPRS, but add an extra test comparing %rip to doreti_iret - * so that we can do the necessary swapgs in that case. - */ +/* + * The #GP (general protection fault) handler has a couple weird cases + * to handle: + * - trapping in iretq to userspace and + * - trapping in xrstor in the kernel. + * We detect both of these by examining the %rip in the iretq_frame. + * Handling them is done by updating %rip in the iretq_frame to point + * to a stub handler of some sort and then iretq'ing to it. For the + * iretq fault we resume in a stub which acts like we got a fresh #GP. + * For the xrstor fault we resume to a stub which returns an error to + * the routine that requested the xrstor. + */ IDTVEC(trap0d) + pushq %rdx pushq %rcx - leaq _C_LABEL(doreti_iret)(%rip),%rcx - cmpq %rcx,16(%rsp) /* over %rcx and err to %rip */ + movq 24(%rsp),%rdx /* over %r[cd]x and err to %rip */ + leaq doreti_iret(%rip),%rcx + cmpq %rcx,%rdx + je .Lhandle_doreti + leaq xrstor_fault(%rip),%rcx + cmpq %rcx,%rdx + je .Lhandle_xrstor popq %rcx - je 1f - testq $SEL_RPL,16(%rsp) /* over err and %rip to %cs */ - je INTRENTRY_LABEL(trap0d) -1: swapgs - movq %rax,CPUVAR(SCRATCH) - movq CPUVAR(KERN_CR3),%rax - testq %rax,%rax - jz 98f - movq %rax,%cr3 - jmp 98f - .text - .globl INTRENTRY_LABEL(trap0d) -INTRENTRY_LABEL(trap0d): /* from kernel */ - pushq $T_PROTFLT - subq $152,%rsp - movq %rcx,TF_RCX(%rsp) - jmp 99f -98: /* from userspace */ - movq CPUVAR(KERN_RSP),%rax - xchgq %rax,%rsp - movq %rcx,TF_RCX(%rsp) - /* set trapno in the trap frame */ - movq $T_PROTFLT,TF_TRAPNO(%rsp) - /* copy err and iretq frame to the trap frame */ - movq 0(%rax),%rcx - movq %rcx,TF_ERR(%rsp) - add $8,%rax - movq IRETQ_RIP(%rax),%rcx - movq %rcx,TF_RIP(%rsp) - movq IRETQ_CS(%rax),%rcx - movq %rcx,TF_CS(%rsp) - movq IRETQ_RFLAGS(%rax),%rcx - movq %rcx,TF_RFLAGS(%rsp) - movq IRETQ_RSP(%rax),%rcx - movq %rcx,TF_RSP(%rsp) - movq IRETQ_SS(%rax),%rcx - movq %rcx,TF_SS(%rsp) - movq CPUVAR(SCRATCH),%rax -99: INTR_SAVE_MOST_GPRS_NO_ADJ - sti - jmp calltrap + popq %rdx + TRAP(T_PROTFLT) + +.Lhandle_xrstor: + /* xrstor faulted; just resume in xrstor_resume */ + leaq xrstor_resume(%rip),%rcx + jmp 1f + +.Lhandle_doreti: + /* iretq faulted; resume in a stub that acts like we got a #GP */ + leaq .Lhandle_doreti_resume(%rip),%rcx +1: movq %rcx,24(%rsp) /* over %r[cd]x and err to %rip */ + popq %rcx + popq %rdx + addq $8,%rsp /* pop the err code */ + jmp doreti_iret +.Lhandle_doreti_resume: + ZTRAP(T_PROTFLT) IDTVEC(trap0e) TRAP(T_PAGEFLT) @@ -305,55 +283,12 @@ Xexceptions: .quad _C_LABEL(Xtrap1e), _C_LABEL(Xtrap1f) /* - * If an error is detected during trap, syscall, or interrupt exit, trap() will - * change %rip to point to this label. At that point, we'll be running with - * the kernel GS.base, but the trap frame will be from CPL==3, so we can't - * go through INTRENTRY as it would do the swapgs that we don't want/need. - * So, locally expand INTRENTRY but without the swapgs: manually - * clean up the stack and resume as if we were handling a general - * protection fault. This will cause the process to get a SIGBUS. - */ -NENTRY(resume_iret) - movq %rax,CPUVAR(SCRATCH) - movq CPUVAR(KERN_CR3),%rax - testq %rax,%rax - jz INTRENTRY_LABEL(iret) - movq %rax,%cr3 - jmp INTRENTRY_LABEL(iret) - .text - .globl INTRENTRY_LABEL(iret) -INTRENTRY_LABEL(iret): /* from kernel */ - movq CPUVAR(KERN_RSP),%rax - xchgq %rax,%rsp - movq %rcx,TF_RCX(%rsp) - /* set trapno+err in the trap frame */ - movq $T_PROTFLT,TF_TRAPNO(%rsp) - movq $0,TF_ERR(%rsp) - /* copy iretq frame to the trap frame */ - movq IRETQ_RIP(%rax),%rcx - movq %rcx,TF_RIP(%rsp) - movq IRETQ_CS(%rax),%rcx - movq %rcx,TF_CS(%rsp) - movq IRETQ_RFLAGS(%rax),%rcx - movq %rcx,TF_RFLAGS(%rsp) - movq IRETQ_RSP(%rax),%rcx - movq %rcx,TF_RSP(%rsp) - movq IRETQ_SS(%rax),%rcx - movq %rcx,TF_SS(%rsp) - movq CPUVAR(SCRATCH),%rax - INTR_SAVE_MOST_GPRS_NO_ADJ - sti - jmp calltrap - - -/* * All traps go through here. Call the generic trap handler, and * check for ASTs afterwards. */ KUENTRY(alltraps) INTRENTRY(alltraps) sti -calltrap: cld SMAP_CLAC #ifdef DIAGNOSTIC @@ -376,19 +311,14 @@ calltrap: jz 2f .Lreal_trap: #endif /* !defined(GPROF) && defined(DDBPROF) */ + .globl recall_trap +recall_trap: movq %rsp, %rdi call _C_LABEL(trap) 2: /* Check for ASTs on exit to user mode. */ cli - CHECK_ASTPENDING(%r11) - je 1f testb $SEL_RPL,TF_CS(%rsp) - jz 1f -5: CLEAR_ASTPENDING(%r11) - sti - movq %rsp, %rdi - call _C_LABEL(ast) - jmp 2b + jnz intr_user_exit #ifndef DIAGNOSTIC 1: INTRFASTEXIT #else /* DIAGNOSTIC */ @@ -396,7 +326,7 @@ calltrap: jne 3f INTRFASTEXIT 3: sti - movabsq $spl_lowered,%rdi + leaq spl_lowered(%rip),%rdi movl CPUVAR(ILEVEL),%esi movl %ebx,%edx xorq %rax,%rax @@ -601,7 +531,6 @@ KIDTVEC(resume_xen_upcall) 2: movq $(1 << LIR_XEN),%rax orq %rax,CPUVAR(IPENDING) -3: INTRFASTEXIT #endif /* NXEN > 0 */ @@ -636,7 +565,6 @@ KIDTVEC(resume_hyperv_upcall) 2: movq $(1 << LIR_HYPERV),%rax orq %rax,CPUVAR(IPENDING) -3: INTRFASTEXIT #endif /* NHYPERV > 0 */ #endif /* NLAPIC > 0 */ @@ -682,7 +610,7 @@ IDTVEC(intr_##name##num) ;\ SMAP_CLAC ;\ incl CPUVAR(IDEPTH) ;\ movq IS_HANDLERS(%r14),%rbx ;\ -6: \ +6: /* loop, walking chain of handlers */ \ movl IH_LEVEL(%rbx),%r12d ;\ cmpl %r13d,%r12d ;\ jle 7f ;\ @@ -693,6 +621,8 @@ IDTVEC(intr_##name##num) ;\ orl %eax,%eax /* should it be counted? */ ;\ jz 4f /* no, skip it */ ;\ incq IH_COUNT(%rbx) /* count the intrs */ ;\ + cmpl $2,%eax /* can't know if it was ours */ ;\ + je 4f /* keep trying */ ;\ cmpl $0,_C_LABEL(intr_shared_edge) ;\ jne 4f /* if no shared edges ... */ ;\ orl %eax,%eax /* 1 means stop trying */ ;\ @@ -700,13 +630,13 @@ IDTVEC(intr_##name##num) ;\ 4: movq IH_NEXT(%rbx),%rbx /* next handler in chain */ ;\ testq %rbx,%rbx ;\ jnz 6b ;\ -5: \ +5: /* successfully handled */ \ cli ;\ unmask(num) /* unmask it in hardware */ ;\ late_ack(num) ;\ sti ;\ jmp _C_LABEL(Xdoreti) /* lower spl and do ASTs */ ;\ -7: \ +7: /* current IPL > handler's ih_level */ \ cli ;\ movq $(1 << num),%rax ;\ orq %rax,CPUVAR(IPENDING) ;\ @@ -714,16 +644,18 @@ IDTVEC(intr_##name##num) ;\ late_ack(num) ;\ sti ;\ jmp _C_LABEL(Xdoreti) /* lower spl and do ASTs */ ;\ -10: \ +10: /* currently masked */ \ cli ;\ movq $(1 << num),%rax ;\ orq %rax,CPUVAR(IPENDING) ;\ level_mask(num) ;\ late_ack(num) ;\ INTRFASTEXIT ;\ -9: \ +9: /* spurious interrupt */ \ unmask(num) ;\ late_ack(num) ;\ + testb $SEL_RPL,TF_CS(%rsp) ;\ + jnz intr_user_exit ;\ INTRFASTEXIT #define ICUADDR IO_ICU1 Index: sys/arch/amd64/amd64/via.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/via.c,v retrieving revision 1.23 diff -u -p -r1.23 via.c --- sys/arch/amd64/amd64/via.c 2 May 2017 11:47:49 -0000 1.23 +++ sys/arch/amd64/amd64/via.c 21 Jun 2018 11:54:01 -0000 @@ -317,18 +317,11 @@ static __inline void viac3_cbc(void *cw, void *src, void *dst, void *key, int rep, void *iv) { - unsigned int creg0; - - creg0 = rcr0(); /* Permit access to SIMD/FPU path */ - lcr0(creg0 & ~(CR0_EM|CR0_TS)); - /* Do the deed */ __asm volatile("pushfq; popfq"); __asm volatile("rep xcryptcbc" : : "b" (key), "a" (iv), "c" (rep), "d" (cw), "S" (src), "D" (dst) : "memory", "cc"); - - lcr0(creg0); } int @@ -521,14 +514,8 @@ void viac3_rnd(void *v) { struct timeout *tmo = v; - unsigned int *p, i, rv, creg0, len = VIAC3_RNG_BUFSIZ; + unsigned int *p, i, rv, len = VIAC3_RNG_BUFSIZ; static int buffer[VIAC3_RNG_BUFSIZ + 2]; /* XXX why + 2? */ -#ifdef MULTIPROCESSOR - int s = splipi(); -#endif - - creg0 = rcr0(); /* Permit access to SIMD/FPU path */ - lcr0(creg0 & ~(CR0_EM|CR0_TS)); /* * Here we collect the random data from the VIA C3 RNG. We make @@ -538,12 +525,6 @@ viac3_rnd(void *v) __asm volatile("rep xstorerng" : "=a" (rv) : "d" (3), "D" (buffer), "c" (len*sizeof(int)) : "memory", "cc"); - - lcr0(creg0); - -#ifdef MULTIPROCESSOR - splx(s); -#endif for (i = 0, p = buffer; i < VIAC3_RNG_BUFSIZ; i++, p++) add_true_randomness(*p); Index: sys/arch/amd64/amd64/vm_machdep.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/vm_machdep.c,v retrieving revision 1.40 diff -u -p -r1.40 vm_machdep.c --- sys/arch/amd64/amd64/vm_machdep.c 12 Sep 2017 02:58:08 -0000 1.40 +++ sys/arch/amd64/amd64/vm_machdep.c 21 Jun 2018 11:54:01 -0000 @@ -73,19 +73,12 @@ cpu_fork(struct proc *p1, struct proc *p void (*func)(void *), void *arg) { struct pcb *pcb = &p2->p_addr->u_pcb; + struct pcb *pcb1 = &p1->p_addr->u_pcb; struct trapframe *tf; struct switchframe *sf; - /* - * If fpuproc != p1, then the fpu h/w state is irrelevant and the - * state had better already be in the pcb. This is true for forks - * but not for dumps. - * - * If fpuproc == p1, then we have to save the fpu h/w state to - * p1's pcb so that we can copy it. - */ - if (p1->p_addr->u_pcb.pcb_fpcpu != NULL) - fpusave_proc(p1, 1); + /* Save the fpu h/w state to p1's pcb so that we can copy it. */ + fpusave(&pcb1->pcb_savefpu); p2->p_md.md_flags = p1->p_md.md_flags; @@ -93,7 +86,7 @@ cpu_fork(struct proc *p1, struct proc *p if (p1 != curproc && p1 != &proc0) panic("cpu_fork: curproc"); #endif - *pcb = p1->p_addr->u_pcb; + *pcb = *pcb1; /* * Activate the address space. @@ -137,11 +130,6 @@ cpu_fork(struct proc *p1, struct proc *p void cpu_exit(struct proc *p) { - - /* If we were using the FPU, forget about it. */ - if (p->p_addr->u_pcb.pcb_fpcpu != NULL) - fpusave_proc(p, 0); - pmap_deactivate(p); sched_exit(p); } Index: sys/arch/amd64/amd64/vmm.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v retrieving revision 1.170 diff -u -p -r1.170 vmm.c --- sys/arch/amd64/amd64/vmm.c 8 Sep 2017 05:36:51 -0000 1.170 +++ sys/arch/amd64/amd64/vmm.c 21 Jun 2018 11:54:01 -0000 @@ -3584,39 +3584,67 @@ vcpu_must_stop(struct vcpu *vcpu) } /* - * vmm_fpusave + * vmm_fpurestore * - * Modified version of fpusave_cpu from fpu.c that only saves the FPU context - * and does not call splipi/splx. Must be called with interrupts disabled. + * Restore the guest's FPU state, saving the existing userland thread's + * FPU context if necessary. Must be called with interrupts disabled. */ -void -vmm_fpusave(void) +int +vmm_fpurestore(struct vcpu *vcpu) { - struct proc *p; struct cpu_info *ci = curcpu(); - p = ci->ci_fpcurproc; - if (p == NULL) - return; + /* save vmmd's FPU state if we haven't already */ + if (ci->ci_flags & CPUF_USERXSTATE) { + ci->ci_flags &= ~CPUF_USERXSTATE; + fpusavereset(&curproc->p_addr->u_pcb.pcb_savefpu); + } - if (ci->ci_fpsaving != 0) - panic("%s: recursive save!", __func__); - /* - * Set ci->ci_fpsaving, so that any pending exception will be - * thrown away. (It will be caught again if/when the FPU - * state is restored.) - */ - ci->ci_fpsaving = 1; - if (xsave_mask) - xsave(&p->p_addr->u_pcb.pcb_savefpu, xsave_mask); - else - fxsave(&p->p_addr->u_pcb.pcb_savefpu); - ci->ci_fpsaving = 0; + if (vcpu->vc_fpuinited) { + /* Restore guest XCR0 and FPU context */ + if (vcpu->vc_gueststate.vg_xcr0 & ~xsave_mask) { + DPRINTF("%s: guest attempted to set invalid %s\n" + __func__, "bits in xcr0"); + return EINVAL; + } - p->p_addr->u_pcb.pcb_cr0 |= CR0_TS; + if (xrstor_user(&vcpu->vc_g_fpu, xsave_mask)) { + DPRINTF("%s: guest attempted to set invalid %s\n" + __func__, "xsave/xrstor state"); + return EINVAL; + } + } + + if (xsave_mask) { + /* Restore guest %xcr0 */ + xsetbv(0, vcpu->vc_gueststate.vg_xcr0); + } - p->p_addr->u_pcb.pcb_fpcpu = NULL; - ci->ci_fpcurproc = NULL; + return 0; +} + +/* + * vmm_fpusave + * + * Save the guest's FPU state. Must be called with interrupts disabled. + */ +void +vmm_fpusave(struct vcpu *vcpu) +{ + if (xsave_mask) { + /* Save guest %xcr0 */ + vcpu->vc_gueststate.vg_xcr0 = xgetbv(0); + + /* Restore host %xcr0 */ + xsetbv(0, xsave_mask); + } + + /* + * Save full copy of FPU state - guest content is always + * a subset of host's save area (see xsetbv exit handler) + */ + fpusavereset(&vcpu->vc_g_fpu); + vcpu->vc_fpuinited = 1; } /* @@ -3839,39 +3867,10 @@ vcpu_run_vmx(struct vcpu *vcpu, struct v /* Disable interrupts and save the current FPU state. */ disable_intr(); - clts(); - vmm_fpusave(); - - /* Initialize the guest FPU if not inited already */ - if (!vcpu->vc_fpuinited) { - fninit(); - bzero(&vcpu->vc_g_fpu.fp_fxsave, - sizeof(vcpu->vc_g_fpu.fp_fxsave)); - vcpu->vc_g_fpu.fp_fxsave.fx_fcw = - __INITIAL_NPXCW__; - vcpu->vc_g_fpu.fp_fxsave.fx_mxcsr = - __INITIAL_MXCSR__; - fxrstor(&vcpu->vc_g_fpu.fp_fxsave); - - vcpu->vc_fpuinited = 1; - } - - if (xsave_mask) { - /* Restore guest XCR0 and FPU context */ - if (vcpu->vc_gueststate.vg_xcr0 & ~xsave_mask) { - DPRINTF("%s: guest attempted to set invalid " - "bits in xcr0\n", __func__); - ret = EINVAL; - stts(); - enable_intr(); - break; - } - - /* Restore guest %xcr0 */ - xrstor(&vcpu->vc_g_fpu, xsave_mask); - xsetbv(0, vcpu->vc_gueststate.vg_xcr0); - } else - fxrstor(&vcpu->vc_g_fpu.fp_fxsave); + if ((ret = vmm_fpurestore(vcpu))) { + enable_intr(); + break; + } KERNEL_UNLOCK(); ret = vmx_enter_guest(&vcpu->vc_control_pa, @@ -3882,27 +3881,7 @@ vcpu_run_vmx(struct vcpu *vcpu, struct v * the guest FPU state still possibly on the CPU. Save the FPU * state before re-enabling interrupts. */ - if (xsave_mask) { - /* Save guest %xcr0 */ - vcpu->vc_gueststate.vg_xcr0 = xgetbv(0); - - /* Restore host %xcr0 */ - xsetbv(0, xsave_mask); - - /* - * Save full copy of FPU state - guest content is - * always a subset of host's save area (see xsetbv - * exit handler) - */ - xsave(&vcpu->vc_g_fpu, xsave_mask); - } else - fxsave(&vcpu->vc_g_fpu); - - /* - * FPU state is invalid, set CR0_TS to force DNA trap on next - * access. - */ - stts(); + vmm_fpusave(vcpu); enable_intr(); @@ -5715,39 +5694,10 @@ vcpu_run_svm(struct vcpu *vcpu, struct v /* Disable interrupts and save the current FPU state. */ disable_intr(); - clts(); - vmm_fpusave(); - - /* Initialize the guest FPU if not inited already */ - if (!vcpu->vc_fpuinited) { - fninit(); - bzero(&vcpu->vc_g_fpu.fp_fxsave, - sizeof(vcpu->vc_g_fpu.fp_fxsave)); - vcpu->vc_g_fpu.fp_fxsave.fx_fcw = - __INITIAL_NPXCW__; - vcpu->vc_g_fpu.fp_fxsave.fx_mxcsr = - __INITIAL_MXCSR__; - fxrstor(&vcpu->vc_g_fpu.fp_fxsave); - - vcpu->vc_fpuinited = 1; - } - - if (xsave_mask) { - /* Restore guest XCR0 and FPU context */ - if (vcpu->vc_gueststate.vg_xcr0 & ~xsave_mask) { - DPRINTF("%s: guest attempted to set invalid " - "bits in xcr0\n", __func__); - ret = EINVAL; - stts(); - enable_intr(); - break; - } - - /* Restore guest %xcr0 */ - xrstor(&vcpu->vc_g_fpu, xsave_mask); - xsetbv(0, vcpu->vc_gueststate.vg_xcr0); - } else - fxrstor(&vcpu->vc_g_fpu.fp_fxsave); + if ((ret = vmm_fpurestore(vcpu))) { + enable_intr(); + break; + } KERNEL_UNLOCK(); @@ -5761,27 +5711,7 @@ vcpu_run_svm(struct vcpu *vcpu, struct v * the guest FPU state still possibly on the CPU. Save the FPU * state before re-enabling interrupts. */ - if (xsave_mask) { - /* Save guest %xcr0 */ - vcpu->vc_gueststate.vg_xcr0 = xgetbv(0); - - /* Restore host %xcr0 */ - xsetbv(0, xsave_mask); - - /* - * Save full copy of FPU state - guest content is - * always a subset of host's save area (see xsetbv - * exit handler) - */ - xsave(&vcpu->vc_g_fpu, xsave_mask); - } else - fxsave(&vcpu->vc_g_fpu); - - /* - * FPU state is invalid, set CR0_TS to force DNA trap on next - * access. - */ - stts(); + vmm_fpusave(vcpu); enable_intr(); Index: sys/arch/amd64/include/codepatch.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/codepatch.h,v retrieving revision 1.4 diff -u -p -r1.4 codepatch.h --- sys/arch/amd64/include/codepatch.h 25 Aug 2017 19:28:48 -0000 1.4 +++ sys/arch/amd64/include/codepatch.h 21 Jun 2018 11:54:01 -0000 @@ -50,6 +50,8 @@ void codepatch_call(uint16_t tag, void * #define CPTAG_STAC 1 #define CPTAG_CLAC 2 #define CPTAG_EOI 3 +#define CPTAG_XRSTOR 4 +#define CPTAG_XSAVE 5 /* * As stac/clac SMAP instructions are 3 bytes, we want the fastest Index: sys/arch/amd64/include/cpu.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/cpu.h,v retrieving revision 1.114.4.1 diff -u -p -r1.114.4.1 cpu.h --- sys/arch/amd64/include/cpu.h 26 Feb 2018 12:29:48 -0000 1.114.4.1 +++ sys/arch/amd64/include/cpu.h 21 Jun 2018 11:54:01 -0000 @@ -115,10 +115,6 @@ struct cpu_info { u_int64_t ci_intr_rsp; /* U<-->K trampoline stack */ u_int64_t ci_user_cr3; /* U-K page table */ - struct proc *ci_fpcurproc; - struct proc *ci_fpsaveproc; - int ci_fpsaving; - struct pcb *ci_curpcb; struct pcb *ci_idle_pcb; @@ -216,9 +212,9 @@ struct cpu_info { #define CPUF_IDENTIFIED 0x0020 /* CPU has been identified */ #define CPUF_CONST_TSC 0x0040 /* CPU has constant TSC */ -#define CPUF_USERSEGS_BIT 7 /* CPU has curproc's segments */ -#define CPUF_USERSEGS (1< /* - * amd64 only uses the extended save/restore format used - * by fxsave/fsrestore, to always deal with the SSE registers, - * which are part of the ABI to pass floating point values. - * Must be stored in memory on a 16-byte boundary. + * If the CPU supports xsave/xrstor then we use them so that we can provide + * AVX support. Otherwise we require fxsave/fxrstor, as the SSE registers + * are part of the ABI for passing floating point values. + * While fxsave/fxrstor only required 16-byte alignment for the save area, + * xsave/xrstor requires the save area to have 64-byte alignment. */ struct fxsave64 { @@ -63,23 +64,22 @@ extern uint32_t fpu_mxcsr_mask; extern uint64_t xsave_mask; void fpuinit(struct cpu_info *); -void fpudrop(void); -void fpudiscard(struct proc *); void fputrap(struct trapframe *); -void fpusave_proc(struct proc *, int); -void fpusave_cpu(struct cpu_info *, int); +void fpusave(struct savefpu *); +void fpusavereset(struct savefpu *); void fpu_kernel_enter(void); void fpu_kernel_exit(void); +int xrstor_user(struct savefpu *_addr, uint64_t _mask); +#define fpureset() \ + xrstor_user(&proc0.p_addr->u_pcb.pcb_savefpu, xsave_mask) + #define fninit() __asm("fninit") #define fwait() __asm("fwait") -#define fnclex() __asm("fnclex") +/* should be fxsave64, but where we use this it doesn't matter */ #define fxsave(addr) __asm("fxsave %0" : "=m" (*addr)) -#define fxrstor(addr) __asm("fxrstor %0" : : "m" (*addr)) #define ldmxcsr(addr) __asm("ldmxcsr %0" : : "m" (*addr)) #define fldcw(addr) __asm("fldcw %0" : : "m" (*addr)) -#define clts() __asm("clts") -#define stts() lcr0(rcr0() | CR0_TS) static inline void xsave(struct savefpu *addr, uint64_t mask) @@ -88,18 +88,9 @@ xsave(struct savefpu *addr, uint64_t mas lo = mask; hi = mask >> 32; + /* should be xsave64, but where we use this it doesn't matter */ __asm volatile("xsave %0" : "=m" (*addr) : "a" (lo), "d" (hi) : "memory"); -} - -static inline void -xrstor(struct savefpu *addr, uint64_t mask) -{ - uint32_t lo, hi; - - lo = mask; - hi = mask >> 32; - __asm volatile("xrstor %0" : : "m" (*addr), "a" (lo), "d" (hi)); } #endif Index: sys/arch/amd64/include/intrdefs.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/intrdefs.h,v retrieving revision 1.16 diff -u -p -r1.16 intrdefs.h --- sys/arch/amd64/include/intrdefs.h 22 Jun 2016 01:12:38 -0000 1.16 +++ sys/arch/amd64/include/intrdefs.h 21 Jun 2018 11:54:01 -0000 @@ -75,8 +75,6 @@ #define X86_IPI_HALT 0x00000001 #define X86_IPI_NOP 0x00000002 -#define X86_IPI_FLUSH_FPU 0x00000004 -#define X86_IPI_SYNCH_FPU 0x00000008 #define X86_IPI_TLB 0x00000010 #define X86_IPI_MTRR 0x00000020 #define X86_IPI_SETPERF 0x00000040 @@ -84,10 +82,10 @@ #define X86_IPI_START_VMM 0x00000100 #define X86_IPI_STOP_VMM 0x00000200 -#define X86_NIPI 10 +#define X86_NIPI 11 -#define X86_IPI_NAMES { "halt IPI", "nop IPI", "FPU flush IPI", \ - "FPU synch IPI", "TLB shootdown IPI", \ +#define X86_IPI_NAMES { "halt IPI", "nop IPI", NULL, \ + NULL, "TLB shootdown IPI", \ "MTRR update IPI", "setperf IPI", "ddb IPI", \ "VMM start IPI", "VMM stop IPI" } Index: sys/arch/amd64/include/pcb.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/pcb.h,v retrieving revision 1.16 diff -u -p -r1.16 pcb.h --- sys/arch/amd64/include/pcb.h 26 Apr 2017 07:05:24 -0000 1.16 +++ sys/arch/amd64/include/pcb.h 21 Jun 2018 11:54:01 -0000 @@ -69,7 +69,6 @@ #include -#include #include /* @@ -84,9 +83,7 @@ struct pcb { u_int64_t pcb_kstack; /* kernel stack address */ u_int64_t pcb_fsbase; /* per-thread offset: %fs */ caddr_t pcb_onfault; /* copyin/out fault recovery */ - struct cpu_info *pcb_fpcpu; /* cpu holding our fp state. */ struct pmap *pcb_pmap; /* back pointer to our pmap */ - int pcb_cr0; /* saved image of CR0 */ }; #ifdef _KERNEL Index: sys/arch/amd64/include/proc.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/proc.h,v retrieving revision 1.9 diff -u -p -r1.9 proc.h --- sys/arch/amd64/include/proc.h 13 Apr 2017 03:52:25 -0000 1.9 +++ sys/arch/amd64/include/proc.h 21 Jun 2018 11:54:01 -0000 @@ -46,7 +46,6 @@ struct mdproc { }; /* md_flags */ -#define MDP_USEDFPU 0x0001 /* has used the FPU */ #define MDP_IRET 0x0002 /* return via iret, not sysret */ /* (iret can restore r11 and rcx) */ Index: sys/arch/amd64/include/specialreg.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/specialreg.h,v retrieving revision 1.61.4.1 diff -u -p -r1.61.4.1 specialreg.h --- sys/arch/amd64/include/specialreg.h 26 Feb 2018 12:29:48 -0000 1.61.4.1 +++ sys/arch/amd64/include/specialreg.h 21 Jun 2018 11:54:01 -0000 @@ -1386,3 +1386,15 @@ #define PAT_WB 0x6UL #define PAT_UCMINUS 0x7UL +/* + * XSAVE subfeatures (cpuid 0xd, leaf 1) + */ +#define XSAVE_XSAVEOPT 0x1UL +#define XSAVE_XSAVEC 0x2UL +#define XSAVE_XGETBV1 0x4UL +#define XSAVE_XSAVES 0x8UL + +/* + * Default cr0 flags. + */ +#define CR0_DEFAULT (CR0_PE|CR0_PG|CR0_NE|CR0_WP)