diff --git a/s2/cpuid_amd64.go b/s2/cpuid_amd64.go
deleted file mode 100644
index 114ccd2b95..0000000000
--- a/s2/cpuid_amd64.go
+++ /dev/null
@@ -1,1194 +0,0 @@
-// Generated, DO NOT EDIT,
-// but copy it to your own project and rename the package.
-// See more at http://github.com/klauspost/cpuid
-
-// +build !appengine
-// +build gc
-// +build !noasm
-
-package s2
-
-import "strings"
-
-func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
-func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
-func asmXgetbv(index uint32) (eax, edx uint32)
-func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
-
-func initCPU() {
-	cpuid = asmCpuid
-	cpuidex = asmCpuidex
-	xgetbv = asmXgetbv
-	rdtscpAsm = asmRdtscpAsm
-}
-
-// Vendor is a representation of a CPU vendor.
-type vendor int
-
-const (
-	other vendor = iota
-	intel
-	amd
-	via
-	transmeta
-	nsc
-	kvm  // Kernel-based Virtual Machine
-	msvm // Microsoft Hyper-V or Windows Virtual PC
-	vmware
-	xenhvm
-	bhyve
-	hygon
-)
-
-const (
-	cmov               = 1 << iota // i686 CMOV
-	nx                             // NX (No-Execute) bit
-	amd3dnow                       // AMD 3DNOW
-	amd3dnowext                    // AMD 3DNowExt
-	mmx                            // standard MMX
-	mmxext                         // SSE integer functions or AMD MMX ext
-	sse                            // SSE functions
-	sse2                           // P4 SSE functions
-	sse3                           // Prescott SSE3 functions
-	ssse3                          // Conroe SSSE3 functions
-	sse4                           // Penryn SSE4.1 functions
-	sse4a                          // AMD Barcelona microarchitecture SSE4a instructions
-	sse42                          // Nehalem SSE4.2 functions
-	avx                            // AVX functions
-	avx2                           // AVX2 functions
-	fma3                           // Intel FMA 3
-	fma4                           // Bulldozer FMA4 functions
-	xop                            // Bulldozer XOP functions
-	f16c                           // Half-precision floating-point conversion
-	bmi1                           // Bit Manipulation Instruction Set 1
-	bmi2                           // Bit Manipulation Instruction Set 2
-	tbm                            // AMD Trailing Bit Manipulation
-	lzcnt                          // LZCNT instruction
-	popcnt                         // POPCNT instruction
-	aesni                          // Advanced Encryption Standard New Instructions
-	clmul                          // Carry-less Multiplication
-	htt                            // Hyperthreading (enabled)
-	hle                            // Hardware Lock Elision
-	rtm                            // Restricted Transactional Memory
-	rdrand                         // RDRAND instruction is available
-	rdseed                         // RDSEED instruction is available
-	adx                            // Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
-	sha                            // Intel SHA Extensions
-	avx512f                        // AVX-512 Foundation
-	avx512dq                       // AVX-512 Doubleword and Quadword Instructions
-	avx512ifma                     // AVX-512 Integer Fused Multiply-Add Instructions
-	avx512pf                       // AVX-512 Prefetch Instructions
-	avx512er                       // AVX-512 Exponential and Reciprocal Instructions
-	avx512cd                       // AVX-512 Conflict Detection Instructions
-	avx512bw                       // AVX-512 Byte and Word Instructions
-	avx512vl                       // AVX-512 Vector Length Extensions
-	avx512vbmi                     // AVX-512 Vector Bit Manipulation Instructions
-	avx512vbmi2                    // AVX-512 Vector Bit Manipulation Instructions, Version 2
-	avx512vnni                     // AVX-512 Vector Neural Network Instructions
-	avx512vpopcntdq                // AVX-512 Vector Population Count Doubleword and Quadword
-	gfni                           // Galois Field New Instructions
-	vaes                           // Vector AES
-	avx512bitalg                   // AVX-512 Bit Algorithms
-	vpclmulqdq                     // Carry-Less Multiplication Quadword
-	avx512bf16                     // AVX-512 BFLOAT16 Instructions
-	avx512vp2intersect             // AVX-512 Intersect for D/Q
-	mpx                            // Intel MPX (Memory Protection Extensions)
-	erms                           // Enhanced REP MOVSB/STOSB
-	rdtscp                         // RDTSCP Instruction
-	cx16                           // CMPXCHG16B Instruction
-	sgx                            // Software Guard Extensions
-	sgxlc                          // Software Guard Extensions Launch Control
-	ibpb                           // Indirect Branch Restricted Speculation (IBRS) and Indirect Branch Predictor Barrier (IBPB)
-	stibp                          // Single Thread Indirect Branch Predictors
-	vmx                            // Virtual Machine Extensions
-
-	// Performance indicators
-	sse2slow // SSE2 is supported, but usually not faster
-	sse3slow // SSE3 is supported, but usually not faster
-	atom     // Atom processor, some SSSE3 instructions are slower
-)
-
-var flagNames = map[flags]string{
-	cmov:               "CMOV",               // i686 CMOV
-	nx:                 "NX",                 // NX (No-Execute) bit
-	amd3dnow:           "AMD3DNOW",           // AMD 3DNOW
-	amd3dnowext:        "AMD3DNOWEXT",        // AMD 3DNowExt
-	mmx:                "MMX",                // Standard MMX
-	mmxext:             "MMXEXT",             // SSE integer functions or AMD MMX ext
-	sse:                "SSE",                // SSE functions
-	sse2:               "SSE2",               // P4 SSE2 functions
-	sse3:               "SSE3",               // Prescott SSE3 functions
-	ssse3:              "SSSE3",              // Conroe SSSE3 functions
-	sse4:               "SSE4.1",             // Penryn SSE4.1 functions
-	sse4a:              "SSE4A",              // AMD Barcelona microarchitecture SSE4a instructions
-	sse42:              "SSE4.2",             // Nehalem SSE4.2 functions
-	avx:                "AVX",                // AVX functions
-	avx2:               "AVX2",               // AVX functions
-	fma3:               "FMA3",               // Intel FMA 3
-	fma4:               "FMA4",               // Bulldozer FMA4 functions
-	xop:                "XOP",                // Bulldozer XOP functions
-	f16c:               "F16C",               // Half-precision floating-point conversion
-	bmi1:               "BMI1",               // Bit Manipulation Instruction Set 1
-	bmi2:               "BMI2",               // Bit Manipulation Instruction Set 2
-	tbm:                "TBM",                // AMD Trailing Bit Manipulation
-	lzcnt:              "LZCNT",              // LZCNT instruction
-	popcnt:             "POPCNT",             // POPCNT instruction
-	aesni:              "AESNI",              // Advanced Encryption Standard New Instructions
-	clmul:              "CLMUL",              // Carry-less Multiplication
-	htt:                "HTT",                // Hyperthreading (enabled)
-	hle:                "HLE",                // Hardware Lock Elision
-	rtm:                "RTM",                // Restricted Transactional Memory
-	rdrand:             "RDRAND",             // RDRAND instruction is available
-	rdseed:             "RDSEED",             // RDSEED instruction is available
-	adx:                "ADX",                // Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
-	sha:                "SHA",                // Intel SHA Extensions
-	avx512f:            "AVX512F",            // AVX-512 Foundation
-	avx512dq:           "AVX512DQ",           // AVX-512 Doubleword and Quadword Instructions
-	avx512ifma:         "AVX512IFMA",         // AVX-512 Integer Fused Multiply-Add Instructions
-	avx512pf:           "AVX512PF",           // AVX-512 Prefetch Instructions
-	avx512er:           "AVX512ER",           // AVX-512 Exponential and Reciprocal Instructions
-	avx512cd:           "AVX512CD",           // AVX-512 Conflict Detection Instructions
-	avx512bw:           "AVX512BW",           // AVX-512 Byte and Word Instructions
-	avx512vl:           "AVX512VL",           // AVX-512 Vector Length Extensions
-	avx512vbmi:         "AVX512VBMI",         // AVX-512 Vector Bit Manipulation Instructions
-	avx512vbmi2:        "AVX512VBMI2",        // AVX-512 Vector Bit Manipulation Instructions, Version 2
-	avx512vnni:         "AVX512VNNI",         // AVX-512 Vector Neural Network Instructions
-	avx512vpopcntdq:    "AVX512VPOPCNTDQ",    // AVX-512 Vector Population Count Doubleword and Quadword
-	gfni:               "GFNI",               // Galois Field New Instructions
-	vaes:               "VAES",               // Vector AES
-	avx512bitalg:       "AVX512BITALG",       // AVX-512 Bit Algorithms
-	vpclmulqdq:         "VPCLMULQDQ",         // Carry-Less Multiplication Quadword
-	avx512bf16:         "AVX512BF16",         // AVX-512 BFLOAT16 Instruction
-	avx512vp2intersect: "AVX512VP2INTERSECT", // AVX-512 Intersect for D/Q
-	mpx:                "MPX",                // Intel MPX (Memory Protection Extensions)
-	erms:               "ERMS",               // Enhanced REP MOVSB/STOSB
-	rdtscp:             "RDTSCP",             // RDTSCP Instruction
-	cx16:               "CX16",               // CMPXCHG16B Instruction
-	sgx:                "SGX",                // Software Guard Extensions
-	sgxlc:              "SGXLC",              // Software Guard Extensions Launch Control
-	ibpb:               "IBPB",               // Indirect Branch Restricted Speculation and Indirect Branch Predictor Barrier
-	stibp:              "STIBP",              // Single Thread Indirect Branch Predictors
-	vmx:                "VMX",                // Virtual Machine Extensions
-
-	// Performance indicators
-	sse2slow: "SSE2SLOW", // SSE2 supported, but usually not faster
-	sse3slow: "SSE3SLOW", // SSE3 supported, but usually not faster
-	atom:     "ATOM",     // Atom processor, some SSSE3 instructions are slower
-
-}
-
-// CPUInfo contains information about the detected system CPU.
-type cpuInfo struct {
-	brandname      string // Brand name reported by the CPU
-	vendorid       vendor // Comparable CPU vendor ID
-	features       flags  // Features of the CPU
-	physicalcores  int    // Number of physical processor cores in your CPU. Will be 0 if undetectable.
-	threadspercore int    // Number of threads per physical core. Will be 1 if undetectable.
-	logicalcores   int    // Number of physical cores times threads that can run on each core through the use of hyperthreading. Will be 0 if undetectable.
-	family         int    // CPU family number
-	model          int    // CPU model number
-	cacheline      int    // Cache line size in bytes. Will be 0 if undetectable.
-	cache          struct {
-		l1i int // L1 Instruction Cache (per core or shared). Will be -1 if undetected
-		l1d int // L1 Data Cache (per core or shared). Will be -1 if undetected
-		l2  int // L2 Cache (per core or shared). Will be -1 if undetected
-		l3  int // L3 Instruction Cache (per core or shared). Will be -1 if undetected
-	}
-	sgx       sgxsupport
-	maxFunc   uint32
-	maxExFunc uint32
-}
-
-var cpuid func(op uint32) (eax, ebx, ecx, edx uint32)
-var cpuidex func(op, op2 uint32) (eax, ebx, ecx, edx uint32)
-var xgetbv func(index uint32) (eax, edx uint32)
-var rdtscpAsm func() (eax, ebx, ecx, edx uint32)
-
-// CPU contains information about the CPU as detected on startup,
-// or when Detect last was called.
-//
-// Use this as the primary entry point to you data,
-// this way queries are
-var cpu cpuInfo
-
-func init() {
-	initCPU()
-	detect()
-}
-
-// Detect will re-detect current CPU info.
-// This will replace the content of the exported CPU variable.
-//
-// Unless you expect the CPU to change while you are running your program
-// you should not need to call this function.
-// If you call this, you must ensure that no other goroutine is accessing the
-// exported CPU variable.
-func detect() {
-	cpu.maxFunc = maxFunctionID()
-	cpu.maxExFunc = maxExtendedFunction()
-	cpu.brandname = brandName()
-	cpu.cacheline = cacheLine()
-	cpu.family, cpu.model = familyModel()
-	cpu.features = support()
-	cpu.sgx = hasSGX(cpu.features&sgx != 0, cpu.features&sgxlc != 0)
-	cpu.threadspercore = threadsPerCore()
-	cpu.logicalcores = logicalCores()
-	cpu.physicalcores = physicalCores()
-	cpu.vendorid = vendorID()
-	cpu.cacheSize()
-}
-
-// Generated here: http://play.golang.org/p/BxFH2Gdc0G
-
-// Cmov indicates support of CMOV instructions
-func (c cpuInfo) cmov() bool {
-	return c.features&cmov != 0
-}
-
-// Amd3dnow indicates support of AMD 3DNOW! instructions
-func (c cpuInfo) amd3dnow() bool {
-	return c.features&amd3dnow != 0
-}
-
-// Amd3dnowExt indicates support of AMD 3DNOW! Extended instructions
-func (c cpuInfo) amd3dnowext() bool {
-	return c.features&amd3dnowext != 0
-}
-
-// VMX indicates support of VMX
-func (c cpuInfo) vmx() bool {
-	return c.features&vmx != 0
-}
-
-// MMX indicates support of MMX instructions
-func (c cpuInfo) mmx() bool {
-	return c.features&mmx != 0
-}
-
-// MMXExt indicates support of MMXEXT instructions
-// (SSE integer functions or AMD MMX ext)
-func (c cpuInfo) mmxext() bool {
-	return c.features&mmxext != 0
-}
-
-// SSE indicates support of SSE instructions
-func (c cpuInfo) sse() bool {
-	return c.features&sse != 0
-}
-
-// SSE2 indicates support of SSE 2 instructions
-func (c cpuInfo) sse2() bool {
-	return c.features&sse2 != 0
-}
-
-// SSE3 indicates support of SSE 3 instructions
-func (c cpuInfo) sse3() bool {
-	return c.features&sse3 != 0
-}
-
-// SSSE3 indicates support of SSSE 3 instructions
-func (c cpuInfo) ssse3() bool {
-	return c.features&ssse3 != 0
-}
-
-// SSE4 indicates support of SSE 4 (also called SSE 4.1) instructions
-func (c cpuInfo) sse4() bool {
-	return c.features&sse4 != 0
-}
-
-// SSE42 indicates support of SSE4.2 instructions
-func (c cpuInfo) sse42() bool {
-	return c.features&sse42 != 0
-}
-
-// AVX indicates support of AVX instructions
-// and operating system support of AVX instructions
-func (c cpuInfo) avx() bool {
-	return c.features&avx != 0
-}
-
-// AVX2 indicates support of AVX2 instructions
-func (c cpuInfo) avx2() bool {
-	return c.features&avx2 != 0
-}
-
-// FMA3 indicates support of FMA3 instructions
-func (c cpuInfo) fma3() bool {
-	return c.features&fma3 != 0
-}
-
-// FMA4 indicates support of FMA4 instructions
-func (c cpuInfo) fma4() bool {
-	return c.features&fma4 != 0
-}
-
-// XOP indicates support of XOP instructions
-func (c cpuInfo) xop() bool {
-	return c.features&xop != 0
-}
-
-// F16C indicates support of F16C instructions
-func (c cpuInfo) f16c() bool {
-	return c.features&f16c != 0
-}
-
-// BMI1 indicates support of BMI1 instructions
-func (c cpuInfo) bmi1() bool {
-	return c.features&bmi1 != 0
-}
-
-// BMI2 indicates support of BMI2 instructions
-func (c cpuInfo) bmi2() bool {
-	return c.features&bmi2 != 0
-}
-
-// TBM indicates support of TBM instructions
-// (AMD Trailing Bit Manipulation)
-func (c cpuInfo) tbm() bool {
-	return c.features&tbm != 0
-}
-
-// Lzcnt indicates support of LZCNT instruction
-func (c cpuInfo) lzcnt() bool {
-	return c.features&lzcnt != 0
-}
-
-// Popcnt indicates support of POPCNT instruction
-func (c cpuInfo) popcnt() bool {
-	return c.features&popcnt != 0
-}
-
-// HTT indicates the processor has Hyperthreading enabled
-func (c cpuInfo) htt() bool {
-	return c.features&htt != 0
-}
-
-// SSE2Slow indicates that SSE2 may be slow on this processor
-func (c cpuInfo) sse2slow() bool {
-	return c.features&sse2slow != 0
-}
-
-// SSE3Slow indicates that SSE3 may be slow on this processor
-func (c cpuInfo) sse3slow() bool {
-	return c.features&sse3slow != 0
-}
-
-// AesNi indicates support of AES-NI instructions
-// (Advanced Encryption Standard New Instructions)
-func (c cpuInfo) aesni() bool {
-	return c.features&aesni != 0
-}
-
-// Clmul indicates support of CLMUL instructions
-// (Carry-less Multiplication)
-func (c cpuInfo) clmul() bool {
-	return c.features&clmul != 0
-}
-
-// NX indicates support of NX (No-Execute) bit
-func (c cpuInfo) nx() bool {
-	return c.features&nx != 0
-}
-
-// SSE4A indicates support of AMD Barcelona microarchitecture SSE4a instructions
-func (c cpuInfo) sse4a() bool {
-	return c.features&sse4a != 0
-}
-
-// HLE indicates support of Hardware Lock Elision
-func (c cpuInfo) hle() bool {
-	return c.features&hle != 0
-}
-
-// RTM indicates support of Restricted Transactional Memory
-func (c cpuInfo) rtm() bool {
-	return c.features&rtm != 0
-}
-
-// Rdrand indicates support of RDRAND instruction is available
-func (c cpuInfo) rdrand() bool {
-	return c.features&rdrand != 0
-}
-
-// Rdseed indicates support of RDSEED instruction is available
-func (c cpuInfo) rdseed() bool {
-	return c.features&rdseed != 0
-}
-
-// ADX indicates support of Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
-func (c cpuInfo) adx() bool {
-	return c.features&adx != 0
-}
-
-// SHA indicates support of Intel SHA Extensions
-func (c cpuInfo) sha() bool {
-	return c.features&sha != 0
-}
-
-// AVX512F indicates support of AVX-512 Foundation
-func (c cpuInfo) avx512f() bool {
-	return c.features&avx512f != 0
-}
-
-// AVX512DQ indicates support of AVX-512 Doubleword and Quadword Instructions
-func (c cpuInfo) avx512dq() bool {
-	return c.features&avx512dq != 0
-}
-
-// AVX512IFMA indicates support of AVX-512 Integer Fused Multiply-Add Instructions
-func (c cpuInfo) avx512ifma() bool {
-	return c.features&avx512ifma != 0
-}
-
-// AVX512PF indicates support of AVX-512 Prefetch Instructions
-func (c cpuInfo) avx512pf() bool {
-	return c.features&avx512pf != 0
-}
-
-// AVX512ER indicates support of AVX-512 Exponential and Reciprocal Instructions
-func (c cpuInfo) avx512er() bool {
-	return c.features&avx512er != 0
-}
-
-// AVX512CD indicates support of AVX-512 Conflict Detection Instructions
-func (c cpuInfo) avx512cd() bool {
-	return c.features&avx512cd != 0
-}
-
-// AVX512BW indicates support of AVX-512 Byte and Word Instructions
-func (c cpuInfo) avx512bw() bool {
-	return c.features&avx512bw != 0
-}
-
-// AVX512VL indicates support of AVX-512 Vector Length Extensions
-func (c cpuInfo) avx512vl() bool {
-	return c.features&avx512vl != 0
-}
-
-// AVX512VBMI indicates support of AVX-512 Vector Bit Manipulation Instructions
-func (c cpuInfo) avx512vbmi() bool {
-	return c.features&avx512vbmi != 0
-}
-
-// AVX512VBMI2 indicates support of AVX-512 Vector Bit Manipulation Instructions, Version 2
-func (c cpuInfo) avx512vbmi2() bool {
-	return c.features&avx512vbmi2 != 0
-}
-
-// AVX512VNNI indicates support of AVX-512 Vector Neural Network Instructions
-func (c cpuInfo) avx512vnni() bool {
-	return c.features&avx512vnni != 0
-}
-
-// AVX512VPOPCNTDQ indicates support of AVX-512 Vector Population Count Doubleword and Quadword
-func (c cpuInfo) avx512vpopcntdq() bool {
-	return c.features&avx512vpopcntdq != 0
-}
-
-// GFNI indicates support of Galois Field New Instructions
-func (c cpuInfo) gfni() bool {
-	return c.features&gfni != 0
-}
-
-// VAES indicates support of Vector AES
-func (c cpuInfo) vaes() bool {
-	return c.features&vaes != 0
-}
-
-// AVX512BITALG indicates support of AVX-512 Bit Algorithms
-func (c cpuInfo) avx512bitalg() bool {
-	return c.features&avx512bitalg != 0
-}
-
-// VPCLMULQDQ indicates support of Carry-Less Multiplication Quadword
-func (c cpuInfo) vpclmulqdq() bool {
-	return c.features&vpclmulqdq != 0
-}
-
-// AVX512BF16 indicates support of
-func (c cpuInfo) avx512bf16() bool {
-	return c.features&avx512bf16 != 0
-}
-
-// AVX512VP2INTERSECT indicates support of
-func (c cpuInfo) avx512vp2intersect() bool {
-	return c.features&avx512vp2intersect != 0
-}
-
-// MPX indicates support of Intel MPX (Memory Protection Extensions)
-func (c cpuInfo) mpx() bool {
-	return c.features&mpx != 0
-}
-
-// ERMS indicates support of Enhanced REP MOVSB/STOSB
-func (c cpuInfo) erms() bool {
-	return c.features&erms != 0
-}
-
-// RDTSCP Instruction is available.
-func (c cpuInfo) rdtscp() bool {
-	return c.features&rdtscp != 0
-}
-
-// CX16 indicates if CMPXCHG16B instruction is available.
-func (c cpuInfo) cx16() bool {
-	return c.features&cx16 != 0
-}
-
-// TSX is split into HLE (Hardware Lock Elision) and RTM (Restricted Transactional Memory) detection.
-// So TSX simply checks that.
-func (c cpuInfo) tsx() bool {
-	return c.features&(hle|rtm) == hle|rtm
-}
-
-// Atom indicates an Atom processor
-func (c cpuInfo) atom() bool {
-	return c.features&atom != 0
-}
-
-// Intel returns true if vendor is recognized as Intel
-func (c cpuInfo) intel() bool {
-	return c.vendorid == intel
-}
-
-// AMD returns true if vendor is recognized as AMD
-func (c cpuInfo) amd() bool {
-	return c.vendorid == amd
-}
-
-// Hygon returns true if vendor is recognized as Hygon
-func (c cpuInfo) hygon() bool {
-	return c.vendorid == hygon
-}
-
-// Transmeta returns true if vendor is recognized as Transmeta
-func (c cpuInfo) transmeta() bool {
-	return c.vendorid == transmeta
-}
-
-// NSC returns true if vendor is recognized as National Semiconductor
-func (c cpuInfo) nsc() bool {
-	return c.vendorid == nsc
-}
-
-// VIA returns true if vendor is recognized as VIA
-func (c cpuInfo) via() bool {
-	return c.vendorid == via
-}
-
-// RTCounter returns the 64-bit time-stamp counter
-// Uses the RDTSCP instruction. The value 0 is returned
-// if the CPU does not support the instruction.
-func (c cpuInfo) rtcounter() uint64 {
-	if !c.rdtscp() {
-		return 0
-	}
-	a, _, _, d := rdtscpAsm()
-	return uint64(a) | (uint64(d) << 32)
-}
-
-// Ia32TscAux returns the IA32_TSC_AUX part of the RDTSCP.
-// This variable is OS dependent, but on Linux contains information
-// about the current cpu/core the code is running on.
-// If the RDTSCP instruction isn't supported on the CPU, the value 0 is returned.
-func (c cpuInfo) ia32tscaux() uint32 {
-	if !c.rdtscp() {
-		return 0
-	}
-	_, _, ecx, _ := rdtscpAsm()
-	return ecx
-}
-
-// LogicalCPU will return the Logical CPU the code is currently executing on.
-// This is likely to change when the OS re-schedules the running thread
-// to another CPU.
-// If the current core cannot be detected, -1 will be returned.
-func (c cpuInfo) logicalcpu() int {
-	if c.maxFunc < 1 {
-		return -1
-	}
-	_, ebx, _, _ := cpuid(1)
-	return int(ebx >> 24)
-}
-
-// VM Will return true if the cpu id indicates we are in
-// a virtual machine. This is only a hint, and will very likely
-// have many false negatives.
-func (c cpuInfo) vm() bool {
-	switch c.vendorid {
-	case msvm, kvm, vmware, xenhvm, bhyve:
-		return true
-	}
-	return false
-}
-
-// Flags contains detected cpu features and caracteristics
-type flags uint64
-
-// String returns a string representation of the detected
-// CPU features.
-func (f flags) String() string {
-	return strings.Join(f.strings(), ",")
-}
-
-// Strings returns and array of the detected features.
-func (f flags) strings() []string {
-	s := support()
-	r := make([]string, 0, 20)
-	for i := uint(0); i < 64; i++ {
-		key := flags(1 << i)
-		val := flagNames[key]
-		if s&key != 0 {
-			r = append(r, val)
-		}
-	}
-	return r
-}
-
-func maxExtendedFunction() uint32 {
-	eax, _, _, _ := cpuid(0x80000000)
-	return eax
-}
-
-func maxFunctionID() uint32 {
-	a, _, _, _ := cpuid(0)
-	return a
-}
-
-func brandName() string {
-	if maxExtendedFunction() >= 0x80000004 {
-		v := make([]uint32, 0, 48)
-		for i := uint32(0); i < 3; i++ {
-			a, b, c, d := cpuid(0x80000002 + i)
-			v = append(v, a, b, c, d)
-		}
-		return strings.Trim(string(valAsString(v...)), " ")
-	}
-	return "unknown"
-}
-
-func threadsPerCore() int {
-	mfi := maxFunctionID()
-	if mfi < 0x4 || vendorID() != intel {
-		return 1
-	}
-
-	if mfi < 0xb {
-		_, b, _, d := cpuid(1)
-		if (d & (1 << 28)) != 0 {
-			// v will contain logical core count
-			v := (b >> 16) & 255
-			if v > 1 {
-				a4, _, _, _ := cpuid(4)
-				// physical cores
-				v2 := (a4 >> 26) + 1
-				if v2 > 0 {
-					return int(v) / int(v2)
-				}
-			}
-		}
-		return 1
-	}
-	_, b, _, _ := cpuidex(0xb, 0)
-	if b&0xffff == 0 {
-		return 1
-	}
-	return int(b & 0xffff)
-}
-
-func logicalCores() int {
-	mfi := maxFunctionID()
-	switch vendorID() {
-	case intel:
-		// Use this on old Intel processors
-		if mfi < 0xb {
-			if mfi < 1 {
-				return 0
-			}
-			// CPUID.1:EBX[23:16] represents the maximum number of addressable IDs (initial APIC ID)
-			// that can be assigned to logical processors in a physical package.
-			// The value may not be the same as the number of logical processors that are present in the hardware of a physical package.
-			_, ebx, _, _ := cpuid(1)
-			logical := (ebx >> 16) & 0xff
-			return int(logical)
-		}
-		_, b, _, _ := cpuidex(0xb, 1)
-		return int(b & 0xffff)
-	case amd, hygon:
-		_, b, _, _ := cpuid(1)
-		return int((b >> 16) & 0xff)
-	default:
-		return 0
-	}
-}
-
-func familyModel() (int, int) {
-	if maxFunctionID() < 0x1 {
-		return 0, 0
-	}
-	eax, _, _, _ := cpuid(1)
-	family := ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff)
-	model := ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0)
-	return int(family), int(model)
-}
-
-func physicalCores() int {
-	switch vendorID() {
-	case intel:
-		return logicalCores() / threadsPerCore()
-	case amd, hygon:
-		if maxExtendedFunction() >= 0x80000008 {
-			_, _, c, _ := cpuid(0x80000008)
-			return int(c&0xff) + 1
-		}
-	}
-	return 0
-}
-
-// Except from http://en.wikipedia.org/wiki/CPUID#EAX.3D0:_Get_vendor_ID
-var vendorMapping = map[string]vendor{
-	"AMDisbetter!": amd,
-	"AuthenticAMD": amd,
-	"CentaurHauls": via,
-	"GenuineIntel": intel,
-	"TransmetaCPU": transmeta,
-	"GenuineTMx86": transmeta,
-	"Geode by NSC": nsc,
-	"VIA VIA VIA ": via,
-	"KVMKVMKVMKVM": kvm,
-	"Microsoft Hv": msvm,
-	"VMwareVMware": vmware,
-	"XenVMMXenVMM": xenhvm,
-	"bhyve bhyve ": bhyve,
-	"HygonGenuine": hygon,
-}
-
-func vendorID() vendor {
-	_, b, c, d := cpuid(0)
-	v := valAsString(b, d, c)
-	vend, ok := vendorMapping[string(v)]
-	if !ok {
-		return other
-	}
-	return vend
-}
-
-func cacheLine() int {
-	if maxFunctionID() < 0x1 {
-		return 0
-	}
-
-	_, ebx, _, _ := cpuid(1)
-	cache := (ebx & 0xff00) >> 5 // cflush size
-	if cache == 0 && maxExtendedFunction() >= 0x80000006 {
-		_, _, ecx, _ := cpuid(0x80000006)
-		cache = ecx & 0xff // cacheline size
-	}
-	// TODO: Read from Cache and TLB Information
-	return int(cache)
-}
-
-func (c *cpuInfo) cacheSize() {
-	c.cache.l1d = -1
-	c.cache.l1i = -1
-	c.cache.l2 = -1
-	c.cache.l3 = -1
-	vendor := vendorID()
-	switch vendor {
-	case intel:
-		if maxFunctionID() < 4 {
-			return
-		}
-		for i := uint32(0); ; i++ {
-			eax, ebx, ecx, _ := cpuidex(4, i)
-			cacheType := eax & 15
-			if cacheType == 0 {
-				break
-			}
-			cacheLevel := (eax >> 5) & 7
-			coherency := int(ebx&0xfff) + 1
-			partitions := int((ebx>>12)&0x3ff) + 1
-			associativity := int((ebx>>22)&0x3ff) + 1
-			sets := int(ecx) + 1
-			size := associativity * partitions * coherency * sets
-			switch cacheLevel {
-			case 1:
-				if cacheType == 1 {
-					// 1 = Data Cache
-					c.cache.l1d = size
-				} else if cacheType == 2 {
-					// 2 = Instruction Cache
-					c.cache.l1i = size
-				} else {
-					if c.cache.l1d < 0 {
-						c.cache.l1i = size
-					}
-					if c.cache.l1i < 0 {
-						c.cache.l1i = size
-					}
-				}
-			case 2:
-				c.cache.l2 = size
-			case 3:
-				c.cache.l3 = size
-			}
-		}
-	case amd, hygon:
-		// Untested.
-		if maxExtendedFunction() < 0x80000005 {
-			return
-		}
-		_, _, ecx, edx := cpuid(0x80000005)
-		c.cache.l1d = int(((ecx >> 24) & 0xFF) * 1024)
-		c.cache.l1i = int(((edx >> 24) & 0xFF) * 1024)
-
-		if maxExtendedFunction() < 0x80000006 {
-			return
-		}
-		_, _, ecx, _ = cpuid(0x80000006)
-		c.cache.l2 = int(((ecx >> 16) & 0xFFFF) * 1024)
-	}
-
-	return
-}
-
-type sgxepcsection struct {
-	baseaddress uint64
-	epcsize     uint64
-}
-
-type sgxsupport struct {
-	available           bool
-	launchcontrol       bool
-	sgx1supported       bool
-	sgx2supported       bool
-	maxenclavesizenot64 int64
-	maxenclavesize64    int64
-	epcsections         []sgxepcsection
-}
-
-func hasSGX(available, lc bool) (rval sgxsupport) {
-	rval.available = available
-
-	if !available {
-		return
-	}
-
-	rval.launchcontrol = lc
-
-	a, _, _, d := cpuidex(0x12, 0)
-	rval.sgx1supported = a&0x01 != 0
-	rval.sgx2supported = a&0x02 != 0
-	rval.maxenclavesizenot64 = 1 << (d & 0xFF)     // pow 2
-	rval.maxenclavesize64 = 1 << ((d >> 8) & 0xFF) // pow 2
-	rval.epcsections = make([]sgxepcsection, 0)
-
-	for subleaf := uint32(2); subleaf < 2+8; subleaf++ {
-		eax, ebx, ecx, edx := cpuidex(0x12, subleaf)
-		leafType := eax & 0xf
-
-		if leafType == 0 {
-			// Invalid subleaf, stop iterating
-			break
-		} else if leafType == 1 {
-			// EPC Section subleaf
-			baseAddress := uint64(eax&0xfffff000) + (uint64(ebx&0x000fffff) << 32)
-			size := uint64(ecx&0xfffff000) + (uint64(edx&0x000fffff) << 32)
-
-			section := sgxepcsection{baseaddress: baseAddress, epcsize: size}
-			rval.epcsections = append(rval.epcsections, section)
-		}
-	}
-
-	return
-}
-
-func support() flags {
-	mfi := maxFunctionID()
-	vend := vendorID()
-	if mfi < 0x1 {
-		return 0
-	}
-	rval := uint64(0)
-	_, _, c, d := cpuid(1)
-	if (d & (1 << 15)) != 0 {
-		rval |= cmov
-	}
-	if (d & (1 << 23)) != 0 {
-		rval |= mmx
-	}
-	if (d & (1 << 25)) != 0 {
-		rval |= mmxext
-	}
-	if (d & (1 << 25)) != 0 {
-		rval |= sse
-	}
-	if (d & (1 << 26)) != 0 {
-		rval |= sse2
-	}
-	if (c & 1) != 0 {
-		rval |= sse3
-	}
-	if (c & (1 << 5)) != 0 {
-		rval |= vmx
-	}
-	if (c & 0x00000200) != 0 {
-		rval |= ssse3
-	}
-	if (c & 0x00080000) != 0 {
-		rval |= sse4
-	}
-	if (c & 0x00100000) != 0 {
-		rval |= sse42
-	}
-	if (c & (1 << 25)) != 0 {
-		rval |= aesni
-	}
-	if (c & (1 << 1)) != 0 {
-		rval |= clmul
-	}
-	if c&(1<<23) != 0 {
-		rval |= popcnt
-	}
-	if c&(1<<30) != 0 {
-		rval |= rdrand
-	}
-	if c&(1<<29) != 0 {
-		rval |= f16c
-	}
-	if c&(1<<13) != 0 {
-		rval |= cx16
-	}
-	if vend == intel && (d&(1<<28)) != 0 && mfi >= 4 {
-		if threadsPerCore() > 1 {
-			rval |= htt
-		}
-	}
-
-	// Check XGETBV, OXSAVE and AVX bits
-	if c&(1<<26) != 0 && c&(1<<27) != 0 && c&(1<<28) != 0 {
-		// Check for OS support
-		eax, _ := xgetbv(0)
-		if (eax & 0x6) == 0x6 {
-			rval |= avx
-			if (c & 0x00001000) != 0 {
-				rval |= fma3
-			}
-		}
-	}
-
-	// Check AVX2, AVX2 requires OS support, but BMI1/2 don't.
-	if mfi >= 7 {
-		_, ebx, ecx, edx := cpuidex(7, 0)
-		eax1, _, _, _ := cpuidex(7, 1)
-		if (rval&avx) != 0 && (ebx&0x00000020) != 0 {
-			rval |= avx2
-		}
-		if (ebx & 0x00000008) != 0 {
-			rval |= bmi1
-			if (ebx & 0x00000100) != 0 {
-				rval |= bmi2
-			}
-		}
-		if ebx&(1<<2) != 0 {
-			rval |= sgx
-		}
-		if ebx&(1<<4) != 0 {
-			rval |= hle
-		}
-		if ebx&(1<<9) != 0 {
-			rval |= erms
-		}
-		if ebx&(1<<11) != 0 {
-			rval |= rtm
-		}
-		if ebx&(1<<14) != 0 {
-			rval |= mpx
-		}
-		if ebx&(1<<18) != 0 {
-			rval |= rdseed
-		}
-		if ebx&(1<<19) != 0 {
-			rval |= adx
-		}
-		if ebx&(1<<29) != 0 {
-			rval |= sha
-		}
-		if edx&(1<<26) != 0 {
-			rval |= ibpb
-		}
-		if ecx&(1<<30) != 0 {
-			rval |= sgxlc
-		}
-		if edx&(1<<27) != 0 {
-			rval |= stibp
-		}
-
-		// Only detect AVX-512 features if XGETBV is supported
-		if c&((1<<26)|(1<<27)) == (1<<26)|(1<<27) {
-			// Check for OS support
-			eax, _ := xgetbv(0)
-
-			// Verify that XCR0[7:5] = ‘111b’ (OPMASK state, upper 256-bit of ZMM0-ZMM15 and
-			// ZMM16-ZMM31 state are enabled by OS)
-			/// and that XCR0[2:1] = ‘11b’ (XMM state and YMM state are enabled by OS).
-			if (eax>>5)&7 == 7 && (eax>>1)&3 == 3 {
-				if ebx&(1<<16) != 0 {
-					rval |= avx512f
-				}
-				if ebx&(1<<17) != 0 {
-					rval |= avx512dq
-				}
-				if ebx&(1<<21) != 0 {
-					rval |= avx512ifma
-				}
-				if ebx&(1<<26) != 0 {
-					rval |= avx512pf
-				}
-				if ebx&(1<<27) != 0 {
-					rval |= avx512er
-				}
-				if ebx&(1<<28) != 0 {
-					rval |= avx512cd
-				}
-				if ebx&(1<<30) != 0 {
-					rval |= avx512bw
-				}
-				if ebx&(1<<31) != 0 {
-					rval |= avx512vl
-				}
-				// ecx
-				if ecx&(1<<1) != 0 {
-					rval |= avx512vbmi
-				}
-				if ecx&(1<<6) != 0 {
-					rval |= avx512vbmi2
-				}
-				if ecx&(1<<8) != 0 {
-					rval |= gfni
-				}
-				if ecx&(1<<9) != 0 {
-					rval |= vaes
-				}
-				if ecx&(1<<10) != 0 {
-					rval |= vpclmulqdq
-				}
-				if ecx&(1<<11) != 0 {
-					rval |= avx512vnni
-				}
-				if ecx&(1<<12) != 0 {
-					rval |= avx512bitalg
-				}
-				if ecx&(1<<14) != 0 {
-					rval |= avx512vpopcntdq
-				}
-				// edx
-				if edx&(1<<8) != 0 {
-					rval |= avx512vp2intersect
-				}
-				// cpuid eax 07h,ecx=1
-				if eax1&(1<<5) != 0 {
-					rval |= avx512bf16
-				}
-			}
-		}
-	}
-
-	if maxExtendedFunction() >= 0x80000001 {
-		_, _, c, d := cpuid(0x80000001)
-		if (c & (1 << 5)) != 0 {
-			rval |= lzcnt
-			rval |= popcnt
-		}
-		if (d & (1 << 31)) != 0 {
-			rval |= amd3dnow
-		}
-		if (d & (1 << 30)) != 0 {
-			rval |= amd3dnowext
-		}
-		if (d & (1 << 23)) != 0 {
-			rval |= mmx
-		}
-		if (d & (1 << 22)) != 0 {
-			rval |= mmxext
-		}
-		if (c & (1 << 6)) != 0 {
-			rval |= sse4a
-		}
-		if d&(1<<20) != 0 {
-			rval |= nx
-		}
-		if d&(1<<27) != 0 {
-			rval |= rdtscp
-		}
-
-		/* Allow for selectively disabling SSE2 functions on AMD processors
-		   with SSE2 support but not SSE4a. This includes Athlon64, some
-		   Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster
-		   than SSE2 often enough to utilize this special-case flag.
-		   AV_CPU_FLAG_SSE2 and AV_CPU_FLAG_SSE2SLOW are both set in this case
-		   so that SSE2 is used unless explicitly disabled by checking
-		   AV_CPU_FLAG_SSE2SLOW. */
-		if vendorID() != intel &&
-			rval&sse2 != 0 && (c&0x00000040) == 0 {
-			rval |= sse2slow
-		}
-
-		/* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
-		 * used unless the OS has AVX support. */
-		if (rval & avx) != 0 {
-			if (c & 0x00000800) != 0 {
-				rval |= xop
-			}
-			if (c & 0x00010000) != 0 {
-				rval |= fma4
-			}
-		}
-
-		if vendorID() == intel {
-			family, model := familyModel()
-			if family == 6 && (model == 9 || model == 13 || model == 14) {
-				/* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and
-				 * 6/14 (core1 "yonah") theoretically support sse2, but it's
-				 * usually slower than mmx. */
-				if (rval & sse2) != 0 {
-					rval |= sse2slow
-				}
-				if (rval & sse3) != 0 {
-					rval |= sse3slow
-				}
-			}
-			/* The Atom processor has SSSE3 support, which is useful in many cases,
-			 * but sometimes the SSSE3 version is slower than the SSE2 equivalent
-			 * on the Atom, but is generally faster on other processors supporting
-			 * SSSE3. This flag allows for selectively disabling certain SSSE3
-			 * functions on the Atom. */
-			if family == 6 && model == 28 {
-				rval |= atom
-			}
-		}
-	}
-	return flags(rval)
-}
-
-func valAsString(values ...uint32) []byte {
-	r := make([]byte, 4*len(values))
-	for i, v := range values {
-		dst := r[i*4:]
-		dst[0] = byte(v & 0xff)
-		dst[1] = byte((v >> 8) & 0xff)
-		dst[2] = byte((v >> 16) & 0xff)
-		dst[3] = byte((v >> 24) & 0xff)
-		switch {
-		case dst[0] == 0:
-			return r[:i*4]
-		case dst[1] == 0:
-			return r[:i*4+1]
-		case dst[2] == 0:
-			return r[:i*4+2]
-		case dst[3] == 0:
-			return r[:i*4+3]
-		}
-	}
-	return r
-}
diff --git a/s2/cpuid_amd64.s b/s2/cpuid_amd64.s
deleted file mode 100644
index 4cc19a4d9f..0000000000
--- a/s2/cpuid_amd64.s
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
-
-// +build !appengine
-// +build gc
-// +build !noasm
-
-// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
-TEXT ·asmCpuid(SB), 7, $0
-	XORQ CX, CX
-	MOVL op+0(FP), AX
-	CPUID
-	MOVL AX, eax+8(FP)
-	MOVL BX, ebx+12(FP)
-	MOVL CX, ecx+16(FP)
-	MOVL DX, edx+20(FP)
-	RET
-
-// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
-TEXT ·asmCpuidex(SB), 7, $0
-	MOVL op+0(FP), AX
-	MOVL op2+4(FP), CX
-	CPUID
-	MOVL AX, eax+8(FP)
-	MOVL BX, ebx+12(FP)
-	MOVL CX, ecx+16(FP)
-	MOVL DX, edx+20(FP)
-	RET
-
-// func asmXgetbv(index uint32) (eax, edx uint32)
-TEXT ·asmXgetbv(SB), 7, $0
-	MOVL index+0(FP), CX
-	BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
-	MOVL AX, eax+8(FP)
-	MOVL DX, edx+12(FP)
-	RET
-
-// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
-TEXT ·asmRdtscpAsm(SB), 7, $0
-	BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP
-	MOVL AX, eax+0(FP)
-	MOVL BX, ebx+4(FP)
-	MOVL CX, ecx+8(FP)
-	MOVL DX, edx+12(FP)
-	RET