/** 
 Identify the characteristics of the host CPU, providing information
 about cache sizes and assembly optimisation hints.
 
 Some of this information was extremely difficult to track down. Some of the
 documents below were found only in cached versions stored by search engines!
  This code relies on information found in:
    
  - "Intel(R) 64 and IA-32 Architectures Software Developers Manual,
      Volume 2A: Instruction Set Reference, A-M" (2007).
  - "AMD CPUID Specification", Advanced Micro Devices, Rev 2.28 (2008).
  - "AMD Processor Recognition Application Note For Processors Prior to AMD Family 0Fh Processors", Advanced Micro Devices, Rev 3.13 (2005).
  - "AMD Geode(TM) GX Processors Data Book", AMD, Publication ID 31505E, (2005).
  - "AMD K6 Processor Code Optimisation", Advanced Micro Devices, Rev D (2000).
  - "Application note 106: Software Customization for the 6x86 Family", Cyrix Corporation, Rev 1.5 (1998)
  - http://ftp.intron.ac/pub/document/cpu/cpuid.htm
  - "Geode(TM) GX1 Processor Series Low Power Integrated X86 Solution", National Semiconductor, (2002)
  - "The VIA Isaiah Architecture", G. Glenn Henry, Centaur Technology, Inc (2008).
  - http://www.sandpile.org/ia32/cpuid.htm
  - http://grafi.ii.pw.edu.pl/gbm/x86/cpuid.html
  - "What every programmer should know about memory", Ulrich Depper, Red Hat, Inc. 
     (2007). 
   
AUTHORS:  Don Clugston,
          Tomas Lindquist Olsen &lt;tomas@famolsen.dk&gt;
          Fawzi Mohamed
COPYRIGHT:  Public Domain

BUGS:   Currently only works on x86 CPUs.
        Many processors have bugs in their microcode for the CPUID instruction,
        so sometimes the cache information may be incorrect.
*/

module tango.core.Cpuid;

version(GNU){
    // GDC is a filthy liar. It can't actually do inline asm.
} else version(D_InlineAsm_X86) {
    version = Really_D_InlineAsm_X86;
}

version(X86) {
    version=X86_CPU;
} else version(X86_64) {
    version=X86_CPU;
} else version ( PPC64 )
{
    version=PPC_CPU;
} else version ( PPC ) {
    version=PPC_CPU;
} else version(ARM){
} else version(SPARC){
} else {
    static assert(0,"unknown cpu family");
}

/// Cache size and behaviour
struct CacheInfo
{
    /// Size of the cache, in kilobytes, per CPU.
    /// For L1 unified (data + code) caches, this size is half the physical size.
    /// (we don't halve it for larger sizes, since normally
    /// data size is much greater than code size for critical loops).
    size_t size;
    /// Number of ways of associativity, eg:
    /// 1 = direct mapped
    /// 2 = 2-way set associative
    /// 3 = 3-way set associative
    /// ubyte.max = fully associative
    ubyte associativity;
    /// Number of bytes read into the cache when a cache miss occurs.
    uint lineSize;
    /// how many threads share the cache, 0=unkown
    uint nThreadSharing;
    /// if you cannot really trust these numbers
    bool wildGuess;
    void clear(){
        size=cast(size_t)0;
        associativity=1;
        lineSize=0;
        nThreadSharing=0;
        wildGuess=true;
    }
}

/// the main type of cpu
static CpuInfo mainCpu;
static this(){
    version(X86_CPU){
        mainCpu=new CpuInfoX86();
        mainCpu.getCpuData();
    } else version(PPC_CPU){
        mainCpu=new CpuInfoPpc();
    } else version(ARM){
        mainCpu=new CpuInfoArm();
    } else version(SPARC){
        mainCpu=new CpuInfoSparc();
    }
}

/// the current type of cpu
CpuInfo currentCpu() { return mainCpu; }
/// if the system has only one kind of cpu (at the moment hardcoded to true)
bool uniqueCpuType() { return true; }

/// information on a cpu
///
/// if you think x86,sparc,arm,ppc,... should be always defined, but throw or return null
/// post a ticket explaining why
class CpuInfo{
protected:
    /// The data caches. If there are fewer than 5 physical caches levels,
    /// the remaining levels are set to size_t.max/1024 (== entire memory space)
    /// make this a function?
    CacheInfo[5] datacache;
    /// cache levels
    uint numCacheLevels;
    /// vendor name (only for display purposes)
    char [] vendorName;
    /// name of the processor (only for display purposes)
    char [] processorName;
    /// tries to get valid data from the current cpu, return false if it fails
    bool getCpuData(){
        clear();
        cacheFixup();
        return false;
    }
public:
    this(){
        this.clear();
    }
    /// Returns vendor string, for display purposes only.
    /// Do NOT use this to determine features!
    /// Note that some CPUs have programmable vendorIDs.
    char[] vendor() { return vendorName; }
    /// Returns processor string, for display purposes only
    char[] processor() { return processorName; }
    /// Is hyperthreading supported?
    bool hyperThreading() {
        return threadsPerCPU()>coresPerCPU();
    }
    /// Returns number of threads per CPU
    uint threadsPerCPU(){
        return 1;
    }
    /// Returns number of cores in CPU
    uint coresPerCPU(){
        return 1;
    }
    /// clears info stored in this object
    void clear(){
        foreach (ref el;datacache){
            el.clear();
        }
        numCacheLevels=0;
        vendorName="unkown";
        processorName="unkown";
    }
    /// duplicates this object
    CpuInfo dup(){
        CpuInfo newInfo=cast(CpuInfo)this.classinfo.create();
        newInfo[]=this;
        return newInfo;
    }
    /// copies data from one object to the other
    CpuInfo opSliceAssign(CpuInfo other){
        assert(other.classinfo is this.classinfo);
        //datacache.length=other.datacache.length;
        datacache[]=other.datacache;
        numCacheLevels=other.numCacheLevels;
        vendorName=other.vendorName;
        processorName=other.processorName;
        return this;
    }
    /// sets unset values in cache info
    protected void cacheFixup(){
        if (datacache[0].size==0) {
                // Guess same as Pentium 1.
                datacache[0].size = 8;
                datacache[0].associativity = 2;
                datacache[0].lineSize = 32; 
                datacache[0].wildGuess=true;    
        }
        if (datacache[0].nThreadSharing==0){
            datacache[0].nThreadSharing=threadsPerCPU()/coresPerCPU();
        }
        numCacheLevels = 1;
        // And now fill up all the unused levels with full memory space.
        for (int i=1; i< datacache.length; ++i) {
            if (datacache[i].size==0) {
                // Set all remaining levels of cache equal to full address space.
                datacache[i].size = size_t.max/1024;
                datacache[i].associativity = 1;
                datacache[i].lineSize = datacache[i-1].lineSize;
                datacache[i].wildGuess=false;   
            } else {
                numCacheLevels = i+1;
            }
            if (datacache[i].nThreadSharing==0){
                datacache[i].nThreadSharing=threadsPerCPU();
            }
        }
    }
    version(X86_CPU){
        /// utility method to get information about x86 processors
        final CpuInfoX86 x86(){
            if (auto res=cast(CpuInfoX86)this)
                return res;
            throw new Exception("non x86 cpu",__FILE__,__LINE__);
        }
    }
    version(PPC){
        /// utility method to get information about PPC processors
        final CpuInfoPpc ppc(){
            if (auto res=cast(CpuInfoPpc)this)
                return res;
            throw new Exception("non ppc cpu",__FILE__,__LINE__);
        }
    }
    version(ARM){
        /// utility method to get information about arm processors
        final CpuInfoArm arm(){
            if (auto res=cast(CpuInfoArm)this)
                return res;
            throw new Exception("non arm cpu",__FILE__,__LINE__);
        }
    }
    version(SPARC){
        /// utility method to get information about sparc processors
        final CpuInfoSparc sparc(){
            if (auto res=cast(CpuInfoSparc)this)
                return res;
            throw new Exception("non sparc cpu",__FILE__,__LINE__);
        }
    }
}

/// If optimizing for a particular processor, it is generally better
/// to identify based on features rather than model. NOTE: Normally
/// it's only worthwhile to optimise for the latest Intel and AMD CPU,
/// with a backup for other CPUs.
/// Pentium    -- preferPentium1()
/// PMMX       --   + mmx()
/// PPro       -- default
/// PII        --   + mmx()
/// PIII       --   + mmx() + sse()
/// PentiumM   --   + mmx() + sse() + sse2()
/// Pentium4   -- preferPentium4()
/// PentiumD   --   + isX86_64()
/// Core2      -- default + isX86_64()
/// AMD K5     -- preferPentium1()
/// AMD K6     --   + mmx()
/// AMD K6-II  --   + mmx() + 3dnow()
/// AMD K7     -- preferAthlon()
/// AMD K8     --   + sse2()
/// AMD K10    --   + isX86_64()
/// Cyrix 6x86 -- preferPentium1()
///    6x86MX  --   + mmx()
final class CpuInfoX86: CpuInfo {
private:
    bool probablyIntel; // true = _probably_ an Intel processor, might be faking
    bool probablyAMD; // true = _probably_ an AMD processor
    uint features;     // mmx, sse, sse2, hyperthreading, etc
    uint miscfeatures; // sse3, etc.
    uint amdfeatures;  // 3DNow!, mmxext, etc
    uint amdmiscfeatures; // sse4a, sse5, svm, etc
    uint maxCores;
    uint maxThreads;
    uint max_cpuid, max_extended_cpuid;
public:
    // Note that this may indicate multi-core rather than hyperthreading.
    bool hyperThreadingBit()    { return (features&HTT_BIT)!=0;}
    /// Processor type (vendor-dependent).
    /// This should be visible ONLY for display purposes.
    uint stepping, model, family;
    /// Does it have an x87 FPU on-chip?
    bool x87onChip()          {return (features&FPU_BIT)!=0;}
    /// Is MMX supported?
    bool mmx()          {return (features&MMX_BIT)!=0;}
    /// Is SSE supported?
    bool sse()          {return (features&SSE_BIT)!=0;}
    /// Is SSE2 supported?
    bool sse2()         {return (features&SSE2_BIT)!=0;}
    /// Is SSE3 supported?
    bool sse3()         {return (miscfeatures&SSE3_BIT)!=0;}
    /// Is SSSE3 supported?
    bool ssse3()        {return (miscfeatures&SSSE3_BIT)!=0;}
    /// Is SSE4.1 supported?
    bool sse41()        {return (miscfeatures&SSE41_BIT)!=0;}
    /// Is SSE4.2 supported?
    bool sse42()        {return (miscfeatures&SSE42_BIT)!=0;}
    /// Is SSE4a supported?
    bool sse4a()            {return (amdmiscfeatures&SSE4A_BIT)!=0;}
    /// Is SSE5 supported?
    bool sse5()         {return (amdmiscfeatures&SSE5_BIT)!=0;}
    /// Is AMD 3DNOW supported?
    bool amd3dnow()     {return (amdfeatures&AMD_3DNOW_BIT)!=0;}
    /// Is AMD 3DNOW Ext supported?
    bool amd3dnowExt()  {return (amdfeatures&AMD_3DNOW_EXT_BIT)!=0;}
    /// Are AMD extensions to MMX supported?
    bool amdMmx()       {return (amdfeatures&AMD_MMX_BIT)!=0;}
    /// Is fxsave/fxrstor supported?
    bool hasFxsr()          {return (features&FXSR_BIT)!=0;}
    /// Is cmov supported?
    bool hasCmov()          {return (features&CMOV_BIT)!=0;}
    /// Is rdtsc supported?
    bool hasRdtsc()         {return (features&TIMESTAMP_BIT)!=0;}
    /// Is cmpxchg8b supported?
    bool hasCmpxchg8b()     {return (features&CMPXCHG8B_BIT)!=0;}
    /// Is cmpxchg8b supported?
    bool hasCmpxchg16b()    {return (miscfeatures&CMPXCHG16B_BIT)!=0;}
    /// Is 3DNow prefetch supported?
    bool has3dnowPrefetch()
        {return (amdmiscfeatures&AMD_3DNOW_PREFETCH_BIT)!=0;}
    /// Are LAHF and SAHF supported in 64-bit mode?
    bool hasLahfSahf()          {return (amdmiscfeatures&LAHFSAHF_BIT)!=0;}
    /// Is POPCNT supported?
    bool hasPopcnt()        {return (miscfeatures&POPCNT_BIT)!=0;}    
    /// Is LZCNT supported?
    bool hasLzcnt()         {return (amdmiscfeatures&LZCNT_BIT)!=0;}
    /// Is this an Intel64 or AMD 64?
    bool isX86_64()         {return (amdfeatures&AMD64_BIT)!=0;}
            
    /// Is this an IA64 (Itanium) processor?
    bool isItanium()        { return (features&IA64_BIT)!=0; }

    /// Is hyperthreading supported?
    bool hyperThreading()   { return maxThreads>maxCores; }
    /// Returns number of threads per CPU
    uint threadsPerCPU()    { return maxThreads; }
    /// Returns number of cores in CPU
    uint coresPerCPU()      { return maxCores; }
    
    /// Optimisation hints for assembly code.
    /// For forward compatibility, the CPU is compared against different
    /// microarchitectures. For 32-bit X86, comparisons are made against
    /// the Intel PPro/PII/PIII/PM family.
    ///
    /// The major 32-bit x86 microarchitecture 'dynasties' have been:
    /// (1) Intel P6 (PentiumPro, PII, PIII, PM, Core, Core2).
    /// (2) AMD Athlon (K7, K8, K10).
    /// (3) Intel NetBurst (Pentium 4, Pentium D).
    /// (4) In-order Pentium (Pentium1, PMMX)
    /// Other early CPUs (Nx586, AMD K5, K6, Centaur C3, Transmeta,
    ///   Cyrix, Rise) were mostly in-order.
    /// Some new processors do not fit into the existing categories:
    /// Intel Atom 230/330 (family 6, model 0x1C) is an in-order core.
    /// Centaur Isiah = VIA Nano (family 6, model F) is an out-of-order core.
    ///
    /// Within each dynasty, the optimisation techniques are largely
    /// identical (eg, use instruction pairing for group 4). Major
    /// instruction set improvements occur within each group.
    
    /// Does this CPU perform better on AMD K7 code than PentiumPro..Core2 code?
    bool preferAthlon() { return probablyAMD && family >=6; }
    /// Does this CPU perform better on Pentium4 code than PentiumPro..Core2 code?
    bool preferPentium4() { return probablyIntel && family == 0xF; }
    /// Does this CPU perform better on Pentium I code than Pentium Pro code?
    bool preferPentium1() { return family < 6 || (family==6 && model < 0xF && !probablyIntel); }
    this(){
        super();
    }
    override void clear(){
        super.clear();
        stepping=0;
        model=0;
        family=0;
        probablyIntel=false;
        probablyAMD=false;
        vendorName="UnknownX86";
        processorName="UnknownX86";
        features=0;
        miscfeatures=0;
        amdmiscfeatures=0;
        amdfeatures=0;
        maxCores=1;
        maxThreads=1;
    }
    /// copies data from one object to the other
    CpuInfoX86 opSliceAssign(CpuInfo o){
        auto other=cast(CpuInfoX86)o;
        assert(other !is null);
        super.opSliceAssign(o);
        stepping=other.stepping;
        model=other.model;
        family=other.family;
        probablyIntel=other.probablyIntel;
        probablyAMD=other.probablyAMD;
        vendorName=other.vendorName;
        processorName=other.processorName;
        features=other.features;
        miscfeatures=other.miscfeatures;
        amdmiscfeatures=other.amdmiscfeatures;
        amdfeatures=other.amdfeatures;
        maxCores=other.maxCores;
        maxThreads=other.maxThreads;
        return this;
    }
    
    /// auto config for current cpu
    protected override bool getCpuData(){
        if (hasCPUID()) {
            cpuidX86();
            cacheFixup();
            return true;
        } else {
            // it's a 386 or 486, or a Cyrix 6x86.
            //Probably still has an external cache.
            clear();
            cacheFixup();
            return false;
        }
    }
    // feature flags CPUID1_EDX
    enum : uint
    {
        FPU_BIT = 1,
        TIMESTAMP_BIT = 1<<4, // rdtsc
        MDSR_BIT = 1<<5,      // RDMSR/WRMSR
        CMPXCHG8B_BIT = 1<<8,
        CMOV_BIT = 1<<15,
        MMX_BIT = 1<<23,
        FXSR_BIT = 1<<24,
        SSE_BIT = 1<<25,
        SSE2_BIT = 1<<26,
        HTT_BIT = 1<<28,
        IA64_BIT = 1<<30
    }
    // feature flags misc CPUID1_ECX
    enum : uint
    {
        SSE3_BIT = 1,
        PCLMULQDQ_BIT = 1<<1, // from AVX
        MWAIT_BIT = 1<<3,
        SSSE3_BIT = 1<<9,
        FMA_BIT = 1<<12,     // from AVX
        CMPXCHG16B_BIT = 1<<13,
        SSE41_BIT = 1<<19,
        SSE42_BIT = 1<<20,
        POPCNT_BIT = 1<<23,
        AES_BIT = 1<<25, // AES instructions from AVX
        OSXSAVE_BIT = 1<<27, // Used for AVX
        AVX_BIT = 1<<28
    }
/+    
version(X86_64) {    
    bool hasAVXinHardware() {
        // This only indicates hardware support, not OS support.
        return (miscfeatures&AVX_BIT) && (miscfeatures&OSXSAVE_BIT);
    }
    // Is AVX supported (in both hardware & OS)?
    bool Avx() {
        if (!hasAVXinHardware()) return false;
        // Check for OS support
        uint xfeatures;
        asm {mov ECX, 0; xgetbv; mov xfeatures, EAX; }
        return (xfeatures&0x6)==6;
    }
    bool hasAvxFma() {
        if (!AVX()) return false;
        return (features&FMA_BIT)!=0;        
    }
}
+/    
    // AMD feature flags CPUID80000001_EDX
    enum : uint
    {
        AMD_MMX_BIT = 1<<22,
//      FXR_OR_CYRIXMMX_BIT = 1<<24, // Cyrix/NS: 6x86MMX instructions. 
        FFXSR_BIT = 1<<25,
        PAGE1GB_BIT = 1<<26, // support for 1GB pages
        RDTSCP_BIT = 1<<27,
        AMD64_BIT = 1<<29,
        AMD_3DNOW_EXT_BIT = 1<<30,
        AMD_3DNOW_BIT = 1<<31
    }
    // AMD misc feature flags CPUID80000001_ECX
    enum : uint
    {
        LAHFSAHF_BIT = 1,
        LZCNT_BIT = 1<<5,
        SSE4A_BIT = 1<<6,       
        AMD_3DNOW_PREFETCH_BIT = 1<<8,
        SSE5_BIT = 1<<11
    }


    version(Really_D_InlineAsm_X86) {
        // Note that this code will also work for Itanium, after changing the
        // register names in the asm code.

        // CPUID2: "cache and tlb information"
        void getcacheinfoCPUID2()
        {
            // CPUID2 is a dog's breakfast. What was Intel thinking???
            // We are only interested in the data caches
            void decipherCpuid2(ubyte x) {
                if (x==0) return;
                // Values from http://www.sandpile.org/ia32/cpuid.htm.
                // Includes Itanium and non-Intel CPUs.
                //
                ubyte [] ids = [
                    0x0A, 0x0C, 0x2C, 0x60, 0x0E, 0x66, 0x67, 0x68,
                    // level 2 cache
                    0x41, 0x42, 0x43, 0x44, 0x45, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7F,
                    0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x49, 0x4E,
                    0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x48, 0x80, 0x81,
                    // level 3 cache
                    0x22, 0x23, 0x25, 0x29, 0x46, 0x47, 0x4A, 0x4B, 0x4C, 0x4D
                ];
                uint [] sizes = [
                    8, 16, 32, 16, 24, 8, 16, 32,
                    128, 256, 512, 1024, 2048, 1024, 128, 256, 512, 1024, 2048, 512,
                    256, 512, 1024, 2048, 512, 1024, 4096, 6*1024,
                    128, 192, 128, 256, 384, 512, 3072, 512, 128,           
                    512, 1024, 2048, 4096, 4096, 8192, 6*1024, 8192, 12*1024, 16*1024
                ];
            // CPUBUG: Pentium M reports 0x2C but tests show it is only 4-way associative
                ubyte [] ways = [
                    2, 4, 8, 8, 6, 4, 4, 4,
                    4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 2,
                    8, 8, 8, 8, 4, 8, 16, 24,
                    4, 6, 2, 4, 6, 4, 12, 8, 8,
                    4, 8, 8, 8, 4, 8, 12, 16, 12, 16
                ];
                enum { FIRSTDATA2 = 8, FIRSTDATA3 = 28+9 }
                for (int i=0; i< ids.length; ++i) {
                    if (x==ids[i]) {
                        int level = i< FIRSTDATA2 ? 0: i<FIRSTDATA3 ? 1 : 2;
                        if (x==0x49 && family==0xF && model==0x6) level=2;
                        datacache[level].size=sizes[i];
                        datacache[level].associativity=ways[i];
                        if (level == 3 || x==0x2C || (x>=0x48 && x<=0x80) 
                            || x==0x86 || x==0x87
                            || (x>=0x66 && x<=0x68) || (x>=0x39 && x<=0x3E) ){
                            datacache[level].lineSize = 64;
                        } else datacache[level].lineSize = 32;
                    }
                }
            }

            uint[4] a;  
            bool firstTime = true;
            // On a multi-core system, this could theoretically fail, but it's only used
            // for old single-core CPUs.
            uint numinfos = 1;
            do {
                asm {
                    mov EAX, 2;
                    cpuid;
                    mov a, EAX;
                    mov a+4, EBX;
                    mov a+8, ECX;
                    mov a+12, EDX;
                }
                if (firstTime) {
                    if (a[0]==0x0000_7001 && a[3]==0x80 && a[1]==0 && a[2]==0) {
                // Cyrix MediaGX MMXEnhanced returns: EAX= 00007001, EDX=00000080.
                // These are NOT standard Intel values
                // (TLB = 32 entry, 4 way associative, 4K pages)
                // (L1 cache = 16K, 4way, linesize16)
                        datacache[0].size=8;
                        datacache[0].associativity=4;
                        datacache[0].lineSize=16;
                        return;             
                    }
                    // lsb of a is how many times to loop.
                    numinfos = a[0] & 0xFF;
                    // and otherwise it should be ignored
                    a[0] &= 0xFFFF_FF00;
                    firstTime = false;
                }
                for (int c=0; c<4;++c) {
                    // high bit set == no info.
                    if (a[c] & 0x8000_0000) continue;
                    decipherCpuid2(cast(ubyte)(a[c] & 0xFF));
                    decipherCpuid2(cast(ubyte)((a[c]>>8) & 0xFF));
                    decipherCpuid2(cast(ubyte)((a[c]>>16) & 0xFF));
                    decipherCpuid2(cast(ubyte)((a[c]>>24) & 0xFF));
                }
            } while (--numinfos);
        }

        // CPUID4: "Deterministic cache parameters" leaf
        void getcacheinfoCPUID4()
        {
            int cachenum = 0;
            for(;;) {
                uint a, b, number_of_sets;  
                asm {
                    mov EAX, 4;
                    mov ECX, cachenum;
                    cpuid;
                    mov a, EAX;
                    mov b, EBX;
                    mov number_of_sets, ECX;
                }
                ++cachenum;
                if ((a&0x1F)==0) break; // no more caches
                uint numthreads = ((a>>14) & 0xFFF)  + 1;
                uint numcores = ((a>>26) & 0x3F) + 1;
                if (numcores > maxCores) maxCores = numcores;
                if ((a&0x1F)!=1 && ((a&0x1F)!=3)) continue; // we only want data & unified caches
        
                ++number_of_sets;
                ubyte level = cast(ubyte)(((a>>5)&7)-1);
                if (level > datacache.length) continue; // ignore deep caches
                datacache[level].associativity = a & 0x200 ? ubyte.max :cast(ubyte)((b>>22)+1);
                datacache[level].lineSize = (b & 0xFFF)+ 1; // system coherency line size
                uint line_partitions = ((b >> 12)& 0x3FF) + 1;
                // Size = number of sets * associativity * cachelinesize * linepartitions
                // and must convert to Kb, also dividing by the number of cores.
                ulong sz = (datacache[level].associativity< ubyte.max)? number_of_sets *
                    datacache[level].associativity : number_of_sets;        
                datacache[level].size = cast(uint)(
                        (sz * datacache[level].lineSize * line_partitions ) / (numcores *1024));
                if (level == 0 && (a&0xF)==3) {
                    // Halve the size for unified L1 caches
                    datacache[level].size/=2;
                }
            }
        }

        // CPUID8000_0005 & 6
        void getAMDcacheinfo()
        {
            uint c5, c6, d6;
            asm {
                mov EAX, 0x8000_0005; // L1 cache
                cpuid;
                // EAX has L1_TLB_4M.
                // EBX has L1_TLB_4K
                // EDX has L1 instruction cache
                mov c5, ECX;
            }

            datacache[0].size = ( (c5>>24) & 0xFF);
            datacache[0].associativity = cast(ubyte)( (c5 >> 16) & 0xFF);
            datacache[0].lineSize = c5 & 0xFF;

            if (max_extended_cpuid >= 0x8000_0006) {
                // AMD K6-III or K6-2+ or later.
                ubyte numcores = 1;
                if (max_extended_cpuid >=0x8000_0008) {
                    asm {
                        mov EAX, 0x8000_0008;
                        cpuid;
                        mov numcores, CL;
                    }
                    ++numcores;
                    if (numcores>maxCores) maxCores = numcores;
                }
                asm {
                    mov EAX, 0x8000_0006; // L2/L3 cache
                    cpuid;
                    mov c6, ECX; // L2 cache info
                    mov d6, EDX; // L3 cache info
                }
    
                ubyte [] assocmap = [ 0, 1, 2, 0, 4, 0, 8, 0, 16, 0, 32, 48, 64, 96, 128, 0xFF ];
                datacache[1].size = (c6>>16) & 0xFFFF;
                datacache[1].associativity = assocmap[(c6>>12)&0xF];
                datacache[1].lineSize = c6 & 0xFF;
        
                // The L3 cache value is TOTAL, not per core.
                datacache[2].size = ((d6>>18)*512)/numcores; // could be up to 2 * this, -1.
                datacache[2].associativity = assocmap[(d6>>12)&0xF];
                datacache[2].lineSize = d6 & 0xFF;
            }
        }


        void cpuidX86()
        {
            char [12] vendorID;
            char [48] processorNameBuffer;
            uint m_1, m_2;
            char * venptr = vendorID.ptr;
            asm {
                mov EAX, 0;
                cpuid;
                mov m_1, EAX;
                mov EAX, venptr;
                mov [EAX], EBX;
                mov [EAX + 4], EDX;
                mov [EAX + 8], ECX;
                mov EAX, 0x8000_0000;
                cpuid;
                mov m_2, EAX;
            }
            max_cpuid=m_1;
            max_extended_cpuid=m_2;
            
            probablyIntel = vendorID == "GenuineIntel";
            probablyAMD = vendorID == "AuthenticAMD";
            vendorName=vendorID.dup;
            uint a, b, c, d;
            uint apic = 0; // brand index, apic id
            asm {
                mov EAX, 1; // model, stepping
                cpuid;
                mov a, EAX;
                mov apic, EBX;
                mov m_1, ECX;
                mov m_2, EDX;
            }
            miscfeatures=m_1;
            features=m_2;
            amdfeatures = 0;
            amdmiscfeatures = 0;
            if (max_extended_cpuid >= 0x8000_0001) {
                asm {
                    mov EAX, 0x8000_0001;
                    cpuid;
                    mov m_1, ECX;
                    mov m_2, EDX;
                }
                amdmiscfeatures=m_1;
                amdfeatures=m_2;
            }
            // Try to detect fraudulent vendorIDs
            if (amd3dnow) probablyIntel = false;
    
            stepping = a & 0xF;
            uint fbase = (a >> 8) & 0xF;
            uint mbase = (a >> 4) & 0xF;
            family = ((fbase == 0xF) || (fbase == 0)) ? fbase + (a >> 20) & 0xFF : fbase;
            model = ((fbase == 0xF) || (fbase == 6 && probablyIntel) ) ?
                 mbase + ((a >> 12) & 0xF0) : mbase;
         
            if (!probablyIntel && max_extended_cpuid >= 0x8000_0008) {
                // determine max number of cores for AMD
                asm {
                    mov EAX, 0x8000_0008;
                    cpuid;
                    mov c, ECX;
                }
                uint apicsize = (c>>12) & 0xF;
                if (apicsize == 0) {
                    // use legacy method
                    if (hyperThreadingBit)  maxCores = c & 0xFF;
                    else maxCores = 1;
                } else {
                    // maxcores = 2^ apicsize
                    maxCores = 1;
                    while (apicsize) { maxCores<<=1; --apicsize; }
                }
            }
    
            if (max_extended_cpuid >= 0x8000_0004) {
                char *procptr = processorNameBuffer.ptr;
                asm {
                    push ESI;
                    mov ESI, procptr;
                    mov EAX, 0x8000_0002;
                    cpuid;
                    mov [ESI], EAX;
                    mov [ESI+4], EBX;
                    mov [ESI+8], ECX;
                    mov [ESI+12], EDX;
                    mov EAX, 0x8000_0003;
                    cpuid;
                    mov [ESI+16], EAX;
                    mov [ESI+20], EBX;
                    mov [ESI+24], ECX;
                    mov [ESI+28], EDX;
                    mov EAX, 0x8000_0004;
                    cpuid;
                    mov [ESI+32], EAX;
                    mov [ESI+36], EBX;
                    mov [ESI+40], ECX;
                    mov [ESI+44], EDX;
                    pop ESI;            
                }
                // Intel P4 and PM pad at front with spaces.
                // Other CPUs pad at end with nulls.
                int start = 0, end = 0;
                while (processorNameBuffer[start] == ' ') { ++start; }
                while (processorNameBuffer[$-end-1] == 0) { ++end; }
                processorName = processorNameBuffer[start..$-end].dup;
            } else {
                processorName = "Unknown CPU";
            }
            // Determine cache sizes
    
            // Intel docs specify that they return 0 for 0x8000_0005.
            // AMD docs do not specify the behaviour for 0004 and 0002.
            // Centaur/VIA and most other manufacturers use the AMD method,
            // except Cyrix MediaGX MMX Enhanced uses their OWN form of CPUID2!
            // NS Geode GX1 provides CyrixCPUID2 _and_ does the same wrong behaviour
            // for CPUID80000005. But Geode GX uses the AMD method
    
            // Deal with idiotic Geode GX1 - make it same as MediaGX MMX.
            if (max_extended_cpuid==0x8000_0005 && max_cpuid==2) {      
                max_extended_cpuid = 0x8000_0004;
            }
            // Therefore, we try the AMD method unless it's an Intel chip.
            // If we still have no info, try the Intel methods.
            datacache[0].size = 0;
            if (max_cpuid<2 || !probablyIntel) {
                if (max_extended_cpuid >= 0x8000_0005) {
                    getAMDcacheinfo();
                } else if (probablyAMD) {       
                    // According to AMDProcRecognitionAppNote, this means CPU
                    // K5 model 0, or Am5x86 (model 4), or Am4x86DX4 (model 4)
                    // Am5x86 has 16Kb 4-way unified data & code cache.
                    datacache[0].size = 8;
                    datacache[0].associativity = 4;
                    datacache[0].lineSize = 32;     
                } else {
                    // Some obscure CPU.
                    // Values for Cyrix 6x86MX (family 6, model 0)
                    datacache[0].size = 64;
                    datacache[0].associativity = 4;
                    datacache[0].lineSize = 32;     
                }
            }   
            if ((datacache[0].size == 0) && max_cpuid>=4) {
                getcacheinfoCPUID4();
            }
            if ((datacache[0].size == 0) && max_cpuid>=2) {     
                getcacheinfoCPUID2();
            }
            if (datacache[0].size == 0) {
                // Pentium, PMMX, late model 486, or an obscure CPU
                if (mmx) { // Pentium MMX. Also has 8kB code cache.
                    datacache[0].size = 16;
                    datacache[0].associativity = 4;
                    datacache[0].lineSize = 32;     
                } else { // Pentium 1 (which also has 8kB code cache)
                         // or 486.
                    // Cyrix 6x86: 16, 4way, 32 linesize
                    datacache[0].size = 8;
                    datacache[0].associativity = 2;
                    datacache[0].lineSize = 32;
                }       
            }
            if (hyperThreadingBit) maxThreads = (apic>>>16) & 0xFF;
            else maxThreads = maxCores;
        }

        // Return true if the cpuid instruction is supported.
        // BUG(WONTFIX): Doesn't work for Cyrix 6x86 and 6x86L.
        bool hasCPUID()
        {
            uint flags;
            asm {
                pushfd;
                pop EAX;
                mov flags, EAX;
                xor EAX, 0x0020_0000;
                push EAX;
                popfd;
                pushfd;
                pop EAX;
                xor flags, EAX;
            }
            return (flags & 0x0020_0000) !=0;
        }

    } else { // inline asm X86

        bool hasCPUID() { return false; }

        void cpuidX86()
        {
                datacache[0].size = 8;
                datacache[0].associativity = 2;
                datacache[0].lineSize = 32;     
        }   
    }
}

final class CpuInfoPpc: CpuInfo{
    bool hasfloatingpoint; // Floating Point Instructions
    bool hasaltivec;       // AltiVec Instructions
    bool hasgraphicsops;   // Graphics Operations
    bool has64bitops;      // 64-bit Instructions
    bool hasfsqrt;         // HW Floating Point Square Root Instruction
    bool hasstfiwx;        // Store Floating Point as Integer Word Indexed Instructions
    bool hasdcba;          // Data Cache Block Allocate Instruction
    bool hasdatastreams;   // Data Streams Instructions
    bool hasdcbtstreams;   // Data Cache Block Touch Steams Instruction Form
    
    this(){
        super();
    }
    override void clear(){
        super.clear();
        vendorName="Unknown";
        processorName="UnknownPPC";
        hasfloatingpoint= false;
        hasaltivec      = false;
        hasgraphicsops  = false;
        has64bitops  = false;
        hasfsqrt        = false;
        hasstfiwx       = false;
        hasdcba         = false;
        hasdatastreams  = false;
        hasdcbtstreams  = false;
        cacheFixup();
    }
    override CpuInfoPpc opSliceAssign(CpuInfo o){
        auto other=cast(CpuInfoPpc)o;
        assert(other !is null);
        super.opSliceAssign(o);
        hasfloatingpoint= other.hasfloatingpoint;
        hasaltivec      = other.hasaltivec;
        hasgraphicsops  = other.hasgraphicsops;
        has64bitops     = other.has64bitops;
        hasfsqrt        = other.hasfsqrt;
        hasstfiwx       = other.hasstfiwx;
        hasdcba         = other.hasdcba;
        hasdatastreams  = other.hasdatastreams;
        hasdcbtstreams  = other.hasdcbtstreams;
        return this;
    }
    enum PPC_Cputype:int  { PPC601, PPC603, PPC603E, PPC604,
                 PPC604E, PPC620, PPCG3, PPCG4, PPCG5 };
    // TODO: Implement this function with OS support
    void cpuidPPC(PPC_Cputype cputype)
    {
        // TODO:
        // asm { mfpvr; } returns the CPU version but unfortunately it can
        // only be used in kernel mode. So OS support is required.
    
        // 601 has a 8KB combined data & code L1 cache.
        uint sizes[] = [4, 8, 16, 16, 32, 32, 32, 32, 64];
        ubyte ways[] = [8, 2,  4,  4,  4,  8,  8,  8,  8];
        uint L2size[]= [0, 0,  0,  0,  0,  0,  0,  256,  512];
        uint L3size[]= [0, 0,  0,  0,  0,  0,  0,  2048,  0];
    
        datacache[0].size = sizes[cputype];
        datacache[0].associativity = ways[cputype]; 
        datacache[0].lineSize = (cputype==PPC_Cputype.PPCG5)? 128 : 
            (cputype == PPC_Cputype.PPC620 || cputype == PPC_Cputype.PPCG3)? 64 : 32;
        datacache[1].size = L2size[cputype];
        datacache[2].size = L3size[cputype];
        datacache[1].lineSize = datacache[0].lineSize;
        datacache[2].lineSize = datacache[0].lineSize;
        cacheFixup();
    }
    
}

/// this should be expanded by someone using sparc
final class CpuInfoSparc: CpuInfo{
    this(){ super(); }
    override void clear(){
        super.clear();
        vendorName="unknown";
        processorName="unknownSparc";
    }
    override CpuInfoSparc opSliceAssign(CpuInfo o){
        auto other=cast(CpuInfoSparc)o;
        assert(other !is null);
        super.opSliceAssign(o);
        return this;
    }
    enum Sparc_Cputype:int {
        UltraSparcIIi, UltraSparcIII, UltraSparcIIIi,UltraSparcIV,
        UltraSparcIVplus, Sparc64V
    }
    // TODO: Implement this function with OS support
    void cpuidSparc(Sparc_Cputype cputype)
    {
        size_t l1,l2;
        ubyte way1,way2;
        switch(cputype){
        case Sparc_Cputype.UltraSparcIIi:
            l1 = 16;  way1=2; l2 = 512; way2=4;
            break;
        case Sparc_Cputype.UltraSparcIII:
            l1 = 64;  way1=4; l2= 4096; way2=4; // or l2=8192;
            break;
        case Sparc_Cputype.UltraSparcIIIi:
            l1 = 64;  way1=4; l2= 1024; way2=4;
            break;
        case Sparc_Cputype.UltraSparcIV:
            l1 = 64;  way1=4; l2= 16*1024; way2=1;
            break;
        case Sparc_Cputype.UltraSparcIVplus:
            l1 = 64;  way1=4; l2 = 2048; way2=1;
            datacache[3].size=32*1024;
            break;
        case Sparc_Cputype.Sparc64V:
            l1 = 128; way1=2; l2 = 4096; way2=4;
            break;
        default:
            throw new Exception("invalid cputype",__FILE__,__LINE__);
        }
        datacache[1].size=l1;
        datacache[1].associativity=way1;
        datacache[2].size=l2;
        datacache[2].associativity=way2;
        cacheFixup();
    }
}

