/**
 ** Simple entropy harvester based upon the havege RNG
 **
 ** Copyright 2009 Gary Wuertz gary@issiweb.com
 **
 ** This program is free software: you can redistribute it and/or modify
 ** it under the terms of the GNU General Public License as published by
 ** the Free Software Foundation, either version 3 of the License, or
 ** (at your option) any later version.
 **
 ** This program is distributed in the hope that it will be useful,
 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ** GNU General Public License for more details.
 **
 ** You should have received a copy of the GNU General Public License
 ** along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **
 ** This source is an adaptation of work released as
 **
 ** Copyright (C) 2006 - André Seznec - Olivier Rochecouste
 **
 ** under version 2.1 of the GNU Lesser General Public License
 */
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <sys/time.h>
#include "havege.h"
#include "havegedef.h"
/**
 * Local prototypes
 */
static void collect_ndrand(struct hperf * perf);
static int configure_ndrand(int icache,int dcache);
static int configure_hw();
static int configure_amd();
static int configure_intel(unsigned int lsfn);
static void cpuid(int fn, unsigned int *p, char * tag);
/**
 * The original HAVEGE implementation relied heavily on compilation details
 * to tailor the mechanism to its environment. HAVEGE depends on hardware
 * details: the architecture (x86, sparc, ppc, ia64), cache sizes, and
 * the size of compiled code used in the collection sequence.
 *
 * The architectural dependence cannot be removed (since inline assembly is used),
 * but the other requirements can be moved from the compile time realm to
 * execution time to gain some additional flexibility.
 *
 * program state.
 */
static struct  hinfo info;                               // configuration
static int     havege_bigarray [NDSIZECOLLECT + 16384];  // random data
static int     havege_ndpt     = 0;                      // data output offset
/**
 * collect_ndrand state
 */
static int              andpt           = 0;             // offset mask
static volatile int     *havege_pwalk   = 0;             // the walk table
static int              PT              = 0;             // pwalk offset block 1
static int              PT2             = 0;             // pwalk offset block 2
static int              pt2             = 0;             // see note 1 below
/**
 * Entropy gathering - haveged vs gcc round 3.
 *
 * Entropy is collected by executing multiple instances of the code generated by
 * "oneiteration.h". The generated code attempts to exercise the processor branch
 * predictors while permuting the walktable with readings from the processor
 * time stamp counter to generate the random data transfered to the output array.
 * Since the code sequence is completely deterministic, timing variations occur
 * only on the indirect effects of external events as represented by the readings
 * of the processor time stamp counter.
 *
 * In order for this mechanism to work as designed, "oneiteration.h" repetitions
 * should be in-lined to nearly fill the L1 instruction cache. This is approximated
 * in practice by using a computed goto to initiate the collection sequence ( a
 * relative of "Duff's device")
 *
 * This task is made more complicated by the ever more aggresive optimization
 * coming out of gcc. Optimization level is controlled by the build system so
 * all we can try to do is prevent the optimizer from playing too loose with
 * memory accesses (in >gcc4.4 a function attribute may save the day). Note this
 * game is counterintitive - the volatile changes actually decrease the size
 * of the generated code - and making some of the other auto varibles below volatile
 * will get you a segfault!
 *
 * Note 1: pt2 is an outlier - it is used w/o initialization in one instance but
 * seems mostly a temporary. It is probably a bug, but let sleeping....
 */
static void collect_ndrand(struct hperf *perf)
{
   static const int jumps[] = {
      &&loop40-&&loop40, &&loop39-&&loop40, &&loop38-&&loop40, &&loop37-&&loop40,
      &&loop36-&&loop40, &&loop35-&&loop40, &&loop34-&&loop40, &&loop33-&&loop40,
      &&loop32-&&loop40, &&loop31-&&loop40, &&loop30-&&loop40, &&loop29-&&loop40,
      &&loop28-&&loop40, &&loop27-&&loop40, &&loop26-&&loop40, &&loop25-&&loop40,
      &&loop24-&&loop40, &&loop23-&&loop40, &&loop22-&&loop40, &&loop21-&&loop40,
      &&loop20-&&loop40, &&loop19-&&loop40, &&loop18-&&loop40, &&loop17-&&loop40,
      &&loop16-&&loop40, &&loop15-&&loop40, &&loop14-&&loop40, &&loop13-&&loop40,
      &&loop12-&&loop40, &&loop11-&&loop40, &&loop10-&&loop40, &&loop09-&&loop40,
      &&loop08-&&loop40, &&loop07-&&loop40, &&loop06-&&loop40, &&loop05-&&loop40,
      &&loop04-&&loop40, &&loop03-&&loop40, &&loop02-&&loop40, &&loop01-&&loop40
      };
   volatile int *const RESULT = havege_bigarray;
   volatile int *Pt0;
   volatile int *Pt1;
   volatile int *Pt2;
   volatile int *Pt3;
   struct timeval et0,et1;
   int i, inter, PTtest, pt, havege_hardtick;

   if (info.loop_idx<0) {                    // first time initialization
      int ovhd = &&loop - &&loop00;          // loop overhead
      int sz = info.i_cache * 1024 - ovhd ;  // how much code need to fill cache
      int ct = sizeof(jumps)/sizeof(int);
      for(i=1;i<ct&&jumps[i]<sz;i++)
         ;
      info.loop_idx = --i;
      info.loop_sz  = jumps[i]+ovhd;         // collection loop size
      }
   i = 0;
   if (perf!=NULL)
      gettimeofday(&et0,NULL);
   goto loop00;
   #include "loopbody.h"
loop00:
   if (i < NDSIZECOLLECT) goto *(&&loop40+jumps[info.loop_idx]);
loop:
   if (perf!=NULL) {
      gettimeofday(&et1,NULL);
      perf->fill  = 1;
      perf->etime = (et1.tv_sec - et0.tv_sec)*1000000 + et1.tv_usec - et0.tv_usec;
      }
}
/**
 * Configure the collector
 *
 * Initialize the entropy collector. This consists of determining the L1 cache
 * sizes if not specified and allocating in intermediate walk table twice the
 * size of the L1 data cache. Randomness is generated in the walk table by
 * permutting processor time stamp readings. Because this area is twice the
 * size of the L1 data cache, this process should exercise processor TLBs.
 *
 * Initialization of the associated instruction cache mechanism takes place in
 * the first call to collect_ndrand()
 */
static int configure_ndrand (icache,dcache)
{
   int offs, *p;
   info.arch      = ARCH;
   info.vendor    = "";
   info.i_cache   = icache;
   info.d_cache   = dcache;
   info.loop_idx  = -1;
   info.loop_sz   =
   info.etime     = 0;
   if ((icache<1||dcache<1) && configure_hw()==0)
      return 0;

   andpt          = ((2*info.d_cache*1024)/sizeof(int))-1;
   p              = (int *) malloc((andpt + 4097)*sizeof(int));
   offs           = (int)((((long)&p[4096])&0xfff)/sizeof(int));
   havege_pwalk   = &p[4096-offs];
   return 1;
}
/**
 * Auto configuration using cpuid. Note that a command line overrides is
 * respected (this method not called if both cache sizes specified)
 */
#ifdef CPUID
static int configure_hw()
{
   unsigned char regs[4*sizeof(int)];
   unsigned int *p = (unsigned int *)regs;
   int f;

   HASCPUID(f);
   if (f)
      cpuid(0,p,"configure_hw");
   else p[0] = 0;
   switch(p[1]) {
      case 0x68747541:  info.vendor = "amd";       break;
      case 0x69727943:  info.vendor = "cyrix";     break;
      case 0x746e6543:  info.vendor = "centaur";   break;   // aka via
      case 0x756e6547:  info.vendor = "intel";     break;
      case 0x646f6547:  info.vendor = "natsemi";   break;
      case 0x52697365:
      case 0x65736952:  info.vendor = "rise";      break;   // now owned by sis
      case 0x20536953:  info.vendor = "sis";       break;
      }
   info.generic = 0;
   if (!strcmp(info.vendor,"amd") && configure_amd())
      ;
   else if (configure_intel(p[0]))
      ;
   else {
      info.generic = 1;
      if (info.d_cache<1)  info.d_cache = GENERIC_DCACHE;
      if (info.i_cache<1)  info.i_cache = GENERIC_ICACHE;
      }
   return 1;
}
/**
 * Automatic configuration for amd
 *
 * As per AMD document 2541, April 2008
 */
static int configure_amd()
{
   unsigned char regs[4*sizeof(int)];
   unsigned int *p = (unsigned int *)regs;

   cpuid(0x80000000,p,"configure_amd");
   if ((p[0]&15)>=5) {                       // We want the L1 info
      cpuid(0x80000005,p,"configure_amd");
      info.d_cache   =  (p[2]>>24) & 0xff;   // l1 data cache
      info.i_cache   =  (p[3]>>24) & 0xff;   // l1 instruction cache
      return 1;
      }
   return 0;
}
/**
 * Automatic configuration for Intel x86 chips
 *
 * Notes: The "pentium hack" is to use the trace cache size for the instruction cache
 *        if no instruction cache value is found.
 *
 *        Recent Intel processor handbooks, hint that a processor may return a
 *        cache descriptor of 0xff to say in effect "buzz-off use leaf 4". My
 *        limited testing with leaf 4 indicates that it does not return the
 *        same information as leaf 2 - so in this code leaf 4 is only used as
 *        a fallback....
 */
static int configure_intel(unsigned int lsfn)
{
   unsigned char regs[4*sizeof(int)];
   unsigned int *p = (unsigned int *)regs;
  /**
   * As per Intel application note 485, August 2009 the following table contains
   * triples descriptor#, type (0=instruction,1=data), size (kb)
   * This table contains only L1 instruction(0), data(1), and trace(2) items.
   */
   static const int desc[] = {
      0x06, 0,  8 , // 4-way set assoc, 32 byte line size
      0x08, 0, 16 , // 4-way set assoc, 32 byte line size
      0x09, 0, 32 , // 4-way set assoc, 64 byte line size +
      0x0a, 1,  8 , // 2 way set assoc, 32 byte line size
      0x0c, 1, 16 , // 4-way set assoc, 32 byte line size
      0x0d, 1, 16 , // 4-way set assoc, 64 byte line size +
      0x10, 1, 16 , // 4-way set assoc, 64 byte line size
      0x15, 0, 16 , // 4-way set assoc, 64 byte line size
      0x2c, 1, 32 , // 8-way set assoc, 64 byte line size
      0x30, 0, 32 , // 8-way set assoc, 64 byte line size
      0x60, 1, 16 , // 8-way set assoc, sectored cache, 64 byte line size
      0x66, 1,  8 , // 4-way set assoc, sectored cache, 64 byte line size
      0x67, 1, 16 , // 4-way set assoc, sectored cache, 64 byte line size
      0x68, 1, 32 , // 4-way set assoc, sectored cache, 64 byte line size
      0x70, 2, 12 , // 8-way set assoc
      0x71, 2, 16 , // 8-way set assoc
      0x72, 2, 32 , // 8-way set assoc
      0x73, 2, 64 , // 8-way set assoc
      0x77, 0, 16 , // 4-way set assoc, sectored cache, 64 byte line size
      0x00, 0,  0   // sentinel
      };
   int i,j,k,n,sizes[] = {0,0,0};

   cpuid(2,p,"configure_intel");
   n = p[0]&0xff;
   for(i=0;i<n;i++) {
      for(j=0;j<4;j++)
         if (p[j] & 0x80000000) p[j] = 0;
      for(j=0;j<sizeof(regs);j++) {
         if (!regs[j]) continue;
         for(k=0;desc[k]!=0;k+=3)
            if (desc[k]==regs[j]) {
               sizes[desc[k+1]] += desc[k+2];
               break;
               }
         if (info.dbcpuid>0)
            printf("lookup %x %d %d\n", regs[j], desc[k+1], desc[k+2]);
         }
      if ((i+1)!=n)
         cpuid(2,p,"configure_intel(2)");
      }
   if (sizes[0]<sizes[2])	                  // pentium4 hack
      sizes[0] = sizes[2];
   if ((sizes[0]==0||sizes[1]==0) && lsfn>3) {
      int level, type, ways, parts, lines;
      for(i=0;i<15;i++) {
         p[3] = i;
         cpuid(4,p,"configure_intel(3)");
         if ((type=p[0]&0x1f)==0) break;     // No more info
         level = (p[0]>>5)&7;
         lines = p[1] & 0xfff;
         parts = (p[1]>>12) & 0x3ff;
         ways  = (p[1]>>22) & 0x3ff;
         n     = ((ways+1)*(parts+1)*(lines+1)*(p[3]+1))/1024;
         if (info.dbcpuid>0)
            printf("type=%d,level=%d,ways=%d,parts=%d,lines=%d,sets=%d: %d\n",
               type,level,ways+1,parts+1,lines+1,p[3]+1,n);
         if (level==1)
            switch(type) {
               case 1:  sizes[1] = n;  break;      // data
               case 2:  sizes[0] = n;  break;      // instruction
               case 3:  sizes[2] = n;  break;      // unified
               }
         }
      }
   if (info.i_cache<1)
      info.i_cache   = sizes[0];
   if (info.d_cache<1)
      info.d_cache   = sizes[1];
   if (info.i_cache>0 && info.d_cache>0)
      return 1;
   return 0;
}
/**
 * Wrapper around the cpuid to assist in debugging
 */
static void cpuid(int fn, unsigned int *p, char * tag)
{
   CPUID(fn,p);
   if (info.dbcpuid>0) {
      char *rn = "ABDC";
      char d[sizeof(int)+1];int i,j;

      printf("%s:%d\n",tag,fn);
      for (i=0;i<4;i++) {
         int t = p[i];
         int c = 0;
         for (j=sizeof(unsigned int);j>=0;j--) {
            d[j] = c;
            c = t&0xff;
            if (!isprint(c)) c = '.';
            t >>= 8;
            }
         printf("E%cX %10x %s\n", rn[i], p[i], d);
         }
      }
}
#else
/**
 * Auto configuration for processor w/o cpuid
 */
static int configure_hw()
{
   if (info.i_cache>0 && info.d_cache>0)
      ;
   else {
      info.generic = 1;
      if (info.d_cache<1)  info.d_cache = GENERIC_DCACHE;
      if (info.i_cache<1)  info.i_cache = GENERIC_ICACHE;
      }
   return 1;
}
#endif
/**
 * Configure the collection loop and seed the generator.
 */
int ndinit(struct pparams *params, struct hperf *perf)
{
   info.dbcpuid = (params->run_level==1 && params->verbose>0)? 1 : 0;
   if (configure_ndrand(params->i_cache,params->d_cache)) {
      int max = MININITRAND*CRYPTOSIZECOLLECT/NDSIZECOLLECT;
      int i;

      for (i = 0; i < max; i++)
         collect_ndrand(perf);
      havege_ndpt = 0;
      return 1;
      }
   return 0;
}
/**
 * Return monitor information
 */
void  ndinfo(struct hinfo *rv)
{
   memcpy((void *)rv, (void *)&info, sizeof(struct hinfo));
}
/**
 * Main access point
 */
int ndrand(struct hperf *perf)
{
   if (andpt==0) exit(0);
   if (havege_ndpt >= NDSIZECOLLECT) {
      collect_ndrand(perf);
      havege_ndpt = 0;
      }
   return havege_bigarray[havege_ndpt++];
}
