// compute.c
// A program to use as a test fixture for Gaussian distribution experiments
// with a large number of sample points.  The program contains various
// N(0,1) Generators
#include <stdio.h>
#include <math.h>

// (Rougly) How many samples to compute/take stats on before quitting
#define MAXN		1e8

// Which Gaussian Random Generator to use?
// RejectGaussian , BMGaussian, NBMGaussian, NBMGaussianDrift,
// KMGaussian, MontyPythonGaussian, Haruspex1Gaussian
#define GAUSSIAN Haruspex1Gaussian

// The following define causes the random generator to be de-stabalized 
// by only reading half the values from it before re-shuffling, this
// destroys the guarantee that every bit position of the random bytes and
// uniform distribution is exactly 50% likely within a fixed period of samples
// set 1 to cause instability, 0 causes it to remain stable.

#define UNSTABLE_RANDOM 0

// Seed value for the random number generator
//#define SEED 256
#define SEED 372

// This randomizer shuffles blocks of 256 bytes, in a pool of 4096 bytes.
// Each block has every possible byte value exactly once
//
// Data from the whole distribution is sampled every 389th byte,
// which is relatively prime to pool size and guaranteed to choose every
// byte in the array by uniformly sampling from different blocks and without
// repeating any sampled value.
//
// After every byte in the pool is read, the pool is re-shuffled and
// sampling continued from the location from which it left off.
//
// Sampling in this fashion guarantees that every bit in every position of the
// sampled bytes has exactly a 50% chance of being selected within a period
// of 4096 samples.
//
// Since the uniform distribution generator uses 5 bytes at a time,
// which is also relatively prime to *both* the pool size, and the traversal
// rate, the 5 byte alignment of the uniform generator's samples
// does not repeatedly correlate any particular byte of the sample
// with any particular block, or blocks. All blocks are equally likely.
//
// These properties, I hope, are sufficient to produce a decent randomization
// for the statistic, and without recourse to external random seeds

#define mask (4096-1) 
static unsigned char mixArray[mask+1];

// Shuffle every byte in the randomizer array at least once.
static void InitRandom(unsigned char seed) {
	static int sample = 0, p0 = -1;
	static unsigned char byte=0, swap=11;
	int i;
	
	if (p0<0) for (i=0; i<mask+1; ++i) mixArray[i]=(unsigned char)i;

	sample = (sample + seed<<8) & mask;
	swap += seed;

	for (i=0; i<mask+1; i+=3 ){
		sample  = (sample + 768) & mask;
		swap    = mixArray[ sample + (swap^0xAA) ];
		p0 = (i & ~(0xFF)) + swap;
		
		byte = mixArray[ i ];
		mixArray[ i  ] = mixArray[ p0 ];
		mixArray[ p0 ] = byte;
	}
}

// Traverse the random pool, selecting nearly orthogonal random byte values
//
#if UNSTABLE_RANDOM
# define SAMPLE_LEN mask/2
#else
# define SAMPLE_LEN (mask+1)
#endif

unsigned char RandomByte(void){
	static int p0=0;
	static int count=SAMPLE_LEN;

	p0 += 421; // Always get bytes from different bins, skip bins often...
	if ( !--count ) {
		count=SAMPLE_LEN;
		InitRandom( mixArray[p0 & mask] + 151 );
		p0 += 839; // never start in the same place for 4096 tries...
	}
	return mixArray[ p0 & mask ];
}

// Generate a random double on [0,1) from 5 random bytes
double Random01(){
	static double adjust = 1.0/(1ULL<<5*8);
	static unsigned long long result=0;
	unsigned char* p = ((unsigned char*)(void*)&result)-1;	
	int i;
	
	for( i=0; i<5; ++i) *(++p) = RandomByte();
	return (double)result * adjust;
}

// -------------------------------------------------------------------
// These are the various Gaussian distribution based random generators
// They all produce a normal distribution with mean 0, and std. deviation 1.
// (at least in theory...)

double RejectGaussian(void){
	// My own idea...
	// Generate a Gaussian by simply accepting (1-rejecting) random
	// values in proportion to the relative Gaussian probability
	// The generator produces a maximum z of 8 deviations 
	
	static unsigned char sign=0;
	static double u1=5,u2,z;

	while (1){ // loop until a non-rejected value is found
		u2 = u1;	// roll the numbers through
		u1=Random01();  // pick a number

		if ( exp( -0.5*64.*u1*u1 )>u2) {
			z =  u1*8;
			break;
		}
	}

	// Randomly generate a sign...
	sign >>=1;
	if (sign<2) sign=RandomByte();
	return (sign&1) ? -z:z;
}

double BMGaussian(void){
	// Use the "Box Muller" method to generate gaussian distribution
	double z,u1,u2;
	static double zz;
	if (zz) { z=zz; zz=0; return z; }
	u1=Random01(); u2=Random01();
	z=sqrt( -2. * log( 1.-u1 ) );
	zz=z*sin( 2*M_PI*u2 ); // Save an orthogonal value for the next pass.
	return z*cos( 2*M_PI*u2 );
}

double NBMGaussian(void){
	// Use the No-Sine "Box Muller" method to generate Gaussian distribution
	// Removes biases (if any) from OS/CPU's sin() and pi accuracy

	double xs1, xs2, z, w;
	do {
		xs1=2.0*Random01()-1;
		xs2=Random01();
		w = xs1*xs1 + xs2*xs2;
	} while (w >= 1.0 );

	z = xs1 * sqrt( (-2.0 * log( w ) )/w );
	return z;
}

double NBMGaussianDrift(void){
	// Use the No-Sine "Box Muller" method to generate a Gaussian
	// This version includes standard deviation correction to keep
	// the standard deviation closer to ideal for moderately sized
	// samples...	
	
	static double lzz=1.0;
	double xs1, xs2, z, w, drift=1.0;
	do {
		xs1=2.0*Random01()-1; // Cancel any net mean value...
		xs2=Random01();
		w = xs1*xs1 + xs2*xs2;
	} while (w >= 1.0 );

	// Compute a drift correction
	z = xs1 * sqrt( (-2.0 * log( w ) )/w ) * drift;
	w = z*z;
	drift += (w + lzz)>2.0 ? -1e12:1e12; // Too big scales down, ...
	lzz = w;
	return z;
}

double KMGaussian(void) {
	// Kinderman and Monahan method. Reference: Kinderman,
        // A.J. and Monahan, J.F., "Computer generation of random
        // variables using the ratio of uniform deviates", ACM Trans
        // Math Software, 3, (1977), pp257-260.
	
	while (1) {
            double u1 = Random01();
            double u2 = 1.0 - Random01();
            double zz,z = 1.7155277699214135*(u1-0.5)/u2;
            zz = z*z/4.0;
            if (zz <= -log(u2)) return z;
	}
}

// ------------------
// Monty Python Gaussian 
// Andrew Robinson's version June 2012.

double MontyPythonGaussianCore(void) {
// A rapid partitioning of the bell curve into optimized computation areas 
// that can be additively re-maped for fast generation of a normal distribution 
// The program as shown does not have tail correction....
// And hasn't been carefully debugged; test it before using it...
//
// See:
// http://www.physicsforums.com/attachment.php?attachmentid=48387&d=1339819476
// http://www.physicsforums.com/showthread.php?t=611422
//
static const double spin  =0.966804869573191;
static const double spin2 =0.966804869573191*2.0; // insure compute once... 
static const double iTop  =1.59576912160573;      // 1.0 / sqrt(pi/8)

// ITOP is 1/ the height of the random generator areaBox.
// Spin is a partitioning value arbitrarily chosen to make
// the areaBox have a horizontal(z) length of exactly 2,
// which means random generation may use addition or shifting
// rather than multiplication to scale a uniform random to a z of [0,2) 
// The areabox has an area equal to exactly half (the positive half) of
// a bell curve.
//
// 2.0*exp( -0.5*SPIN*SPIN ) = 2.0/ITOP = sqrt(2*pi)/2.0

// For speed of calculation reasons, bounding polynomials are used *FIRST*
// and an exact gaussian exp( 0.5*z*z ) is only made if an error is possible. 
// Other bounding functions are possible depending on machine hardware...

	double z = Random01()*2.;
	double u1, lowerN, upperN, ebar;

	if (z<=spin) return z; // Area region A1 always returns...

	if (z>spin2) { // Tail region computation for sure... GO recursive!
		return  spin2 + MontyPythonGaussianCore()*0.609743713386175;
		// A correction ought to go here, but I'm ignoring it for now.
	}

	// Otherwise, it must be in regions A2,A3,error.
	// And requires a "y" point.
	u1 = Random01();

	// Compute an estimated bound -- and if too close -- compute exactly
	ebar   = 0.008; // maximum polynomial error is 0.8% (<0.5% avg).
	lowerN =((0.27*z)-1.57009)*z+2.26458;

	// The following if operation could be re-ordered with the return's
	// depending on which is faster, a caluculation -- or a cache miss...
	if (u1      <= lowerN) return z;
	if (u1-ebar <= lowerN) {
		lowerN = exp(-0.5*z*z) * iTop; // Compute the exact value.
		if (u1<=lowerN) return z; // barely below/on the curve of A2.
	 }

	 // At this point, the data *must* be above curve a2. 
	 // curves e0, and a3 are mirrored...
	 z = spin2 - z;

	 ebar=0.00246; // Maximum polynomial error is 0.246% (<~0.15% avg.)
	 upperN  =  (( -0.2922*z + 0.9374)*z - 0.0171)*z + 0.40485;
	 
	 if (u1        > upperN) return z;
	 if (u1 + ebar > upperN) {
		upperN = 2.0 - exp( -0.5*z*z ) * iTop;
		if (u1 >= upperN) return z;
	 }

	 // There is no other possibilty, the value is an e0
	 // Return it as a tail region data point...
	 
	 return spin2 + z*1.02803051369197;
}


double MontyPythonGaussian(void){
	// My own idea...
	static unsigned char sign=0;
	double z = MontyPythonGaussianCore();

	// Randomly generate a sign...
	z = (sign&1) ? z:-1;	
	if (!(sign>>=1)) sign=RandomByte();

	return z;
}


double MontyPythonGaussian2(void){
	// My own idea...
	static double queue[4] = {0,0};
	static double result=0;
	static int pos=0;
	static unsigned char sign=0;
	double z = MontyPythonGaussianCore();

	// Randomly generate a sign...
	z = (sign&1) ? z:-1;	
	if (!(sign>>=1)) sign=RandomByte();

	result += z - queue[pos];
	queue[pos] = z;
	pos = (pos+1)&0x3;
	return result/2.0;
}

double Haruspex1Gaussian(void) {
	// Mapping techinique
	// http://www.physicsforums.com/showpost.php?p=3961012&postcount=14
	double z,y,gz,dg;

	while (1) {
		z=Random01();
		y=Random01() * 2.43;
		gz=1/(1.-z);
		dg = gz*gz;
		gz-=1.0;
		if (y< exp(-0.5*gz*gz)*dg) return gz;	
	}
}

// --------------------------------------------------------------------------
//     main(), the place where the numerical experiment is carried out
// --------------------------------------------------------------------------

int main(void) {
	long double a=0.;	// accumulation
	double tmp;	// temporary for calculation
	double n=0.;	// number of items counted
	double plus=0.;	// Number of items being positive, for skew measurement 
	double result;	// The resultant statistic to be measured

	// ---------------------------------------------
	// Initialize the all important random generator
	
	for (tmp=0; tmp < SEED; tmp+=1) InitRandom( (int)tmp );

	// ---------------------------------------------
	// calculate the statistic using n iterations 
	
	while ( (n+=1) < MAXN ) {
		// ---------------------------------------
		// Generate a sample and statistic
		
		tmp  = GAUSSIAN() + GAUSSIAN(); // Generate a Gaussian sample

		if (tmp>0) plus+=1.0;	// Keep track of skew (if any...)
		a += (tmp*tmp);

		// Compute the overall result ... but only occasionally
		if ( ((long)n & 0x1fffff)==0 ){
			result = sqrt( (double)(a/n) );

		// ---------------------------------------------------------
		// From here down is just printing the most stable estimates
		
			static char display[48]="", next[48];
			static char candidate='!';
			static int  stable=0;
			char *d=display, *nx=next;
			
		       	sprintf( next,"%16.14f", result ); //Get potential value

			// Find the (still) non-changing portion of the value
			while (*d && *nx && *d==*nx) {++d;++nx;}

			// Keep track of the next potential digit/character
			if (candidate != *nx) { // if digit was unstable, retry
				stable=0;
				candidate=*nx;
		       	} else if (!(++stable&0x3)) { // 4 tries before extends
				*d++=*nx++;	// extend display with candidate
				candidate=*nx;
			}
			*d=0; // terminate the displayable result string

			// Print the quasi-stable result
			printf("%s plus=%9.7f n=%4.2e \r",display,(plus/n),n);
			fflush(stdout); // avoid nuissance buffer delays...
		} // end of the print routine

	} // while more samples to compute, do them...
	printf("\n%16.14f\n", result);
	return 0; // Always tell OS that computation was a "success".
}// end main()