40 changed files with 15572 additions and 9055 deletions
-
2configure.ac
-
59examples/transient-noise/noi-ring51-demo.cir
-
53examples/transient-noise/noi-sc-tr.cir
-
56examples/transient-noise/noilib-demo.h
-
27examples/transient-noise/shot_ng.cir
-
5src/Makefile.am
-
4src/frontend/Makefile.am
-
298src/frontend/com_fft.c
-
1src/frontend/com_fft.h
-
8src/frontend/commands.c
-
62src/frontend/trannoise/1-f-code.c
-
61src/frontend/trannoise/1-f-code_d.c
-
846src/frontend/trannoise/FastNorm3.c
-
10src/frontend/trannoise/Makefile.am
-
532src/frontend/trannoise/wallace.c
-
7src/include/1-f-code.h
-
32src/include/FastNorm3.h
-
4src/include/bool.h
-
108src/include/fftext.h
-
13src/include/ngspice.h
-
22src/include/wallace.h
-
25src/main.c
-
4src/maths/Makefile.am
-
13src/maths/fft/Makefile.am
-
37src/maths/fft/NOTE
-
70src/maths/fft/Read Me
-
156src/maths/fft/fftext.c
-
106src/maths/fft/fftext.h
-
3175src/maths/fft/fftlib.c
-
76src/maths/fft/fftlib.h
-
297src/maths/fft/matlib.c
-
33src/maths/fft/matlib.h
-
24src/maths/misc/randnumb.c
-
3src/spicelib/devices/vsrc/vsrc.c
-
51src/spicelib/devices/vsrc/vsrcacct.c
-
1src/spicelib/devices/vsrc/vsrcask.c
-
11src/spicelib/devices/vsrc/vsrcdefs.h
-
442src/spicelib/devices/vsrc/vsrcload.c
-
7src/spicelib/devices/vsrc/vsrcpar.c
-
60visualc/vngspice.vcproj
@ -0,0 +1,59 @@ |
|||
* 51 stage Ring-Osc. BSIM3, transient noise |
|||
* will need 45 min on a i7 860 with 4 threads |
|||
|
|||
* closes the loop between inverters xinv1 and xinv5 |
|||
vin in out dc 0.5 pulse 0.5 0 0.1n 5n 1 1 1 |
|||
|
|||
vdd dd 0 dc 0 pulse 0 2.2 0 1n 1 1 1 |
|||
|
|||
vss ss 0 dc 0 |
|||
ve sub 0 dc 0 |
|||
|
|||
vpe well 0 2.2 |
|||
|
|||
* noisy inverters |
|||
xiinv2 dd ss sub well out25 out50 inv253 |
|||
xiinv1 dd ss sub well in out25 inv253 |
|||
|
|||
*very noisy inverter |
|||
xiinv5 dd ss sub well out50 out inv1_2 |
|||
*output amplifier |
|||
xiinv11 dd ss sub well out25 bufout inv1 |
|||
cout bufout ss 0.2pF |
|||
|
|||
.option itl1=500 gmin=1e-15 itl4=10 noacct |
|||
|
|||
* .dc vdd 0 2 0.01 |
|||
.tran 0.01n 500n |
|||
|
|||
.save in bufout v(t1) |
|||
|
|||
.include D:\Spice_Win\Exam_BSIM3\Modelcards\modelcard.nmos |
|||
.include D:\Spice_Win\Exam_BSIM3\Modelcards\modelcard.pmos |
|||
|
|||
.include noilib-demo.h |
|||
|
|||
.control |
|||
unset ngdebug |
|||
* first run |
|||
save bufout $ needed for restricting memory usage |
|||
rusage |
|||
tran 8p 10000n |
|||
rusage |
|||
plot bufout xlimit 90n 95n |
|||
linearize |
|||
fft bufout |
|||
* next run |
|||
reset |
|||
save bufout |
|||
alter @v.xiinv5.vn1[trnoise] = [ 0 0 0 0 ] $ no noise |
|||
tran 8p 10000n |
|||
rusage |
|||
plot bufout xlimit 90n 95n |
|||
linearize |
|||
fft bufout |
|||
plot mag(bufout) mag(sp2.bufout) xlimit 0 2G ylimit 1e-11 0.1 ylog |
|||
.endc |
|||
|
|||
|
|||
.end |
|||
@ -0,0 +1,53 @@ |
|||
* simple sample & hold, transient noise |
|||
|
|||
* switch control |
|||
* PULSE(V1 V2 TD TR TF PW PER) |
|||
vgate1 ga1 0 dc 0 pulse (0 1 0 10n 10n 90n 200n) |
|||
|
|||
Switch1 1 2 ga1 0 smodel1 |
|||
|
|||
* noisy input |
|||
* rms value white, time step, exponent < 2, rms value 1/f |
|||
vin 1 0 dc 0 trnoise 0.1m 0.2n 1 0.1m |
|||
*vin 1 0 dc 0 trnoise 0.1m 0.2n 0 0.1m |
|||
|
|||
* output |
|||
c2 2 0 10p |
|||
|
|||
* second S&H |
|||
vgate2 ga2 0 dc 0 pulse (0 1 140n 10n 10n 30n 200n) |
|||
*Buffer EXXXXXXX N+ N- NC+ NC- VALUE |
|||
e1 4 0 2 0 1 |
|||
Switch2 4 3 ga2 0 smodel2 |
|||
c3 3 0 10p |
|||
|
|||
.option itl1=500 gmin=1e-15 itl4=10 acct |
|||
|
|||
.model smodel1 sw vt=0.5 ron=100 |
|||
.model smodel2 sw vt=0.5 ron=100 |
|||
|
|||
.tran 0.4n 100u |
|||
|
|||
|
|||
.control |
|||
unset ngdebug |
|||
set filetype=ascii |
|||
rusage |
|||
run |
|||
rusage all |
|||
write noi_test.out v(1) |
|||
plot v(2) v(3) xlimit 4u 5u |
|||
plot v(ga1) v(ga2) xlimit 4u 5u |
|||
linearize |
|||
*rms v(1) |
|||
fft v(3) |
|||
plot mag(v(3)) loglog xlimit 1e4 1e8 ylimit 1e-10 1e-4 |
|||
setplot tran1 |
|||
linearize |
|||
psd 101 v(3) |
|||
plot mag(v(3)) xlimit 0 3e7 ylimit 0 10u |
|||
|
|||
.endc |
|||
|
|||
|
|||
.end |
|||
@ -0,0 +1,56 @@ |
|||
|
|||
* standard inverter made noisy |
|||
*.subckt inv1 dd ss sub well in out |
|||
*vn1 out outi dc 0 noise 0.1 0.3n 1.0 0.1 |
|||
*mn1 outi in ss sub n1 w=2u l=0.25u AS=3p AD=3p PS=4u PD=4u |
|||
*mp1 outi in dd well p1 w=4u l=0.25u AS=7p AD=7p PS=6u PD=6u |
|||
*.ends inv1 |
|||
|
|||
* standard inverter |
|||
.subckt inv1 dd ss sub well in out |
|||
mn1 out in ss sub n1 w=2u l=0.25u AS=3p AD=3p PS=4u PD=4u |
|||
mp1 out in dd well p1 w=4u l=0.25u AS=7p AD=7p PS=6u PD=6u |
|||
.ends inv1 |
|||
|
|||
* very noisy inverter (noise on vdd and well) |
|||
.subckt inv1_1 dd ss sub well in out |
|||
vn1 dd idd dc 0 trnoise 0.05 0.05n 1 0.05 |
|||
vn2 well iwell dc 0 trnoise 0.05 0.05n 1 0.05 |
|||
mn1 out in ss sub n1 w=2u l=0.25u AS=3p AD=3p PS=4u PD=4u |
|||
mp1 out in idd iwell p1 w=4u l=0.25u AS=7p AD=7p PS=6u PD=6u |
|||
*Cout out 0 0.1p |
|||
.ends inv1_1 |
|||
|
|||
|
|||
* another very noisy inverter |
|||
.subckt inv1_2 dd ss sub well in out |
|||
vn1 out outi dc 0 trnoise 0.05 8p 1.0 0.001 |
|||
mn1 outi in ss sub n1 w=2u l=0.25u AS=3p AD=3p PS=4u PD=4u |
|||
mp1 outi in dd well p1 w=4u l=0.25u AS=7p AD=7p PS=6u PD=6u |
|||
*Cout out 0 0.1p |
|||
.ends inv1_2 |
|||
|
|||
* another very noisy inverter with current souces parallel to transistor |
|||
.subckt inv13 dd ss sub well in outi |
|||
in1 ss outi dc 0 noise 200u 0.05n 1.0 50u |
|||
mn1 outi in ss sub n1 w=2u l=0.25u AS=3p AD=3p PS=4u PD=4u |
|||
in2 dd outi dc 0 noise 200u 0.05n 1.0 50u |
|||
mp1 outi in dd well p1 w=4u l=0.25u AS=7p AD=7p PS=6u PD=6u |
|||
*Cout out 0 0.1p |
|||
.ends inv13 |
|||
|
|||
.subckt inv53 dd ss sub well in out |
|||
xinv1 dd ss sub well in 1 inv1 |
|||
xinv2 dd ss sub well 1 2 inv1 |
|||
xinv3 dd ss sub well 2 3 inv1 |
|||
xinv4 dd ss sub well 3 4 inv1 |
|||
xinv5 dd ss sub well 4 out inv1 |
|||
.ends inv53 |
|||
|
|||
.subckt inv253 dd ss sub well in out |
|||
xinv1 dd ss sub well in 1 inv53 |
|||
xinv2 dd ss sub well 1 2 inv53 |
|||
xinv3 dd ss sub well 2 3 inv53 |
|||
xinv4 dd ss sub well 3 4 inv53 |
|||
xinv5 dd ss sub well 4 out inv53 |
|||
.ends inv253 |
|||
@ -0,0 +1,27 @@ |
|||
* Shot noise test with B source, diode |
|||
* voltage on device (diode, forward) |
|||
Vdev out 0 DC 0 PULSE(0.4 0.45 10u) |
|||
* diode, forward direction, to be modeled with noise |
|||
D1 mess 0 DMOD |
|||
.model DMOD D IS=1e-14 N=1 |
|||
X1 0 mess out ishot |
|||
* device between 1 and 2 |
|||
* new output terminals of device including noise: 1 and 3 |
|||
.subckt ishot 1 2 3 |
|||
* white noise source with rms 1V |
|||
VNG 0 11 DC 0 TRNOISE(1 1n 0 0) |
|||
*measure the current i(v1) |
|||
V1 2 3 DC 0 |
|||
* calculate the shot noise |
|||
* sqrt(2*current*q*bandwidth) |
|||
BI 1 3 I=sqrt(2*abs(i(v1))*1.6e-19*1e7)*v(11) |
|||
.ends ishot |
|||
* 20000 sample points |
|||
.tran 1n 20u |
|||
.control |
|||
run |
|||
plot (-1)*i(vdev) |
|||
meas tran vdev_rms avg i(vdev) from=0u to=9.9u |
|||
meas tran vdev_rms avg i(vdev) from=10.1u to=20u |
|||
.endc |
|||
.end |
|||
@ -0,0 +1,62 @@ |
|||
/* Copyright: Holger Vogt, 2008 |
|||
Generates 1/f noise values according to: |
|||
"Discrete simulation of colored noise and stochastic |
|||
processes and 1/fa power law noise generation" |
|||
Kasdin, N.J.; |
|||
Proceedings of the IEEE |
|||
Volume 83, Issue 5, May 1995 Page(s):802 - 827 |
|||
*/ |
|||
|
|||
#include <math.h> |
|||
#include <stdio.h> |
|||
#include <stddef.h> |
|||
#include <stdlib.h> |
|||
#include <stdarg.h> // var. argumente |
|||
#include "1-f-code.h" |
|||
#include "ngspice.h" |
|||
|
|||
#include "fftext.h" |
|||
#include "wallace.h" |
|||
|
|||
|
|||
void f_alpha(int n_pts, int n_exp, float X[], float Q_d, |
|||
float alpha) |
|||
{ |
|||
int i; |
|||
float *hfa, *wfa; |
|||
float ha; |
|||
|
|||
ha = alpha/2.0f ; |
|||
// Q_d = sqrt(Q_d); /* find the deviation of the noise */ |
|||
hfa = TMALLOC(float,n_pts); |
|||
wfa = TMALLOC(float,n_pts); |
|||
hfa[0] = 1.0f; |
|||
wfa[0] = Q_d * (float)GaussWa; |
|||
/* generate the coefficients hk */ |
|||
for (i=1 ; i < n_pts; i++) { |
|||
/* generate the coefficients hk */ |
|||
hfa[i] = hfa[i-1] * (ha + (float)(i-1)) / ( (float)(i) ); |
|||
/* fill the sequence wk with white noise */ |
|||
wfa[i] = Q_d * (float)GaussWa; |
|||
} |
|||
|
|||
// for (i=0 ; i < n_pts; i++) |
|||
// printf("rnd %e, hk %e\n", wfa[i], hfa[i]); |
|||
|
|||
/* perform the discrete Fourier transform */ |
|||
fftInit(n_exp); |
|||
rffts(hfa, n_exp, 1); |
|||
rffts(wfa, n_exp, 1) ; |
|||
|
|||
/* multiply the two complex vectors */ |
|||
rspectprod(hfa, wfa, X, n_pts); |
|||
/* inverse transform */ |
|||
riffts(X, n_exp, 1); |
|||
|
|||
free(hfa) ; |
|||
free(wfa); |
|||
/* fft tables will be freed in vsrcaccept.c and isrcaccept.c |
|||
fftFree(); */ |
|||
fprintf(stdout,"%d (2e%d) one over f values created\n", n_pts, n_exp); |
|||
} |
|||
|
|||
@ -0,0 +1,61 @@ |
|||
/* Copyright: Holger Vogt, 2008 |
|||
Discrete simulation of colored noise and stochastic |
|||
processes and 1/fa power law noise generation |
|||
Kasdin, N.J.; |
|||
Proceedings of the IEEE |
|||
Volume 83, Issue 5, May 1995 Page(s):802 - 827 |
|||
*/ |
|||
|
|||
#include <math.h> |
|||
#include <stdio.h> |
|||
#include <stddef.h> |
|||
#include <stdlib.h> |
|||
#include <stdarg.h> // var. argumente |
|||
#include "1-f-code.h" |
|||
#include "ngspice.h" |
|||
|
|||
#include "fftext.h" |
|||
#include "wallace.h" |
|||
|
|||
|
|||
void f_alpha(int n_pts, int n_exp, double X[], double Q_d, |
|||
double alpha) |
|||
{ |
|||
unsigned int i; |
|||
double *hfa, *wfa; |
|||
double ha; |
|||
|
|||
ha = alpha/2.0f ; |
|||
// Q_d = sqrt(Q_d); /* find the deviation of the noise */ |
|||
hfa = TMALLOC(double,n_pts); |
|||
wfa = TMALLOC(double,n_pts); |
|||
hfa[0] = 1.0f; |
|||
wfa[0] = Q_d * GaussWa; |
|||
/* generate the coefficients hk */ |
|||
for (i=1 ; i < n_pts; i++) { |
|||
/* generate the coefficients hk */ |
|||
hfa[i] = hfa[i-1] * (ha + (double)(i-1)) / ( (double)(i) ); |
|||
/* fill the sequence wk with white noise */ |
|||
wfa[i] = Q_d * GaussWa; |
|||
} |
|||
|
|||
// for (i=0 ; i < n_pts; i++) |
|||
// printf("rnd %e, hk %e\n", wfa[i], hfa[i]); |
|||
|
|||
/* perform the discrete Fourier transform */ |
|||
fftInit(n_exp); |
|||
rffts(hfa, n_exp, 1); |
|||
rffts(wfa, n_exp, 1) ; |
|||
|
|||
/* multiply the two complex vectors */ |
|||
rspectprod(hfa, wfa, X, n_pts); |
|||
/* inverse transform */ |
|||
riffts(X, n_exp, 1); |
|||
|
|||
free(hfa) ; |
|||
free(wfa); |
|||
/* fft tables will be freed in vsrcaccept.c and isrcaccept.c |
|||
fftFree(); */ |
|||
fprintf(stdout,"%d (2e%d) one over f values created\n", n_pts, n_exp); |
|||
} |
|||
|
|||
@ -0,0 +1,846 @@ |
|||
/* This is file FastNorm3.c */ |
|||
/* SUPERCEDES FastNorm.c, FastNorm2.c. Use with FastNorm3.h */ |
|||
/* 24 June 2003 */ |
|||
|
|||
/* A package containing a very fast generator of pseudo-random |
|||
Unit NORMAL variates, and some fairly high-quality UNIFORM |
|||
generators. It also contains a straightforward implementation of |
|||
a ChiSquared and Gamma generator copied from Ahrens and Dieter. |
|||
*/ |
|||
|
|||
/* Version 3 with double transformations and controllable extension |
|||
to repeat the double transformations for higher quality at lower |
|||
speed. |
|||
Dated 17 May 20003. |
|||
Copyright Christopher Stewart Wallace. |
|||
*/ |
|||
/* |
|||
%A C. S. Wallace |
|||
%T Fast Pseudo-Random Generators for Normal and Exponential Variates. |
|||
%J ACM Trans. Math. Software |
|||
%V 22 |
|||
%N 1 |
|||
%P 119-127 |
|||
%M MAR |
|||
%D 1996 |
|||
%O TR 94/197, May 1994, Dept. Computer Science, Monash University |
|||
%K CSW, CSWallace, Monash, pseudo random number generator, algorithm, |
|||
jrnl, TOMS, numbers, normal, probability, distribution, PRNG, RNG, Gaussian, |
|||
distribution, jrnl, ACM, TOMS, TR 94 197, TR197, c1996, c199x, c19xx |
|||
*/ |
|||
/* Use of this package requires the file "FastNorm3.h" which must be |
|||
#include-ed in any C files using this package. |
|||
|
|||
The main purpose of this package is to provide a very fast source |
|||
of pseudo-random variates from the Unit Normal N(0,1) distribution, having |
|||
the density function |
|||
|
|||
f(x) = (1/sqrt(2*PI)) * exp (-0.5 * x^2) |
|||
|
|||
Variates are obtained not by calling a function, but by use of a macro |
|||
"FastNorm" defined in FastNorm3.h. In a C program, this macro may appear |
|||
anywhere a (double) expression could appear, e.g in statements like |
|||
z += FastNorm; |
|||
if (FastNorm < 1.1) ..... |
|||
q = fabs (FastNorm); etc. |
|||
|
|||
The revision history, and a reference to the method description, is given |
|||
later in this file under the heading "Revision history Fastnorm". |
|||
|
|||
Major sections of this file, such as the revision history and the |
|||
major subroutines, are all headed by a line containing a row of minus signs (-) |
|||
and the name of the section or subroutine. |
|||
|
|||
The generators included are: |
|||
a Uniform source of integers, unsigned integers and doubles. |
|||
Chi-sq(N) (based on Ahrens and Dieter) |
|||
Gamma(N) (= 0.5 * Chi-sq(2N)) |
|||
Normal (a very fast routine) |
|||
*/ |
|||
|
|||
/* ----------------- inclusions and some definitions ------------ */ |
|||
#include <math.h> |
|||
#ifndef NOSPICE |
|||
#include "ngspice.h" |
|||
#endif |
|||
#include "FastNorm3.h" |
|||
|
|||
|
|||
/* --------------- (Uniform) c7rand, irandm, urandm ---------- */ |
|||
/* |
|||
c A random number generator called as a function by |
|||
c c7rand (iseed) or irandm (iseed) or urandm (iseed) |
|||
|
|||
c The parameter should be a pointer to a 2-element Sw vector. |
|||
c The first call gives a double uniform in 0 .. 1. |
|||
c The second gives an Sw integer uniform in 0 .. 2**31-1 |
|||
c The third gives an Sw integer with 32 bits, so unif in |
|||
c -2**31 .. 2**31-1 if used in 32-bit signed arithmetic. |
|||
c All update iseed[] in exactly the same way. |
|||
c iseed[] must be a 2-element Sw vector. |
|||
c The initial value of iseed[1] may be any 32-bit integer. |
|||
c The initial value of iseed[0] may be any 32-bit integer except -1. |
|||
c |
|||
c The period of the random sequence is 2**32 * (2**32-1) |
|||
c Its quality is quite good. It is based on the mixed multiplicative |
|||
c congruential (Lehmer) generator |
|||
x[n+1] = (69069 * x[n] + odd constant) MOD 2^32 |
|||
c but avoids most of the well-known defects of this type of generator |
|||
c by, in effect, generating x[n+k] from x[n] as defined by the |
|||
c sequence above, where k is chosen randomly in 1 ... 128 with the |
|||
c help of a subsidiary Tauseworth-type generator. |
|||
c For the positve integer generator irandm, the less |
|||
c significant digits are more random than is usual for a Lehmer |
|||
c generator. The last n<31 digits do not repeat with a period of 2^n. |
|||
c This is also true of the unsigned integer generator urandm, but less |
|||
c so. |
|||
|
|||
c This is an implementation in C of the algorithm described in |
|||
c Technical Report "A Long-Period Pseudo-Random Generator" |
|||
c TR89/123, Computer Science, Monash University, |
|||
c Clayton, Vic 3168 AUSTRALIA |
|||
c by |
|||
c |
|||
c C.S.Wallace csw@cs.monash.edu.au |
|||
|
|||
c The table mt[0:127] is defined by mt[i] = 69069 ** (128-i) |
|||
*/ |
|||
|
|||
#define MASK ((Sw) 0x12DD4922) |
|||
/* or in decimal, 316492066 */ |
|||
#define SCALE ((double) 1.0 / (1024.0 * 1024.0 * 1024.0 * 2.0)) |
|||
/* i.e. 2 to power -31 */ |
|||
|
|||
static Sw mt [128] = { |
|||
902906369, |
|||
2030498053, |
|||
-473499623, |
|||
1640834941, |
|||
723406961, |
|||
1993558325, |
|||
-257162999, |
|||
-1627724755, |
|||
913952737, |
|||
278845029, |
|||
1327502073, |
|||
-1261253155, |
|||
981676113, |
|||
-1785280363, |
|||
1700077033, |
|||
366908557, |
|||
-1514479167, |
|||
-682799163, |
|||
141955545, |
|||
-830150595, |
|||
317871153, |
|||
1542036469, |
|||
-946413879, |
|||
-1950779155, |
|||
985397153, |
|||
626515237, |
|||
530871481, |
|||
783087261, |
|||
-1512358895, |
|||
1031357269, |
|||
-2007710807, |
|||
-1652747955, |
|||
-1867214463, |
|||
928251525, |
|||
1243003801, |
|||
-2132510467, |
|||
1874683889, |
|||
-717013323, |
|||
218254473, |
|||
-1628774995, |
|||
-2064896159, |
|||
69678053, |
|||
281568889, |
|||
-2104168611, |
|||
-165128239, |
|||
1536495125, |
|||
-39650967, |
|||
546594317, |
|||
-725987007, |
|||
1392966981, |
|||
1044706649, |
|||
687331773, |
|||
-2051306575, |
|||
1544302965, |
|||
-758494647, |
|||
-1243934099, |
|||
-75073759, |
|||
293132965, |
|||
-1935153095, |
|||
118929437, |
|||
807830417, |
|||
-1416222507, |
|||
-1550074071, |
|||
-84903219, |
|||
1355292929, |
|||
-380482555, |
|||
-1818444007, |
|||
-204797315, |
|||
170442609, |
|||
-1636797387, |
|||
868931593, |
|||
-623503571, |
|||
1711722209, |
|||
381210981, |
|||
-161547783, |
|||
-272740131, |
|||
-1450066095, |
|||
2116588437, |
|||
1100682473, |
|||
358442893, |
|||
-1529216831, |
|||
2116152005, |
|||
-776333095, |
|||
1265240893, |
|||
-482278607, |
|||
1067190005, |
|||
333444553, |
|||
86502381, |
|||
753481377, |
|||
39000101, |
|||
1779014585, |
|||
219658653, |
|||
-920253679, |
|||
2029538901, |
|||
1207761577, |
|||
-1515772851, |
|||
-236195711, |
|||
442620293, |
|||
423166617, |
|||
-1763648515, |
|||
-398436623, |
|||
-1749358155, |
|||
-538598519, |
|||
-652439379, |
|||
430550625, |
|||
-1481396507, |
|||
2093206905, |
|||
-1934691747, |
|||
-962631983, |
|||
1454463253, |
|||
-1877118871, |
|||
-291917555, |
|||
-1711673279, |
|||
201201733, |
|||
-474645415, |
|||
-96764739, |
|||
-1587365199, |
|||
1945705589, |
|||
1303896393, |
|||
1744831853, |
|||
381957665, |
|||
2135332261, |
|||
-55996615, |
|||
-1190135011, |
|||
1790562961, |
|||
-1493191723, |
|||
475559465, |
|||
69069 |
|||
}; |
|||
|
|||
double c7rand (Sw *is) |
|||
{ |
|||
Sw it, leh; |
|||
|
|||
it = is [0]; |
|||
leh = is [1]; |
|||
/* Do a 7-place right cyclic shift of it */ |
|||
it = ((it >> 7) & 0x01FFFFFF) + ((it & 0x7F) << 25); |
|||
if (!(it & 0x80000000)) it = it ^ MASK; |
|||
leh = (leh * mt[it & 127] + it) & 0xFFFFFFFF; |
|||
is [0] = it; is [1] = leh; |
|||
if (leh & 0x80000000) leh = leh ^ 0xFFFFFFFF; |
|||
return (SCALE * leh); |
|||
} |
|||
|
|||
|
|||
|
|||
Sw irandm (Sw *is) |
|||
{ |
|||
Sw it, leh; |
|||
|
|||
it = is [0]; |
|||
leh = is [1]; |
|||
/* Do a 7-place right cyclic shift of it */ |
|||
it = ((it >> 7) & 0x01FFFFFF) + ((it & 0x7F) << 25); |
|||
if (!(it & 0x80000000)) it = it ^ MASK; |
|||
leh = (leh * mt[it & 127] + it) & 0xFFFFFFFF; |
|||
is [0] = it; is [1] = leh; |
|||
if (leh & 0x80000000) leh = leh ^ 0xFFFFFFFF; |
|||
return (leh); |
|||
} |
|||
|
|||
|
|||
unsigned int urandm (Sw *is) |
|||
{ |
|||
Sw it, leh; |
|||
|
|||
it = is [0]; |
|||
leh = is [1]; |
|||
/* Do a 7-place right cyclic shift of it */ |
|||
it = ((it >> 7) & 0x01FFFFFF) + ((it & 0x7F) << 25); |
|||
if (!(it & 0x80000000)) it = it ^ MASK; |
|||
leh = (leh * mt[it & 127] + it) & 0xFFFFFFFF; |
|||
is [0] = it; is [1] = leh; |
|||
return (leh); |
|||
} |
|||
|
|||
|
|||
/* --------------- (Chi-squared) adchi ----------------------- */ |
|||
/* Simple implementation of Ahrens and Dieter method for a chi-sq |
|||
random variate of order a >> 1. Uses c7rand, maths library */ |
|||
/* 13 July 1998 */ |
|||
/* Slightly faster if 'a' is the same as on previous call */ |
|||
/* This routine is no longer used in the fastnorm code, but is included |
|||
because it may be useful */ |
|||
|
|||
|
|||
static double gorder, gm, rt2gm, aold; |
|||
|
|||
double adchi (double a, int *is) |
|||
{ |
|||
double x, y, z, sq; |
|||
|
|||
if (a != aold) { |
|||
aold = a; gorder = 0.5 * a; |
|||
gm = gorder - 1.0; |
|||
rt2gm = sqrt (aold - 1.0); |
|||
} |
|||
|
|||
polar: |
|||
x = 2.0 * c7rand(is) - 1.0; z = c7rand(is); |
|||
sq = x*x + z*z; |
|||
if ((sq > 1.0) || (sq < 0.25)) goto polar; |
|||
y = x / z; |
|||
x = rt2gm * y + gm; |
|||
if (x < 0.0) goto polar; |
|||
|
|||
z = (1.0 + y*y) * exp (gm * log(x/gm) - rt2gm * y); |
|||
if (c7rand(is) > z) goto polar; |
|||
|
|||
return (2.0 * x); |
|||
} |
|||
|
|||
/* -------------------- (Gamma) rgamma (g, is) ----------- */ |
|||
|
|||
double rgamma (double g, int *is) |
|||
{ |
|||
double x, y, z, sq; |
|||
|
|||
if (g != gorder) { |
|||
gorder = g; |
|||
gm = gorder - 1.0; aold = 2.0 * gorder; |
|||
rt2gm = sqrt (aold - 1.0); |
|||
} |
|||
|
|||
polar: |
|||
x = 2.0 * c7rand(is) - 1.0; z = c7rand(is); |
|||
sq = x*x + z*z; |
|||
if ((sq > 1.0) || (sq < 0.25)) goto polar; |
|||
y = x / z; |
|||
x = rt2gm * y + gm; |
|||
if (x < 0.0) goto polar; |
|||
|
|||
z = (1.0 + y*y) * exp (gm * log(x/gm) - rt2gm * y); |
|||
if (c7rand(is) > z) goto polar; |
|||
|
|||
return (x); |
|||
} |
|||
|
|||
|
|||
/* ------------------ Revision history Fastnorm ------------- */ |
|||
/* Items in this revision history appear in chronological order, |
|||
so the most recent revsion appears last. |
|||
Revision items are separated by a line of '+' characters. |
|||
|
|||
++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
|||
This is a revised version of the algorithm decribed in |
|||
|
|||
ACM Transactions on Mathematical Software, Vol 22, No 1 |
|||
March 1996, pp 119-127. |
|||
|
|||
A fast generator of pseudo-random variates from the unit Normal |
|||
distribution. It keeps a pool of about 1000 variates, and generates new |
|||
ones by picking 4 from the pool, rotating the 4-vector with these as its |
|||
components, and replacing the old variates with the components of the |
|||
rotated vector. |
|||
|
|||
The program should initialize the generator by calling initnorm(seed) |
|||
with seed a Sw integer seed value. Different seed values will give |
|||
different sequences of Normals. Seed may be any 32-bit integer. |
|||
BUT SEE REVISION of 17 May 2003 for initnorm() parameters. |
|||
The revised initnorm requires two integer parameters, iseed and |
|||
quoll, the latter specifying a tradeoff between speed and |
|||
quality. |
|||
Then, wherever the program needs a new Normal variate, it should |
|||
use the macro FastNorm, e.g. in statements like: |
|||
x = FastNorm; (Sets x to a random Normal value) |
|||
or |
|||
x += a + FastNorm * b; (Adds a normal with mean a, SD b, to x) |
|||
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
|||
Changed basic formula, which was: |
|||
t = (p+q+r+s)*0.5; p = p-t; q = t-q; r = t-r; s = t-s; |
|||
This gives sum of new p+q+r+s = 2p(old) which may not be a great |
|||
choice. The new version is: |
|||
t = (p+q+r+s)*0.5; p = p-t; q = q-t; r = t-r; s = t-s; |
|||
which gives new p+q+r+s = p+q-r-s (old) which may be better. |
|||
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
|||
|
|||
Revision 14 November 1998 |
|||
The older version "FastNorm" which was available via ftp was found |
|||
to have a defect which could affect some applications. |
|||
|
|||
Dr Christine Rueb, (Max Planck Institut fur Infomatik, |
|||
Im Stadtwald W 66123 Saabrucken, F.G.R., |
|||
(rueb@mpi-sb.mpg.de) |
|||
|
|||
found that if a large number N of consecutive variates were summed to give |
|||
a variate S with nominally N(0,N) distribution, the variance of S was in some |
|||
cases too small. The effect was noticed with N=400, and was particularly strong |
|||
for N=1023 if the first several (about 128) variates from FastNorm were |
|||
discarded. Dr. Rueb traced the effect to an unexpected tendency of FastNorm |
|||
to concentrate values with an anomolous correlation into the first 128 |
|||
elements of the variate pool. |
|||
With the help of her analysis, the algorithm has been revised in a |
|||
way which appears to overcome the problem, at the cost of about a 19% |
|||
reduction in speed (which still leaves the method very fast.) |
|||
|
|||
IT MUST BE RECOGNISED THAT THIS ALGORITHM IS NOVEL |
|||
AND WHILE IT PASSES A NUMBER OF STANDARD TESTS FOR DISTRIBUTIONAL |
|||
FORM, LACK OF SERIAL CORRELATION ETC., IT MAY STILL HAVE DEFECTS. |
|||
|
|||
RECALL THE NUMBER OF YEARS WHICH IT TOOK FOR THE LIMITATIONS OF |
|||
THE LEHMER GENERATOR FOR UNIFORM VARIATES TO BECOME APPARENT !!! |
|||
|
|||
UNTIL MORE EXPERIENCE IS GAINED WITH THIS TYPE OF GENERATOR, IT |
|||
WOULD BE WISE IN ANY CRITICAL APPLICATION TO COMPARE RESULTS |
|||
OBTAINED USING IT WITH RESULTS OBTAINED USING A "STANDARD" FORM |
|||
OF GENERATOR OF NORMAL VARIATES COUPLED WITH A WELL-DOCUMENTED |
|||
GENERATOR OF UNIFORM VARIATES. |
|||
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
|||
|
|||
Revision 1 April 2003. |
|||
Trying a scanning process proposed by R.P.Brent. It needs 2 pool |
|||
vectors, as it cannot update in-situ, but may be more robust. |
|||
It is a bit slower on a 133 Mhz PC but just as fast on a newer PC |
|||
(moggie) at about 16 ns per call in the 'speed.c' test. |
|||
The extreme-value defect is the same on old and new versions. |
|||
If one finds a value 'A' such that a batch of B genuine Normal variates has |
|||
probability 0.2 of containing a variate with abolute value greater than A, |
|||
then the probability that both of two consecive batches of B will contain |
|||
such a value should be 0.2 times 0.2, or 0.04. Instead, both versions give |
|||
the extreme value prob. as 0.200 (over a million batches) but give the |
|||
consective-pair prob as 0.050 for batch size B = 1024. |
|||
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
|||
|
|||
Revision 17 May 2003. |
|||
The fundamental defect of the method, namely an inadequate 'mixing' |
|||
of squared value ('energy') between one generation of the pool and the next, |
|||
cannot readily be removed. In going from one pool to the next, the energy |
|||
in an old variate is shared among just 4 variates in the new pool. Hence it |
|||
takes many generations before the energy of some original variate can be |
|||
distributed across the whole pool. The number of generations needed cannot |
|||
be less than the log to base 4 of the pool size, or 5 for a pool size of |
|||
1024. In fact, the pseudo-random indexing of the pool means that rather |
|||
more generations are needed on average. |
|||
The defect is readily revealed by the following test. One picks a |
|||
"batch size" comparable to the pool size, say 500 or 1000. One then |
|||
computes a value A such that a batch will with probability 0.2 contain one |
|||
or more variates with absolute value exceeding A. |
|||
One then draws batches from FastNorm, |
|||
and tests each batch to see if it contains such an extreme value. |
|||
Over many batches, one counts the frequency of such 'extreme' batches, |
|||
and finds (with FastNorm2) that it is indeed about 0.2. However, when one counts |
|||
the frequency with which succesive batches are both extreme, one finds it to |
|||
be higher than the proper value (0.2)^2 = 0.04. For batch sizes round the pool |
|||
size, it can be as high as 0.05. That is, although the frequncy of extreme |
|||
values is about right, their occurrence in the stream is correlated over a |
|||
scale of the order of the pool size. |
|||
The same correlation effect is seen in the average 4th moment of |
|||
successive batches. |
|||
Since this inter-generational correlation cannot be avoided, the |
|||
this revision seeks to reduce it by performing at least two simple |
|||
rotations of the pool at each generation. Obviously, some speed is lost, |
|||
but the correlations are reduced. |
|||
To allow the user to trade off speed and quality, the initialization |
|||
function initnorm() now provides a QUALITY parameter 'quoll' which controls |
|||
how many double-rotations are done for each generation. |
|||
See the comments in initnorm() for more detail. |
|||
++++++++++ End of revision notes +++++++++ */ |
|||
|
|||
|
|||
|
|||
/* ----------------- Some test results ------------------------ */ |
|||
/* |
|||
General form: |
|||
Some simple tests were conducted by transforming FastNorm variates |
|||
in several ways to yield a variable nominally uniformly distributed in 0 ... 1. |
|||
Uniformity of the derived variate was then tested by a ChiSquared test on a |
|||
100-cell histogram with cell counts around 10000. These tests are crude, but |
|||
showed no untoward results on the present version. |
|||
Transformations included: |
|||
y = 0.5 * (1.0 + erf (n1 / sqrt(2)) |
|||
|
|||
y = 0.5 * (n1 / (n1^2 + n2^2 + n3^2) - 1) |
|||
|
|||
y = exp (-0.5 * (n1^2 + n2^2)) |
|||
|
|||
y = (n1^2 + n2^2) / (n1^2 + n2^2 + n3^2 + n4^2) |
|||
|
|||
where n1, n2 etc are successive Normal variates. |
|||
It may be noted that some of these are sensitive to serial correlation if |
|||
present. |
|||
|
|||
Fourth moment of batches: |
|||
Extensive tests for correlation among the fourth moments of successive |
|||
batches of variates were made, with batch sizes comparabe to or (worst case) |
|||
equal to the size of the variate pool (4096 in this revision). |
|||
With 'quality' 1, significant correlation appears after 10^6 batches |
|||
of worst-case size. |
|||
With quality 2, no significant correlation is evident after 10^7 |
|||
batches. A just-significant correlation appears after 3.6*10^7 batches. |
|||
As this requires some 1.4*10^11 deviates to be drawn, it may be irrelevent |
|||
for many applications. The observed correlation coefficent was 0.0008. |
|||
With quality 3, results are OK after 10^8 batches, or more than |
|||
4*10^11 variates. |
|||
No tests have been done with quality 4 as yet. |
|||
|
|||
Speed: |
|||
Speed tests were done on a PC running RedHat Linux, using "-O" |
|||
compiler optimization. The test loop was |
|||
for (i = 0; i < 500000000; i++) { |
|||
a += FastNorm; a -= FastNorm; |
|||
} |
|||
Thus the test makes 10^9 uses of FastNorm. The time taken, (which |
|||
includes time for a call in 'initnorm' and the loop overhead) depends on |
|||
the 'quality' set by initnorm. |
|||
Quality 1: 21.5 sec |
|||
Quality 2: 32.1 sec |
|||
Quality 3: 42.5 sec |
|||
Quality 4: 53.1 sec |
|||
|
|||
By way of comparison, the same 10^9 call loop was timed with the Unix library |
|||
"random()" routine substituted for FastNorm, and the variable 'a' defined as |
|||
integer rather than double. Also, since most use of a Uniform generator such |
|||
as "random()" requires that the returned integer be scaled into a floating- |
|||
point number in 0 ... 1, the timing was repeated with |
|||
"a += random" ('a' integer) replaced by "a += Scale*random()" where |
|||
'a' is double and Scale = 2^(-31). The times obtained were: |
|||
Random (integer): 44.1 sec |
|||
Random (double) : 47.7 sec |
|||
|
|||
It can be seen that FastNorm (even at quality 3) is faster than a |
|||
commonly-used Uniform generator. To some extent, this result may vary on |
|||
different computers and compilers. Since FastNorm (at least for qualities |
|||
above 1) no doubt does more arithmetic per variate than "random()", much of |
|||
its speed advantage must come from its replacement of a function call per |
|||
variate by a macro which makes only one function call every 4095 variates. |
|||
Computers with lower 'call' overheads than the PC used here might show |
|||
differnt results. |
|||
Incidently, the Uniform generator 'c7rand()' included in this |
|||
package, which returns a double uniform in 0 ... 1, and is of fairly high |
|||
quality, gives a time in the same test of 36.8 sec, a little faster than |
|||
'random()'. |
|||
*/ |
|||
|
|||
|
|||
/* ----------------- globals ------------------------- */ |
|||
/* A pool must have a length which is a multiple of 4. |
|||
* During regeneration of a new pool, the pool is treated as 4 |
|||
* consecutive vectors, each of length VL. |
|||
*/ |
|||
|
|||
#define VE 10 |
|||
#define VL (1 << VE) |
|||
#define VM (VL-1) |
|||
#define WL (4*VL) |
|||
#define WM (WL-1) |
|||
|
|||
Sw gaussfaze; |
|||
Sf *gausssave; |
|||
Sf GScale; |
|||
/* GScale,fastnorm,gaussfaze, -save must be visible to callers*/ |
|||
static Sf chic1, chic2; /* Constants used in getting ChiSq_WL */ |
|||
Sw gslew; /* Counts generations */ |
|||
static Sw qual; /* Sets number of double transforms per generation. */ |
|||
static Sw c7g [2]; /* seed values for c7rand */ |
|||
|
|||
Sf wk1 [WL], wk2 [WL]; /* Pools of variates. */ |
|||
|
|||
|
|||
/* ------------------ regen ---------------------- */ |
|||
/* Takes variates from wk1[], transforms to wk[2], then back to wk1[]. |
|||
*/ |
|||
void regen () |
|||
{ |
|||
Sw i, j, k, m; |
|||
Sf p, q, r, s, t; |
|||
Sw topv[6], ord[4], *top; |
|||
Sf *ppt[4], *ptn; |
|||
|
|||
/* Choose 4 random start points in the wk1[] vector |
|||
I want them all different. */ |
|||
|
|||
top = topv + 1; |
|||
/* Set limiting values in top[-1], top[4] */ |
|||
top[-1] = VL; top[4] = 0; |
|||
reran1: |
|||
m = irandm (c7g); /* positive 32-bit random */ |
|||
/* Extract two VE-sized randoms from m, which has 31 useable digits */ |
|||
m = m >> (31 - 2*VE); |
|||
top[0] = m & VM; m = m >> VE; top[1] = m & VM; |
|||
m = irandm (c7g); /* positive 32-bit random */ |
|||
/* Extract two VE-sized randoms from m, which has 31 useable digits */ |
|||
m = m >> (31 - 2*VE); |
|||
top[2] = m & VM; m = m >> VE; top[3] = m & VM; |
|||
for (i = 0; i < 4; i++) ord[i] = i; |
|||
/* Sort in decreasing size */ |
|||
for (i = 2; i >= 0; i--) { |
|||
for (j = 0; j <= i; j++) { |
|||
if (top[j] < top[j+1]) { |
|||
k = top[j]; top[j] = top[j+1]; |
|||
top[j+1] = k; |
|||
k = ord[j]; ord[j] = ord[j+1]; |
|||
ord[j+1] = k; |
|||
} |
|||
} |
|||
} |
|||
/* Ensure all different */ |
|||
for (i = 0; i < 3; i++) { if (top[i] == top[i+1]) goto reran1; } |
|||
|
|||
/* Set pt pointers to their start values for the first chunk. */ |
|||
for (i = 0; i < 4; i++) { |
|||
j = ord[i]; |
|||
ppt[j] = wk2 + j * VL + top[i]; |
|||
} |
|||
|
|||
/* Set ptn to point into wk1 */ |
|||
ptn = wk1; |
|||
|
|||
/* Now ready to do five chunks. The length of chunk i is |
|||
top[i-1] - top[i] (I hope) |
|||
At the end of chunk i, pointer ord[i] should have reached the end |
|||
of its part, and need to be wrapped down to the start of its part. |
|||
*/ |
|||
i = 0; |
|||
|
|||
chunk: |
|||
j = top[i] - top[i-1]; /* Minus the chunk length */ |
|||
for (; j < 0; j++) { |
|||
p = *ptn++; s = *ptn++; q = *ptn++; r = *ptn++; |
|||
t = (p + q + r + s) * 0.5; |
|||
*ppt[0]++ = t - p; |
|||
*ppt[1]++ = t - q; |
|||
*ppt[2]++ = r - t; |
|||
*ppt[3]++ = s - t; |
|||
} |
|||
/* This should end the chunk. See if all done */ |
|||
if (i == 4) goto passdone; |
|||
|
|||
/* The pointer for part ord[i] should have passed its end */ |
|||
j = ord[i]; |
|||
#ifdef dddd |
|||
printf ("Chunk %1d done. Ptr %1d now %4d\n", i, j, ppt[j]-wk2); |
|||
#endif |
|||
ppt[j] -= VL; |
|||
i++; |
|||
goto chunk; |
|||
|
|||
passdone: |
|||
/* wk1[] values have been transformed and placed in wk2[] |
|||
Transform from wk2 to wk1 with a simple shuffle */ |
|||
m = (irandm (c7g) >> (29 - VE)) & WM; |
|||
j = 0; |
|||
for (i = 0; i < 4; i++) ppt[i] = wk1 + i * VL; |
|||
for (i = 0; i < VL; i++) { |
|||
p = wk2[j^m]; j++; |
|||
s = wk2[j^m]; j++; |
|||
q = wk2[j^m]; j++; |
|||
r = wk2[j^m]; j++; |
|||
t = (p + q + r + s) * 0.5; |
|||
*ppt[0]++ = t - p; |
|||
*ppt[1]++ = q - t; |
|||
*ppt[2]++ = t - r; |
|||
*ppt[3]++ = s - t; |
|||
} |
|||
|
|||
/* We have a new lot of variates in wk1 */ |
|||
return; |
|||
} |
|||
|
|||
|
|||
/* ------------------- renormalize --------------------------- */ |
|||
/* Rescales wk1[] so sum of squares = WL */ |
|||
/* Returns the original sum-of-squares */ |
|||
Sf renormalize (void) |
|||
{ |
|||
Sf ts, vv; |
|||
Sw i; |
|||
|
|||
ts = 0.0; |
|||
for (i = 0; i < WL; i++) { |
|||
ts += wk1[i] * wk1[i]; |
|||
} |
|||
vv = sqrt (WL / ts); |
|||
for (i = 0; i < WL; i++) wk1[i] *= vv; |
|||
return (ts); |
|||
} |
|||
|
|||
|
|||
/* ------------------------ BoxMuller ---------------------- */ |
|||
/* Fills block gvec of length ll with proper normals */ |
|||
void boxmuller (Sf *gvec, Sw ll) |
|||
{ |
|||
Sw i; |
|||
Sf tx, ty, tr, tz; |
|||
|
|||
/* Here, replace the whole pool with conventional Normal variates */ |
|||
i = 0; |
|||
nextpair: |
|||
tx = 2.0 * c7rand(c7g) - 1.0; /* Uniform in -1..1 */ |
|||
ty = 2.0 * c7rand(c7g) - 1.0; /* Uniform in -1..1 */ |
|||
tr = tx * tx + ty * ty; |
|||
if ((tr > 1.0) || (tr < 0.25)) goto nextpair; |
|||
tz = -2.0 * log (c7rand(c7g)); /* Sum of squares */ |
|||
tz = sqrt ( tz / tr ); |
|||
gvec [i++] = tx * tz; gvec [i++] = ty * tz; |
|||
if (i < ll) goto nextpair; |
|||
/* Horrid, but good enough */ |
|||
return; |
|||
} |
|||
|
|||
|
|||
/* ------------------------- initnorm ---------------------- */ |
|||
/* To initialize, given a seed integer and a quality level. |
|||
The seed can be any integer. The quality level quoll should be |
|||
between 1 and 4. Quoll = 1 gives high speed, but leaves some |
|||
correlation between the 4th moments of successive batches of values. |
|||
Higher values of quoll give lower speed but less correlation. |
|||
|
|||
If called with quoll = 0, initnorm performs a check that the |
|||
most crucial routine (regen) is performing correctly. In this |
|||
case, the value of 'iseed' is ignored. Initnorm will report the |
|||
results of the test, which compares pool values with check17 and |
|||
check98, which are defined below. |
|||
When a check call is made, a proper call on initnorm must then |
|||
be made before using the FastNorm macro. A check call does not |
|||
properly initialize the routines even if it succeeds. |
|||
*/ |
|||
static Sf check17 = 0.1255789; |
|||
static Sf check98 = -0.7113293; |
|||
|
|||
void initnorm (Sw seed, Sw quoll) |
|||
{ |
|||
Sw i; |
|||
|
|||
/* At one stage, we need to generate a random variable Z such that |
|||
(WL * Z*Z) has a Chi-squared-WL density. Now, a var with |
|||
an approximate Chi-sq-K distn can be got as |
|||
(A + B*n)**2 where n has unit Normal distn, |
|||
A**2 = K * sqrt (1 - 1/K), A**2 + B**2 = K. (For large K) |
|||
So we form Z as (1/sqrt(WL)) * (A + B*n) |
|||
or chic1 + chic2 * n where |
|||
chic1 = A / sqrt(WL), chic2 = B / sqrt(WL). |
|||
Hence |
|||
chic1 = sqrt (A*A / WL) = sqrt ( sqrt (1 - 1/WL)), |
|||
chic2 = sqrt (1 - chic1*chic1) |
|||
*/ |
|||
|
|||
chic1 = sqrt ( sqrt (1.0 - 1.0 / WL)); |
|||
chic2 = sqrt (1.0 - chic1 * chic1); |
|||
|
|||
/* Set regen counter "gslew" which will affect renormalizations. |
|||
Since pools are OK already, we wont't need to renorm for a |
|||
while */ |
|||
gslew = 1; |
|||
/* Finally, set "gaussfaze" to return all of wk1 |
|||
* except the last entry at WL-1 */ |
|||
gaussfaze = WL-1; |
|||
gausssave = wk1; |
|||
|
|||
/* If quoll = 0, do a check on installation */ |
|||
if (quoll == 0) goto docheck; |
|||
qual = quoll; |
|||
/* Check sensible values for quoll, say 1 to 4 */ |
|||
if ((quoll < 0) || (quoll > 4)) { |
|||
printf ("From initnorm(): quoll parameter %d out of\ |
|||
range 1 to 4\n", quoll); |
|||
return; |
|||
} |
|||
c7g[0] = seed; c7g[1] = -3337792; |
|||
|
|||
/* Fill wk1[] with good normals */ |
|||
boxmuller (wk1, WL); |
|||
/* Scale so sum-of-squares = WL */ |
|||
GScale = sqrt (renormalize () / WL); |
|||
/* We have set |
|||
GScale to restore the original ChiSq_WL sum-of-squares */ |
|||
return; |
|||
|
|||
docheck: |
|||
/* Set a simple pattern in wk1[] and test results of regen */ |
|||
for (i = 0; i < WL; i++) wk1[i] = wk2[i] = 0.0; |
|||
wk1[0] = sqrt ((double) WL); |
|||
c7g[0] = 1234567; c7g[1] = 9876543; |
|||
for (i = 0; i < 60; i++) regen(); |
|||
/* Check a couple of values */ |
|||
if ((fabs (wk1[17] - check17) > 0.00001) || |
|||
(fabs (wk1[98] - check98) > 0.00001)) { |
|||
printf ("\nInitnorm check failed.\n"); |
|||
printf ("Expected %8.5f got %10.7f\n", check17, wk1[17]); |
|||
printf ("Expected %8.5f got %10.7f\n", check98, wk1[98]); |
|||
} |
|||
else printf ("\nInitnorm check OK\n"); |
|||
return; |
|||
} |
|||
|
|||
|
|||
/* ---------------------- fastnorm -------------------------- */ |
|||
/* If gslew shows time is ripe, renormalizes the pool |
|||
fastnorm() returns the value GScale*gausssave[0]. |
|||
*/ |
|||
|
|||
Sf fastnorm () |
|||
{ |
|||
Sf sos; |
|||
Sw n1; |
|||
|
|||
if (! (gslew & 0xFFFF)) { |
|||
sos = renormalize (); |
|||
} |
|||
|
|||
/* The last entry of gausssave, at WL-1, will not have been used. |
|||
Use it to get an approx. to sqrt (ChiSq_WL / WL). |
|||
See initnorm() code for details */ |
|||
GScale = chic1 + chic2 * GScale * gausssave [WL-1]; |
|||
for (n1 = 0; n1 < qual; n1++) regen (); |
|||
gslew++; |
|||
|
|||
gaussfaze = WL - 1; |
|||
|
|||
return (GScale * gausssave [0]); |
|||
} |
|||
|
|||
|
|||
/* --------------------- (test) main ------------------------- */ |
|||
#ifdef Main |
|||
#include "FastNorm3.h" |
|||
int main() |
|||
{ |
|||
Sf x; Sw i; |
|||
initnorm (0, 0); |
|||
initnorm (77, 2); |
|||
printf ("SoS %20.6f\n", renormalize()); |
|||
// for (i = 0; i < 2000000; i++) x = FastNorm; |
|||
for (i = 0; i < 200; i++) { |
|||
x = FastNorm; |
|||
printf("%d\t%f\n", i, x); |
|||
} |
|||
printf ("SoS %20.6f\n", renormalize()); |
|||
exit (1); |
|||
} |
|||
#endif |
|||
@ -0,0 +1,10 @@ |
|||
noinst_LTLIBRARIES = libtrannoise.la |
|||
|
|||
libtrannoise_la_SOURCES = \
|
|||
FastNorm3.c \
|
|||
1-f-code.c \
|
|||
wallace.c |
|||
|
|||
AM_CPPFLAGS = -I$(top_srcdir)/src/include -I$(top_srcdir)/src/frontend |
|||
|
|||
MAINTAINERCLEANFILES = Makefile.in |
|||
@ -0,0 +1,532 @@ |
|||
/* Wallace generator for normally distributed random variates |
|||
Copyright: Holger Vogt, 2008 |
|||
|
|||
*/ |
|||
|
|||
//#define FASTNORM_ORIG |
|||
|
|||
#include <stdio.h> |
|||
#include <stdlib.h> |
|||
#ifdef _MSC_VER |
|||
#include <process.h> |
|||
#define getpid _getpid |
|||
#else |
|||
#include <unistd.h> |
|||
#endif |
|||
#include <math.h> |
|||
#include "wallace.h" |
|||
#include "FastNorm3.h" |
|||
|
|||
#ifdef HasMain |
|||
#include <sys/timeb.h> |
|||
#else |
|||
#ifndef NOSPICE |
|||
#include "ngspice.h" |
|||
#endif |
|||
#endif |
|||
|
|||
#define POOLSIZE 4096 |
|||
#define LPOOLSIZE 12 |
|||
#define NOTRANS 3 /* number of (dual) transformations */ |
|||
|
|||
#define VE 10 |
|||
#define VL (1 << VE) |
|||
#define VM (VL-1) |
|||
#define WL (4*VL) |
|||
#define WM (WL-1) |
|||
|
|||
double *outgauss; /* output vector for user access */ |
|||
unsigned int variate_used; /* actual index of variate called by user */ |
|||
double ScaleGauss; |
|||
|
|||
static double *pool1; |
|||
static double *pool2; |
|||
static unsigned int *addrif, *addrib; |
|||
static unsigned n = POOLSIZE; |
|||
static double chi1, chi2; /* chi^2 correction values */ |
|||
static unsigned int newpools; |
|||
|
|||
extern double drand(void); |
|||
extern unsigned int CombLCGTausInt(void); |
|||
extern void TausSeed(void); |
|||
extern unsigned int CombLCGTausInt2(void); |
|||
|
|||
|
|||
void PolarGauss(double* py1, double* py2) |
|||
{ |
|||
double x1, x2, w; |
|||
|
|||
do { |
|||
x1 = drand(); |
|||
x2 = drand(); |
|||
w = x1 * x1 + x2 * x2; |
|||
} while (( w > 1.0 ) || ( w < 0.25)); |
|||
|
|||
w = sqrt( (-2.0 * log( w ) ) / w ); |
|||
|
|||
*py1 = (double)(x1 * w); |
|||
*py2 = (double)(x2 * w); |
|||
} |
|||
|
|||
|
|||
|
|||
|
|||
|
|||
void initw(void) |
|||
{ |
|||
unsigned i; |
|||
double totsqr, nomsqr; |
|||
unsigned long int coa, cob, s; |
|||
|
|||
/* initialize the uniform generator */ |
|||
srand(getpid()); |
|||
// srand(17); |
|||
TausSeed(); |
|||
|
|||
ScaleGauss = 1.; |
|||
newpools = 1; |
|||
|
|||
/* set up the two pools */ |
|||
pool1 = TMALLOC(double, n); //(double*)malloc(n * sizeof(double)); |
|||
pool2 = TMALLOC(double, n); //(double*)malloc(n * sizeof(double)); |
|||
addrif = TMALLOC(unsigned int, (n + NOTRANS)); //(unsigned int*)malloc((n + NOTRANS) * sizeof(unsigned int)); |
|||
addrib = TMALLOC(unsigned int, (n + NOTRANS)); //(unsigned int*)malloc((n + NOTRANS) * sizeof(unsigned int)); |
|||
|
|||
/* fill the first pool with normally distributed values */ |
|||
PolarGauss(&pool1[0], &pool1[1]); |
|||
for (i = 1; i < n>>1; i++) { |
|||
PolarGauss(&pool1[i<<1], &pool1[(i<<1) + 1]); |
|||
} |
|||
/* normalize pool content */ |
|||
/* totsqr = totsum = 0.0; |
|||
for (i = 0; i < n; i++) { |
|||
totsqr += pool1[i] * pool1[i]; |
|||
totsum += pool1[i]; |
|||
} |
|||
totsum = totsum/n; |
|||
for (i = 0; i < n; i++) { |
|||
totsqr += (pool1[i] - totsum) * (pool1[i] - totsum); |
|||
} |
|||
nomsqr = sqrt(n / totsqr); |
|||
for (i = 0; i < n; i++) |
|||
pool1[i] = (pool1[i] - totsum) * nomsqr; |
|||
*/ |
|||
totsqr = 0.0; |
|||
for (i = 0; i < n; i++) |
|||
totsqr += pool1[i] * pool1[i]; |
|||
nomsqr = sqrt(n / totsqr); |
|||
for (i = 0; i < n; i++) |
|||
pool1[i] *= nomsqr; |
|||
|
|||
/* calculate ch^2 value */ |
|||
chi1 = sqrt ( sqrt (1.0 - 1.0/n)); |
|||
chi2 = sqrt ( 1.0 - chi1*chi1); |
|||
|
|||
/* first scaling, based on unused pool1[n-2] */ |
|||
ScaleGauss = chi1 + chi2 * ScaleGauss * pool1[n-2]; |
|||
/* access to first pool */ |
|||
outgauss = pool1; |
|||
/* set data counter, we return n-2 values here */ |
|||
variate_used = n - 2; |
|||
|
|||
/* generate random reading addresses using a LCG */ |
|||
s = 0; |
|||
coa = 241; |
|||
cob = 59; |
|||
for (i=0; i < (n + NOTRANS); i++) { |
|||
// addrif[i] = s = (s * coa + cob) % ( n ); |
|||
coa = CombLCGTausInt(); |
|||
addrif[i] = coa >> (32 - LPOOLSIZE); |
|||
// printf ("Random add:\t%ld\n" , s); |
|||
} |
|||
s = 0; |
|||
coa = 193; |
|||
cob = 15; |
|||
for (i=0; i < (n + NOTRANS); i++) { |
|||
// addrib[i] = s = (s * coa + cob) % ( n ); |
|||
coa = CombLCGTausInt(); |
|||
addrib[i] = coa >> (32 - LPOOLSIZE); |
|||
// printf ("Random add:\t%ld\n" , addrib[i]); |
|||
} |
|||
|
|||
// printf("norm for orig. Gauss: %e, chi^2 scale: %e\n", nomsqr, ScaleGauss); |
|||
// NewWa(); |
|||
} |
|||
|
|||
/* original FastNorm3.c code */ |
|||
#ifdef FASTNORM_ORIG |
|||
float NewWa () |
|||
{ |
|||
int i, j, k, m; |
|||
float p, q, r, s, t; |
|||
int topv[6], ord[4], *top; |
|||
float *ppt[4], *ptn; |
|||
|
|||
float nulval, endval; |
|||
float totsqr, nomsqr; |
|||
nulval = ScaleGauss * pool1[0]; |
|||
endval = pool1[n-1]; |
|||
|
|||
/* Choose 4 random start points in the wk1[] vector |
|||
I want them all different. */ |
|||
|
|||
top = topv + 1; |
|||
/* Set limiting values in top[-1], top[4] */ |
|||
top[-1] = VL; top[4] = 0; |
|||
reran1: |
|||
m = CombLCGTausInt(); /* positive 32-bit random */ |
|||
/* Extract two VE-sized randoms from m, which has 31 useable digits */ |
|||
m = m >> (31 - 2*VE); |
|||
top[0] = m & VM; m = m >> VE; top[1] = m & VM; |
|||
m = CombLCGTausInt(); /* positive 32-bit random */ |
|||
/* Extract two VE-sized randoms from m, which has 31 useable digits */ |
|||
m = m >> (31 - 2*VE); |
|||
top[2] = m & VM; m = m >> VE; top[3] = m & VM; |
|||
for (i = 0; i < 4; i++) ord[i] = i; |
|||
/* Sort in decreasing size */ |
|||
for (i = 2; i >= 0; i--) { |
|||
for (j = 0; j <= i; j++) { |
|||
if (top[j] < top[j+1]) { |
|||
k = top[j]; top[j] = top[j+1]; |
|||
top[j+1] = k; |
|||
k = ord[j]; ord[j] = ord[j+1]; |
|||
ord[j+1] = k; |
|||
} |
|||
} |
|||
} |
|||
/* Ensure all different */ |
|||
for (i = 0; i < 3; i++) { if (top[i] == top[i+1]) goto reran1; } |
|||
|
|||
/* Set pt pointers to their start values for the first chunk. */ |
|||
for (i = 0; i < 4; i++) { |
|||
j = ord[i]; |
|||
ppt[j] = pool2 + j * VL + top[i]; |
|||
} |
|||
|
|||
/* Set ptn to point into wk1 */ |
|||
ptn = pool1; |
|||
|
|||
/* Now ready to do five chunks. The length of chunk i is |
|||
top[i-1] - top[i] (I hope) |
|||
At the end of chunk i, pointer ord[i] should have reached the end |
|||
of its part, and need to be wrapped down to the start of its part. |
|||
*/ |
|||
i = 0; |
|||
|
|||
chunk: |
|||
j = top[i] - top[i-1]; /* Minus the chunk length */ |
|||
for (; j < 0; j++) { |
|||
p = *ptn++; s = *ptn++; q = *ptn++; r = *ptn++; |
|||
t = (p + q + r + s) * 0.5; |
|||
*ppt[0]++ = t - p; |
|||
*ppt[1]++ = t - q; |
|||
*ppt[2]++ = r - t; |
|||
*ppt[3]++ = s - t; |
|||
} |
|||
/* This should end the chunk. See if all done */ |
|||
if (i == 4) goto passdone; |
|||
|
|||
/* The pointer for part ord[i] should have passed its end */ |
|||
j = ord[i]; |
|||
#ifdef dddd |
|||
printf ("Chunk %1d done. Ptr %1d now %4d\n", i, j, ppt[j]-pool2); |
|||
#endif |
|||
ppt[j] -= VL; |
|||
i++; |
|||
goto chunk; |
|||
|
|||
passdone: |
|||
/* wk1[] values have been transformed and placed in wk2[] |
|||
Transform from wk2 to wk1 with a simple shuffle */ |
|||
m = (CombLCGTausInt2() >> (29 - VE)) & WM; |
|||
j = 0; |
|||
for (i = 0; i < 4; i++) ppt[i] = pool1 + i * VL; |
|||
for (i = 0; i < VL; i++) { |
|||
p = pool2[j^m]; j++; |
|||
s = pool2[j^m]; j++; |
|||
q = pool2[j^m]; j++; |
|||
r = pool2[j^m]; j++; |
|||
t = (p + q + r + s) * 0.5; |
|||
*ppt[0]++ = t - p; |
|||
*ppt[1]++ = q - t; |
|||
*ppt[2]++ = t - r; |
|||
*ppt[3]++ = s - t; |
|||
} |
|||
|
|||
/* renormalize again if number of pools beyond limit */ |
|||
if (!(newpools & 0xFFFF)) { |
|||
totsqr = 0.0; |
|||
for (i = 0; i < n; i++) |
|||
totsqr += pool1[i] * pool1[i]; |
|||
nomsqr = sqrt(n / totsqr); |
|||
for (i = 0; i < n; i++) |
|||
pool1[i] *= nomsqr; |
|||
} |
|||
|
|||
outgauss = pool1; |
|||
/* reset data counter */ |
|||
variate_used = n - 1; |
|||
|
|||
/* set counter counting nomber of pools made */ |
|||
newpools++; |
|||
|
|||
/* new scale factor using ch^2 correction, |
|||
using pool1[n-1] from last pool */ |
|||
ScaleGauss = chi1 + chi2 * ScaleGauss * endval; |
|||
|
|||
// printf("Pool number: %d, chi^2 scale: %e\n", newpools, ScaleGauss); |
|||
|
|||
return nulval; /* use old scale */ |
|||
|
|||
} |
|||
|
|||
#else |
|||
|
|||
/* Simplified code according to an algorithm published by C. S. Wallace: |
|||
"Fast Pseudorandom Generators for Normal and Exponential Variates", |
|||
ACM Transactions on Mathmatical Software, Vol. 22, No. 1, March 1996, pp. 119-127. |
|||
Transform pool1 to pool2 and back to pool1 NOTRANS times |
|||
by orthogonal 4 x 4 Hadamard-Matrix. |
|||
Mixing of values is very important: Any value in the pool should contribute to |
|||
every value in the new pools, at least after several passes (number of passes |
|||
is set by NOTRANS to 2 or 3). |
|||
4 values are read in a continuous sequence from the total of POOLSIZE values. |
|||
Values are stored in steps modulo POOLSIZE/4. |
|||
During backward transformation the values are shuffled by a random number jj. |
|||
*/ |
|||
|
|||
double NewWa(void) |
|||
{ |
|||
double nulval, endval; |
|||
double bl1, bl2, bl3, bl4; /* the four values to be transformed */ |
|||
double bsum; |
|||
double totsqr, nomsqr; |
|||
unsigned int i, j, jj, m, mm, mmm; |
|||
|
|||
nulval = ScaleGauss * pool1[0]; |
|||
endval = pool1[n-1]; |
|||
m = n >> 2; |
|||
// printf("New pool after next value\n"); |
|||
|
|||
/* generate new pool by transformation |
|||
Transformation is repeated NOTRANS times */ |
|||
for (i=0; i < NOTRANS; i++) { |
|||
mm = m << 1; |
|||
mmm = mm + m; |
|||
/* forward transformation */ |
|||
// for (j=0; j < n; j += 4) { |
|||
for (j=0; j < m; j++) { |
|||
bl1 = pool1[j]; |
|||
bl2 = pool1[j+m]; |
|||
bl3 = pool1[j+mm]; |
|||
bl4 = pool1[j+mmm]; |
|||
/* Hadamard-Matrix */ |
|||
bsum = (bl1 + bl2 + bl3 + bl4) * 0.5f; |
|||
jj = j<<2; |
|||
pool2[jj] = bl1 - bsum; |
|||
pool2[jj+1] = bl2 - bsum; |
|||
pool2[jj+2] = bsum - bl3; |
|||
pool2[jj+3] = bsum - bl4; |
|||
} |
|||
/* backward transformation */ |
|||
jj = (CombLCGTausInt2() >> (31 - LPOOLSIZE)) & (n - 1); |
|||
for (j=0; j < m; j++) { |
|||
bl1 = pool2[j^jj]; |
|||
bl2 = pool2[(j+m)^jj]; |
|||
bl3 = pool2[(j+mm)^jj]; |
|||
bl4 = pool2[(j+mmm)^jj]; |
|||
/* Hadamard-Matrix */ |
|||
bsum = (bl1 + bl2 + bl3 + bl4) * 0.5f; |
|||
jj = j<<2; |
|||
pool1[jj] = bl1 - bsum; |
|||
pool1[jj+1] = bl2 - bsum; |
|||
pool1[jj+2] = bsum - bl3; |
|||
pool1[jj+3] = bsum - bl4; |
|||
} |
|||
} |
|||
|
|||
/* renormalize again if number of pools beyond limit */ |
|||
if (!(newpools & 0xFFFF)) { |
|||
totsqr = 0.0; |
|||
for (i = 0; i < n; i++) |
|||
totsqr += pool1[i] * pool1[i]; |
|||
nomsqr = sqrt(n / totsqr); |
|||
for (i = 0; i < n; i++) |
|||
pool1[i] *= nomsqr; |
|||
} |
|||
|
|||
outgauss = pool1; |
|||
/* reset data counter */ |
|||
variate_used = n - 1; |
|||
|
|||
/* set counter counting nomber of pools made */ |
|||
newpools++; |
|||
|
|||
/* new scale factor using ch^2 correction, |
|||
using pool1[n-1] from previous pool */ |
|||
ScaleGauss = chi1 + chi2 * ScaleGauss * endval; |
|||
|
|||
// printf("Pool number: %d, chi^2 scale: %e\n", newpools, ScaleGauss); |
|||
|
|||
return nulval; /* use old scale */ |
|||
// return pool1[0]; /* use new scale */ |
|||
} |
|||
|
|||
#endif |
|||
|
|||
|
|||
#ifdef FASTNORMTEST |
|||
float NewWa_not(void) |
|||
{ |
|||
float nulval, endval; |
|||
float bl1, bl2, bl3, bl4; /* the four values to be transformed */ |
|||
float bsum; |
|||
float totsqr, nomsqr; |
|||
unsigned int i, j, jj; |
|||
nulval = ScaleGauss * pool1[0]; |
|||
endval = pool1[n-1]; |
|||
|
|||
// printf("New pool after next value\n"); |
|||
|
|||
/* generate new pool by transformation |
|||
Transformation is repeated NOTRANS times */ |
|||
for (i=0; i < NOTRANS; i++) { |
|||
|
|||
/* forward transformation */ |
|||
for (j=0; j < n; j += 4) { |
|||
jj = j + i; |
|||
bl1 = pool1[addrif[jj]]; |
|||
bl2 = pool1[addrif[jj+1]]; |
|||
bl3 = pool1[addrif[jj+2]]; |
|||
bl4 = pool1[addrif[jj+3]]; |
|||
/* s = (s*coa + cob) & (n - 1); |
|||
bl1 = pool1[s]; |
|||
s = (s*coa + cob) & (n - 1); |
|||
bl2 = pool1[s + 1]; |
|||
s = (s*coa + cob) & (n - 1); |
|||
bl3 = pool1[s + 2]; |
|||
s = (s*coa + cob) & (n - 1); |
|||
bl4 = pool1[s + 3]; */ |
|||
/* jj = j + i; |
|||
bl1 = pool1[addrif[jj]]; |
|||
bl2 = pool1[addrif[jj+1]]; |
|||
bl3 = pool1[addrif[jj+2]]; |
|||
bl4 = pool1[addrif[jj+3]]; */ |
|||
/* bl1 = pool1[j]; |
|||
bl2 = pool1[j+1]; |
|||
bl3 = pool1[j+2]; |
|||
bl4 = pool1[j+3]; */ |
|||
/* Hadamard-Matrix */ |
|||
bsum = (bl1 + bl2 + bl3 + bl4) * 0.5; |
|||
/* pool2[j] = bl1 - bsum; |
|||
pool2[j+1] = bl2 - bsum; |
|||
pool2[j+2] = bsum - bl3; |
|||
pool2[j+3] = bsum - bl4; */ |
|||
pool2[addrib[jj]] = bl1 - bsum; |
|||
pool2[addrib[jj+1]] = bl2 - bsum; |
|||
pool2[addrib[jj+2]] = bsum - bl3; |
|||
pool2[addrib[jj+3]] = bsum - bl4; |
|||
} |
|||
/* backward transformation */ |
|||
for (j=0; j < n; j += 4) { |
|||
bl1 = pool2[j]; |
|||
bl2 = pool2[j+1]; |
|||
bl3 = pool2[j+2]; |
|||
bl4 = pool2[j+3]; |
|||
/* bl1 = pool2[addrib[j]]; |
|||
bl2 = pool2[addrib[j+1]]; |
|||
bl3 = pool2[addrib[j+2]]; |
|||
bl4 = pool2[addrib[j+3]]; */ |
|||
/* Hadamard-Matrix */ |
|||
bsum = (bl1 + bl2 + bl3 + bl4) * 0.5; |
|||
pool1[j] = bl1 - bsum; |
|||
pool1[j+1] = bl2 - bsum; |
|||
pool1[j+2] = bsum - bl3; |
|||
pool1[j+3] = bsum - bl4; |
|||
} |
|||
} |
|||
|
|||
|
|||
/* renormalize again if number of pools beyond limit */ |
|||
if (!(newpools & 0xFFFF)) { |
|||
totsqr = 0.0; |
|||
for (i = 0; i < n; i++) |
|||
totsqr += pool1[i] * pool1[i]; |
|||
nomsqr = sqrt(n / totsqr); |
|||
for (i = 0; i < n; i++) |
|||
pool1[i] *= nomsqr; |
|||
} |
|||
|
|||
outgauss = pool1; |
|||
/* reset data counter */ |
|||
variate_used = n - 1; |
|||
|
|||
/* set counter counting nomber of pools made */ |
|||
newpools++; |
|||
|
|||
/* new scale factor using ch^2 correction, |
|||
using pool1[n-1] from last pool */ |
|||
ScaleGauss = chi1 + chi2 * ScaleGauss * endval; |
|||
|
|||
// printf("Pool number: %d, chi^2 scale: %e\n", newpools, ScaleGauss); |
|||
|
|||
return nulval; /* use old scale */ |
|||
// return pool1[0]; /* use new scale */ |
|||
} |
|||
#endif |
|||
|
|||
/* --------------------- (test) main ------------------------- */ |
|||
/* gcc -Wall -g -DHasMain -I../../include wallace.c CombTaus.o -o watest.exe */ |
|||
#ifdef HasMain |
|||
#include "wallace.h" |
|||
|
|||
struct timeb timenow; |
|||
struct timeb timebegin; |
|||
int sec, msec; |
|||
|
|||
void timediff(struct timeb *now, struct timeb *begin, int *sec, int *msec) |
|||
{ |
|||
|
|||
*msec = now->millitm - begin->millitm; |
|||
*sec = now->time - begin->time; |
|||
if (*msec < 0) { |
|||
*msec += 1000; |
|||
(*sec)--; |
|||
} |
|||
return; |
|||
|
|||
} |
|||
|
|||
|
|||
int main() |
|||
{ |
|||
float x; |
|||
unsigned int i; |
|||
long int count; |
|||
|
|||
initw(); |
|||
ftime(&timebegin); |
|||
count = 100000000; |
|||
for (i = 0; i < count; i++) { |
|||
x = GaussWa; |
|||
// printf("%d\t%f\n", i, x); |
|||
} |
|||
ftime(&timenow); |
|||
timediff(&timenow, &timebegin, &sec, &msec); |
|||
printf("WallaceHV: %ld normal variates: %f s\n", count, sec + (float) msec / 1000.0); |
|||
|
|||
initnorm(0, 0); |
|||
initnorm(77, 3); |
|||
ftime(&timebegin); |
|||
count = 100000000; |
|||
for (i = 0; i < count; i++) { |
|||
x = FastNorm; |
|||
// printf("%d\t%f\n", i, x); |
|||
} |
|||
ftime(&timenow); |
|||
timediff(&timenow, &timebegin, &sec, &msec); |
|||
printf("FastNorm3: %ld normal variates: %f s\n", count, sec + (float) msec / 1000.0); |
|||
|
|||
return (1); |
|||
} |
|||
#endif |
|||
@ -0,0 +1,7 @@ |
|||
|
|||
|
|||
|
|||
void f_alpha(int n_pts, int n_exp, float X[], float Q_d, |
|||
float alpha); |
|||
|
|||
void rvfft(float X[], unsigned long int n); |
|||
@ -0,0 +1,32 @@ |
|||
/* Last revised 28-1-1999 */ |
|||
/* This is the header file FastNorm3.h to be included in code files |
|||
using FastNorm3.c */ |
|||
/* I M P O R T A N T ! ! ! ! ! |
|||
|
|||
The definition below should be altered to ensure that integer |
|||
arithmetic is done on 32-bit words. It may need to be changed from int to |
|||
long on some platforms. The 32-bit requirement arises from the use of |
|||
a Uniform pseudo-random generator in part of the code, which assumes 32-bit |
|||
twos-complement arithmetic. In dire need, replace this generator with |
|||
another more suitable for the platform. The rest of the code assumes only |
|||
that signed integers up to a bit less than 2^31 can be handled. |
|||
*/ |
|||
|
|||
#define Sw int /* MUST define Sw as a 32-bit integer or longer */ |
|||
#define Sf double |
|||
|
|||
extern int gaussfaze; |
|||
extern int gaussmask; |
|||
extern double *gausssave; |
|||
extern double GScale; |
|||
|
|||
#define FastNorm ((--gaussfaze)?GScale*gausssave[gaussfaze]:fastnorm()) |
|||
|
|||
void initnorm(Sw seed, Sw quoll); |
|||
Sf fastnorm (void); |
|||
Sf c7rand(Sw*); |
|||
Sw irandm(Sw*); |
|||
unsigned Sw urandm(Sw*); |
|||
double adchi (double a, int *is); |
|||
double rgamma (double g, int *is); |
|||
Sf renormalize(void); |
|||
@ -0,0 +1,108 @@ |
|||
/******************************************************************* |
|||
This file extends the fftlib with calls to maintain the cosine and bit reversed tables |
|||
for you (including mallocs and free's). Call the init routine for each fft size you will |
|||
be using. Then you can call the fft routines below which will make the fftlib library |
|||
call with the appropriate tables passed. When you are done with all fft's you can call |
|||
fftfree to release the storage for the tables. Note that you can call fftinit repeatedly |
|||
with the same size, the extra calls will be ignored. So, you could make a macro to call |
|||
fftInit every time you call ffts. For example you could have someting like: |
|||
#define FFT(a,n) if(!fftInit(roundtol(LOG2(n)))) ffts(a,roundtol(LOG2(n)),1);else printf("fft error\n"); |
|||
*******************************************************************/ |
|||
|
|||
int fftInit(long M); |
|||
// malloc and init cosine and bit reversed tables for a given size fft, ifft, rfft, rifft |
|||
/* INPUTS */ |
|||
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
|||
/* OUTPUTS */ |
|||
/* private cosine and bit reversed tables */ |
|||
|
|||
void fftFree(); |
|||
// release storage for all private cosine and bit reversed tables |
|||
|
|||
void ffts(float *data, long M, long Rows); |
|||
/* Compute in-place complex fft on the rows of the input array */ |
|||
/* INPUTS */ |
|||
/* *ioptr = input data array */ |
|||
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
|||
/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ |
|||
/* OUTPUTS */ |
|||
/* *ioptr = output data array */ |
|||
|
|||
void iffts(float *data, long M, long Rows); |
|||
/* Compute in-place inverse complex fft on the rows of the input array */ |
|||
/* INPUTS */ |
|||
/* *ioptr = input data array */ |
|||
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
|||
/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ |
|||
/* OUTPUTS */ |
|||
/* *ioptr = output data array */ |
|||
|
|||
void rffts(float *data, long M, long Rows); |
|||
/* Compute in-place real fft on the rows of the input array */ |
|||
/* The result is the complex spectra of the positive frequencies */ |
|||
/* except the location for the first complex number contains the real */ |
|||
/* values for DC and Nyquest */ |
|||
/* See rspectprod for multiplying two of these spectra together- ex. for fast convolution */ |
|||
/* INPUTS */ |
|||
/* *ioptr = real input data array */ |
|||
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
|||
/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ |
|||
/* OUTPUTS */ |
|||
/* *ioptr = output data array in the following order */ |
|||
/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ |
|||
|
|||
void riffts(float *data, long M, long Rows); |
|||
/* Compute in-place real ifft on the rows of the input array */ |
|||
/* data order as from rffts */ |
|||
/* INPUTS */ |
|||
/* *ioptr = input data array in the following order */ |
|||
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
|||
/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ |
|||
/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ |
|||
/* OUTPUTS */ |
|||
/* *ioptr = real output data array */ |
|||
|
|||
void rspectprod(float *data1, float *data2, float *outdata, long N); |
|||
// When multiplying a pair of spectra from rfft care must be taken to multiply the |
|||
// two real values seperately from the complex ones. This routine does it correctly. |
|||
// the result can be stored in-place over one of the inputs |
|||
/* INPUTS */ |
|||
/* *data1 = input data array first spectra */ |
|||
/* *data2 = input data array second spectra */ |
|||
/* N = fft input size for both data1 and data2 */ |
|||
/* OUTPUTS */ |
|||
/* *outdata = output data array spectra */ |
|||
|
|||
|
|||
/* The following is FYI |
|||
|
|||
|
|||
Note that most of the fft routines require full matrices, ie Rsiz==Ncols |
|||
This is how I like to define a real matrix: |
|||
struct matrix { // real matrix |
|||
float *d; // pointer to data |
|||
long Nrows; // number of rows in the matrix |
|||
long Ncols; // number of columns in the matrix (can be less than Rsiz) |
|||
long Rsiz; // number of floats from one row to the next |
|||
}; |
|||
typedef struct matrix matrix; |
|||
|
|||
|
|||
|
|||
CACHEFILLMALLOC and CEILCACHELINE can be used instead of malloc to make |
|||
arrays that start exactly on a cache line start. |
|||
First we CACHEFILLMALLOC a void * (use this void * when free'ing), |
|||
then we set our array pointer equal to the properly cast CEILCACHELINE of this void * |
|||
example: |
|||
aInit = CACHEFILLMALLOC( NUMFLOATS*sizeof(float) ); |
|||
a = (float *) CEILCACHELINE(ainit); |
|||
... main body of code ... |
|||
free(aInit); |
|||
|
|||
To disable this alignment, set CACHELINESIZE to 1 |
|||
#define CACHELINESIZE 32 // Bytes per cache line |
|||
#define CACHELINEFILL (CACHELINESIZE-1) |
|||
#define CEILCACHELINE(p) ((((unsigned long)p+CACHELINEFILL)/CACHELINESIZE)*CACHELINESIZE) |
|||
#define CACHEFILLMALLOC(n) malloc((n)+CACHELINEFILL) |
|||
*/ |
|||
|
|||
@ -0,0 +1,22 @@ |
|||
/* Wallace generator for normally distributed random variates |
|||
Copyright Holger Vogt, 2008 |
|||
|
|||
Calling sequence: |
|||
initw(void); initialize using srand(seed) |
|||
double x = GaussWa; returns normally distributed random variate |
|||
|
|||
*/ |
|||
|
|||
|
|||
|
|||
extern double *outgauss; /*output vector for user access */ |
|||
extern unsigned int variate_used; /* actual index of variate called by user */ |
|||
extern double ScaleGauss; /* scale factor, including chi square correction */ |
|||
|
|||
double NewWa(void); /* generate new pool, return outgauss[0] */ |
|||
|
|||
#define GaussWa ((--variate_used)?(outgauss[variate_used]*ScaleGauss):NewWa()) |
|||
|
|||
void initw(void); /* initialization of Wallace generator */ |
|||
|
|||
void PolarGauss(double* py1, double* py2); |
|||
@ -1,6 +1,6 @@ |
|||
## Process this file with automake
|
|||
|
|||
SUBDIRS = cmaths ni sparse poly deriv misc |
|||
DIST_SUBDIRS = cmaths ni sparse poly deriv misc |
|||
SUBDIRS = cmaths ni sparse poly deriv misc fft |
|||
DIST_SUBDIRS = cmaths ni sparse poly deriv misc fft |
|||
|
|||
MAINTAINERCLEANFILES = Makefile.in |
|||
@ -0,0 +1,13 @@ |
|||
## Process this file with automake to produce Makefile.in
|
|||
|
|||
noinst_LTLIBRARIES = libmathfft.la |
|||
|
|||
libmathfft_la_SOURCES = \
|
|||
fftext.c \
|
|||
fftlib.c \
|
|||
matlib.c |
|||
|
|||
|
|||
|
|||
AM_CPPFLAGS = -I$(top_srcdir)/src/include |
|||
MAINTAINERCLEANFILES = Makefile.in |
|||
@ -0,0 +1,37 @@ |
|||
Subject: FFT for RISC 2.0 |
|||
To: macgifts@sumex-aim.stanford.edu |
|||
Enclosure: FFTs-for-RISC-2.sit |
|||
|
|||
Enclosed is a stuffit archive of version 2.0 of my 'C' source code fft library. |
|||
|
|||
Very-Fast Fourier Transform routines. Routines are provided for real and complex |
|||
forward and inverse 1d and 2d fourier transforms and 3d complex forward and inverse ffts. |
|||
I coded these to optimize execution speed on Risc processors like the PowerPC. |
|||
All fft sizes must still be a power of two. |
|||
Test programs based on the Numerical Recipes in C routines are provided. |
|||
Also included are some simple applications with source code which time the FFTs. |
|||
See the enclosed read me file for more information. |
|||
|
|||
Revision version 2.0: |
|||
Rewrote code to rely more on compiler optimization (and be less ugly.) |
|||
Removed restrictions on too small or too large ffts. |
|||
Provided a library extension that manages memory for cosine and bit |
|||
reversed counter tables. |
|||
Added 2d and 3d complex and 2d real ffts. |
|||
Speeded routines for data too large to fit in primary cache. |
|||
Changed most testing from Matlab to Numerical Recipes based (because its cheaper.) |
|||
Changed call parameters (watch out.) |
|||
Revision version 1.21: |
|||
line 126 of rfftTest.c corrected. |
|||
Revisions version 1.2: |
|||
I now store the Nyquest point of the real transform where the 0 for the DC term's |
|||
imaginary part used to be. !! WATCH OUT FOR THIS IF YOU USE rfft !! |
|||
Added the real inverse Fourier transform. |
|||
|
|||
Revisions version 1.1: |
|||
Re-arranged to put fft routines in a shared library and changed source file name to fftlib.c. |
|||
Removed some ugly optimizations that are no longer needed for CodeWarrier. |
|||
|
|||
This code is public domain, do anything you want to with it. |
|||
|
|||
[Moderators- This file should replace ffts-for-risc-121-c.hqx and can be included on any CD] |
|||
@ -0,0 +1,70 @@ |
|||
This directory contains a public domain FFT library which was optimized |
|||
for speed on RISC processors such as the PowerPC. All ffts |
|||
use single precision floats, for double precision just use a |
|||
global search and replace to change float to double in all |
|||
source files. |
|||
Codewarrier Pro 1.0 project files are also supplied. |
|||
|
|||
** Warning ** Perform rigorous testing to |
|||
your own standards before using this code. |
|||
|
|||
(John Green) green_jt@vsdec.npt.nuwc.navy.mil |
|||
|
|||
files: |
|||
fftTiming |
|||
Application to time complex ffts |
|||
|
|||
rfftTiming |
|||
Application to time real ffts |
|||
|
|||
// Directory: fft libraries |
|||
|
|||
files: |
|||
|
|||
fftext.c |
|||
Library of in-place fast fourier transforms. Contains forward |
|||
and inverse complex and real transforms. The real fft's expect the |
|||
frequency domain data to have the real part of the fsamp/2 bin (which |
|||
has a 0 imaginary part) to be stored in the location for the imaginary |
|||
part of the DC bin (the DC bin of real data is also strictly real.) |
|||
You must first call an initialization routine fftInit before calling |
|||
the fft computation routines ffts, iffts, rffts and riffts. |
|||
The init routines malloc the memory to store the cosine and |
|||
bit reversed counter tables as well as initializing their values. |
|||
|
|||
fftlib.c |
|||
Lower level library of in-place fast fourier transforms. Same as fftext.c but you |
|||
need to manage the mallocs for the cosine and bit reversed tables yourself. |
|||
|
|||
|
|||
fft2d.c |
|||
Library of 2d and 3d complex and 2d real in-place fast fourier transforms. |
|||
The init routine fft2dInit must be called before using the 2d routines and |
|||
fft3dInit must be called before using the 3d routines. These init routines |
|||
will also call the appropriate 1d init routines in fftext.c |
|||
|
|||
matlib.c |
|||
Matrix transpose routines used by fft2d.c and complex vector multiply |
|||
for forming the product of two spectra. |
|||
|
|||
dxpose.c |
|||
Double precision matrix transpose for quick single precision complex transposing |
|||
|
|||
// Directory: timing code |
|||
This directory contains the source to fftTiming and rfftTiming |
|||
|
|||
// Directory: Numerical Recipes testing |
|||
This directory contains files used to test the various fft routines using |
|||
the Numerical Recipes in C routines as a baseline. These routines can be purchased |
|||
in PeeCee (after expanding you can move them to a Mac) format from: |
|||
http://cfata2.harvard.edu/numerical-recipes/ |
|||
Unfortunately Numerical Recipes defines its forward and inverse fft's backwards. |
|||
For complex fft's I just use their inverse fft as a forward one, but for real ffts |
|||
their forward fft followed by my inverse fft reverses the data. They also have ugly matrix |
|||
and tensor data types and start their indices with one, Fortran style, but these are |
|||
minor annoyances. |
|||
|
|||
// Directory: Matlab testing |
|||
This directory contains files to test fast 1d and 2d convolution with Matlab used to |
|||
verify the results. An example of using Matlab to test the fft library routines is |
|||
also given for the 2d real fft. |
|||
@ -0,0 +1,156 @@ |
|||
/******************************************************************* |
|||
This file extends the fftlib with calls to maintain the cosine and bit reversed tables |
|||
for you (including mallocs and free's). Call the init routine for each fft size you will |
|||
be using. Then you can call the fft routines below which will make the fftlib library |
|||
call with the appropriate tables passed. When you are done with all fft's you can call |
|||
fftfree to release the storage for the tables. Note that you can call fftinit repeatedly |
|||
with the same size, the extra calls will be ignored. So, you could make a macro to call |
|||
fftInit every time you call ffts. For example you could have someting like: |
|||
#define FFT(a,n) if(!fftInit(roundtol(LOG2(n)))) ffts(a,roundtol(LOG2(n)),1);else printf("fft error\n"); |
|||
*******************************************************************/ |
|||
#include <stdlib.h> |
|||
#include "fftlib.h" |
|||
#include "matlib.h" |
|||
#include "fftext.h" |
|||
|
|||
// pointers to storage of Utbl's and BRLow's |
|||
static float *UtblArray[8*sizeof(long)] = {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0}; |
|||
static short *BRLowArray[8*sizeof(long)/2] = {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0}; |
|||
|
|||
int fftInit(long M){ |
|||
// malloc and init cosine and bit reversed tables for a given size fft, ifft, rfft, rifft |
|||
/* INPUTS */ |
|||
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
|||
/* OUTPUTS */ |
|||
/* private cosine and bit reversed tables */ |
|||
|
|||
int theError = 1; |
|||
/*** I did NOT test cases with M>27 ***/ |
|||
if ((M >= 0) && (M < 8*sizeof(long))){ |
|||
theError = 0; |
|||
if (UtblArray[M] == 0){ // have we not inited this size fft yet? |
|||
// init cos table |
|||
UtblArray[M] = (float *) malloc( (POW2(M)/4+1)*sizeof(float) ); |
|||
if (UtblArray[M] == 0) |
|||
theError = 2; |
|||
else{ |
|||
fftCosInit(M, UtblArray[M]); |
|||
} |
|||
if (M > 1){ |
|||
if (BRLowArray[M/2] == 0){ // init bit reversed table for cmplx fft |
|||
BRLowArray[M/2] = (short *) malloc( POW2(M/2-1)*sizeof(short) ); |
|||
if (BRLowArray[M/2] == 0) |
|||
theError = 2; |
|||
else{ |
|||
fftBRInit(M, BRLowArray[M/2]); |
|||
} |
|||
} |
|||
} |
|||
if (M > 2){ |
|||
if (BRLowArray[(M-1)/2] == 0){ // init bit reversed table for real fft |
|||
BRLowArray[(M-1)/2] = (short *) malloc( POW2((M-1)/2-1)*sizeof(short) ); |
|||
if (BRLowArray[(M-1)/2] == 0) |
|||
theError = 2; |
|||
else{ |
|||
fftBRInit(M-1, BRLowArray[(M-1)/2]); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
}; |
|||
return theError; |
|||
} |
|||
|
|||
void fftFree(void){ |
|||
// release storage for all private cosine and bit reversed tables |
|||
long i1; |
|||
for (i1=8*sizeof(long)/2-1; i1>=0; i1--){ |
|||
if (BRLowArray[i1] != 0){ |
|||
free(BRLowArray[i1]); |
|||
BRLowArray[i1] = 0; |
|||
}; |
|||
}; |
|||
for (i1=8*sizeof(long)-1; i1>=0; i1--){ |
|||
if (UtblArray[i1] != 0){ |
|||
free(UtblArray[i1]); |
|||
UtblArray[i1] = 0; |
|||
}; |
|||
}; |
|||
} |
|||
|
|||
/************************************************* |
|||
The following calls are easier than calling to fftlib directly. |
|||
Just make sure fftlib has been called for each M first. |
|||
**************************************************/ |
|||
|
|||
void ffts(float *data, long M, long Rows){ |
|||
/* Compute in-place complex fft on the rows of the input array */ |
|||
/* INPUTS */ |
|||
/* *ioptr = input data array */ |
|||
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
|||
/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ |
|||
/* OUTPUTS */ |
|||
/* *ioptr = output data array */ |
|||
ffts1(data, M, Rows, UtblArray[M], BRLowArray[M/2]); |
|||
} |
|||
|
|||
void iffts(float *data, long M, long Rows){ |
|||
/* Compute in-place inverse complex fft on the rows of the input array */ |
|||
/* INPUTS */ |
|||
/* *ioptr = input data array */ |
|||
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
|||
/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ |
|||
/* OUTPUTS */ |
|||
/* *ioptr = output data array */ |
|||
iffts1(data, M, Rows, UtblArray[M], BRLowArray[M/2]); |
|||
} |
|||
|
|||
void rffts(float *data, long M, long Rows){ |
|||
/* Compute in-place real fft on the rows of the input array */ |
|||
/* The result is the complex spectra of the positive frequencies */ |
|||
/* except the location for the first complex number contains the real */ |
|||
/* values for DC and Nyquest */ |
|||
/* See rspectprod for multiplying two of these spectra together- ex. for fast convolution */ |
|||
/* INPUTS */ |
|||
/* *ioptr = real input data array */ |
|||
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
|||
/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ |
|||
/* OUTPUTS */ |
|||
/* *ioptr = output data array in the following order */ |
|||
/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ |
|||
rffts1(data, M, Rows, UtblArray[M], BRLowArray[(M-1)/2]); |
|||
} |
|||
|
|||
void riffts(float *data, long M, long Rows){ |
|||
/* Compute in-place real ifft on the rows of the input array */ |
|||
/* data order as from rffts */ |
|||
/* INPUTS */ |
|||
/* *ioptr = input data array in the following order */ |
|||
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
|||
/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ |
|||
/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ |
|||
/* OUTPUTS */ |
|||
/* *ioptr = real output data array */ |
|||
riffts1(data, M, Rows, UtblArray[M], BRLowArray[(M-1)/2]); |
|||
} |
|||
|
|||
void rspectprod(float *data1, float *data2, float *outdata, long N){ |
|||
// When multiplying a pair of spectra from rfft care must be taken to multiply the |
|||
// two real values seperately from the complex ones. This routine does it correctly. |
|||
// the result can be stored in-place over one of the inputs |
|||
/* INPUTS */ |
|||
/* *data1 = input data array first spectra */ |
|||
/* *data2 = input data array second spectra */ |
|||
/* N = fft input size for both data1 and data2 */ |
|||
/* OUTPUTS */ |
|||
/* *outdata = output data array spectra */ |
|||
if(N>1){ |
|||
outdata[0] = data1[0] * data2[0]; // multiply the zero freq values |
|||
outdata[1] = data1[1] * data2[1]; // multiply the nyquest freq values |
|||
cvprod(data1 + 2, data2 + 2, outdata + 2, N/2-1); // multiply the other positive freq values |
|||
} |
|||
else{ |
|||
outdata[0] = data1[0] * data2[0]; |
|||
} |
|||
} |
|||
@ -0,0 +1,106 @@ |
|||
/******************************************************************* |
|||
This file extends the fftlib with calls to maintain the cosine and bit reversed tables |
|||
for you (including mallocs and free's). Call the init routine for each fft size you will |
|||
be using. Then you can call the fft routines below which will make the fftlib library |
|||
call with the appropriate tables passed. When you are done with all fft's you can call |
|||
fftfree to release the storage for the tables. Note that you can call fftinit repeatedly |
|||
with the same size, the extra calls will be ignored. So, you could make a macro to call |
|||
fftInit every time you call ffts. For example you could have someting like: |
|||
#define FFT(a,n) if(!fftInit(roundtol(LOG2(n)))) ffts(a,roundtol(LOG2(n)),1);else printf("fft error\n"); |
|||
*******************************************************************/ |
|||
|
|||
int fftInit(long M); |
|||
// malloc and init cosine and bit reversed tables for a given size fft, ifft, rfft, rifft |
|||
/* INPUTS */ |
|||
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
|||
/* OUTPUTS */ |
|||
/* private cosine and bit reversed tables */ |
|||
|
|||
void fftFree(void); |
|||
// release storage for all private cosine and bit reversed tables |
|||
|
|||
void ffts(float *data, long M, long Rows); |
|||
/* Compute in-place complex fft on the rows of the input array */ |
|||
/* INPUTS */ |
|||
/* *ioptr = input data array */ |
|||
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
|||
/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ |
|||
/* OUTPUTS */ |
|||
/* *ioptr = output data array */ |
|||
|
|||
void iffts(float *data, long M, long Rows); |
|||
/* Compute in-place inverse complex fft on the rows of the input array */ |
|||
/* INPUTS */ |
|||
/* *ioptr = input data array */ |
|||
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
|||
/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ |
|||
/* OUTPUTS */ |
|||
/* *ioptr = output data array */ |
|||
|
|||
void rffts(float *data, long M, long Rows); |
|||
/* Compute in-place real fft on the rows of the input array */ |
|||
/* The result is the complex spectra of the positive frequencies */ |
|||
/* except the location for the first complex number contains the real */ |
|||
/* values for DC and Nyquest */ |
|||
/* See rspectprod for multiplying two of these spectra together- ex. for fast convolution */ |
|||
/* INPUTS */ |
|||
/* *ioptr = real input data array */ |
|||
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
|||
/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ |
|||
/* OUTPUTS */ |
|||
/* *ioptr = output data array in the following order */ |
|||
/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ |
|||
|
|||
void riffts(float *data, long M, long Rows); |
|||
/* Compute in-place real ifft on the rows of the input array */ |
|||
/* data order as from rffts */ |
|||
/* INPUTS */ |
|||
/* *ioptr = input data array in the following order */ |
|||
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
|||
/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ |
|||
/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ |
|||
/* OUTPUTS */ |
|||
/* *ioptr = real output data array */ |
|||
|
|||
void rspectprod(float *data1, float *data2, float *outdata, long N); |
|||
// When multiplying a pair of spectra from rfft care must be taken to multiply the |
|||
// two real values seperately from the complex ones. This routine does it correctly. |
|||
// the result can be stored in-place over one of the inputs |
|||
/* INPUTS */ |
|||
/* *data1 = input data array first spectra */ |
|||
/* *data2 = input data array second spectra */ |
|||
/* N = fft input size for both data1 and data2 */ |
|||
/* OUTPUTS */ |
|||
/* *outdata = output data array spectra */ |
|||
|
|||
|
|||
// The following is FYI |
|||
|
|||
|
|||
//Note that most of the fft routines require full matrices, ie Rsiz==Ncols |
|||
//This is how I like to define a real matrix: |
|||
//struct matrix { // real matrix |
|||
// float *d; // pointer to data |
|||
// long Nrows; // number of rows in the matrix |
|||
// long Ncols; // number of columns in the matrix (can be less than Rsiz) |
|||
// long Rsiz; // number of floats from one row to the next |
|||
//}; |
|||
//typedef struct matrix matrix; |
|||
|
|||
|
|||
|
|||
// CACHEFILLMALLOC and CEILCACHELINE can be used instead of malloc to make |
|||
// arrays that start exactly on a cache line start. |
|||
// First we CACHEFILLMALLOC a void * (use this void * when free'ing), |
|||
// then we set our array pointer equal to the properly cast CEILCACHELINE of this void * |
|||
// example: |
|||
// aInit = CACHEFILLMALLOC( NUMFLOATS*sizeof(float) ); |
|||
// a = (float *) CEILCACHELINE(ainit); |
|||
// ... main body of code ... |
|||
// free(aInit); |
|||
// |
|||
// To disable this alignment, set CACHELINESIZE to 1 |
|||
//#define CACHELINESIZE 32 // Bytes per cache line |
|||
//#define CACHELINEFILL (CACHELINESIZE-1) |
|||
//#define CEILCACHELINE(p) ((((unsigned long)p+CACHELINEFILL)/CACHELINESIZE)*CACHELINESIZE) |
|||
//#define CACHEFILLMALLOC(n) malloc((n)+CACHELINEFILL) |
|||
3175
src/maths/fft/fftlib.c
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
@ -0,0 +1,76 @@ |
|||
#define MYRECIPLN2 1.442695040888963407359924681001892137426 // 1.0/log(2) |
|||
|
|||
/* some useful conversions between a number and its power of 2 */ |
|||
#define LOG2(a) (MYRECIPLN2*log(a)) // floating point logarithm base 2 |
|||
#define POW2(m) ((unsigned long) 1 << (m)) // integer power of 2 for m<32 |
|||
|
|||
/******************************************************************* |
|||
lower level fft stuff called by routines in fftext.c and fft2d.c |
|||
*******************************************************************/ |
|||
|
|||
void fftCosInit(long M, float *Utbl); |
|||
/* Compute Utbl, the cosine table for ffts */ |
|||
/* of size (pow(2,M)/4 +1) */ |
|||
/* INPUTS */ |
|||
/* M = log2 of fft size */ |
|||
/* OUTPUTS */ |
|||
/* *Utbl = cosine table */ |
|||
|
|||
void fftBRInit(long M, short *BRLow); |
|||
/* Compute BRLow, the bit reversed table for ffts */ |
|||
/* of size pow(2,M/2 -1) */ |
|||
/* INPUTS */ |
|||
/* M = log2 of fft size */ |
|||
/* OUTPUTS */ |
|||
/* *BRLow = bit reversed counter table */ |
|||
|
|||
void ffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow); |
|||
/* Compute in-place complex fft on the rows of the input array */ |
|||
/* INPUTS */ |
|||
/* *ioptr = input data array */ |
|||
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
|||
/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ |
|||
/* *Utbl = cosine table */ |
|||
/* *BRLow = bit reversed counter table */ |
|||
/* OUTPUTS */ |
|||
/* *ioptr = output data array */ |
|||
|
|||
void iffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow); |
|||
/* Compute in-place inverse complex fft on the rows of the input array */ |
|||
/* INPUTS */ |
|||
/* *ioptr = input data array */ |
|||
/* M = log2 of fft size */ |
|||
/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ |
|||
/* *Utbl = cosine table */ |
|||
/* *BRLow = bit reversed counter table */ |
|||
/* OUTPUTS */ |
|||
/* *ioptr = output data array */ |
|||
|
|||
void rffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow); |
|||
/* Compute in-place real fft on the rows of the input array */ |
|||
/* The result is the complex spectra of the positive frequencies */ |
|||
/* except the location for the first complex number contains the real */ |
|||
/* values for DC and Nyquest */ |
|||
/* INPUTS */ |
|||
/* *ioptr = real input data array */ |
|||
/* M = log2 of fft size */ |
|||
/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ |
|||
/* *Utbl = cosine table */ |
|||
/* *BRLow = bit reversed counter table */ |
|||
/* OUTPUTS */ |
|||
/* *ioptr = output data array in the following order */ |
|||
/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ |
|||
|
|||
|
|||
void riffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow); |
|||
/* Compute in-place real ifft on the rows of the input array */ |
|||
/* data order as from rffts1 */ |
|||
/* INPUTS */ |
|||
/* *ioptr = input data array in the following order */ |
|||
/* M = log2 of fft size */ |
|||
/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ |
|||
/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ |
|||
/* *Utbl = cosine table */ |
|||
/* *BRLow = bit reversed counter table */ |
|||
/* OUTPUTS */ |
|||
/* *ioptr = real output data array */ |
|||
@ -0,0 +1,297 @@ |
|||
/* a few routines from a vector/matrix library */ |
|||
#include "matlib.h" |
|||
|
|||
void xpose(float *indata, long iRsiz, float *outdata, long oRsiz, long Nrows, long Ncols){ |
|||
/* not in-place matrix transpose */ |
|||
/* INPUTS */ |
|||
/* *indata = input data array */ |
|||
/* iRsiz = offset to between rows of input data array */ |
|||
/* oRsiz = offset to between rows of output data array */ |
|||
/* Nrows = number of rows in input data array */ |
|||
/* Ncols = number of columns in input data array */ |
|||
/* OUTPUTS */ |
|||
/* *outdata = output data array */ |
|||
|
|||
float *irow; /* pointer to input row start */ |
|||
float *ocol; /* pointer to output col start */ |
|||
float *idata; /* pointer to input data */ |
|||
float *odata; /* pointer to output data */ |
|||
long RowCnt; /* row counter */ |
|||
long ColCnt; /* col counter */ |
|||
float T0; /* data storage */ |
|||
float T1; /* data storage */ |
|||
float T2; /* data storage */ |
|||
float T3; /* data storage */ |
|||
float T4; /* data storage */ |
|||
float T5; /* data storage */ |
|||
float T6; /* data storage */ |
|||
float T7; /* data storage */ |
|||
const long inRsizd1 = iRsiz; |
|||
const long inRsizd2 = 2*iRsiz; |
|||
const long inRsizd3 = inRsizd2+iRsiz; |
|||
const long inRsizd4 = 4*iRsiz; |
|||
const long inRsizd5 = inRsizd3+inRsizd2; |
|||
const long inRsizd6 = inRsizd4+inRsizd2; |
|||
const long inRsizd7 = inRsizd4+inRsizd3; |
|||
const long inRsizd8 = 8*iRsiz; |
|||
|
|||
ocol = outdata; |
|||
irow = indata; |
|||
for (RowCnt=Nrows/8; RowCnt>0; RowCnt--){ |
|||
idata = irow; |
|||
odata = ocol; |
|||
for (ColCnt=Ncols; ColCnt>0; ColCnt--){ |
|||
T0 = *idata; |
|||
T1 = *(idata+inRsizd1); |
|||
T2 = *(idata+inRsizd2); |
|||
T3 = *(idata+inRsizd3); |
|||
T4 = *(idata+inRsizd4); |
|||
T5 = *(idata+inRsizd5); |
|||
T6 = *(idata+inRsizd6); |
|||
T7 = *(idata+inRsizd7); |
|||
*odata = T0; |
|||
*(odata+1) = T1; |
|||
*(odata+2) = T2; |
|||
*(odata+3) = T3; |
|||
*(odata+4) = T4; |
|||
*(odata+5) = T5; |
|||
*(odata+6) = T6; |
|||
*(odata+7) = T7; |
|||
idata++; |
|||
odata += oRsiz; |
|||
} |
|||
irow += inRsizd8; |
|||
ocol += 8; |
|||
} |
|||
if (Nrows%8 != 0){ |
|||
for (ColCnt=Ncols; ColCnt>0; ColCnt--){ |
|||
idata = irow++; |
|||
odata = ocol; |
|||
ocol += oRsiz; |
|||
for (RowCnt=Nrows%8; RowCnt>0; RowCnt--){ |
|||
T0 = *idata; |
|||
*odata++ = T0; |
|||
idata += iRsiz; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
void cxpose(float *indata, long iRsiz, float *outdata, long oRsiz, long Nrows, long Ncols){ |
|||
/* not in-place complex float matrix transpose */ |
|||
/* INPUTS */ |
|||
/* *indata = input data array */ |
|||
/* iRsiz = offset to between rows of input data array */ |
|||
/* oRsiz = offset to between rows of output data array */ |
|||
/* Nrows = number of rows in input data array */ |
|||
/* Ncols = number of columns in input data array */ |
|||
/* OUTPUTS */ |
|||
/* *outdata = output data array */ |
|||
|
|||
float *irow; /* pointer to input row start */ |
|||
float *ocol; /* pointer to output col start */ |
|||
float *idata; /* pointer to input data */ |
|||
float *odata; /* pointer to output data */ |
|||
long RowCnt; /* row counter */ |
|||
long ColCnt; /* col counter */ |
|||
float T0r; /* data storage */ |
|||
float T0i; /* data storage */ |
|||
float T1r; /* data storage */ |
|||
float T1i; /* data storage */ |
|||
float T2r; /* data storage */ |
|||
float T2i; /* data storage */ |
|||
float T3r; /* data storage */ |
|||
float T3i; /* data storage */ |
|||
const long inRsizd1 = 2*iRsiz; |
|||
const long inRsizd1i = 2*iRsiz + 1; |
|||
const long inRsizd2 = 4*iRsiz; |
|||
const long inRsizd2i = 4*iRsiz + 1; |
|||
const long inRsizd3 = inRsizd2+inRsizd1; |
|||
const long inRsizd3i = inRsizd2+inRsizd1 + 1; |
|||
const long inRsizd4 = 8*iRsiz; |
|||
|
|||
ocol = outdata; |
|||
irow = indata; |
|||
for (RowCnt=Nrows/4; RowCnt>0; RowCnt--){ |
|||
idata = irow; |
|||
odata = ocol; |
|||
for (ColCnt=Ncols; ColCnt>0; ColCnt--){ |
|||
T0r = *idata; |
|||
T0i = *(idata +1); |
|||
T1r = *(idata+inRsizd1); |
|||
T1i = *(idata+inRsizd1i); |
|||
T2r = *(idata+inRsizd2); |
|||
T2i = *(idata+inRsizd2i); |
|||
T3r = *(idata+inRsizd3); |
|||
T3i = *(idata+inRsizd3i); |
|||
*odata = T0r; |
|||
*(odata+1) = T0i; |
|||
*(odata+2) = T1r; |
|||
*(odata+3) = T1i; |
|||
*(odata+4) = T2r; |
|||
*(odata+5) = T2i; |
|||
*(odata+6) = T3r; |
|||
*(odata+7) = T3i; |
|||
idata+=2; |
|||
odata += 2*oRsiz; |
|||
} |
|||
irow += inRsizd4; |
|||
ocol += 8; |
|||
} |
|||
if (Nrows%4 != 0){ |
|||
for (ColCnt=Ncols; ColCnt>0; ColCnt--){ |
|||
idata = irow; |
|||
odata = ocol; |
|||
for (RowCnt=Nrows%4; RowCnt>0; RowCnt--){ |
|||
T0r = *idata; |
|||
T0i = *(idata+1); |
|||
*odata = T0r; |
|||
*(odata+1) = T0i; |
|||
odata+=2; |
|||
idata += 2*iRsiz; |
|||
} |
|||
irow+=2; |
|||
ocol += 2*oRsiz; |
|||
} |
|||
} |
|||
} |
|||
|
|||
void cvprod(float *a, float *b, float *out, long N){ |
|||
/* complex vector product, can be in-place */ |
|||
/* product of complex vector *a times complex vector *b */ |
|||
/* INPUTS */ |
|||
/* N vector length */ |
|||
/* *a complex vector length N complex numbers */ |
|||
/* *b complex vector length N complex numbers */ |
|||
/* OUTPUTS */ |
|||
/* *out complex vector length N */ |
|||
|
|||
long OutCnt; /* output counter */ |
|||
float A0R; /* A storage */ |
|||
float A0I; /* A storage */ |
|||
float A1R; /* A storage */ |
|||
float A1I; /* A storage */ |
|||
float A2R; /* A storage */ |
|||
float A2I; /* A storage */ |
|||
float A3R; /* A storage */ |
|||
float A3I; /* A storage */ |
|||
float B0R; /* B storage */ |
|||
float B0I; /* B storage */ |
|||
float B1R; /* B storage */ |
|||
float B1I; /* B storage */ |
|||
float B2R; /* B storage */ |
|||
float B2I; /* B storage */ |
|||
float B3R; /* B storage */ |
|||
float B3I; /* B storage */ |
|||
float T0R; /* TMP storage */ |
|||
float T0I; /* TMP storage */ |
|||
float T1R; /* TMP storage */ |
|||
float T1I; /* TMP storage */ |
|||
float T2R; /* TMP storage */ |
|||
float T2I; /* TMP storage */ |
|||
float T3R; /* TMP storage */ |
|||
float T3I; /* TMP storage */ |
|||
|
|||
if (N>=4){ |
|||
A0R = *a; |
|||
B0R = *b; |
|||
A0I = *(a +1); |
|||
B0I = *(b +1); |
|||
A1R = *(a +2); |
|||
B1R = *(b +2); |
|||
A1I = *(a +3); |
|||
B1I = *(b +3); |
|||
A2R = *(a +4); |
|||
B2R = *(b +4); |
|||
A2I = *(a +5); |
|||
B2I = *(b +5); |
|||
A3R = *(a +6); |
|||
B3R = *(b +6); |
|||
A3I = *(a +7); |
|||
B3I = *(b +7); |
|||
T0R = A0R * B0R; |
|||
T0I = (A0R * B0I); |
|||
T1R = A1R * B1R; |
|||
T1I = (A1R * B1I); |
|||
T2R = A2R * B2R; |
|||
T2I = (A2R * B2I); |
|||
T3R = A3R * B3R; |
|||
T3I = (A3R * B3I); |
|||
T0R -= (A0I * B0I); |
|||
T0I = A0I * B0R + T0I; |
|||
T1R -= (A1I * B1I); |
|||
T1I = A1I * B1R + T1I; |
|||
T2R -= (A2I * B2I); |
|||
T2I = A2I * B2R + T2I; |
|||
T3R -= (A3I * B3I); |
|||
T3I = A3I * B3R + T3I; |
|||
for (OutCnt=N/4-1; OutCnt > 0; OutCnt--){ |
|||
a += 8; |
|||
b += 8; |
|||
A0R = *a; |
|||
B0R = *b; |
|||
A0I = *(a +1); |
|||
B0I = *(b +1); |
|||
A1R = *(a +2); |
|||
B1R = *(b +2); |
|||
A1I = *(a +3); |
|||
B1I = *(b +3); |
|||
A2R = *(a +4); |
|||
B2R = *(b +4); |
|||
A2I = *(a +5); |
|||
B2I = *(b +5); |
|||
A3R = *(a +6); |
|||
B3R = *(b +6); |
|||
A3I = *(a +7); |
|||
B3I = *(b +7); |
|||
*out = T0R; |
|||
*(out +1) = T0I; |
|||
*(out +2) = T1R; |
|||
*(out +3) = T1I; |
|||
*(out +4) = T2R; |
|||
*(out +5) = T2I; |
|||
*(out +6) = T3R; |
|||
*(out +7) = T3I; |
|||
T0R = A0R * B0R; |
|||
T0I = (A0R * B0I); |
|||
T1R = A1R * B1R; |
|||
T1I = (A1R * B1I); |
|||
T2R = A2R * B2R; |
|||
T2I = (A2R * B2I); |
|||
T3R = A3R * B3R; |
|||
T3I = (A3R * B3I); |
|||
T0R -= (A0I * B0I); |
|||
T0I = A0I * B0R + T0I; |
|||
T1R -= (A1I * B1I); |
|||
T1I = A1I * B1R + T1I; |
|||
T2R -= (A2I * B2I); |
|||
T2I = A2I * B2R + T2I; |
|||
T3R -= (A3I * B3I); |
|||
T3I = A3I * B3R + T3I; |
|||
out += 8; |
|||
} |
|||
a += 8; |
|||
b += 8; |
|||
*out = T0R; |
|||
*(out +1) = T0I; |
|||
*(out +2) = T1R; |
|||
*(out +3) = T1I; |
|||
*(out +4) = T2R; |
|||
*(out +5) = T2I; |
|||
*(out +6) = T3R; |
|||
*(out +7) = T3I; |
|||
out += 8; |
|||
} |
|||
for (OutCnt=N%4; OutCnt > 0; OutCnt--){ |
|||
A0R = *a++; |
|||
B0R = *b++; |
|||
A0I = *a++; |
|||
B0I = *b++; |
|||
T0R = A0R * B0R; |
|||
T0I = (A0R * B0I); |
|||
T0R -= (A0I * B0I); |
|||
T0I = A0I * B0R + T0I; |
|||
*out++ = T0R; |
|||
*out++ = T0I; |
|||
} |
|||
} |
|||
@ -0,0 +1,33 @@ |
|||
/* a few routines from a vector/matrix library */ |
|||
|
|||
void xpose(float *indata, long iRsiz, float *outdata, long oRsiz, long Nrows, long Ncols); |
|||
/* not in-place matrix transpose */ |
|||
/* INPUTS */ |
|||
/* *indata = input data array */ |
|||
/* iRsiz = offset to between rows of input data array */ |
|||
/* oRsiz = offset to between rows of output data array */ |
|||
/* Nrows = number of rows in input data array */ |
|||
/* Ncols = number of columns in input data array */ |
|||
/* OUTPUTS */ |
|||
/* *outdata = output data array */ |
|||
|
|||
void cxpose(float *indata, long iRsiz, float *outdata, long oRsiz, long Nrows, long Ncols); |
|||
/* not in-place complex matrix transpose */ |
|||
/* INPUTS */ |
|||
/* *indata = input data array */ |
|||
/* iRsiz = offset to between rows of input data array */ |
|||
/* oRsiz = offset to between rows of output data array */ |
|||
/* Nrows = number of rows in input data array */ |
|||
/* Ncols = number of columns in input data array */ |
|||
/* OUTPUTS */ |
|||
/* *outdata = output data array */ |
|||
|
|||
void cvprod(float *a, float *b, float *out, long N); |
|||
/* complex vector product, can be in-place */ |
|||
/* product of complex vector *a times complex vector *b */ |
|||
/* INPUTS */ |
|||
/* N vector length */ |
|||
/* *a complex vector length N complex numbers */ |
|||
/* *b complex vector length N complex numbers */ |
|||
/* OUTPUTS */ |
|||
/* *out complex vector length N */ |
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue