40 changed files with 15572 additions and 9055 deletions
-
2configure.ac
-
59examples/transient-noise/noi-ring51-demo.cir
-
53examples/transient-noise/noi-sc-tr.cir
-
56examples/transient-noise/noilib-demo.h
-
27examples/transient-noise/shot_ng.cir
-
5src/Makefile.am
-
4src/frontend/Makefile.am
-
298src/frontend/com_fft.c
-
1src/frontend/com_fft.h
-
12src/frontend/commands.c
-
62src/frontend/trannoise/1-f-code.c
-
61src/frontend/trannoise/1-f-code_d.c
-
846src/frontend/trannoise/FastNorm3.c
-
10src/frontend/trannoise/Makefile.am
-
532src/frontend/trannoise/wallace.c
-
7src/include/1-f-code.h
-
32src/include/FastNorm3.h
-
4src/include/bool.h
-
108src/include/fftext.h
-
13src/include/ngspice.h
-
22src/include/wallace.h
-
25src/main.c
-
4src/maths/Makefile.am
-
13src/maths/fft/Makefile.am
-
37src/maths/fft/NOTE
-
70src/maths/fft/Read Me
-
156src/maths/fft/fftext.c
-
106src/maths/fft/fftext.h
-
3175src/maths/fft/fftlib.c
-
76src/maths/fft/fftlib.h
-
297src/maths/fft/matlib.c
-
33src/maths/fft/matlib.h
-
24src/maths/misc/randnumb.c
-
3src/spicelib/devices/vsrc/vsrc.c
-
51src/spicelib/devices/vsrc/vsrcacct.c
-
1src/spicelib/devices/vsrc/vsrcask.c
-
13src/spicelib/devices/vsrc/vsrcdefs.h
-
440src/spicelib/devices/vsrc/vsrcload.c
-
7src/spicelib/devices/vsrc/vsrcpar.c
-
17882visualc/vngspice.vcproj
@ -0,0 +1,59 @@ |
|||||
|
* 51 stage Ring-Osc. BSIM3, transient noise |
||||
|
* will need 45 min on a i7 860 with 4 threads |
||||
|
|
||||
|
* closes the loop between inverters xinv1 and xinv5 |
||||
|
vin in out dc 0.5 pulse 0.5 0 0.1n 5n 1 1 1 |
||||
|
|
||||
|
vdd dd 0 dc 0 pulse 0 2.2 0 1n 1 1 1 |
||||
|
|
||||
|
vss ss 0 dc 0 |
||||
|
ve sub 0 dc 0 |
||||
|
|
||||
|
vpe well 0 2.2 |
||||
|
|
||||
|
* noisy inverters |
||||
|
xiinv2 dd ss sub well out25 out50 inv253 |
||||
|
xiinv1 dd ss sub well in out25 inv253 |
||||
|
|
||||
|
*very noisy inverter |
||||
|
xiinv5 dd ss sub well out50 out inv1_2 |
||||
|
*output amplifier |
||||
|
xiinv11 dd ss sub well out25 bufout inv1 |
||||
|
cout bufout ss 0.2pF |
||||
|
|
||||
|
.option itl1=500 gmin=1e-15 itl4=10 noacct |
||||
|
|
||||
|
* .dc vdd 0 2 0.01 |
||||
|
.tran 0.01n 500n |
||||
|
|
||||
|
.save in bufout v(t1) |
||||
|
|
||||
|
.include D:\Spice_Win\Exam_BSIM3\Modelcards\modelcard.nmos |
||||
|
.include D:\Spice_Win\Exam_BSIM3\Modelcards\modelcard.pmos |
||||
|
|
||||
|
.include noilib-demo.h |
||||
|
|
||||
|
.control |
||||
|
unset ngdebug |
||||
|
* first run |
||||
|
save bufout $ needed for restricting memory usage |
||||
|
rusage |
||||
|
tran 8p 10000n |
||||
|
rusage |
||||
|
plot bufout xlimit 90n 95n |
||||
|
linearize |
||||
|
fft bufout |
||||
|
* next run |
||||
|
reset |
||||
|
save bufout |
||||
|
alter @v.xiinv5.vn1[trnoise] = [ 0 0 0 0 ] $ no noise |
||||
|
tran 8p 10000n |
||||
|
rusage |
||||
|
plot bufout xlimit 90n 95n |
||||
|
linearize |
||||
|
fft bufout |
||||
|
plot mag(bufout) mag(sp2.bufout) xlimit 0 2G ylimit 1e-11 0.1 ylog |
||||
|
.endc |
||||
|
|
||||
|
|
||||
|
.end |
||||
@ -0,0 +1,53 @@ |
|||||
|
* simple sample & hold, transient noise |
||||
|
|
||||
|
* switch control |
||||
|
* PULSE(V1 V2 TD TR TF PW PER) |
||||
|
vgate1 ga1 0 dc 0 pulse (0 1 0 10n 10n 90n 200n) |
||||
|
|
||||
|
Switch1 1 2 ga1 0 smodel1 |
||||
|
|
||||
|
* noisy input |
||||
|
* rms value white, time step, exponent < 2, rms value 1/f |
||||
|
vin 1 0 dc 0 trnoise 0.1m 0.2n 1 0.1m |
||||
|
*vin 1 0 dc 0 trnoise 0.1m 0.2n 0 0.1m |
||||
|
|
||||
|
* output |
||||
|
c2 2 0 10p |
||||
|
|
||||
|
* second S&H |
||||
|
vgate2 ga2 0 dc 0 pulse (0 1 140n 10n 10n 30n 200n) |
||||
|
*Buffer EXXXXXXX N+ N- NC+ NC- VALUE |
||||
|
e1 4 0 2 0 1 |
||||
|
Switch2 4 3 ga2 0 smodel2 |
||||
|
c3 3 0 10p |
||||
|
|
||||
|
.option itl1=500 gmin=1e-15 itl4=10 acct |
||||
|
|
||||
|
.model smodel1 sw vt=0.5 ron=100 |
||||
|
.model smodel2 sw vt=0.5 ron=100 |
||||
|
|
||||
|
.tran 0.4n 100u |
||||
|
|
||||
|
|
||||
|
.control |
||||
|
unset ngdebug |
||||
|
set filetype=ascii |
||||
|
rusage |
||||
|
run |
||||
|
rusage all |
||||
|
write noi_test.out v(1) |
||||
|
plot v(2) v(3) xlimit 4u 5u |
||||
|
plot v(ga1) v(ga2) xlimit 4u 5u |
||||
|
linearize |
||||
|
*rms v(1) |
||||
|
fft v(3) |
||||
|
plot mag(v(3)) loglog xlimit 1e4 1e8 ylimit 1e-10 1e-4 |
||||
|
setplot tran1 |
||||
|
linearize |
||||
|
psd 101 v(3) |
||||
|
plot mag(v(3)) xlimit 0 3e7 ylimit 0 10u |
||||
|
|
||||
|
.endc |
||||
|
|
||||
|
|
||||
|
.end |
||||
@ -0,0 +1,56 @@ |
|||||
|
|
||||
|
* standard inverter made noisy |
||||
|
*.subckt inv1 dd ss sub well in out |
||||
|
*vn1 out outi dc 0 noise 0.1 0.3n 1.0 0.1 |
||||
|
*mn1 outi in ss sub n1 w=2u l=0.25u AS=3p AD=3p PS=4u PD=4u |
||||
|
*mp1 outi in dd well p1 w=4u l=0.25u AS=7p AD=7p PS=6u PD=6u |
||||
|
*.ends inv1 |
||||
|
|
||||
|
* standard inverter |
||||
|
.subckt inv1 dd ss sub well in out |
||||
|
mn1 out in ss sub n1 w=2u l=0.25u AS=3p AD=3p PS=4u PD=4u |
||||
|
mp1 out in dd well p1 w=4u l=0.25u AS=7p AD=7p PS=6u PD=6u |
||||
|
.ends inv1 |
||||
|
|
||||
|
* very noisy inverter (noise on vdd and well) |
||||
|
.subckt inv1_1 dd ss sub well in out |
||||
|
vn1 dd idd dc 0 trnoise 0.05 0.05n 1 0.05 |
||||
|
vn2 well iwell dc 0 trnoise 0.05 0.05n 1 0.05 |
||||
|
mn1 out in ss sub n1 w=2u l=0.25u AS=3p AD=3p PS=4u PD=4u |
||||
|
mp1 out in idd iwell p1 w=4u l=0.25u AS=7p AD=7p PS=6u PD=6u |
||||
|
*Cout out 0 0.1p |
||||
|
.ends inv1_1 |
||||
|
|
||||
|
|
||||
|
* another very noisy inverter |
||||
|
.subckt inv1_2 dd ss sub well in out |
||||
|
vn1 out outi dc 0 trnoise 0.05 8p 1.0 0.001 |
||||
|
mn1 outi in ss sub n1 w=2u l=0.25u AS=3p AD=3p PS=4u PD=4u |
||||
|
mp1 outi in dd well p1 w=4u l=0.25u AS=7p AD=7p PS=6u PD=6u |
||||
|
*Cout out 0 0.1p |
||||
|
.ends inv1_2 |
||||
|
|
||||
|
* another very noisy inverter with current souces parallel to transistor |
||||
|
.subckt inv13 dd ss sub well in outi |
||||
|
in1 ss outi dc 0 noise 200u 0.05n 1.0 50u |
||||
|
mn1 outi in ss sub n1 w=2u l=0.25u AS=3p AD=3p PS=4u PD=4u |
||||
|
in2 dd outi dc 0 noise 200u 0.05n 1.0 50u |
||||
|
mp1 outi in dd well p1 w=4u l=0.25u AS=7p AD=7p PS=6u PD=6u |
||||
|
*Cout out 0 0.1p |
||||
|
.ends inv13 |
||||
|
|
||||
|
.subckt inv53 dd ss sub well in out |
||||
|
xinv1 dd ss sub well in 1 inv1 |
||||
|
xinv2 dd ss sub well 1 2 inv1 |
||||
|
xinv3 dd ss sub well 2 3 inv1 |
||||
|
xinv4 dd ss sub well 3 4 inv1 |
||||
|
xinv5 dd ss sub well 4 out inv1 |
||||
|
.ends inv53 |
||||
|
|
||||
|
.subckt inv253 dd ss sub well in out |
||||
|
xinv1 dd ss sub well in 1 inv53 |
||||
|
xinv2 dd ss sub well 1 2 inv53 |
||||
|
xinv3 dd ss sub well 2 3 inv53 |
||||
|
xinv4 dd ss sub well 3 4 inv53 |
||||
|
xinv5 dd ss sub well 4 out inv53 |
||||
|
.ends inv253 |
||||
@ -0,0 +1,27 @@ |
|||||
|
* Shot noise test with B source, diode |
||||
|
* voltage on device (diode, forward) |
||||
|
Vdev out 0 DC 0 PULSE(0.4 0.45 10u) |
||||
|
* diode, forward direction, to be modeled with noise |
||||
|
D1 mess 0 DMOD |
||||
|
.model DMOD D IS=1e-14 N=1 |
||||
|
X1 0 mess out ishot |
||||
|
* device between 1 and 2 |
||||
|
* new output terminals of device including noise: 1 and 3 |
||||
|
.subckt ishot 1 2 3 |
||||
|
* white noise source with rms 1V |
||||
|
VNG 0 11 DC 0 TRNOISE(1 1n 0 0) |
||||
|
*measure the current i(v1) |
||||
|
V1 2 3 DC 0 |
||||
|
* calculate the shot noise |
||||
|
* sqrt(2*current*q*bandwidth) |
||||
|
BI 1 3 I=sqrt(2*abs(i(v1))*1.6e-19*1e7)*v(11) |
||||
|
.ends ishot |
||||
|
* 20000 sample points |
||||
|
.tran 1n 20u |
||||
|
.control |
||||
|
run |
||||
|
plot (-1)*i(vdev) |
||||
|
meas tran vdev_rms avg i(vdev) from=0u to=9.9u |
||||
|
meas tran vdev_rms avg i(vdev) from=10.1u to=20u |
||||
|
.endc |
||||
|
.end |
||||
@ -0,0 +1,62 @@ |
|||||
|
/* Copyright: Holger Vogt, 2008 |
||||
|
Generates 1/f noise values according to: |
||||
|
"Discrete simulation of colored noise and stochastic |
||||
|
processes and 1/fa power law noise generation" |
||||
|
Kasdin, N.J.; |
||||
|
Proceedings of the IEEE |
||||
|
Volume 83, Issue 5, May 1995 Page(s):802 - 827 |
||||
|
*/ |
||||
|
|
||||
|
#include <math.h> |
||||
|
#include <stdio.h> |
||||
|
#include <stddef.h> |
||||
|
#include <stdlib.h> |
||||
|
#include <stdarg.h> // var. argumente |
||||
|
#include "1-f-code.h" |
||||
|
#include "ngspice.h" |
||||
|
|
||||
|
#include "fftext.h" |
||||
|
#include "wallace.h" |
||||
|
|
||||
|
|
||||
|
void f_alpha(int n_pts, int n_exp, float X[], float Q_d, |
||||
|
float alpha) |
||||
|
{ |
||||
|
int i; |
||||
|
float *hfa, *wfa; |
||||
|
float ha; |
||||
|
|
||||
|
ha = alpha/2.0f ; |
||||
|
// Q_d = sqrt(Q_d); /* find the deviation of the noise */ |
||||
|
hfa = TMALLOC(float,n_pts); |
||||
|
wfa = TMALLOC(float,n_pts); |
||||
|
hfa[0] = 1.0f; |
||||
|
wfa[0] = Q_d * (float)GaussWa; |
||||
|
/* generate the coefficients hk */ |
||||
|
for (i=1 ; i < n_pts; i++) { |
||||
|
/* generate the coefficients hk */ |
||||
|
hfa[i] = hfa[i-1] * (ha + (float)(i-1)) / ( (float)(i) ); |
||||
|
/* fill the sequence wk with white noise */ |
||||
|
wfa[i] = Q_d * (float)GaussWa; |
||||
|
} |
||||
|
|
||||
|
// for (i=0 ; i < n_pts; i++) |
||||
|
// printf("rnd %e, hk %e\n", wfa[i], hfa[i]); |
||||
|
|
||||
|
/* perform the discrete Fourier transform */ |
||||
|
fftInit(n_exp); |
||||
|
rffts(hfa, n_exp, 1); |
||||
|
rffts(wfa, n_exp, 1) ; |
||||
|
|
||||
|
/* multiply the two complex vectors */ |
||||
|
rspectprod(hfa, wfa, X, n_pts); |
||||
|
/* inverse transform */ |
||||
|
riffts(X, n_exp, 1); |
||||
|
|
||||
|
free(hfa) ; |
||||
|
free(wfa); |
||||
|
/* fft tables will be freed in vsrcaccept.c and isrcaccept.c |
||||
|
fftFree(); */ |
||||
|
fprintf(stdout,"%d (2e%d) one over f values created\n", n_pts, n_exp); |
||||
|
} |
||||
|
|
||||
@ -0,0 +1,61 @@ |
|||||
|
/* Copyright: Holger Vogt, 2008 |
||||
|
Discrete simulation of colored noise and stochastic |
||||
|
processes and 1/fa power law noise generation |
||||
|
Kasdin, N.J.; |
||||
|
Proceedings of the IEEE |
||||
|
Volume 83, Issue 5, May 1995 Page(s):802 - 827 |
||||
|
*/ |
||||
|
|
||||
|
#include <math.h> |
||||
|
#include <stdio.h> |
||||
|
#include <stddef.h> |
||||
|
#include <stdlib.h> |
||||
|
#include <stdarg.h> // var. argumente |
||||
|
#include "1-f-code.h" |
||||
|
#include "ngspice.h" |
||||
|
|
||||
|
#include "fftext.h" |
||||
|
#include "wallace.h" |
||||
|
|
||||
|
|
||||
|
void f_alpha(int n_pts, int n_exp, double X[], double Q_d, |
||||
|
double alpha) |
||||
|
{ |
||||
|
unsigned int i; |
||||
|
double *hfa, *wfa; |
||||
|
double ha; |
||||
|
|
||||
|
ha = alpha/2.0f ; |
||||
|
// Q_d = sqrt(Q_d); /* find the deviation of the noise */ |
||||
|
hfa = TMALLOC(double,n_pts); |
||||
|
wfa = TMALLOC(double,n_pts); |
||||
|
hfa[0] = 1.0f; |
||||
|
wfa[0] = Q_d * GaussWa; |
||||
|
/* generate the coefficients hk */ |
||||
|
for (i=1 ; i < n_pts; i++) { |
||||
|
/* generate the coefficients hk */ |
||||
|
hfa[i] = hfa[i-1] * (ha + (double)(i-1)) / ( (double)(i) ); |
||||
|
/* fill the sequence wk with white noise */ |
||||
|
wfa[i] = Q_d * GaussWa; |
||||
|
} |
||||
|
|
||||
|
// for (i=0 ; i < n_pts; i++) |
||||
|
// printf("rnd %e, hk %e\n", wfa[i], hfa[i]); |
||||
|
|
||||
|
/* perform the discrete Fourier transform */ |
||||
|
fftInit(n_exp); |
||||
|
rffts(hfa, n_exp, 1); |
||||
|
rffts(wfa, n_exp, 1) ; |
||||
|
|
||||
|
/* multiply the two complex vectors */ |
||||
|
rspectprod(hfa, wfa, X, n_pts); |
||||
|
/* inverse transform */ |
||||
|
riffts(X, n_exp, 1); |
||||
|
|
||||
|
free(hfa) ; |
||||
|
free(wfa); |
||||
|
/* fft tables will be freed in vsrcaccept.c and isrcaccept.c |
||||
|
fftFree(); */ |
||||
|
fprintf(stdout,"%d (2e%d) one over f values created\n", n_pts, n_exp); |
||||
|
} |
||||
|
|
||||
@ -0,0 +1,846 @@ |
|||||
|
/* This is file FastNorm3.c */ |
||||
|
/* SUPERCEDES FastNorm.c, FastNorm2.c. Use with FastNorm3.h */ |
||||
|
/* 24 June 2003 */ |
||||
|
|
||||
|
/* A package containing a very fast generator of pseudo-random |
||||
|
Unit NORMAL variates, and some fairly high-quality UNIFORM |
||||
|
generators. It also contains a straightforward implementation of |
||||
|
a ChiSquared and Gamma generator copied from Ahrens and Dieter. |
||||
|
*/ |
||||
|
|
||||
|
/* Version 3 with double transformations and controllable extension |
||||
|
to repeat the double transformations for higher quality at lower |
||||
|
speed. |
||||
|
Dated 17 May 20003. |
||||
|
Copyright Christopher Stewart Wallace. |
||||
|
*/ |
||||
|
/* |
||||
|
%A C. S. Wallace |
||||
|
%T Fast Pseudo-Random Generators for Normal and Exponential Variates. |
||||
|
%J ACM Trans. Math. Software |
||||
|
%V 22 |
||||
|
%N 1 |
||||
|
%P 119-127 |
||||
|
%M MAR |
||||
|
%D 1996 |
||||
|
%O TR 94/197, May 1994, Dept. Computer Science, Monash University |
||||
|
%K CSW, CSWallace, Monash, pseudo random number generator, algorithm, |
||||
|
jrnl, TOMS, numbers, normal, probability, distribution, PRNG, RNG, Gaussian, |
||||
|
distribution, jrnl, ACM, TOMS, TR 94 197, TR197, c1996, c199x, c19xx |
||||
|
*/ |
||||
|
/* Use of this package requires the file "FastNorm3.h" which must be |
||||
|
#include-ed in any C files using this package. |
||||
|
|
||||
|
The main purpose of this package is to provide a very fast source |
||||
|
of pseudo-random variates from the Unit Normal N(0,1) distribution, having |
||||
|
the density function |
||||
|
|
||||
|
f(x) = (1/sqrt(2*PI)) * exp (-0.5 * x^2) |
||||
|
|
||||
|
Variates are obtained not by calling a function, but by use of a macro |
||||
|
"FastNorm" defined in FastNorm3.h. In a C program, this macro may appear |
||||
|
anywhere a (double) expression could appear, e.g in statements like |
||||
|
z += FastNorm; |
||||
|
if (FastNorm < 1.1) ..... |
||||
|
q = fabs (FastNorm); etc. |
||||
|
|
||||
|
The revision history, and a reference to the method description, is given |
||||
|
later in this file under the heading "Revision history Fastnorm". |
||||
|
|
||||
|
Major sections of this file, such as the revision history and the |
||||
|
major subroutines, are all headed by a line containing a row of minus signs (-) |
||||
|
and the name of the section or subroutine. |
||||
|
|
||||
|
The generators included are: |
||||
|
a Uniform source of integers, unsigned integers and doubles. |
||||
|
Chi-sq(N) (based on Ahrens and Dieter) |
||||
|
Gamma(N) (= 0.5 * Chi-sq(2N)) |
||||
|
Normal (a very fast routine) |
||||
|
*/ |
||||
|
|
||||
|
/* ----------------- inclusions and some definitions ------------ */ |
||||
|
#include <math.h> |
||||
|
#ifndef NOSPICE |
||||
|
#include "ngspice.h" |
||||
|
#endif |
||||
|
#include "FastNorm3.h" |
||||
|
|
||||
|
|
||||
|
/* --------------- (Uniform) c7rand, irandm, urandm ---------- */ |
||||
|
/* |
||||
|
c A random number generator called as a function by |
||||
|
c c7rand (iseed) or irandm (iseed) or urandm (iseed) |
||||
|
|
||||
|
c The parameter should be a pointer to a 2-element Sw vector. |
||||
|
c The first call gives a double uniform in 0 .. 1. |
||||
|
c The second gives an Sw integer uniform in 0 .. 2**31-1 |
||||
|
c The third gives an Sw integer with 32 bits, so unif in |
||||
|
c -2**31 .. 2**31-1 if used in 32-bit signed arithmetic. |
||||
|
c All update iseed[] in exactly the same way. |
||||
|
c iseed[] must be a 2-element Sw vector. |
||||
|
c The initial value of iseed[1] may be any 32-bit integer. |
||||
|
c The initial value of iseed[0] may be any 32-bit integer except -1. |
||||
|
c |
||||
|
c The period of the random sequence is 2**32 * (2**32-1) |
||||
|
c Its quality is quite good. It is based on the mixed multiplicative |
||||
|
c congruential (Lehmer) generator |
||||
|
x[n+1] = (69069 * x[n] + odd constant) MOD 2^32 |
||||
|
c but avoids most of the well-known defects of this type of generator |
||||
|
c by, in effect, generating x[n+k] from x[n] as defined by the |
||||
|
c sequence above, where k is chosen randomly in 1 ... 128 with the |
||||
|
c help of a subsidiary Tauseworth-type generator. |
||||
|
c For the positve integer generator irandm, the less |
||||
|
c significant digits are more random than is usual for a Lehmer |
||||
|
c generator. The last n<31 digits do not repeat with a period of 2^n. |
||||
|
c This is also true of the unsigned integer generator urandm, but less |
||||
|
c so. |
||||
|
|
||||
|
c This is an implementation in C of the algorithm described in |
||||
|
c Technical Report "A Long-Period Pseudo-Random Generator" |
||||
|
c TR89/123, Computer Science, Monash University, |
||||
|
c Clayton, Vic 3168 AUSTRALIA |
||||
|
c by |
||||
|
c |
||||
|
c C.S.Wallace csw@cs.monash.edu.au |
||||
|
|
||||
|
c The table mt[0:127] is defined by mt[i] = 69069 ** (128-i) |
||||
|
*/ |
||||
|
|
||||
|
#define MASK ((Sw) 0x12DD4922) |
||||
|
/* or in decimal, 316492066 */ |
||||
|
#define SCALE ((double) 1.0 / (1024.0 * 1024.0 * 1024.0 * 2.0)) |
||||
|
/* i.e. 2 to power -31 */ |
||||
|
|
||||
|
static Sw mt [128] = { |
||||
|
902906369, |
||||
|
2030498053, |
||||
|
-473499623, |
||||
|
1640834941, |
||||
|
723406961, |
||||
|
1993558325, |
||||
|
-257162999, |
||||
|
-1627724755, |
||||
|
913952737, |
||||
|
278845029, |
||||
|
1327502073, |
||||
|
-1261253155, |
||||
|
981676113, |
||||
|
-1785280363, |
||||
|
1700077033, |
||||
|
366908557, |
||||
|
-1514479167, |
||||
|
-682799163, |
||||
|
141955545, |
||||
|
-830150595, |
||||
|
317871153, |
||||
|
1542036469, |
||||
|
-946413879, |
||||
|
-1950779155, |
||||
|
985397153, |
||||
|
626515237, |
||||
|
530871481, |
||||
|
783087261, |
||||
|
-1512358895, |
||||
|
1031357269, |
||||
|
-2007710807, |
||||
|
-1652747955, |
||||
|
-1867214463, |
||||
|
928251525, |
||||
|
1243003801, |
||||
|
-2132510467, |
||||
|
1874683889, |
||||
|
-717013323, |
||||
|
218254473, |
||||
|
-1628774995, |
||||
|
-2064896159, |
||||
|
69678053, |
||||
|
281568889, |
||||
|
-2104168611, |
||||
|
-165128239, |
||||
|
1536495125, |
||||
|
-39650967, |
||||
|
546594317, |
||||
|
-725987007, |
||||
|
1392966981, |
||||
|
1044706649, |
||||
|
687331773, |
||||
|
-2051306575, |
||||
|
1544302965, |
||||
|
-758494647, |
||||
|
-1243934099, |
||||
|
-75073759, |
||||
|
293132965, |
||||
|
-1935153095, |
||||
|
118929437, |
||||
|
807830417, |
||||
|
-1416222507, |
||||
|
-1550074071, |
||||
|
-84903219, |
||||
|
1355292929, |
||||
|
-380482555, |
||||
|
-1818444007, |
||||
|
-204797315, |
||||
|
170442609, |
||||
|
-1636797387, |
||||
|
868931593, |
||||
|
-623503571, |
||||
|
1711722209, |
||||
|
381210981, |
||||
|
-161547783, |
||||
|
-272740131, |
||||
|
-1450066095, |
||||
|
2116588437, |
||||
|
1100682473, |
||||
|
358442893, |
||||
|
-1529216831, |
||||
|
2116152005, |
||||
|
-776333095, |
||||
|
1265240893, |
||||
|
-482278607, |
||||
|
1067190005, |
||||
|
333444553, |
||||
|
86502381, |
||||
|
753481377, |
||||
|
39000101, |
||||
|
1779014585, |
||||
|
219658653, |
||||
|
-920253679, |
||||
|
2029538901, |
||||
|
1207761577, |
||||
|
-1515772851, |
||||
|
-236195711, |
||||
|
442620293, |
||||
|
423166617, |
||||
|
-1763648515, |
||||
|
-398436623, |
||||
|
-1749358155, |
||||
|
-538598519, |
||||
|
-652439379, |
||||
|
430550625, |
||||
|
-1481396507, |
||||
|
2093206905, |
||||
|
-1934691747, |
||||
|
-962631983, |
||||
|
1454463253, |
||||
|
-1877118871, |
||||
|
-291917555, |
||||
|
-1711673279, |
||||
|
201201733, |
||||
|
-474645415, |
||||
|
-96764739, |
||||
|
-1587365199, |
||||
|
1945705589, |
||||
|
1303896393, |
||||
|
1744831853, |
||||
|
381957665, |
||||
|
2135332261, |
||||
|
-55996615, |
||||
|
-1190135011, |
||||
|
1790562961, |
||||
|
-1493191723, |
||||
|
475559465, |
||||
|
69069 |
||||
|
}; |
||||
|
|
||||
|
double c7rand (Sw *is) |
||||
|
{ |
||||
|
Sw it, leh; |
||||
|
|
||||
|
it = is [0]; |
||||
|
leh = is [1]; |
||||
|
/* Do a 7-place right cyclic shift of it */ |
||||
|
it = ((it >> 7) & 0x01FFFFFF) + ((it & 0x7F) << 25); |
||||
|
if (!(it & 0x80000000)) it = it ^ MASK; |
||||
|
leh = (leh * mt[it & 127] + it) & 0xFFFFFFFF; |
||||
|
is [0] = it; is [1] = leh; |
||||
|
if (leh & 0x80000000) leh = leh ^ 0xFFFFFFFF; |
||||
|
return (SCALE * leh); |
||||
|
} |
||||
|
|
||||
|
|
||||
|
|
||||
|
Sw irandm (Sw *is) |
||||
|
{ |
||||
|
Sw it, leh; |
||||
|
|
||||
|
it = is [0]; |
||||
|
leh = is [1]; |
||||
|
/* Do a 7-place right cyclic shift of it */ |
||||
|
it = ((it >> 7) & 0x01FFFFFF) + ((it & 0x7F) << 25); |
||||
|
if (!(it & 0x80000000)) it = it ^ MASK; |
||||
|
leh = (leh * mt[it & 127] + it) & 0xFFFFFFFF; |
||||
|
is [0] = it; is [1] = leh; |
||||
|
if (leh & 0x80000000) leh = leh ^ 0xFFFFFFFF; |
||||
|
return (leh); |
||||
|
} |
||||
|
|
||||
|
|
||||
|
unsigned int urandm (Sw *is) |
||||
|
{ |
||||
|
Sw it, leh; |
||||
|
|
||||
|
it = is [0]; |
||||
|
leh = is [1]; |
||||
|
/* Do a 7-place right cyclic shift of it */ |
||||
|
it = ((it >> 7) & 0x01FFFFFF) + ((it & 0x7F) << 25); |
||||
|
if (!(it & 0x80000000)) it = it ^ MASK; |
||||
|
leh = (leh * mt[it & 127] + it) & 0xFFFFFFFF; |
||||
|
is [0] = it; is [1] = leh; |
||||
|
return (leh); |
||||
|
} |
||||
|
|
||||
|
|
||||
|
/* --------------- (Chi-squared) adchi ----------------------- */ |
||||
|
/* Simple implementation of Ahrens and Dieter method for a chi-sq |
||||
|
random variate of order a >> 1. Uses c7rand, maths library */ |
||||
|
/* 13 July 1998 */ |
||||
|
/* Slightly faster if 'a' is the same as on previous call */ |
||||
|
/* This routine is no longer used in the fastnorm code, but is included |
||||
|
because it may be useful */ |
||||
|
|
||||
|
|
||||
|
static double gorder, gm, rt2gm, aold; |
||||
|
|
||||
|
double adchi (double a, int *is) |
||||
|
{ |
||||
|
double x, y, z, sq; |
||||
|
|
||||
|
if (a != aold) { |
||||
|
aold = a; gorder = 0.5 * a; |
||||
|
gm = gorder - 1.0; |
||||
|
rt2gm = sqrt (aold - 1.0); |
||||
|
} |
||||
|
|
||||
|
polar: |
||||
|
x = 2.0 * c7rand(is) - 1.0; z = c7rand(is); |
||||
|
sq = x*x + z*z; |
||||
|
if ((sq > 1.0) || (sq < 0.25)) goto polar; |
||||
|
y = x / z; |
||||
|
x = rt2gm * y + gm; |
||||
|
if (x < 0.0) goto polar; |
||||
|
|
||||
|
z = (1.0 + y*y) * exp (gm * log(x/gm) - rt2gm * y); |
||||
|
if (c7rand(is) > z) goto polar; |
||||
|
|
||||
|
return (2.0 * x); |
||||
|
} |
||||
|
|
||||
|
/* -------------------- (Gamma) rgamma (g, is) ----------- */ |
||||
|
|
||||
|
double rgamma (double g, int *is) |
||||
|
{ |
||||
|
double x, y, z, sq; |
||||
|
|
||||
|
if (g != gorder) { |
||||
|
gorder = g; |
||||
|
gm = gorder - 1.0; aold = 2.0 * gorder; |
||||
|
rt2gm = sqrt (aold - 1.0); |
||||
|
} |
||||
|
|
||||
|
polar: |
||||
|
x = 2.0 * c7rand(is) - 1.0; z = c7rand(is); |
||||
|
sq = x*x + z*z; |
||||
|
if ((sq > 1.0) || (sq < 0.25)) goto polar; |
||||
|
y = x / z; |
||||
|
x = rt2gm * y + gm; |
||||
|
if (x < 0.0) goto polar; |
||||
|
|
||||
|
z = (1.0 + y*y) * exp (gm * log(x/gm) - rt2gm * y); |
||||
|
if (c7rand(is) > z) goto polar; |
||||
|
|
||||
|
return (x); |
||||
|
} |
||||
|
|
||||
|
|
||||
|
/* ------------------ Revision history Fastnorm ------------- */ |
||||
|
/* Items in this revision history appear in chronological order, |
||||
|
so the most recent revsion appears last. |
||||
|
Revision items are separated by a line of '+' characters. |
||||
|
|
||||
|
++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
||||
|
This is a revised version of the algorithm decribed in |
||||
|
|
||||
|
ACM Transactions on Mathematical Software, Vol 22, No 1 |
||||
|
March 1996, pp 119-127. |
||||
|
|
||||
|
A fast generator of pseudo-random variates from the unit Normal |
||||
|
distribution. It keeps a pool of about 1000 variates, and generates new |
||||
|
ones by picking 4 from the pool, rotating the 4-vector with these as its |
||||
|
components, and replacing the old variates with the components of the |
||||
|
rotated vector. |
||||
|
|
||||
|
The program should initialize the generator by calling initnorm(seed) |
||||
|
with seed a Sw integer seed value. Different seed values will give |
||||
|
different sequences of Normals. Seed may be any 32-bit integer. |
||||
|
BUT SEE REVISION of 17 May 2003 for initnorm() parameters. |
||||
|
The revised initnorm requires two integer parameters, iseed and |
||||
|
quoll, the latter specifying a tradeoff between speed and |
||||
|
quality. |
||||
|
Then, wherever the program needs a new Normal variate, it should |
||||
|
use the macro FastNorm, e.g. in statements like: |
||||
|
x = FastNorm; (Sets x to a random Normal value) |
||||
|
or |
||||
|
x += a + FastNorm * b; (Adds a normal with mean a, SD b, to x) |
||||
|
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
||||
|
Changed basic formula, which was: |
||||
|
t = (p+q+r+s)*0.5; p = p-t; q = t-q; r = t-r; s = t-s; |
||||
|
This gives sum of new p+q+r+s = 2p(old) which may not be a great |
||||
|
choice. The new version is: |
||||
|
t = (p+q+r+s)*0.5; p = p-t; q = q-t; r = t-r; s = t-s; |
||||
|
which gives new p+q+r+s = p+q-r-s (old) which may be better. |
||||
|
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
||||
|
|
||||
|
Revision 14 November 1998 |
||||
|
The older version "FastNorm" which was available via ftp was found |
||||
|
to have a defect which could affect some applications. |
||||
|
|
||||
|
Dr Christine Rueb, (Max Planck Institut fur Infomatik, |
||||
|
Im Stadtwald W 66123 Saabrucken, F.G.R., |
||||
|
(rueb@mpi-sb.mpg.de) |
||||
|
|
||||
|
found that if a large number N of consecutive variates were summed to give |
||||
|
a variate S with nominally N(0,N) distribution, the variance of S was in some |
||||
|
cases too small. The effect was noticed with N=400, and was particularly strong |
||||
|
for N=1023 if the first several (about 128) variates from FastNorm were |
||||
|
discarded. Dr. Rueb traced the effect to an unexpected tendency of FastNorm |
||||
|
to concentrate values with an anomolous correlation into the first 128 |
||||
|
elements of the variate pool. |
||||
|
With the help of her analysis, the algorithm has been revised in a |
||||
|
way which appears to overcome the problem, at the cost of about a 19% |
||||
|
reduction in speed (which still leaves the method very fast.) |
||||
|
|
||||
|
IT MUST BE RECOGNISED THAT THIS ALGORITHM IS NOVEL |
||||
|
AND WHILE IT PASSES A NUMBER OF STANDARD TESTS FOR DISTRIBUTIONAL |
||||
|
FORM, LACK OF SERIAL CORRELATION ETC., IT MAY STILL HAVE DEFECTS. |
||||
|
|
||||
|
RECALL THE NUMBER OF YEARS WHICH IT TOOK FOR THE LIMITATIONS OF |
||||
|
THE LEHMER GENERATOR FOR UNIFORM VARIATES TO BECOME APPARENT !!! |
||||
|
|
||||
|
UNTIL MORE EXPERIENCE IS GAINED WITH THIS TYPE OF GENERATOR, IT |
||||
|
WOULD BE WISE IN ANY CRITICAL APPLICATION TO COMPARE RESULTS |
||||
|
OBTAINED USING IT WITH RESULTS OBTAINED USING A "STANDARD" FORM |
||||
|
OF GENERATOR OF NORMAL VARIATES COUPLED WITH A WELL-DOCUMENTED |
||||
|
GENERATOR OF UNIFORM VARIATES. |
||||
|
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
||||
|
|
||||
|
Revision 1 April 2003. |
||||
|
Trying a scanning process proposed by R.P.Brent. It needs 2 pool |
||||
|
vectors, as it cannot update in-situ, but may be more robust. |
||||
|
It is a bit slower on a 133 Mhz PC but just as fast on a newer PC |
||||
|
(moggie) at about 16 ns per call in the 'speed.c' test. |
||||
|
The extreme-value defect is the same on old and new versions. |
||||
|
If one finds a value 'A' such that a batch of B genuine Normal variates has |
||||
|
probability 0.2 of containing a variate with abolute value greater than A, |
||||
|
then the probability that both of two consecive batches of B will contain |
||||
|
such a value should be 0.2 times 0.2, or 0.04. Instead, both versions give |
||||
|
the extreme value prob. as 0.200 (over a million batches) but give the |
||||
|
consective-pair prob as 0.050 for batch size B = 1024. |
||||
|
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
||||
|
|
||||
|
Revision 17 May 2003. |
||||
|
The fundamental defect of the method, namely an inadequate 'mixing' |
||||
|
of squared value ('energy') between one generation of the pool and the next, |
||||
|
cannot readily be removed. In going from one pool to the next, the energy |
||||
|
in an old variate is shared among just 4 variates in the new pool. Hence it |
||||
|
takes many generations before the energy of some original variate can be |
||||
|
distributed across the whole pool. The number of generations needed cannot |
||||
|
be less than the log to base 4 of the pool size, or 5 for a pool size of |
||||
|
1024. In fact, the pseudo-random indexing of the pool means that rather |
||||
|
more generations are needed on average. |
||||
|
The defect is readily revealed by the following test. One picks a |
||||
|
"batch size" comparable to the pool size, say 500 or 1000. One then |
||||
|
computes a value A such that a batch will with probability 0.2 contain one |
||||
|
or more variates with absolute value exceeding A. |
||||
|
One then draws batches from FastNorm, |
||||
|
and tests each batch to see if it contains such an extreme value. |
||||
|
Over many batches, one counts the frequency of such 'extreme' batches, |
||||
|
and finds (with FastNorm2) that it is indeed about 0.2. However, when one counts |
||||
|
the frequency with which succesive batches are both extreme, one finds it to |
||||
|
be higher than the proper value (0.2)^2 = 0.04. For batch sizes round the pool |
||||
|
size, it can be as high as 0.05. That is, although the frequncy of extreme |
||||
|
values is about right, their occurrence in the stream is correlated over a |
||||
|
scale of the order of the pool size. |
||||
|
The same correlation effect is seen in the average 4th moment of |
||||
|
successive batches. |
||||
|
Since this inter-generational correlation cannot be avoided, the |
||||
|
this revision seeks to reduce it by performing at least two simple |
||||
|
rotations of the pool at each generation. Obviously, some speed is lost, |
||||
|
but the correlations are reduced. |
||||
|
To allow the user to trade off speed and quality, the initialization |
||||
|
function initnorm() now provides a QUALITY parameter 'quoll' which controls |
||||
|
how many double-rotations are done for each generation. |
||||
|
See the comments in initnorm() for more detail. |
||||
|
++++++++++ End of revision notes +++++++++ */ |
||||
|
|
||||
|
|
||||
|
|
||||
|
/* ----------------- Some test results ------------------------ */ |
||||
|
/* |
||||
|
General form: |
||||
|
Some simple tests were conducted by transforming FastNorm variates |
||||
|
in several ways to yield a variable nominally uniformly distributed in 0 ... 1. |
||||
|
Uniformity of the derived variate was then tested by a ChiSquared test on a |
||||
|
100-cell histogram with cell counts around 10000. These tests are crude, but |
||||
|
showed no untoward results on the present version. |
||||
|
Transformations included: |
||||
|
y = 0.5 * (1.0 + erf (n1 / sqrt(2)) |
||||
|
|
||||
|
y = 0.5 * (n1 / (n1^2 + n2^2 + n3^2) - 1) |
||||
|
|
||||
|
y = exp (-0.5 * (n1^2 + n2^2)) |
||||
|
|
||||
|
y = (n1^2 + n2^2) / (n1^2 + n2^2 + n3^2 + n4^2) |
||||
|
|
||||
|
where n1, n2 etc are successive Normal variates. |
||||
|
It may be noted that some of these are sensitive to serial correlation if |
||||
|
present. |
||||
|
|
||||
|
Fourth moment of batches: |
||||
|
Extensive tests for correlation among the fourth moments of successive |
||||
|
batches of variates were made, with batch sizes comparabe to or (worst case) |
||||
|
equal to the size of the variate pool (4096 in this revision). |
||||
|
With 'quality' 1, significant correlation appears after 10^6 batches |
||||
|
of worst-case size. |
||||
|
With quality 2, no significant correlation is evident after 10^7 |
||||
|
batches. A just-significant correlation appears after 3.6*10^7 batches. |
||||
|
As this requires some 1.4*10^11 deviates to be drawn, it may be irrelevent |
||||
|
for many applications. The observed correlation coefficent was 0.0008. |
||||
|
With quality 3, results are OK after 10^8 batches, or more than |
||||
|
4*10^11 variates. |
||||
|
No tests have been done with quality 4 as yet. |
||||
|
|
||||
|
Speed: |
||||
|
Speed tests were done on a PC running RedHat Linux, using "-O" |
||||
|
compiler optimization. The test loop was |
||||
|
for (i = 0; i < 500000000; i++) { |
||||
|
a += FastNorm; a -= FastNorm; |
||||
|
} |
||||
|
Thus the test makes 10^9 uses of FastNorm. The time taken, (which |
||||
|
includes time for a call in 'initnorm' and the loop overhead) depends on |
||||
|
the 'quality' set by initnorm. |
||||
|
Quality 1: 21.5 sec |
||||
|
Quality 2: 32.1 sec |
||||
|
Quality 3: 42.5 sec |
||||
|
Quality 4: 53.1 sec |
||||
|
|
||||
|
By way of comparison, the same 10^9 call loop was timed with the Unix library |
||||
|
"random()" routine substituted for FastNorm, and the variable 'a' defined as |
||||
|
integer rather than double. Also, since most use of a Uniform generator such |
||||
|
as "random()" requires that the returned integer be scaled into a floating- |
||||
|
point number in 0 ... 1, the timing was repeated with |
||||
|
"a += random" ('a' integer) replaced by "a += Scale*random()" where |
||||
|
'a' is double and Scale = 2^(-31). The times obtained were: |
||||
|
Random (integer): 44.1 sec |
||||
|
Random (double) : 47.7 sec |
||||
|
|
||||
|
It can be seen that FastNorm (even at quality 3) is faster than a |
||||
|
commonly-used Uniform generator. To some extent, this result may vary on |
||||
|
different computers and compilers. Since FastNorm (at least for qualities |
||||
|
above 1) no doubt does more arithmetic per variate than "random()", much of |
||||
|
its speed advantage must come from its replacement of a function call per |
||||
|
variate by a macro which makes only one function call every 4095 variates. |
||||
|
Computers with lower 'call' overheads than the PC used here might show |
||||
|
differnt results. |
||||
|
Incidently, the Uniform generator 'c7rand()' included in this |
||||
|
package, which returns a double uniform in 0 ... 1, and is of fairly high |
||||
|
quality, gives a time in the same test of 36.8 sec, a little faster than |
||||
|
'random()'. |
||||
|
*/ |
||||
|
|
||||
|
|
||||
|
/* ----------------- globals ------------------------- */ |
||||
|
/* A pool must have a length which is a multiple of 4. |
||||
|
* During regeneration of a new pool, the pool is treated as 4 |
||||
|
* consecutive vectors, each of length VL. |
||||
|
*/ |
||||
|
|
||||
|
#define VE 10 |
||||
|
#define VL (1 << VE) |
||||
|
#define VM (VL-1) |
||||
|
#define WL (4*VL) |
||||
|
#define WM (WL-1) |
||||
|
|
||||
|
Sw gaussfaze; |
||||
|
Sf *gausssave; |
||||
|
Sf GScale; |
||||
|
/* GScale,fastnorm,gaussfaze, -save must be visible to callers*/ |
||||
|
static Sf chic1, chic2; /* Constants used in getting ChiSq_WL */ |
||||
|
Sw gslew; /* Counts generations */ |
||||
|
static Sw qual; /* Sets number of double transforms per generation. */ |
||||
|
static Sw c7g [2]; /* seed values for c7rand */ |
||||
|
|
||||
|
Sf wk1 [WL], wk2 [WL]; /* Pools of variates. */ |
||||
|
|
||||
|
|
||||
|
/* ------------------ regen ---------------------- */ |
||||
|
/* Takes variates from wk1[], transforms to wk[2], then back to wk1[]. |
||||
|
*/ |
||||
|
void regen () |
||||
|
{ |
||||
|
Sw i, j, k, m; |
||||
|
Sf p, q, r, s, t; |
||||
|
Sw topv[6], ord[4], *top; |
||||
|
Sf *ppt[4], *ptn; |
||||
|
|
||||
|
/* Choose 4 random start points in the wk1[] vector |
||||
|
I want them all different. */ |
||||
|
|
||||
|
top = topv + 1; |
||||
|
/* Set limiting values in top[-1], top[4] */ |
||||
|
top[-1] = VL; top[4] = 0; |
||||
|
reran1: |
||||
|
m = irandm (c7g); /* positive 32-bit random */ |
||||
|
/* Extract two VE-sized randoms from m, which has 31 useable digits */ |
||||
|
m = m >> (31 - 2*VE); |
||||
|
top[0] = m & VM; m = m >> VE; top[1] = m & VM; |
||||
|
m = irandm (c7g); /* positive 32-bit random */ |
||||
|
/* Extract two VE-sized randoms from m, which has 31 useable digits */ |
||||
|
m = m >> (31 - 2*VE); |
||||
|
top[2] = m & VM; m = m >> VE; top[3] = m & VM; |
||||
|
for (i = 0; i < 4; i++) ord[i] = i; |
||||
|
/* Sort in decreasing size */ |
||||
|
for (i = 2; i >= 0; i--) { |
||||
|
for (j = 0; j <= i; j++) { |
||||
|
if (top[j] < top[j+1]) { |
||||
|
k = top[j]; top[j] = top[j+1]; |
||||
|
top[j+1] = k; |
||||
|
k = ord[j]; ord[j] = ord[j+1]; |
||||
|
ord[j+1] = k; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
/* Ensure all different */ |
||||
|
for (i = 0; i < 3; i++) { if (top[i] == top[i+1]) goto reran1; } |
||||
|
|
||||
|
/* Set pt pointers to their start values for the first chunk. */ |
||||
|
for (i = 0; i < 4; i++) { |
||||
|
j = ord[i]; |
||||
|
ppt[j] = wk2 + j * VL + top[i]; |
||||
|
} |
||||
|
|
||||
|
/* Set ptn to point into wk1 */ |
||||
|
ptn = wk1; |
||||
|
|
||||
|
/* Now ready to do five chunks. The length of chunk i is |
||||
|
top[i-1] - top[i] (I hope) |
||||
|
At the end of chunk i, pointer ord[i] should have reached the end |
||||
|
of its part, and need to be wrapped down to the start of its part. |
||||
|
*/ |
||||
|
i = 0; |
||||
|
|
||||
|
chunk: |
||||
|
j = top[i] - top[i-1]; /* Minus the chunk length */ |
||||
|
for (; j < 0; j++) { |
||||
|
p = *ptn++; s = *ptn++; q = *ptn++; r = *ptn++; |
||||
|
t = (p + q + r + s) * 0.5; |
||||
|
*ppt[0]++ = t - p; |
||||
|
*ppt[1]++ = t - q; |
||||
|
*ppt[2]++ = r - t; |
||||
|
*ppt[3]++ = s - t; |
||||
|
} |
||||
|
/* This should end the chunk. See if all done */ |
||||
|
if (i == 4) goto passdone; |
||||
|
|
||||
|
/* The pointer for part ord[i] should have passed its end */ |
||||
|
j = ord[i]; |
||||
|
#ifdef dddd |
||||
|
printf ("Chunk %1d done. Ptr %1d now %4d\n", i, j, ppt[j]-wk2); |
||||
|
#endif |
||||
|
ppt[j] -= VL; |
||||
|
i++; |
||||
|
goto chunk; |
||||
|
|
||||
|
passdone: |
||||
|
/* wk1[] values have been transformed and placed in wk2[] |
||||
|
Transform from wk2 to wk1 with a simple shuffle */ |
||||
|
m = (irandm (c7g) >> (29 - VE)) & WM; |
||||
|
j = 0; |
||||
|
for (i = 0; i < 4; i++) ppt[i] = wk1 + i * VL; |
||||
|
for (i = 0; i < VL; i++) { |
||||
|
p = wk2[j^m]; j++; |
||||
|
s = wk2[j^m]; j++; |
||||
|
q = wk2[j^m]; j++; |
||||
|
r = wk2[j^m]; j++; |
||||
|
t = (p + q + r + s) * 0.5; |
||||
|
*ppt[0]++ = t - p; |
||||
|
*ppt[1]++ = q - t; |
||||
|
*ppt[2]++ = t - r; |
||||
|
*ppt[3]++ = s - t; |
||||
|
} |
||||
|
|
||||
|
/* We have a new lot of variates in wk1 */ |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
|
||||
|
/* ------------------- renormalize --------------------------- */ |
||||
|
/* Rescales wk1[] so sum of squares = WL */ |
||||
|
/* Returns the original sum-of-squares */ |
||||
|
Sf renormalize (void) |
||||
|
{ |
||||
|
Sf ts, vv; |
||||
|
Sw i; |
||||
|
|
||||
|
ts = 0.0; |
||||
|
for (i = 0; i < WL; i++) { |
||||
|
ts += wk1[i] * wk1[i]; |
||||
|
} |
||||
|
vv = sqrt (WL / ts); |
||||
|
for (i = 0; i < WL; i++) wk1[i] *= vv; |
||||
|
return (ts); |
||||
|
} |
||||
|
|
||||
|
|
||||
|
/* ------------------------ BoxMuller ---------------------- */ |
||||
|
/* Fills block gvec of length ll with proper normals */ |
||||
|
void boxmuller (Sf *gvec, Sw ll) |
||||
|
{ |
||||
|
Sw i; |
||||
|
Sf tx, ty, tr, tz; |
||||
|
|
||||
|
/* Here, replace the whole pool with conventional Normal variates */ |
||||
|
i = 0; |
||||
|
nextpair: |
||||
|
tx = 2.0 * c7rand(c7g) - 1.0; /* Uniform in -1..1 */ |
||||
|
ty = 2.0 * c7rand(c7g) - 1.0; /* Uniform in -1..1 */ |
||||
|
tr = tx * tx + ty * ty; |
||||
|
if ((tr > 1.0) || (tr < 0.25)) goto nextpair; |
||||
|
tz = -2.0 * log (c7rand(c7g)); /* Sum of squares */ |
||||
|
tz = sqrt ( tz / tr ); |
||||
|
gvec [i++] = tx * tz; gvec [i++] = ty * tz; |
||||
|
if (i < ll) goto nextpair; |
||||
|
/* Horrid, but good enough */ |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
|
||||
|
/* ------------------------- initnorm ---------------------- */ |
||||
|
/* To initialize, given a seed integer and a quality level. |
||||
|
The seed can be any integer. The quality level quoll should be |
||||
|
between 1 and 4. Quoll = 1 gives high speed, but leaves some |
||||
|
correlation between the 4th moments of successive batches of values. |
||||
|
Higher values of quoll give lower speed but less correlation. |
||||
|
|
||||
|
If called with quoll = 0, initnorm performs a check that the |
||||
|
most crucial routine (regen) is performing correctly. In this |
||||
|
case, the value of 'iseed' is ignored. Initnorm will report the |
||||
|
results of the test, which compares pool values with check17 and |
||||
|
check98, which are defined below. |
||||
|
When a check call is made, a proper call on initnorm must then |
||||
|
be made before using the FastNorm macro. A check call does not |
||||
|
properly initialize the routines even if it succeeds. |
||||
|
*/ |
||||
|
static Sf check17 = 0.1255789; |
||||
|
static Sf check98 = -0.7113293; |
||||
|
|
||||
|
void initnorm (Sw seed, Sw quoll) |
||||
|
{ |
||||
|
Sw i; |
||||
|
|
||||
|
/* At one stage, we need to generate a random variable Z such that |
||||
|
(WL * Z*Z) has a Chi-squared-WL density. Now, a var with |
||||
|
an approximate Chi-sq-K distn can be got as |
||||
|
(A + B*n)**2 where n has unit Normal distn, |
||||
|
A**2 = K * sqrt (1 - 1/K), A**2 + B**2 = K. (For large K) |
||||
|
So we form Z as (1/sqrt(WL)) * (A + B*n) |
||||
|
or chic1 + chic2 * n where |
||||
|
chic1 = A / sqrt(WL), chic2 = B / sqrt(WL). |
||||
|
Hence |
||||
|
chic1 = sqrt (A*A / WL) = sqrt ( sqrt (1 - 1/WL)), |
||||
|
chic2 = sqrt (1 - chic1*chic1) |
||||
|
*/ |
||||
|
|
||||
|
chic1 = sqrt ( sqrt (1.0 - 1.0 / WL)); |
||||
|
chic2 = sqrt (1.0 - chic1 * chic1); |
||||
|
|
||||
|
/* Set regen counter "gslew" which will affect renormalizations. |
||||
|
Since pools are OK already, we wont't need to renorm for a |
||||
|
while */ |
||||
|
gslew = 1; |
||||
|
/* Finally, set "gaussfaze" to return all of wk1 |
||||
|
* except the last entry at WL-1 */ |
||||
|
gaussfaze = WL-1; |
||||
|
gausssave = wk1; |
||||
|
|
||||
|
/* If quoll = 0, do a check on installation */ |
||||
|
if (quoll == 0) goto docheck; |
||||
|
qual = quoll; |
||||
|
/* Check sensible values for quoll, say 1 to 4 */ |
||||
|
if ((quoll < 0) || (quoll > 4)) { |
||||
|
printf ("From initnorm(): quoll parameter %d out of\ |
||||
|
range 1 to 4\n", quoll); |
||||
|
return; |
||||
|
} |
||||
|
c7g[0] = seed; c7g[1] = -3337792; |
||||
|
|
||||
|
/* Fill wk1[] with good normals */ |
||||
|
boxmuller (wk1, WL); |
||||
|
/* Scale so sum-of-squares = WL */ |
||||
|
GScale = sqrt (renormalize () / WL); |
||||
|
/* We have set |
||||
|
GScale to restore the original ChiSq_WL sum-of-squares */ |
||||
|
return; |
||||
|
|
||||
|
docheck: |
||||
|
/* Set a simple pattern in wk1[] and test results of regen */ |
||||
|
for (i = 0; i < WL; i++) wk1[i] = wk2[i] = 0.0; |
||||
|
wk1[0] = sqrt ((double) WL); |
||||
|
c7g[0] = 1234567; c7g[1] = 9876543; |
||||
|
for (i = 0; i < 60; i++) regen(); |
||||
|
/* Check a couple of values */ |
||||
|
if ((fabs (wk1[17] - check17) > 0.00001) || |
||||
|
(fabs (wk1[98] - check98) > 0.00001)) { |
||||
|
printf ("\nInitnorm check failed.\n"); |
||||
|
printf ("Expected %8.5f got %10.7f\n", check17, wk1[17]); |
||||
|
printf ("Expected %8.5f got %10.7f\n", check98, wk1[98]); |
||||
|
} |
||||
|
else printf ("\nInitnorm check OK\n"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
|
||||
|
/* ---------------------- fastnorm -------------------------- */ |
||||
|
/* If gslew shows time is ripe, renormalizes the pool |
||||
|
fastnorm() returns the value GScale*gausssave[0]. |
||||
|
*/ |
||||
|
|
||||
|
Sf fastnorm () |
||||
|
{ |
||||
|
Sf sos; |
||||
|
Sw n1; |
||||
|
|
||||
|
if (! (gslew & 0xFFFF)) { |
||||
|
sos = renormalize (); |
||||
|
} |
||||
|
|
||||
|
/* The last entry of gausssave, at WL-1, will not have been used. |
||||
|
Use it to get an approx. to sqrt (ChiSq_WL / WL). |
||||
|
See initnorm() code for details */ |
||||
|
GScale = chic1 + chic2 * GScale * gausssave [WL-1]; |
||||
|
for (n1 = 0; n1 < qual; n1++) regen (); |
||||
|
gslew++; |
||||
|
|
||||
|
gaussfaze = WL - 1; |
||||
|
|
||||
|
return (GScale * gausssave [0]); |
||||
|
} |
||||
|
|
||||
|
|
||||
|
/* --------------------- (test) main ------------------------- */ |
||||
|
#ifdef Main |
||||
|
#include "FastNorm3.h" |
||||
|
int main() |
||||
|
{ |
||||
|
Sf x; Sw i; |
||||
|
initnorm (0, 0); |
||||
|
initnorm (77, 2); |
||||
|
printf ("SoS %20.6f\n", renormalize()); |
||||
|
// for (i = 0; i < 2000000; i++) x = FastNorm; |
||||
|
for (i = 0; i < 200; i++) { |
||||
|
x = FastNorm; |
||||
|
printf("%d\t%f\n", i, x); |
||||
|
} |
||||
|
printf ("SoS %20.6f\n", renormalize()); |
||||
|
exit (1); |
||||
|
} |
||||
|
#endif |
||||
@ -0,0 +1,10 @@ |
|||||
|
noinst_LTLIBRARIES = libtrannoise.la |
||||
|
|
||||
|
libtrannoise_la_SOURCES = \
|
||||
|
FastNorm3.c \
|
||||
|
1-f-code.c \
|
||||
|
wallace.c |
||||
|
|
||||
|
AM_CPPFLAGS = -I$(top_srcdir)/src/include -I$(top_srcdir)/src/frontend |
||||
|
|
||||
|
MAINTAINERCLEANFILES = Makefile.in |
||||
@ -0,0 +1,532 @@ |
|||||
|
/* Wallace generator for normally distributed random variates |
||||
|
Copyright: Holger Vogt, 2008 |
||||
|
|
||||
|
*/ |
||||
|
|
||||
|
//#define FASTNORM_ORIG |
||||
|
|
||||
|
#include <stdio.h> |
||||
|
#include <stdlib.h> |
||||
|
#ifdef _MSC_VER |
||||
|
#include <process.h> |
||||
|
#define getpid _getpid |
||||
|
#else |
||||
|
#include <unistd.h> |
||||
|
#endif |
||||
|
#include <math.h> |
||||
|
#include "wallace.h" |
||||
|
#include "FastNorm3.h" |
||||
|
|
||||
|
#ifdef HasMain |
||||
|
#include <sys/timeb.h> |
||||
|
#else |
||||
|
#ifndef NOSPICE |
||||
|
#include "ngspice.h" |
||||
|
#endif |
||||
|
#endif |
||||
|
|
||||
|
#define POOLSIZE 4096 |
||||
|
#define LPOOLSIZE 12 |
||||
|
#define NOTRANS 3 /* number of (dual) transformations */ |
||||
|
|
||||
|
#define VE 10 |
||||
|
#define VL (1 << VE) |
||||
|
#define VM (VL-1) |
||||
|
#define WL (4*VL) |
||||
|
#define WM (WL-1) |
||||
|
|
||||
|
double *outgauss; /* output vector for user access */ |
||||
|
unsigned int variate_used; /* actual index of variate called by user */ |
||||
|
double ScaleGauss; |
||||
|
|
||||
|
static double *pool1; |
||||
|
static double *pool2; |
||||
|
static unsigned int *addrif, *addrib; |
||||
|
static unsigned n = POOLSIZE; |
||||
|
static double chi1, chi2; /* chi^2 correction values */ |
||||
|
static unsigned int newpools; |
||||
|
|
||||
|
extern double drand(void); |
||||
|
extern unsigned int CombLCGTausInt(void); |
||||
|
extern void TausSeed(void); |
||||
|
extern unsigned int CombLCGTausInt2(void); |
||||
|
|
||||
|
|
||||
|
void PolarGauss(double* py1, double* py2) |
||||
|
{ |
||||
|
double x1, x2, w; |
||||
|
|
||||
|
do { |
||||
|
x1 = drand(); |
||||
|
x2 = drand(); |
||||
|
w = x1 * x1 + x2 * x2; |
||||
|
} while (( w > 1.0 ) || ( w < 0.25)); |
||||
|
|
||||
|
w = sqrt( (-2.0 * log( w ) ) / w ); |
||||
|
|
||||
|
*py1 = (double)(x1 * w); |
||||
|
*py2 = (double)(x2 * w); |
||||
|
} |
||||
|
|
||||
|
|
||||
|
|
||||
|
|
||||
|
|
||||
|
void initw(void) |
||||
|
{ |
||||
|
unsigned i; |
||||
|
double totsqr, nomsqr; |
||||
|
unsigned long int coa, cob, s; |
||||
|
|
||||
|
/* initialize the uniform generator */ |
||||
|
srand(getpid()); |
||||
|
// srand(17); |
||||
|
TausSeed(); |
||||
|
|
||||
|
ScaleGauss = 1.; |
||||
|
newpools = 1; |
||||
|
|
||||
|
/* set up the two pools */ |
||||
|
pool1 = TMALLOC(double, n); //(double*)malloc(n * sizeof(double)); |
||||
|
pool2 = TMALLOC(double, n); //(double*)malloc(n * sizeof(double)); |
||||
|
addrif = TMALLOC(unsigned int, (n + NOTRANS)); //(unsigned int*)malloc((n + NOTRANS) * sizeof(unsigned int)); |
||||
|
addrib = TMALLOC(unsigned int, (n + NOTRANS)); //(unsigned int*)malloc((n + NOTRANS) * sizeof(unsigned int)); |
||||
|
|
||||
|
/* fill the first pool with normally distributed values */ |
||||
|
PolarGauss(&pool1[0], &pool1[1]); |
||||
|
for (i = 1; i < n>>1; i++) { |
||||
|
PolarGauss(&pool1[i<<1], &pool1[(i<<1) + 1]); |
||||
|
} |
||||
|
/* normalize pool content */ |
||||
|
/* totsqr = totsum = 0.0; |
||||
|
for (i = 0; i < n; i++) { |
||||
|
totsqr += pool1[i] * pool1[i]; |
||||
|
totsum += pool1[i]; |
||||
|
} |
||||
|
totsum = totsum/n; |
||||
|
for (i = 0; i < n; i++) { |
||||
|
totsqr += (pool1[i] - totsum) * (pool1[i] - totsum); |
||||
|
} |
||||
|
nomsqr = sqrt(n / totsqr); |
||||
|
for (i = 0; i < n; i++) |
||||
|
pool1[i] = (pool1[i] - totsum) * nomsqr; |
||||
|
*/ |
||||
|
totsqr = 0.0; |
||||
|
for (i = 0; i < n; i++) |
||||
|
totsqr += pool1[i] * pool1[i]; |
||||
|
nomsqr = sqrt(n / totsqr); |
||||
|
for (i = 0; i < n; i++) |
||||
|
pool1[i] *= nomsqr; |
||||
|
|
||||
|
/* calculate ch^2 value */ |
||||
|
chi1 = sqrt ( sqrt (1.0 - 1.0/n)); |
||||
|
chi2 = sqrt ( 1.0 - chi1*chi1); |
||||
|
|
||||
|
/* first scaling, based on unused pool1[n-2] */ |
||||
|
ScaleGauss = chi1 + chi2 * ScaleGauss * pool1[n-2]; |
||||
|
/* access to first pool */ |
||||
|
outgauss = pool1; |
||||
|
/* set data counter, we return n-2 values here */ |
||||
|
variate_used = n - 2; |
||||
|
|
||||
|
/* generate random reading addresses using a LCG */ |
||||
|
s = 0; |
||||
|
coa = 241; |
||||
|
cob = 59; |
||||
|
for (i=0; i < (n + NOTRANS); i++) { |
||||
|
// addrif[i] = s = (s * coa + cob) % ( n ); |
||||
|
coa = CombLCGTausInt(); |
||||
|
addrif[i] = coa >> (32 - LPOOLSIZE); |
||||
|
// printf ("Random add:\t%ld\n" , s); |
||||
|
} |
||||
|
s = 0; |
||||
|
coa = 193; |
||||
|
cob = 15; |
||||
|
for (i=0; i < (n + NOTRANS); i++) { |
||||
|
// addrib[i] = s = (s * coa + cob) % ( n ); |
||||
|
coa = CombLCGTausInt(); |
||||
|
addrib[i] = coa >> (32 - LPOOLSIZE); |
||||
|
// printf ("Random add:\t%ld\n" , addrib[i]); |
||||
|
} |
||||
|
|
||||
|
// printf("norm for orig. Gauss: %e, chi^2 scale: %e\n", nomsqr, ScaleGauss); |
||||
|
// NewWa(); |
||||
|
} |
||||
|
|
||||
|
/* original FastNorm3.c code */ |
||||
|
#ifdef FASTNORM_ORIG |
||||
|
float NewWa () |
||||
|
{ |
||||
|
int i, j, k, m; |
||||
|
float p, q, r, s, t; |
||||
|
int topv[6], ord[4], *top; |
||||
|
float *ppt[4], *ptn; |
||||
|
|
||||
|
float nulval, endval; |
||||
|
float totsqr, nomsqr; |
||||
|
nulval = ScaleGauss * pool1[0]; |
||||
|
endval = pool1[n-1]; |
||||
|
|
||||
|
/* Choose 4 random start points in the wk1[] vector |
||||
|
I want them all different. */ |
||||
|
|
||||
|
top = topv + 1; |
||||
|
/* Set limiting values in top[-1], top[4] */ |
||||
|
top[-1] = VL; top[4] = 0; |
||||
|
reran1: |
||||
|
m = CombLCGTausInt(); /* positive 32-bit random */ |
||||
|
/* Extract two VE-sized randoms from m, which has 31 useable digits */ |
||||
|
m = m >> (31 - 2*VE); |
||||
|
top[0] = m & VM; m = m >> VE; top[1] = m & VM; |
||||
|
m = CombLCGTausInt(); /* positive 32-bit random */ |
||||
|
/* Extract two VE-sized randoms from m, which has 31 useable digits */ |
||||
|
m = m >> (31 - 2*VE); |
||||
|
top[2] = m & VM; m = m >> VE; top[3] = m & VM; |
||||
|
for (i = 0; i < 4; i++) ord[i] = i; |
||||
|
/* Sort in decreasing size */ |
||||
|
for (i = 2; i >= 0; i--) { |
||||
|
for (j = 0; j <= i; j++) { |
||||
|
if (top[j] < top[j+1]) { |
||||
|
k = top[j]; top[j] = top[j+1]; |
||||
|
top[j+1] = k; |
||||
|
k = ord[j]; ord[j] = ord[j+1]; |
||||
|
ord[j+1] = k; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
/* Ensure all different */ |
||||
|
for (i = 0; i < 3; i++) { if (top[i] == top[i+1]) goto reran1; } |
||||
|
|
||||
|
/* Set pt pointers to their start values for the first chunk. */ |
||||
|
for (i = 0; i < 4; i++) { |
||||
|
j = ord[i]; |
||||
|
ppt[j] = pool2 + j * VL + top[i]; |
||||
|
} |
||||
|
|
||||
|
/* Set ptn to point into wk1 */ |
||||
|
ptn = pool1; |
||||
|
|
||||
|
/* Now ready to do five chunks. The length of chunk i is |
||||
|
top[i-1] - top[i] (I hope) |
||||
|
At the end of chunk i, pointer ord[i] should have reached the end |
||||
|
of its part, and need to be wrapped down to the start of its part. |
||||
|
*/ |
||||
|
i = 0; |
||||
|
|
||||
|
chunk: |
||||
|
j = top[i] - top[i-1]; /* Minus the chunk length */ |
||||
|
for (; j < 0; j++) { |
||||
|
p = *ptn++; s = *ptn++; q = *ptn++; r = *ptn++; |
||||
|
t = (p + q + r + s) * 0.5; |
||||
|
*ppt[0]++ = t - p; |
||||
|
*ppt[1]++ = t - q; |
||||
|
*ppt[2]++ = r - t; |
||||
|
*ppt[3]++ = s - t; |
||||
|
} |
||||
|
/* This should end the chunk. See if all done */ |
||||
|
if (i == 4) goto passdone; |
||||
|
|
||||
|
/* The pointer for part ord[i] should have passed its end */ |
||||
|
j = ord[i]; |
||||
|
#ifdef dddd |
||||
|
printf ("Chunk %1d done. Ptr %1d now %4d\n", i, j, ppt[j]-pool2); |
||||
|
#endif |
||||
|
ppt[j] -= VL; |
||||
|
i++; |
||||
|
goto chunk; |
||||
|
|
||||
|
passdone: |
||||
|
/* wk1[] values have been transformed and placed in wk2[] |
||||
|
Transform from wk2 to wk1 with a simple shuffle */ |
||||
|
m = (CombLCGTausInt2() >> (29 - VE)) & WM; |
||||
|
j = 0; |
||||
|
for (i = 0; i < 4; i++) ppt[i] = pool1 + i * VL; |
||||
|
for (i = 0; i < VL; i++) { |
||||
|
p = pool2[j^m]; j++; |
||||
|
s = pool2[j^m]; j++; |
||||
|
q = pool2[j^m]; j++; |
||||
|
r = pool2[j^m]; j++; |
||||
|
t = (p + q + r + s) * 0.5; |
||||
|
*ppt[0]++ = t - p; |
||||
|
*ppt[1]++ = q - t; |
||||
|
*ppt[2]++ = t - r; |
||||
|
*ppt[3]++ = s - t; |
||||
|
} |
||||
|
|
||||
|
/* renormalize again if number of pools beyond limit */ |
||||
|
if (!(newpools & 0xFFFF)) { |
||||
|
totsqr = 0.0; |
||||
|
for (i = 0; i < n; i++) |
||||
|
totsqr += pool1[i] * pool1[i]; |
||||
|
nomsqr = sqrt(n / totsqr); |
||||
|
for (i = 0; i < n; i++) |
||||
|
pool1[i] *= nomsqr; |
||||
|
} |
||||
|
|
||||
|
outgauss = pool1; |
||||
|
/* reset data counter */ |
||||
|
variate_used = n - 1; |
||||
|
|
||||
|
/* set counter counting nomber of pools made */ |
||||
|
newpools++; |
||||
|
|
||||
|
/* new scale factor using ch^2 correction, |
||||
|
using pool1[n-1] from last pool */ |
||||
|
ScaleGauss = chi1 + chi2 * ScaleGauss * endval; |
||||
|
|
||||
|
// printf("Pool number: %d, chi^2 scale: %e\n", newpools, ScaleGauss); |
||||
|
|
||||
|
return nulval; /* use old scale */ |
||||
|
|
||||
|
} |
||||
|
|
||||
|
#else |
||||
|
|
||||
|
/* Simplified code according to an algorithm published by C. S. Wallace: |
||||
|
"Fast Pseudorandom Generators for Normal and Exponential Variates", |
||||
|
ACM Transactions on Mathmatical Software, Vol. 22, No. 1, March 1996, pp. 119-127. |
||||
|
Transform pool1 to pool2 and back to pool1 NOTRANS times |
||||
|
by orthogonal 4 x 4 Hadamard-Matrix. |
||||
|
Mixing of values is very important: Any value in the pool should contribute to |
||||
|
every value in the new pools, at least after several passes (number of passes |
||||
|
is set by NOTRANS to 2 or 3). |
||||
|
4 values are read in a continuous sequence from the total of POOLSIZE values. |
||||
|
Values are stored in steps modulo POOLSIZE/4. |
||||
|
During backward transformation the values are shuffled by a random number jj. |
||||
|
*/ |
||||
|
|
||||
|
double NewWa(void) |
||||
|
{ |
||||
|
double nulval, endval; |
||||
|
double bl1, bl2, bl3, bl4; /* the four values to be transformed */ |
||||
|
double bsum; |
||||
|
double totsqr, nomsqr; |
||||
|
unsigned int i, j, jj, m, mm, mmm; |
||||
|
|
||||
|
nulval = ScaleGauss * pool1[0]; |
||||
|
endval = pool1[n-1]; |
||||
|
m = n >> 2; |
||||
|
// printf("New pool after next value\n"); |
||||
|
|
||||
|
/* generate new pool by transformation |
||||
|
Transformation is repeated NOTRANS times */ |
||||
|
for (i=0; i < NOTRANS; i++) { |
||||
|
mm = m << 1; |
||||
|
mmm = mm + m; |
||||
|
/* forward transformation */ |
||||
|
// for (j=0; j < n; j += 4) { |
||||
|
for (j=0; j < m; j++) { |
||||
|
bl1 = pool1[j]; |
||||
|
bl2 = pool1[j+m]; |
||||
|
bl3 = pool1[j+mm]; |
||||
|
bl4 = pool1[j+mmm]; |
||||
|
/* Hadamard-Matrix */ |
||||
|
bsum = (bl1 + bl2 + bl3 + bl4) * 0.5f; |
||||
|
jj = j<<2; |
||||
|
pool2[jj] = bl1 - bsum; |
||||
|
pool2[jj+1] = bl2 - bsum; |
||||
|
pool2[jj+2] = bsum - bl3; |
||||
|
pool2[jj+3] = bsum - bl4; |
||||
|
} |
||||
|
/* backward transformation */ |
||||
|
jj = (CombLCGTausInt2() >> (31 - LPOOLSIZE)) & (n - 1); |
||||
|
for (j=0; j < m; j++) { |
||||
|
bl1 = pool2[j^jj]; |
||||
|
bl2 = pool2[(j+m)^jj]; |
||||
|
bl3 = pool2[(j+mm)^jj]; |
||||
|
bl4 = pool2[(j+mmm)^jj]; |
||||
|
/* Hadamard-Matrix */ |
||||
|
bsum = (bl1 + bl2 + bl3 + bl4) * 0.5f; |
||||
|
jj = j<<2; |
||||
|
pool1[jj] = bl1 - bsum; |
||||
|
pool1[jj+1] = bl2 - bsum; |
||||
|
pool1[jj+2] = bsum - bl3; |
||||
|
pool1[jj+3] = bsum - bl4; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/* renormalize again if number of pools beyond limit */ |
||||
|
if (!(newpools & 0xFFFF)) { |
||||
|
totsqr = 0.0; |
||||
|
for (i = 0; i < n; i++) |
||||
|
totsqr += pool1[i] * pool1[i]; |
||||
|
nomsqr = sqrt(n / totsqr); |
||||
|
for (i = 0; i < n; i++) |
||||
|
pool1[i] *= nomsqr; |
||||
|
} |
||||
|
|
||||
|
outgauss = pool1; |
||||
|
/* reset data counter */ |
||||
|
variate_used = n - 1; |
||||
|
|
||||
|
/* set counter counting nomber of pools made */ |
||||
|
newpools++; |
||||
|
|
||||
|
/* new scale factor using ch^2 correction, |
||||
|
using pool1[n-1] from previous pool */ |
||||
|
ScaleGauss = chi1 + chi2 * ScaleGauss * endval; |
||||
|
|
||||
|
// printf("Pool number: %d, chi^2 scale: %e\n", newpools, ScaleGauss); |
||||
|
|
||||
|
return nulval; /* use old scale */ |
||||
|
// return pool1[0]; /* use new scale */ |
||||
|
} |
||||
|
|
||||
|
#endif |
||||
|
|
||||
|
|
||||
|
#ifdef FASTNORMTEST |
||||
|
float NewWa_not(void) |
||||
|
{ |
||||
|
float nulval, endval; |
||||
|
float bl1, bl2, bl3, bl4; /* the four values to be transformed */ |
||||
|
float bsum; |
||||
|
float totsqr, nomsqr; |
||||
|
unsigned int i, j, jj; |
||||
|
nulval = ScaleGauss * pool1[0]; |
||||
|
endval = pool1[n-1]; |
||||
|
|
||||
|
// printf("New pool after next value\n"); |
||||
|
|
||||
|
/* generate new pool by transformation |
||||
|
Transformation is repeated NOTRANS times */ |
||||
|
for (i=0; i < NOTRANS; i++) { |
||||
|
|
||||
|
/* forward transformation */ |
||||
|
for (j=0; j < n; j += 4) { |
||||
|
jj = j + i; |
||||
|
bl1 = pool1[addrif[jj]]; |
||||
|
bl2 = pool1[addrif[jj+1]]; |
||||
|
bl3 = pool1[addrif[jj+2]]; |
||||
|
bl4 = pool1[addrif[jj+3]]; |
||||
|
/* s = (s*coa + cob) & (n - 1); |
||||
|
bl1 = pool1[s]; |
||||
|
s = (s*coa + cob) & (n - 1); |
||||
|
bl2 = pool1[s + 1]; |
||||
|
s = (s*coa + cob) & (n - 1); |
||||
|
bl3 = pool1[s + 2]; |
||||
|
s = (s*coa + cob) & (n - 1); |
||||
|
bl4 = pool1[s + 3]; */ |
||||
|
/* jj = j + i; |
||||
|
bl1 = pool1[addrif[jj]]; |
||||
|
bl2 = pool1[addrif[jj+1]]; |
||||
|
bl3 = pool1[addrif[jj+2]]; |
||||
|
bl4 = pool1[addrif[jj+3]]; */ |
||||
|
/* bl1 = pool1[j]; |
||||
|
bl2 = pool1[j+1]; |
||||
|
bl3 = pool1[j+2]; |
||||
|
bl4 = pool1[j+3]; */ |
||||
|
/* Hadamard-Matrix */ |
||||
|
bsum = (bl1 + bl2 + bl3 + bl4) * 0.5; |
||||
|
/* pool2[j] = bl1 - bsum; |
||||
|
pool2[j+1] = bl2 - bsum; |
||||
|
pool2[j+2] = bsum - bl3; |
||||
|
pool2[j+3] = bsum - bl4; */ |
||||
|
pool2[addrib[jj]] = bl1 - bsum; |
||||
|
pool2[addrib[jj+1]] = bl2 - bsum; |
||||
|
pool2[addrib[jj+2]] = bsum - bl3; |
||||
|
pool2[addrib[jj+3]] = bsum - bl4; |
||||
|
} |
||||
|
/* backward transformation */ |
||||
|
for (j=0; j < n; j += 4) { |
||||
|
bl1 = pool2[j]; |
||||
|
bl2 = pool2[j+1]; |
||||
|
bl3 = pool2[j+2]; |
||||
|
bl4 = pool2[j+3]; |
||||
|
/* bl1 = pool2[addrib[j]]; |
||||
|
bl2 = pool2[addrib[j+1]]; |
||||
|
bl3 = pool2[addrib[j+2]]; |
||||
|
bl4 = pool2[addrib[j+3]]; */ |
||||
|
/* Hadamard-Matrix */ |
||||
|
bsum = (bl1 + bl2 + bl3 + bl4) * 0.5; |
||||
|
pool1[j] = bl1 - bsum; |
||||
|
pool1[j+1] = bl2 - bsum; |
||||
|
pool1[j+2] = bsum - bl3; |
||||
|
pool1[j+3] = bsum - bl4; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
|
||||
|
/* renormalize again if number of pools beyond limit */ |
||||
|
if (!(newpools & 0xFFFF)) { |
||||
|
totsqr = 0.0; |
||||
|
for (i = 0; i < n; i++) |
||||
|
totsqr += pool1[i] * pool1[i]; |
||||
|
nomsqr = sqrt(n / totsqr); |
||||
|
for (i = 0; i < n; i++) |
||||
|
pool1[i] *= nomsqr; |
||||
|
} |
||||
|
|
||||
|
outgauss = pool1; |
||||
|
/* reset data counter */ |
||||
|
variate_used = n - 1; |
||||
|
|
||||
|
/* set counter counting nomber of pools made */ |
||||
|
newpools++; |
||||
|
|
||||
|
/* new scale factor using ch^2 correction, |
||||
|
using pool1[n-1] from last pool */ |
||||
|
ScaleGauss = chi1 + chi2 * ScaleGauss * endval; |
||||
|
|
||||
|
// printf("Pool number: %d, chi^2 scale: %e\n", newpools, ScaleGauss); |
||||
|
|
||||
|
return nulval; /* use old scale */ |
||||
|
// return pool1[0]; /* use new scale */ |
||||
|
} |
||||
|
#endif |
||||
|
|
||||
|
/* --------------------- (test) main ------------------------- */ |
||||
|
/* gcc -Wall -g -DHasMain -I../../include wallace.c CombTaus.o -o watest.exe */ |
||||
|
#ifdef HasMain |
||||
|
#include "wallace.h" |
||||
|
|
||||
|
struct timeb timenow; |
||||
|
struct timeb timebegin; |
||||
|
int sec, msec; |
||||
|
|
||||
|
void timediff(struct timeb *now, struct timeb *begin, int *sec, int *msec) |
||||
|
{ |
||||
|
|
||||
|
*msec = now->millitm - begin->millitm; |
||||
|
*sec = now->time - begin->time; |
||||
|
if (*msec < 0) { |
||||
|
*msec += 1000; |
||||
|
(*sec)--; |
||||
|
} |
||||
|
return; |
||||
|
|
||||
|
} |
||||
|
|
||||
|
|
||||
|
int main() |
||||
|
{ |
||||
|
float x; |
||||
|
unsigned int i; |
||||
|
long int count; |
||||
|
|
||||
|
initw(); |
||||
|
ftime(&timebegin); |
||||
|
count = 100000000; |
||||
|
for (i = 0; i < count; i++) { |
||||
|
x = GaussWa; |
||||
|
// printf("%d\t%f\n", i, x); |
||||
|
} |
||||
|
ftime(&timenow); |
||||
|
timediff(&timenow, &timebegin, &sec, &msec); |
||||
|
printf("WallaceHV: %ld normal variates: %f s\n", count, sec + (float) msec / 1000.0); |
||||
|
|
||||
|
initnorm(0, 0); |
||||
|
initnorm(77, 3); |
||||
|
ftime(&timebegin); |
||||
|
count = 100000000; |
||||
|
for (i = 0; i < count; i++) { |
||||
|
x = FastNorm; |
||||
|
// printf("%d\t%f\n", i, x); |
||||
|
} |
||||
|
ftime(&timenow); |
||||
|
timediff(&timenow, &timebegin, &sec, &msec); |
||||
|
printf("FastNorm3: %ld normal variates: %f s\n", count, sec + (float) msec / 1000.0); |
||||
|
|
||||
|
return (1); |
||||
|
} |
||||
|
#endif |
||||
@ -0,0 +1,7 @@ |
|||||
|
|
||||
|
|
||||
|
|
||||
|
void f_alpha(int n_pts, int n_exp, float X[], float Q_d, |
||||
|
float alpha); |
||||
|
|
||||
|
void rvfft(float X[], unsigned long int n); |
||||
@ -0,0 +1,32 @@ |
|||||
|
/* Last revised 28-1-1999 */ |
||||
|
/* This is the header file FastNorm3.h to be included in code files |
||||
|
using FastNorm3.c */ |
||||
|
/* I M P O R T A N T ! ! ! ! ! |
||||
|
|
||||
|
The definition below should be altered to ensure that integer |
||||
|
arithmetic is done on 32-bit words. It may need to be changed from int to |
||||
|
long on some platforms. The 32-bit requirement arises from the use of |
||||
|
a Uniform pseudo-random generator in part of the code, which assumes 32-bit |
||||
|
twos-complement arithmetic. In dire need, replace this generator with |
||||
|
another more suitable for the platform. The rest of the code assumes only |
||||
|
that signed integers up to a bit less than 2^31 can be handled. |
||||
|
*/ |
||||
|
|
||||
|
#define Sw int /* MUST define Sw as a 32-bit integer or longer */ |
||||
|
#define Sf double |
||||
|
|
||||
|
extern int gaussfaze; |
||||
|
extern int gaussmask; |
||||
|
extern double *gausssave; |
||||
|
extern double GScale; |
||||
|
|
||||
|
#define FastNorm ((--gaussfaze)?GScale*gausssave[gaussfaze]:fastnorm()) |
||||
|
|
||||
|
void initnorm(Sw seed, Sw quoll); |
||||
|
Sf fastnorm (void); |
||||
|
Sf c7rand(Sw*); |
||||
|
Sw irandm(Sw*); |
||||
|
unsigned Sw urandm(Sw*); |
||||
|
double adchi (double a, int *is); |
||||
|
double rgamma (double g, int *is); |
||||
|
Sf renormalize(void); |
||||
@ -0,0 +1,108 @@ |
|||||
|
/******************************************************************* |
||||
|
This file extends the fftlib with calls to maintain the cosine and bit reversed tables |
||||
|
for you (including mallocs and free's). Call the init routine for each fft size you will |
||||
|
be using. Then you can call the fft routines below which will make the fftlib library |
||||
|
call with the appropriate tables passed. When you are done with all fft's you can call |
||||
|
fftfree to release the storage for the tables. Note that you can call fftinit repeatedly |
||||
|
with the same size, the extra calls will be ignored. So, you could make a macro to call |
||||
|
fftInit every time you call ffts. For example you could have someting like: |
||||
|
#define FFT(a,n) if(!fftInit(roundtol(LOG2(n)))) ffts(a,roundtol(LOG2(n)),1);else printf("fft error\n"); |
||||
|
*******************************************************************/ |
||||
|
|
||||
|
int fftInit(long M); |
||||
|
// malloc and init cosine and bit reversed tables for a given size fft, ifft, rfft, rifft |
||||
|
/* INPUTS */ |
||||
|
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
||||
|
/* OUTPUTS */ |
||||
|
/* private cosine and bit reversed tables */ |
||||
|
|
||||
|
void fftFree(); |
||||
|
// release storage for all private cosine and bit reversed tables |
||||
|
|
||||
|
void ffts(float *data, long M, long Rows); |
||||
|
/* Compute in-place complex fft on the rows of the input array */ |
||||
|
/* INPUTS */ |
||||
|
/* *ioptr = input data array */ |
||||
|
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
||||
|
/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *ioptr = output data array */ |
||||
|
|
||||
|
void iffts(float *data, long M, long Rows); |
||||
|
/* Compute in-place inverse complex fft on the rows of the input array */ |
||||
|
/* INPUTS */ |
||||
|
/* *ioptr = input data array */ |
||||
|
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
||||
|
/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *ioptr = output data array */ |
||||
|
|
||||
|
void rffts(float *data, long M, long Rows); |
||||
|
/* Compute in-place real fft on the rows of the input array */ |
||||
|
/* The result is the complex spectra of the positive frequencies */ |
||||
|
/* except the location for the first complex number contains the real */ |
||||
|
/* values for DC and Nyquest */ |
||||
|
/* See rspectprod for multiplying two of these spectra together- ex. for fast convolution */ |
||||
|
/* INPUTS */ |
||||
|
/* *ioptr = real input data array */ |
||||
|
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
||||
|
/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *ioptr = output data array in the following order */ |
||||
|
/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ |
||||
|
|
||||
|
void riffts(float *data, long M, long Rows); |
||||
|
/* Compute in-place real ifft on the rows of the input array */ |
||||
|
/* data order as from rffts */ |
||||
|
/* INPUTS */ |
||||
|
/* *ioptr = input data array in the following order */ |
||||
|
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
||||
|
/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ |
||||
|
/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *ioptr = real output data array */ |
||||
|
|
||||
|
void rspectprod(float *data1, float *data2, float *outdata, long N); |
||||
|
// When multiplying a pair of spectra from rfft care must be taken to multiply the |
||||
|
// two real values seperately from the complex ones. This routine does it correctly. |
||||
|
// the result can be stored in-place over one of the inputs |
||||
|
/* INPUTS */ |
||||
|
/* *data1 = input data array first spectra */ |
||||
|
/* *data2 = input data array second spectra */ |
||||
|
/* N = fft input size for both data1 and data2 */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *outdata = output data array spectra */ |
||||
|
|
||||
|
|
||||
|
/* The following is FYI |
||||
|
|
||||
|
|
||||
|
Note that most of the fft routines require full matrices, ie Rsiz==Ncols |
||||
|
This is how I like to define a real matrix: |
||||
|
struct matrix { // real matrix |
||||
|
float *d; // pointer to data |
||||
|
long Nrows; // number of rows in the matrix |
||||
|
long Ncols; // number of columns in the matrix (can be less than Rsiz) |
||||
|
long Rsiz; // number of floats from one row to the next |
||||
|
}; |
||||
|
typedef struct matrix matrix; |
||||
|
|
||||
|
|
||||
|
|
||||
|
CACHEFILLMALLOC and CEILCACHELINE can be used instead of malloc to make |
||||
|
arrays that start exactly on a cache line start. |
||||
|
First we CACHEFILLMALLOC a void * (use this void * when free'ing), |
||||
|
then we set our array pointer equal to the properly cast CEILCACHELINE of this void * |
||||
|
example: |
||||
|
aInit = CACHEFILLMALLOC( NUMFLOATS*sizeof(float) ); |
||||
|
a = (float *) CEILCACHELINE(ainit); |
||||
|
... main body of code ... |
||||
|
free(aInit); |
||||
|
|
||||
|
To disable this alignment, set CACHELINESIZE to 1 |
||||
|
#define CACHELINESIZE 32 // Bytes per cache line |
||||
|
#define CACHELINEFILL (CACHELINESIZE-1) |
||||
|
#define CEILCACHELINE(p) ((((unsigned long)p+CACHELINEFILL)/CACHELINESIZE)*CACHELINESIZE) |
||||
|
#define CACHEFILLMALLOC(n) malloc((n)+CACHELINEFILL) |
||||
|
*/ |
||||
|
|
||||
@ -0,0 +1,22 @@ |
|||||
|
/* Wallace generator for normally distributed random variates |
||||
|
Copyright Holger Vogt, 2008 |
||||
|
|
||||
|
Calling sequence: |
||||
|
initw(void); initialize using srand(seed) |
||||
|
double x = GaussWa; returns normally distributed random variate |
||||
|
|
||||
|
*/ |
||||
|
|
||||
|
|
||||
|
|
||||
|
extern double *outgauss; /*output vector for user access */ |
||||
|
extern unsigned int variate_used; /* actual index of variate called by user */ |
||||
|
extern double ScaleGauss; /* scale factor, including chi square correction */ |
||||
|
|
||||
|
double NewWa(void); /* generate new pool, return outgauss[0] */ |
||||
|
|
||||
|
#define GaussWa ((--variate_used)?(outgauss[variate_used]*ScaleGauss):NewWa()) |
||||
|
|
||||
|
void initw(void); /* initialization of Wallace generator */ |
||||
|
|
||||
|
void PolarGauss(double* py1, double* py2); |
||||
@ -1,6 +1,6 @@ |
|||||
## Process this file with automake
|
## Process this file with automake
|
||||
|
|
||||
SUBDIRS = cmaths ni sparse poly deriv misc |
|
||||
DIST_SUBDIRS = cmaths ni sparse poly deriv misc |
|
||||
|
SUBDIRS = cmaths ni sparse poly deriv misc fft |
||||
|
DIST_SUBDIRS = cmaths ni sparse poly deriv misc fft |
||||
|
|
||||
MAINTAINERCLEANFILES = Makefile.in |
MAINTAINERCLEANFILES = Makefile.in |
||||
@ -0,0 +1,13 @@ |
|||||
|
## Process this file with automake to produce Makefile.in
|
||||
|
|
||||
|
noinst_LTLIBRARIES = libmathfft.la |
||||
|
|
||||
|
libmathfft_la_SOURCES = \
|
||||
|
fftext.c \
|
||||
|
fftlib.c \
|
||||
|
matlib.c |
||||
|
|
||||
|
|
||||
|
|
||||
|
AM_CPPFLAGS = -I$(top_srcdir)/src/include |
||||
|
MAINTAINERCLEANFILES = Makefile.in |
||||
@ -0,0 +1,37 @@ |
|||||
|
Subject: FFT for RISC 2.0 |
||||
|
To: macgifts@sumex-aim.stanford.edu |
||||
|
Enclosure: FFTs-for-RISC-2.sit |
||||
|
|
||||
|
Enclosed is a stuffit archive of version 2.0 of my 'C' source code fft library. |
||||
|
|
||||
|
Very-Fast Fourier Transform routines. Routines are provided for real and complex |
||||
|
forward and inverse 1d and 2d fourier transforms and 3d complex forward and inverse ffts. |
||||
|
I coded these to optimize execution speed on Risc processors like the PowerPC. |
||||
|
All fft sizes must still be a power of two. |
||||
|
Test programs based on the Numerical Recipes in C routines are provided. |
||||
|
Also included are some simple applications with source code which time the FFTs. |
||||
|
See the enclosed read me file for more information. |
||||
|
|
||||
|
Revision version 2.0: |
||||
|
Rewrote code to rely more on compiler optimization (and be less ugly.) |
||||
|
Removed restrictions on too small or too large ffts. |
||||
|
Provided a library extension that manages memory for cosine and bit |
||||
|
reversed counter tables. |
||||
|
Added 2d and 3d complex and 2d real ffts. |
||||
|
Speeded routines for data too large to fit in primary cache. |
||||
|
Changed most testing from Matlab to Numerical Recipes based (because its cheaper.) |
||||
|
Changed call parameters (watch out.) |
||||
|
Revision version 1.21: |
||||
|
line 126 of rfftTest.c corrected. |
||||
|
Revisions version 1.2: |
||||
|
I now store the Nyquest point of the real transform where the 0 for the DC term's |
||||
|
imaginary part used to be. !! WATCH OUT FOR THIS IF YOU USE rfft !! |
||||
|
Added the real inverse Fourier transform. |
||||
|
|
||||
|
Revisions version 1.1: |
||||
|
Re-arranged to put fft routines in a shared library and changed source file name to fftlib.c. |
||||
|
Removed some ugly optimizations that are no longer needed for CodeWarrier. |
||||
|
|
||||
|
This code is public domain, do anything you want to with it. |
||||
|
|
||||
|
[Moderators- This file should replace ffts-for-risc-121-c.hqx and can be included on any CD] |
||||
@ -0,0 +1,70 @@ |
|||||
|
This directory contains a public domain FFT library which was optimized |
||||
|
for speed on RISC processors such as the PowerPC. All ffts |
||||
|
use single precision floats, for double precision just use a |
||||
|
global search and replace to change float to double in all |
||||
|
source files. |
||||
|
Codewarrier Pro 1.0 project files are also supplied. |
||||
|
|
||||
|
** Warning ** Perform rigorous testing to |
||||
|
your own standards before using this code. |
||||
|
|
||||
|
(John Green) green_jt@vsdec.npt.nuwc.navy.mil |
||||
|
|
||||
|
files: |
||||
|
fftTiming |
||||
|
Application to time complex ffts |
||||
|
|
||||
|
rfftTiming |
||||
|
Application to time real ffts |
||||
|
|
||||
|
// Directory: fft libraries |
||||
|
|
||||
|
files: |
||||
|
|
||||
|
fftext.c |
||||
|
Library of in-place fast fourier transforms. Contains forward |
||||
|
and inverse complex and real transforms. The real fft's expect the |
||||
|
frequency domain data to have the real part of the fsamp/2 bin (which |
||||
|
has a 0 imaginary part) to be stored in the location for the imaginary |
||||
|
part of the DC bin (the DC bin of real data is also strictly real.) |
||||
|
You must first call an initialization routine fftInit before calling |
||||
|
the fft computation routines ffts, iffts, rffts and riffts. |
||||
|
The init routines malloc the memory to store the cosine and |
||||
|
bit reversed counter tables as well as initializing their values. |
||||
|
|
||||
|
fftlib.c |
||||
|
Lower level library of in-place fast fourier transforms. Same as fftext.c but you |
||||
|
need to manage the mallocs for the cosine and bit reversed tables yourself. |
||||
|
|
||||
|
|
||||
|
fft2d.c |
||||
|
Library of 2d and 3d complex and 2d real in-place fast fourier transforms. |
||||
|
The init routine fft2dInit must be called before using the 2d routines and |
||||
|
fft3dInit must be called before using the 3d routines. These init routines |
||||
|
will also call the appropriate 1d init routines in fftext.c |
||||
|
|
||||
|
matlib.c |
||||
|
Matrix transpose routines used by fft2d.c and complex vector multiply |
||||
|
for forming the product of two spectra. |
||||
|
|
||||
|
dxpose.c |
||||
|
Double precision matrix transpose for quick single precision complex transposing |
||||
|
|
||||
|
// Directory: timing code |
||||
|
This directory contains the source to fftTiming and rfftTiming |
||||
|
|
||||
|
// Directory: Numerical Recipes testing |
||||
|
This directory contains files used to test the various fft routines using |
||||
|
the Numerical Recipes in C routines as a baseline. These routines can be purchased |
||||
|
in PeeCee (after expanding you can move them to a Mac) format from: |
||||
|
http://cfata2.harvard.edu/numerical-recipes/ |
||||
|
Unfortunately Numerical Recipes defines its forward and inverse fft's backwards. |
||||
|
For complex fft's I just use their inverse fft as a forward one, but for real ffts |
||||
|
their forward fft followed by my inverse fft reverses the data. They also have ugly matrix |
||||
|
and tensor data types and start their indices with one, Fortran style, but these are |
||||
|
minor annoyances. |
||||
|
|
||||
|
// Directory: Matlab testing |
||||
|
This directory contains files to test fast 1d and 2d convolution with Matlab used to |
||||
|
verify the results. An example of using Matlab to test the fft library routines is |
||||
|
also given for the 2d real fft. |
||||
@ -0,0 +1,156 @@ |
|||||
|
/******************************************************************* |
||||
|
This file extends the fftlib with calls to maintain the cosine and bit reversed tables |
||||
|
for you (including mallocs and free's). Call the init routine for each fft size you will |
||||
|
be using. Then you can call the fft routines below which will make the fftlib library |
||||
|
call with the appropriate tables passed. When you are done with all fft's you can call |
||||
|
fftfree to release the storage for the tables. Note that you can call fftinit repeatedly |
||||
|
with the same size, the extra calls will be ignored. So, you could make a macro to call |
||||
|
fftInit every time you call ffts. For example you could have someting like: |
||||
|
#define FFT(a,n) if(!fftInit(roundtol(LOG2(n)))) ffts(a,roundtol(LOG2(n)),1);else printf("fft error\n"); |
||||
|
*******************************************************************/ |
||||
|
#include <stdlib.h> |
||||
|
#include "fftlib.h" |
||||
|
#include "matlib.h" |
||||
|
#include "fftext.h" |
||||
|
|
||||
|
// pointers to storage of Utbl's and BRLow's |
||||
|
static float *UtblArray[8*sizeof(long)] = {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
||||
|
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0}; |
||||
|
static short *BRLowArray[8*sizeof(long)/2] = {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0}; |
||||
|
|
||||
|
int fftInit(long M){ |
||||
|
// malloc and init cosine and bit reversed tables for a given size fft, ifft, rfft, rifft |
||||
|
/* INPUTS */ |
||||
|
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
||||
|
/* OUTPUTS */ |
||||
|
/* private cosine and bit reversed tables */ |
||||
|
|
||||
|
int theError = 1; |
||||
|
/*** I did NOT test cases with M>27 ***/ |
||||
|
if ((M >= 0) && (M < 8*sizeof(long))){ |
||||
|
theError = 0; |
||||
|
if (UtblArray[M] == 0){ // have we not inited this size fft yet? |
||||
|
// init cos table |
||||
|
UtblArray[M] = (float *) malloc( (POW2(M)/4+1)*sizeof(float) ); |
||||
|
if (UtblArray[M] == 0) |
||||
|
theError = 2; |
||||
|
else{ |
||||
|
fftCosInit(M, UtblArray[M]); |
||||
|
} |
||||
|
if (M > 1){ |
||||
|
if (BRLowArray[M/2] == 0){ // init bit reversed table for cmplx fft |
||||
|
BRLowArray[M/2] = (short *) malloc( POW2(M/2-1)*sizeof(short) ); |
||||
|
if (BRLowArray[M/2] == 0) |
||||
|
theError = 2; |
||||
|
else{ |
||||
|
fftBRInit(M, BRLowArray[M/2]); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
if (M > 2){ |
||||
|
if (BRLowArray[(M-1)/2] == 0){ // init bit reversed table for real fft |
||||
|
BRLowArray[(M-1)/2] = (short *) malloc( POW2((M-1)/2-1)*sizeof(short) ); |
||||
|
if (BRLowArray[(M-1)/2] == 0) |
||||
|
theError = 2; |
||||
|
else{ |
||||
|
fftBRInit(M-1, BRLowArray[(M-1)/2]); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
}; |
||||
|
return theError; |
||||
|
} |
||||
|
|
||||
|
void fftFree(void){ |
||||
|
// release storage for all private cosine and bit reversed tables |
||||
|
long i1; |
||||
|
for (i1=8*sizeof(long)/2-1; i1>=0; i1--){ |
||||
|
if (BRLowArray[i1] != 0){ |
||||
|
free(BRLowArray[i1]); |
||||
|
BRLowArray[i1] = 0; |
||||
|
}; |
||||
|
}; |
||||
|
for (i1=8*sizeof(long)-1; i1>=0; i1--){ |
||||
|
if (UtblArray[i1] != 0){ |
||||
|
free(UtblArray[i1]); |
||||
|
UtblArray[i1] = 0; |
||||
|
}; |
||||
|
}; |
||||
|
} |
||||
|
|
||||
|
/************************************************* |
||||
|
The following calls are easier than calling to fftlib directly. |
||||
|
Just make sure fftlib has been called for each M first. |
||||
|
**************************************************/ |
||||
|
|
||||
|
void ffts(float *data, long M, long Rows){ |
||||
|
/* Compute in-place complex fft on the rows of the input array */ |
||||
|
/* INPUTS */ |
||||
|
/* *ioptr = input data array */ |
||||
|
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
||||
|
/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *ioptr = output data array */ |
||||
|
ffts1(data, M, Rows, UtblArray[M], BRLowArray[M/2]); |
||||
|
} |
||||
|
|
||||
|
void iffts(float *data, long M, long Rows){ |
||||
|
/* Compute in-place inverse complex fft on the rows of the input array */ |
||||
|
/* INPUTS */ |
||||
|
/* *ioptr = input data array */ |
||||
|
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
||||
|
/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *ioptr = output data array */ |
||||
|
iffts1(data, M, Rows, UtblArray[M], BRLowArray[M/2]); |
||||
|
} |
||||
|
|
||||
|
void rffts(float *data, long M, long Rows){ |
||||
|
/* Compute in-place real fft on the rows of the input array */ |
||||
|
/* The result is the complex spectra of the positive frequencies */ |
||||
|
/* except the location for the first complex number contains the real */ |
||||
|
/* values for DC and Nyquest */ |
||||
|
/* See rspectprod for multiplying two of these spectra together- ex. for fast convolution */ |
||||
|
/* INPUTS */ |
||||
|
/* *ioptr = real input data array */ |
||||
|
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
||||
|
/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *ioptr = output data array in the following order */ |
||||
|
/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ |
||||
|
rffts1(data, M, Rows, UtblArray[M], BRLowArray[(M-1)/2]); |
||||
|
} |
||||
|
|
||||
|
void riffts(float *data, long M, long Rows){ |
||||
|
/* Compute in-place real ifft on the rows of the input array */ |
||||
|
/* data order as from rffts */ |
||||
|
/* INPUTS */ |
||||
|
/* *ioptr = input data array in the following order */ |
||||
|
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
||||
|
/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ |
||||
|
/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *ioptr = real output data array */ |
||||
|
riffts1(data, M, Rows, UtblArray[M], BRLowArray[(M-1)/2]); |
||||
|
} |
||||
|
|
||||
|
void rspectprod(float *data1, float *data2, float *outdata, long N){ |
||||
|
// When multiplying a pair of spectra from rfft care must be taken to multiply the |
||||
|
// two real values seperately from the complex ones. This routine does it correctly. |
||||
|
// the result can be stored in-place over one of the inputs |
||||
|
/* INPUTS */ |
||||
|
/* *data1 = input data array first spectra */ |
||||
|
/* *data2 = input data array second spectra */ |
||||
|
/* N = fft input size for both data1 and data2 */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *outdata = output data array spectra */ |
||||
|
if(N>1){ |
||||
|
outdata[0] = data1[0] * data2[0]; // multiply the zero freq values |
||||
|
outdata[1] = data1[1] * data2[1]; // multiply the nyquest freq values |
||||
|
cvprod(data1 + 2, data2 + 2, outdata + 2, N/2-1); // multiply the other positive freq values |
||||
|
} |
||||
|
else{ |
||||
|
outdata[0] = data1[0] * data2[0]; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,106 @@ |
|||||
|
/******************************************************************* |
||||
|
This file extends the fftlib with calls to maintain the cosine and bit reversed tables |
||||
|
for you (including mallocs and free's). Call the init routine for each fft size you will |
||||
|
be using. Then you can call the fft routines below which will make the fftlib library |
||||
|
call with the appropriate tables passed. When you are done with all fft's you can call |
||||
|
fftfree to release the storage for the tables. Note that you can call fftinit repeatedly |
||||
|
with the same size, the extra calls will be ignored. So, you could make a macro to call |
||||
|
fftInit every time you call ffts. For example you could have someting like: |
||||
|
#define FFT(a,n) if(!fftInit(roundtol(LOG2(n)))) ffts(a,roundtol(LOG2(n)),1);else printf("fft error\n"); |
||||
|
*******************************************************************/ |
||||
|
|
||||
|
int fftInit(long M); |
||||
|
// malloc and init cosine and bit reversed tables for a given size fft, ifft, rfft, rifft |
||||
|
/* INPUTS */ |
||||
|
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
||||
|
/* OUTPUTS */ |
||||
|
/* private cosine and bit reversed tables */ |
||||
|
|
||||
|
void fftFree(void); |
||||
|
// release storage for all private cosine and bit reversed tables |
||||
|
|
||||
|
void ffts(float *data, long M, long Rows); |
||||
|
/* Compute in-place complex fft on the rows of the input array */ |
||||
|
/* INPUTS */ |
||||
|
/* *ioptr = input data array */ |
||||
|
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
||||
|
/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *ioptr = output data array */ |
||||
|
|
||||
|
void iffts(float *data, long M, long Rows); |
||||
|
/* Compute in-place inverse complex fft on the rows of the input array */ |
||||
|
/* INPUTS */ |
||||
|
/* *ioptr = input data array */ |
||||
|
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
||||
|
/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *ioptr = output data array */ |
||||
|
|
||||
|
void rffts(float *data, long M, long Rows); |
||||
|
/* Compute in-place real fft on the rows of the input array */ |
||||
|
/* The result is the complex spectra of the positive frequencies */ |
||||
|
/* except the location for the first complex number contains the real */ |
||||
|
/* values for DC and Nyquest */ |
||||
|
/* See rspectprod for multiplying two of these spectra together- ex. for fast convolution */ |
||||
|
/* INPUTS */ |
||||
|
/* *ioptr = real input data array */ |
||||
|
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
||||
|
/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *ioptr = output data array in the following order */ |
||||
|
/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ |
||||
|
|
||||
|
void riffts(float *data, long M, long Rows); |
||||
|
/* Compute in-place real ifft on the rows of the input array */ |
||||
|
/* data order as from rffts */ |
||||
|
/* INPUTS */ |
||||
|
/* *ioptr = input data array in the following order */ |
||||
|
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
||||
|
/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ |
||||
|
/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *ioptr = real output data array */ |
||||
|
|
||||
|
void rspectprod(float *data1, float *data2, float *outdata, long N); |
||||
|
// When multiplying a pair of spectra from rfft care must be taken to multiply the |
||||
|
// two real values seperately from the complex ones. This routine does it correctly. |
||||
|
// the result can be stored in-place over one of the inputs |
||||
|
/* INPUTS */ |
||||
|
/* *data1 = input data array first spectra */ |
||||
|
/* *data2 = input data array second spectra */ |
||||
|
/* N = fft input size for both data1 and data2 */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *outdata = output data array spectra */ |
||||
|
|
||||
|
|
||||
|
// The following is FYI |
||||
|
|
||||
|
|
||||
|
//Note that most of the fft routines require full matrices, ie Rsiz==Ncols |
||||
|
//This is how I like to define a real matrix: |
||||
|
//struct matrix { // real matrix |
||||
|
// float *d; // pointer to data |
||||
|
// long Nrows; // number of rows in the matrix |
||||
|
// long Ncols; // number of columns in the matrix (can be less than Rsiz) |
||||
|
// long Rsiz; // number of floats from one row to the next |
||||
|
//}; |
||||
|
//typedef struct matrix matrix; |
||||
|
|
||||
|
|
||||
|
|
||||
|
// CACHEFILLMALLOC and CEILCACHELINE can be used instead of malloc to make |
||||
|
// arrays that start exactly on a cache line start. |
||||
|
// First we CACHEFILLMALLOC a void * (use this void * when free'ing), |
||||
|
// then we set our array pointer equal to the properly cast CEILCACHELINE of this void * |
||||
|
// example: |
||||
|
// aInit = CACHEFILLMALLOC( NUMFLOATS*sizeof(float) ); |
||||
|
// a = (float *) CEILCACHELINE(ainit); |
||||
|
// ... main body of code ... |
||||
|
// free(aInit); |
||||
|
// |
||||
|
// To disable this alignment, set CACHELINESIZE to 1 |
||||
|
//#define CACHELINESIZE 32 // Bytes per cache line |
||||
|
//#define CACHELINEFILL (CACHELINESIZE-1) |
||||
|
//#define CEILCACHELINE(p) ((((unsigned long)p+CACHELINEFILL)/CACHELINESIZE)*CACHELINESIZE) |
||||
|
//#define CACHEFILLMALLOC(n) malloc((n)+CACHELINEFILL) |
||||
3175
src/maths/fft/fftlib.c
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
@ -0,0 +1,76 @@ |
|||||
|
#define MYRECIPLN2 1.442695040888963407359924681001892137426 // 1.0/log(2) |
||||
|
|
||||
|
/* some useful conversions between a number and its power of 2 */ |
||||
|
#define LOG2(a) (MYRECIPLN2*log(a)) // floating point logarithm base 2 |
||||
|
#define POW2(m) ((unsigned long) 1 << (m)) // integer power of 2 for m<32 |
||||
|
|
||||
|
/******************************************************************* |
||||
|
lower level fft stuff called by routines in fftext.c and fft2d.c |
||||
|
*******************************************************************/ |
||||
|
|
||||
|
void fftCosInit(long M, float *Utbl); |
||||
|
/* Compute Utbl, the cosine table for ffts */ |
||||
|
/* of size (pow(2,M)/4 +1) */ |
||||
|
/* INPUTS */ |
||||
|
/* M = log2 of fft size */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *Utbl = cosine table */ |
||||
|
|
||||
|
void fftBRInit(long M, short *BRLow); |
||||
|
/* Compute BRLow, the bit reversed table for ffts */ |
||||
|
/* of size pow(2,M/2 -1) */ |
||||
|
/* INPUTS */ |
||||
|
/* M = log2 of fft size */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *BRLow = bit reversed counter table */ |
||||
|
|
||||
|
void ffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow); |
||||
|
/* Compute in-place complex fft on the rows of the input array */ |
||||
|
/* INPUTS */ |
||||
|
/* *ioptr = input data array */ |
||||
|
/* M = log2 of fft size (ex M=10 for 1024 point fft) */ |
||||
|
/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ |
||||
|
/* *Utbl = cosine table */ |
||||
|
/* *BRLow = bit reversed counter table */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *ioptr = output data array */ |
||||
|
|
||||
|
void iffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow); |
||||
|
/* Compute in-place inverse complex fft on the rows of the input array */ |
||||
|
/* INPUTS */ |
||||
|
/* *ioptr = input data array */ |
||||
|
/* M = log2 of fft size */ |
||||
|
/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ |
||||
|
/* *Utbl = cosine table */ |
||||
|
/* *BRLow = bit reversed counter table */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *ioptr = output data array */ |
||||
|
|
||||
|
void rffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow); |
||||
|
/* Compute in-place real fft on the rows of the input array */ |
||||
|
/* The result is the complex spectra of the positive frequencies */ |
||||
|
/* except the location for the first complex number contains the real */ |
||||
|
/* values for DC and Nyquest */ |
||||
|
/* INPUTS */ |
||||
|
/* *ioptr = real input data array */ |
||||
|
/* M = log2 of fft size */ |
||||
|
/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ |
||||
|
/* *Utbl = cosine table */ |
||||
|
/* *BRLow = bit reversed counter table */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *ioptr = output data array in the following order */ |
||||
|
/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ |
||||
|
|
||||
|
|
||||
|
void riffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow); |
||||
|
/* Compute in-place real ifft on the rows of the input array */ |
||||
|
/* data order as from rffts1 */ |
||||
|
/* INPUTS */ |
||||
|
/* *ioptr = input data array in the following order */ |
||||
|
/* M = log2 of fft size */ |
||||
|
/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ |
||||
|
/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ |
||||
|
/* *Utbl = cosine table */ |
||||
|
/* *BRLow = bit reversed counter table */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *ioptr = real output data array */ |
||||
@ -0,0 +1,297 @@ |
|||||
|
/* a few routines from a vector/matrix library */ |
||||
|
#include "matlib.h" |
||||
|
|
||||
|
void xpose(float *indata, long iRsiz, float *outdata, long oRsiz, long Nrows, long Ncols){ |
||||
|
/* not in-place matrix transpose */ |
||||
|
/* INPUTS */ |
||||
|
/* *indata = input data array */ |
||||
|
/* iRsiz = offset to between rows of input data array */ |
||||
|
/* oRsiz = offset to between rows of output data array */ |
||||
|
/* Nrows = number of rows in input data array */ |
||||
|
/* Ncols = number of columns in input data array */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *outdata = output data array */ |
||||
|
|
||||
|
float *irow; /* pointer to input row start */ |
||||
|
float *ocol; /* pointer to output col start */ |
||||
|
float *idata; /* pointer to input data */ |
||||
|
float *odata; /* pointer to output data */ |
||||
|
long RowCnt; /* row counter */ |
||||
|
long ColCnt; /* col counter */ |
||||
|
float T0; /* data storage */ |
||||
|
float T1; /* data storage */ |
||||
|
float T2; /* data storage */ |
||||
|
float T3; /* data storage */ |
||||
|
float T4; /* data storage */ |
||||
|
float T5; /* data storage */ |
||||
|
float T6; /* data storage */ |
||||
|
float T7; /* data storage */ |
||||
|
const long inRsizd1 = iRsiz; |
||||
|
const long inRsizd2 = 2*iRsiz; |
||||
|
const long inRsizd3 = inRsizd2+iRsiz; |
||||
|
const long inRsizd4 = 4*iRsiz; |
||||
|
const long inRsizd5 = inRsizd3+inRsizd2; |
||||
|
const long inRsizd6 = inRsizd4+inRsizd2; |
||||
|
const long inRsizd7 = inRsizd4+inRsizd3; |
||||
|
const long inRsizd8 = 8*iRsiz; |
||||
|
|
||||
|
ocol = outdata; |
||||
|
irow = indata; |
||||
|
for (RowCnt=Nrows/8; RowCnt>0; RowCnt--){ |
||||
|
idata = irow; |
||||
|
odata = ocol; |
||||
|
for (ColCnt=Ncols; ColCnt>0; ColCnt--){ |
||||
|
T0 = *idata; |
||||
|
T1 = *(idata+inRsizd1); |
||||
|
T2 = *(idata+inRsizd2); |
||||
|
T3 = *(idata+inRsizd3); |
||||
|
T4 = *(idata+inRsizd4); |
||||
|
T5 = *(idata+inRsizd5); |
||||
|
T6 = *(idata+inRsizd6); |
||||
|
T7 = *(idata+inRsizd7); |
||||
|
*odata = T0; |
||||
|
*(odata+1) = T1; |
||||
|
*(odata+2) = T2; |
||||
|
*(odata+3) = T3; |
||||
|
*(odata+4) = T4; |
||||
|
*(odata+5) = T5; |
||||
|
*(odata+6) = T6; |
||||
|
*(odata+7) = T7; |
||||
|
idata++; |
||||
|
odata += oRsiz; |
||||
|
} |
||||
|
irow += inRsizd8; |
||||
|
ocol += 8; |
||||
|
} |
||||
|
if (Nrows%8 != 0){ |
||||
|
for (ColCnt=Ncols; ColCnt>0; ColCnt--){ |
||||
|
idata = irow++; |
||||
|
odata = ocol; |
||||
|
ocol += oRsiz; |
||||
|
for (RowCnt=Nrows%8; RowCnt>0; RowCnt--){ |
||||
|
T0 = *idata; |
||||
|
*odata++ = T0; |
||||
|
idata += iRsiz; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
void cxpose(float *indata, long iRsiz, float *outdata, long oRsiz, long Nrows, long Ncols){ |
||||
|
/* not in-place complex float matrix transpose */ |
||||
|
/* INPUTS */ |
||||
|
/* *indata = input data array */ |
||||
|
/* iRsiz = offset to between rows of input data array */ |
||||
|
/* oRsiz = offset to between rows of output data array */ |
||||
|
/* Nrows = number of rows in input data array */ |
||||
|
/* Ncols = number of columns in input data array */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *outdata = output data array */ |
||||
|
|
||||
|
float *irow; /* pointer to input row start */ |
||||
|
float *ocol; /* pointer to output col start */ |
||||
|
float *idata; /* pointer to input data */ |
||||
|
float *odata; /* pointer to output data */ |
||||
|
long RowCnt; /* row counter */ |
||||
|
long ColCnt; /* col counter */ |
||||
|
float T0r; /* data storage */ |
||||
|
float T0i; /* data storage */ |
||||
|
float T1r; /* data storage */ |
||||
|
float T1i; /* data storage */ |
||||
|
float T2r; /* data storage */ |
||||
|
float T2i; /* data storage */ |
||||
|
float T3r; /* data storage */ |
||||
|
float T3i; /* data storage */ |
||||
|
const long inRsizd1 = 2*iRsiz; |
||||
|
const long inRsizd1i = 2*iRsiz + 1; |
||||
|
const long inRsizd2 = 4*iRsiz; |
||||
|
const long inRsizd2i = 4*iRsiz + 1; |
||||
|
const long inRsizd3 = inRsizd2+inRsizd1; |
||||
|
const long inRsizd3i = inRsizd2+inRsizd1 + 1; |
||||
|
const long inRsizd4 = 8*iRsiz; |
||||
|
|
||||
|
ocol = outdata; |
||||
|
irow = indata; |
||||
|
for (RowCnt=Nrows/4; RowCnt>0; RowCnt--){ |
||||
|
idata = irow; |
||||
|
odata = ocol; |
||||
|
for (ColCnt=Ncols; ColCnt>0; ColCnt--){ |
||||
|
T0r = *idata; |
||||
|
T0i = *(idata +1); |
||||
|
T1r = *(idata+inRsizd1); |
||||
|
T1i = *(idata+inRsizd1i); |
||||
|
T2r = *(idata+inRsizd2); |
||||
|
T2i = *(idata+inRsizd2i); |
||||
|
T3r = *(idata+inRsizd3); |
||||
|
T3i = *(idata+inRsizd3i); |
||||
|
*odata = T0r; |
||||
|
*(odata+1) = T0i; |
||||
|
*(odata+2) = T1r; |
||||
|
*(odata+3) = T1i; |
||||
|
*(odata+4) = T2r; |
||||
|
*(odata+5) = T2i; |
||||
|
*(odata+6) = T3r; |
||||
|
*(odata+7) = T3i; |
||||
|
idata+=2; |
||||
|
odata += 2*oRsiz; |
||||
|
} |
||||
|
irow += inRsizd4; |
||||
|
ocol += 8; |
||||
|
} |
||||
|
if (Nrows%4 != 0){ |
||||
|
for (ColCnt=Ncols; ColCnt>0; ColCnt--){ |
||||
|
idata = irow; |
||||
|
odata = ocol; |
||||
|
for (RowCnt=Nrows%4; RowCnt>0; RowCnt--){ |
||||
|
T0r = *idata; |
||||
|
T0i = *(idata+1); |
||||
|
*odata = T0r; |
||||
|
*(odata+1) = T0i; |
||||
|
odata+=2; |
||||
|
idata += 2*iRsiz; |
||||
|
} |
||||
|
irow+=2; |
||||
|
ocol += 2*oRsiz; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
void cvprod(float *a, float *b, float *out, long N){ |
||||
|
/* complex vector product, can be in-place */ |
||||
|
/* product of complex vector *a times complex vector *b */ |
||||
|
/* INPUTS */ |
||||
|
/* N vector length */ |
||||
|
/* *a complex vector length N complex numbers */ |
||||
|
/* *b complex vector length N complex numbers */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *out complex vector length N */ |
||||
|
|
||||
|
long OutCnt; /* output counter */ |
||||
|
float A0R; /* A storage */ |
||||
|
float A0I; /* A storage */ |
||||
|
float A1R; /* A storage */ |
||||
|
float A1I; /* A storage */ |
||||
|
float A2R; /* A storage */ |
||||
|
float A2I; /* A storage */ |
||||
|
float A3R; /* A storage */ |
||||
|
float A3I; /* A storage */ |
||||
|
float B0R; /* B storage */ |
||||
|
float B0I; /* B storage */ |
||||
|
float B1R; /* B storage */ |
||||
|
float B1I; /* B storage */ |
||||
|
float B2R; /* B storage */ |
||||
|
float B2I; /* B storage */ |
||||
|
float B3R; /* B storage */ |
||||
|
float B3I; /* B storage */ |
||||
|
float T0R; /* TMP storage */ |
||||
|
float T0I; /* TMP storage */ |
||||
|
float T1R; /* TMP storage */ |
||||
|
float T1I; /* TMP storage */ |
||||
|
float T2R; /* TMP storage */ |
||||
|
float T2I; /* TMP storage */ |
||||
|
float T3R; /* TMP storage */ |
||||
|
float T3I; /* TMP storage */ |
||||
|
|
||||
|
if (N>=4){ |
||||
|
A0R = *a; |
||||
|
B0R = *b; |
||||
|
A0I = *(a +1); |
||||
|
B0I = *(b +1); |
||||
|
A1R = *(a +2); |
||||
|
B1R = *(b +2); |
||||
|
A1I = *(a +3); |
||||
|
B1I = *(b +3); |
||||
|
A2R = *(a +4); |
||||
|
B2R = *(b +4); |
||||
|
A2I = *(a +5); |
||||
|
B2I = *(b +5); |
||||
|
A3R = *(a +6); |
||||
|
B3R = *(b +6); |
||||
|
A3I = *(a +7); |
||||
|
B3I = *(b +7); |
||||
|
T0R = A0R * B0R; |
||||
|
T0I = (A0R * B0I); |
||||
|
T1R = A1R * B1R; |
||||
|
T1I = (A1R * B1I); |
||||
|
T2R = A2R * B2R; |
||||
|
T2I = (A2R * B2I); |
||||
|
T3R = A3R * B3R; |
||||
|
T3I = (A3R * B3I); |
||||
|
T0R -= (A0I * B0I); |
||||
|
T0I = A0I * B0R + T0I; |
||||
|
T1R -= (A1I * B1I); |
||||
|
T1I = A1I * B1R + T1I; |
||||
|
T2R -= (A2I * B2I); |
||||
|
T2I = A2I * B2R + T2I; |
||||
|
T3R -= (A3I * B3I); |
||||
|
T3I = A3I * B3R + T3I; |
||||
|
for (OutCnt=N/4-1; OutCnt > 0; OutCnt--){ |
||||
|
a += 8; |
||||
|
b += 8; |
||||
|
A0R = *a; |
||||
|
B0R = *b; |
||||
|
A0I = *(a +1); |
||||
|
B0I = *(b +1); |
||||
|
A1R = *(a +2); |
||||
|
B1R = *(b +2); |
||||
|
A1I = *(a +3); |
||||
|
B1I = *(b +3); |
||||
|
A2R = *(a +4); |
||||
|
B2R = *(b +4); |
||||
|
A2I = *(a +5); |
||||
|
B2I = *(b +5); |
||||
|
A3R = *(a +6); |
||||
|
B3R = *(b +6); |
||||
|
A3I = *(a +7); |
||||
|
B3I = *(b +7); |
||||
|
*out = T0R; |
||||
|
*(out +1) = T0I; |
||||
|
*(out +2) = T1R; |
||||
|
*(out +3) = T1I; |
||||
|
*(out +4) = T2R; |
||||
|
*(out +5) = T2I; |
||||
|
*(out +6) = T3R; |
||||
|
*(out +7) = T3I; |
||||
|
T0R = A0R * B0R; |
||||
|
T0I = (A0R * B0I); |
||||
|
T1R = A1R * B1R; |
||||
|
T1I = (A1R * B1I); |
||||
|
T2R = A2R * B2R; |
||||
|
T2I = (A2R * B2I); |
||||
|
T3R = A3R * B3R; |
||||
|
T3I = (A3R * B3I); |
||||
|
T0R -= (A0I * B0I); |
||||
|
T0I = A0I * B0R + T0I; |
||||
|
T1R -= (A1I * B1I); |
||||
|
T1I = A1I * B1R + T1I; |
||||
|
T2R -= (A2I * B2I); |
||||
|
T2I = A2I * B2R + T2I; |
||||
|
T3R -= (A3I * B3I); |
||||
|
T3I = A3I * B3R + T3I; |
||||
|
out += 8; |
||||
|
} |
||||
|
a += 8; |
||||
|
b += 8; |
||||
|
*out = T0R; |
||||
|
*(out +1) = T0I; |
||||
|
*(out +2) = T1R; |
||||
|
*(out +3) = T1I; |
||||
|
*(out +4) = T2R; |
||||
|
*(out +5) = T2I; |
||||
|
*(out +6) = T3R; |
||||
|
*(out +7) = T3I; |
||||
|
out += 8; |
||||
|
} |
||||
|
for (OutCnt=N%4; OutCnt > 0; OutCnt--){ |
||||
|
A0R = *a++; |
||||
|
B0R = *b++; |
||||
|
A0I = *a++; |
||||
|
B0I = *b++; |
||||
|
T0R = A0R * B0R; |
||||
|
T0I = (A0R * B0I); |
||||
|
T0R -= (A0I * B0I); |
||||
|
T0I = A0I * B0R + T0I; |
||||
|
*out++ = T0R; |
||||
|
*out++ = T0I; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,33 @@ |
|||||
|
/* a few routines from a vector/matrix library */ |
||||
|
|
||||
|
void xpose(float *indata, long iRsiz, float *outdata, long oRsiz, long Nrows, long Ncols); |
||||
|
/* not in-place matrix transpose */ |
||||
|
/* INPUTS */ |
||||
|
/* *indata = input data array */ |
||||
|
/* iRsiz = offset to between rows of input data array */ |
||||
|
/* oRsiz = offset to between rows of output data array */ |
||||
|
/* Nrows = number of rows in input data array */ |
||||
|
/* Ncols = number of columns in input data array */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *outdata = output data array */ |
||||
|
|
||||
|
void cxpose(float *indata, long iRsiz, float *outdata, long oRsiz, long Nrows, long Ncols); |
||||
|
/* not in-place complex matrix transpose */ |
||||
|
/* INPUTS */ |
||||
|
/* *indata = input data array */ |
||||
|
/* iRsiz = offset to between rows of input data array */ |
||||
|
/* oRsiz = offset to between rows of output data array */ |
||||
|
/* Nrows = number of rows in input data array */ |
||||
|
/* Ncols = number of columns in input data array */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *outdata = output data array */ |
||||
|
|
||||
|
void cvprod(float *a, float *b, float *out, long N); |
||||
|
/* complex vector product, can be in-place */ |
||||
|
/* product of complex vector *a times complex vector *b */ |
||||
|
/* INPUTS */ |
||||
|
/* N vector length */ |
||||
|
/* *a complex vector length N complex numbers */ |
||||
|
/* *b complex vector length N complex numbers */ |
||||
|
/* OUTPUTS */ |
||||
|
/* *out complex vector length N */ |
||||
17882
visualc/vngspice.vcproj
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
Write
Preview
Loading…
Cancel
Save
Reference in new issue