DEFUN_DLD ( sinewave_indicator, args, nargout )
{
octave_value_list retval_list ;
int nargin = args.length () ;
int vec_length = args(0).length () ;
// check the input argument
if ( nargin != 1 )
{
error ("Invalid argument. Input is a single price vector.") ;
return retval_list ;
}
if ( vec_length < 50 )
{
error ("Invalid argument. Input is a single price vector.") ;
return retval_list ;
}
if ( error_state )
{
error ("Invalid argument. Input is a single price vector.") ;
return retval_list ;
}
// end of input checking
// inputs
ColumnVector price = args(0).column_vector_value () ;
// outputs
ColumnVector sinewave( vec_length ) ;
ColumnVector sinewave_lead_1( vec_length ) ;
ColumnVector smoothperiod_out( vec_length ) ;
ColumnVector dcphase_vec( vec_length ) ;
ColumnVector sumperiod( vec_length ) ;
ColumnVector sum_period( vec_length ) ;
ColumnVector deltaphase( vec_length ) ;
// Declarations for calculations of period, phase & sine wave measurements
ColumnVector smooth( vec_length ) ;
ColumnVector period( vec_length ) ;
ColumnVector smoothperiod( vec_length ) ;
ColumnVector detrender( vec_length ) ;
ColumnVector Q1( vec_length ) ;
ColumnVector I1( vec_length ) ;
ColumnVector jI( vec_length ) ;
ColumnVector jQ( vec_length ) ;
ColumnVector I2( vec_length ) ;
ColumnVector Q2( vec_length ) ;
ColumnVector sI2( vec_length ) ;
ColumnVector sQ2( vec_length ) ;
ColumnVector Re( vec_length ) ;
ColumnVector Im( vec_length ) ;
ColumnVector sRe( vec_length ) ;
ColumnVector sIm( vec_length ) ;
int dcperiod ;
double realpart ;
double imagpart ;
double dcphase ;
double sum_deltaphase ;
int count ;
// unrolled loop to fill the first 5 elements of above calculation vectors ( unrolled for speed optimisation )
sinewave(0) = 0.0 ; sinewave(1) = 0.0 ; sinewave(2) = 0.0 ; sinewave(3) = 0.0 ; sinewave(4) = 0.0 ;
sinewave_lead_1(0) = 0.0 ; sinewave_lead_1(1) = 0.0 ; sinewave_lead_1(2) = 0.0 ; sinewave_lead_1(3) = 0.0 ; sinewave_lead_1(4) = 0.0 ;
smoothperiod_out(0) = 0.0 ; smoothperiod_out(1) = 0.0 ; smoothperiod_out(2) = 0.0 ; smoothperiod_out(3) = 0.0 ; smoothperiod_out(4) = 0.0 ;
dcphase_vec(0) = 0.0 ; dcphase_vec(1) = 0.0 ; dcphase_vec(2) = 0.0 ; dcphase_vec(3) = 0.0 ; dcphase_vec(4) = 0.0 ;
smooth(0) = 0.0 ; smooth(1) = 0.0 ; smooth(2) = 0.0 ; smooth(3) = 0.0 ; smooth(4) = 0.0 ;
period(0) = 0.0 ; period(1) = 0.0 ; period(2) = 0.0 ; period(3) = 0.0 ; period(4) = 0.0 ;
smoothperiod(0) = 0.0 ; smoothperiod(1) = 0.0 ; smoothperiod(2) = 0.0 ; smoothperiod(3) = 0.0 ; smoothperiod(4) = 0.0 ;
detrender(0) = 0.0 ; detrender(1) = 0.0 ; detrender(2) = 0.0 ; detrender(3) = 0.0 ; detrender(4) = 0.0 ;
Q1(0) = 0.0 ; Q1(1) = 0.0 ; Q1(2) = 0.0 ; Q1(3) = 0.0 ; Q1(4) = 0.0 ;
I1(0) = 0.0 ; I1(1) = 0.0 ; I1(2) = 0.0 ; I1(3) = 0.0 ; I1(4) = 0.0 ;
jI(0) = 0.0 ; jI(1) = 0.0 ; jI(2) = 0.0 ; jI(3) = 0.0 ; jI(4) = 0.0 ;
jQ(0) = 0.0 ; jQ(1) = 0.0 ; jQ(2) = 0.0 ; jQ(3) = 0.0 ; jQ(4) = 0.0 ;
I2(0) = 0.0 ; I2(1) = 0.0 ; I2(2) = 0.0 ; I2(3) = 0.0 ; I2(4) = 0.0 ;
Q2(0) = 0.0 ; Q2(1) = 0.0 ; Q2(2) = 0.0 ; Q2(3) = 0.0 ; Q2(4) = 0.0 ;
sI2(0) = 0.0 ; sI2(1) = 0.0 ; sI2(2) = 0.0 ; sI2(3) = 0.0 ; sI2(4) = 0.0 ;
sQ2(0) = 0.0 ; sQ2(1) = 0.0 ; sQ2(2) = 0.0 ; sQ2(3) = 0.0 ; sQ2(4) = 0.0 ;
Re(0) = 0.0 ; Re(1) = 0.0 ; Re(2) = 0.0 ; Re(3) = 0.0 ; Re(4) = 0.0 ;
Im(0) = 0.0 ; Im(1) = 0.0 ; Im(2) = 0.0 ; Im(3) = 0.0 ; Im(4) = 0.0 ;
sRe(0) = 0.0 ; sRe(1) = 0.0 ; sRe(2) = 0.0 ; sRe(3) = 0.0 ; sRe(4) = 0.0 ;
sIm(0) = 0.0 ; sIm(1) = 0.0 ; sIm(2) = 0.0 ; sIm(3) = 0.0 ; sIm(4) = 0.0 ;
for ( octave_idx_type ii (5) ; ii < vec_length ; ii++ ) // Start the main loop
{
// smooth the price for hilbert calculations
smooth(ii) = (4.0 * price(ii) + 3.0 * price(ii-1) + 2.0 * price(ii-2) + price(ii-3) ) / 10.0 ;
// Detrend the input
detrender(ii) = (0.0962 * smooth(ii) + 0.5769 * smooth(ii-2) - 0.5769 * smooth(ii-4) - 0.0962 * smooth(ii-6)) * (0.075 * period(ii-1) + 0.54) ;
// Compute InPhase and Quadrature components
Q1(ii) = (0.0962 * detrender(ii) + 0.5769 * detrender(ii-2) - 0.5769 * detrender(ii-4) - 0.0962 * detrender(ii-6)) * (0.075 * period(ii-1) + 0.54) ;
I1(ii) = detrender(ii-3) ;
// Advance the phase of I1 and Q1 by 90 degrees
jI(ii) = (0.0962 * I1(ii) + 0.5769 * I1(ii-2) - 0.5769 * I1(ii-4) - 0.0962 * I1(ii-6)) * (0.075 * period(ii-1) + 0.54) ;
jQ(ii) = (0.0962 * Q1(ii) + 0.5769 * Q1(ii-2) - 0.5769 * Q1(ii-4) - 0.0962 * Q1(ii-6)) * (0.075 * period(ii-1) + 0.54) ;
// Phasor addition for 3 bar averaging
I2(ii) = I1(ii) - jQ(ii) ;
Q2(ii) = Q1(ii) + jI(ii) ;
// Smooth the I and Q components before applying the discriminator
sI2(ii) = 0.2 * I2(ii) + 0.8 * sI2(ii-1) ;
sQ2(ii) = 0.2 * Q2(ii) + 0.8 * sQ2(ii-1) ;
// Homodyne Discriminator
Re(ii) = sI2(ii) * sI2(ii-1) + sQ2(ii) * sQ2(ii-1) ;
Im(ii) = sI2(ii) * sQ2(ii-1) - sQ2(ii) * sI2(ii-1) ;
sRe(ii) = 0.2 * Re(ii) + 0.8 * sRe(ii-1) ;
sIm(ii) = 0.2 * Im(ii) + 0.8 * sIm(ii-1) ;
if ( (sIm(ii) > 0.0 || sIm(ii) < 0.0) && (sRe(ii) > 0.0 || sRe(ii) < 0.0) )
{
period(ii) = 360.0 / ( ((atan(sIm(ii) / sRe(ii))) * 180.0) / PI ) ;
}
else
{
period(ii) = period(ii-1) ;
}
if ( period(ii) > 1.5 * period(ii-1) )
{
period(ii) = 1.5 * period(ii-1) ;
}
if ( period(ii) < 0.67 * period(ii-1) )
{
period(ii) = 0.67 * period(ii-1) ;
}
if ( period(ii) < 6.0 )
{
period(ii) = 6.0 ;
}
if ( period(ii) > 50.0 )
{
period(ii) = 50.0 ;
}
period(ii) = 0.2 * period(ii) + 0.8 * period(ii-1) ;
smoothperiod(ii) = 0.33 * period(ii) + 0.67 * smoothperiod(ii-1) ;
// Compute Dominant Cycle
dcperiod = int ( smoothperiod(ii) + 0.5 ) ;
realpart = 0.0 ;
imagpart = 0.0 ;
dcphase = 0.0 ;
for ( octave_idx_type jj (0) ; jj <= ( dcperiod - 1 ) ; jj++ )
{
realpart += sin( PI/180.0 * 360.0 * jj / dcperiod ) * ( smooth(ii-jj) ) ;
imagpart += cos( PI/180.0 * 360.0 * jj / dcperiod ) * ( smooth(ii-jj) ) ;
}
if ( fabs( imagpart ) > 0.0 )
{
dcphase = atan( realpart / imagpart ) * 180.0 / PI ;
}
else if ( fabs( imagpart ) < 0.001 )
{
if ( realpart < 0.0 )
{
dcphase -= 90.0 ;
}
else if ( realpart > 0.0 )
{
dcphase += 90.0 ;
}
}
dcphase += 90.0 ;
// Compensate for one bar lag of the 4 bar weighted moving average
dcphase += 360.0 / smoothperiod(ii) ;
if ( imagpart < 0.0 )
dcphase += 180.0 ;
if ( dcphase > 315.0 )
dcphase -= 360.0 ;
// phase output
dcphase_vec(ii) = dcphase ;
//Now compute a differential phase, resolve phase wraparound, and limit delta phase errors
deltaphase(ii) = dcphase_vec(ii) - dcphase_vec(ii-1) ;
if ( dcphase_vec(ii-1) > 270.0 && dcphase_vec(ii) < 90.0 )
{
deltaphase(ii) = 360.0 - dcphase_vec(ii-1) + dcphase_vec(ii) ;
}
if ( deltaphase(ii) < 1.0 )
{
deltaphase(ii) = 1.0 ;
}
if ( deltaphase(ii) > 60.0 )
{
deltaphase(ii) = 60.0 ;
}
// Sum Deltaphases to reach 360 degrees. The sum is the instantaneous period.
sum_period(ii) = 0.0 ;
sum_deltaphase = 0.0 ;
count = 0 ;
while ( sum_deltaphase < 360.0 )
{
sum_deltaphase += deltaphase(ii-count) ;
count ++ ;
sum_period(ii) = count ;
}
// Resolve Instantaneous Period errors and smooth
if ( sum_period(ii) == 0.0 )
{
sum_period(ii) = sum_period(ii-1) ;
}
sumperiod(ii) = 0.25 * sum_period(ii) + 0.75 * sum_period(ii-1) ;
// sinewave output
sinewave(ii) = sin( dcphase * PI / 180.0 ) ;
// one bar leading function
sinewave_lead_1(ii) = sin( ( dcphase + 360.0 / smoothperiod(ii) ) * PI / 180.0 ) ;
// period output
smoothperiod_out(ii) = floor ( smoothperiod(ii) + 0.5 ) ;
} // end of main ii loop
retval_list(3) = dcphase_vec ;
retval_list(2) = smoothperiod_out ;
retval_list(1) = sinewave_lead_1 ;
retval_list(0) = sinewave ;
return retval_list ;
} // end of function
This is a straightforward conversion of the code available from here. A nice intro to how it can be used is here and Ehler's own website can be found here.
"Trading is statistics and time series analysis." This blog details my progress in developing a systematic trading system for use on the futures and forex markets, with discussion of the various indicators and other inputs used in the creation of the system. Also discussed are some of the issues/problems encountered during this development process. Within the blog posts there are links to other web pages that are/have been useful to me.
Pages
▼
Monday, 21 December 2015
John Ehler's Sinewave Indicator Code
A reader recently inquired about my use of this indicator and so below I provide my Octave C++ .oct version that I have been using for the past few years.
Sunday, 6 December 2015
Denoising Autoencoder MATLAB/Octave Code
Following on from my last post I have been looking for Octave code for the denoising autoencoder to avoid reinventing the wheel and writing it myself from scratch, and luckily I have found two options. The first is a tutorial on autoencoders, by a Piotr Mirowski, which has a link to a Github page with code. The second is a toolbox for dimensionality reduction, by Laurens van der Maaten, which has autoencoders as one of its options.
As of now I'm not sure which one I'll use, and perhaps I might yet write my own code heavily reusing elements from both of the above. More in due course.
As of now I'm not sure which one I'll use, and perhaps I might yet write my own code heavily reusing elements from both of the above. More in due course.
Sunday, 22 November 2015
Recent Readings and New Directions.
Since my last post I have been doing a fair bit of online research and fortunately I have discovered the following papers, which mesh nicely with what I am trying to do with Conditional Restricted Boltzmann Machines to model time series:-
Deep Learning Architecture for Univariate Time Series Forecasting
Temporal Autoencoding Restricted Boltzmann Machine
Temporal Autoencoding Improves Generative Models of Time Series
Deep Modelling Complex Couplings Within Financial Markets
Predicting Time Series of Railway Speed Restrictions with Time Dependent Machine Learning Techniques
The big take away from these readings is to explicitly model the autoregressive components via a Denoising Autoencoder and secondly, not to model a univariate time series in isolation, but model a multivariate time series where the "other" time series are either informative measures taken from the univariate series itself (informative indicators?) and/or related time series e.g in forex one could use concepts similar to fractional product inefficiency or on all markets the concept of Intermarket analysis.
For the nearest future I have therefore set myself the task of adapting my CRBM code to include the denoising autoencoder and to investigate the multivariate time series approach.
Deep Learning Architecture for Univariate Time Series Forecasting
Temporal Autoencoding Restricted Boltzmann Machine
Temporal Autoencoding Improves Generative Models of Time Series
Deep Modelling Complex Couplings Within Financial Markets
Predicting Time Series of Railway Speed Restrictions with Time Dependent Machine Learning Techniques
The big take away from these readings is to explicitly model the autoregressive components via a Denoising Autoencoder and secondly, not to model a univariate time series in isolation, but model a multivariate time series where the "other" time series are either informative measures taken from the univariate series itself (informative indicators?) and/or related time series e.g in forex one could use concepts similar to fractional product inefficiency or on all markets the concept of Intermarket analysis.
For the nearest future I have therefore set myself the task of adapting my CRBM code to include the denoising autoencoder and to investigate the multivariate time series approach.
Tuesday, 13 October 2015
Giving up on Runge-Kutta Methods (for now?)
Over the last few weeks I have been looking at using Runge-Kutta methods for the creation of features, but I have decided to give up on this for now simply because I think I have found a better way to accomplish what I want. I was alerted to this possible approach by this post over at http://dsp.stackexchange.com/ and following up on this I remembered that a few years ago I coded up John Ehler's Linear Prediction Filter (the white paper for which might still be available here) and my crudely transcribed code given below:
DEFUN_DLD (linearpredict, args, , "Help String")
{
octave_value retval;
ColumnVector a = args(0).column_vector_value (); // The input price
ColumnVector b(a); // This will be the output column vector returned to Octave by "retval"
const int n = args(1).int_value();
int lngth = 10;
double g[30];
int zz;
for (zz = 0; zz < 30; zz++)
g[zz] = 0.0;
double sigPredict[30];
for (zz = 0; zz < 30; zz++)
sigPredict[zz] = 0.0;
double sigPower = 0.0;
double mu = 0.0;
double xBar = 0.0;
int jj = 0;
for (octave_idx_type ii (10); ii < a.length="" 10="" a="" average="" factor="" for="" href="https://draft.blogger.com/null" if="" ii="" jj="" lngth="" loop="" normalization="" ompute="" onvergence="" power="" sigpower="" start="" the=""> 0)
mu = 0.25 / (sigPower * 10);
//Compute signal estimate
xBar = 0;
for (jj = 1; jj <= lngth; jj++)
xBar = xBar + a(ii - jj) * g[jj];
//Compute gain coefficients
for (jj = 1; jj <= lngth; jj++)
g[jj] = g[jj] + (mu * (a(ii) - xBar) * a(ii - jj));
//Compute signal prediction waveform
for (jj = 0; jj <= lngth; jj++)
sigPredict[jj] = a(ii - (10 - jj));
//Extend signal prediction into the future
int kk = 0;
for (jj = lngth + 1; jj <= lngth + 5; jj++)
{
sigPredict[jj] = 0;
for (kk = 1; kk <= lngth; kk++)
sigPredict[jj] = sigPredict[jj] + sigPredict[jj - kk] * g[kk];
}
b(ii) = sigPredict[lngth + n];
}
retval = b; // Assign the output column vector to the return value
return retval; // Return the output to Octave
}
which is very similar in concept to Burg's method. I think some application of these methods show more promise than concentrating on my naive implementation of Runge-Kutta.
Monday, 28 September 2015
Runge-Kutta Example and Code
Following on from my last post I thought I would, as a first step, code up a "straightforward" Runge-Kutta function and show how to deal with the fact that there is no "magic mathematical formula" to calculate the slopes that are an integral part of Runge-Kutta.
My approach is to fit a quadratic function to the last n_bars of price and take the slope of this via my Savitzky-Golay filter convolution code, and in doing so the Runge-Kutta value k1 can easily be obtained. The extrapolation beyond the k1 point to the points k2 and k3 is, in effect, trying to fit to points that have a half bar lead over the last available price. To accommodate this I use the last n_bar - 1 points of a 2 bar simple moving average plus the "position" of points k2 and k3 to calculate the slopes at k2 and k3. A 2 bar simple moving average is used because this has a half bar lag and is effectively an interpolation of the known prices at the "half bar intervals," and therefore points k2 and k3 are one h step ahead of the last half bar interval. The k4 point is again simply calculated directly from prices plus the half bar interval projection from point k3. If all this seems confusing, hopefully the Octave code below will clear things up.
and this with noise ( line 12 of the code ).
The cyan line is the underlying price and the red line is the Runge-Kutta 1 bar ahead projection. As can be seen, when the price is moving in rather straight lines the projection is quite accurate, however, at turnings there is some overshoot, which is to be expected. I'm not unduly concerned about this overshoot as my intention is simply to get the various k values as features, but this overshoot does have some implications which I will discuss in a future post.
My approach is to fit a quadratic function to the last n_bars of price and take the slope of this via my Savitzky-Golay filter convolution code, and in doing so the Runge-Kutta value k1 can easily be obtained. The extrapolation beyond the k1 point to the points k2 and k3 is, in effect, trying to fit to points that have a half bar lead over the last available price. To accommodate this I use the last n_bar - 1 points of a 2 bar simple moving average plus the "position" of points k2 and k3 to calculate the slopes at k2 and k3. A 2 bar simple moving average is used because this has a half bar lag and is effectively an interpolation of the known prices at the "half bar intervals," and therefore points k2 and k3 are one h step ahead of the last half bar interval. The k4 point is again simply calculated directly from prices plus the half bar interval projection from point k3. If all this seems confusing, hopefully the Octave code below will clear things up.
clear all
% create the raw price series
period = input( 'Period? ' ) ;
sideways = zeros( 1 , 2*period ) ;
uwr = ( 1 : 1 : 2*period ) .* 1.333 / period ; uwr_end = uwr(end) ;
unr = ( 1 : 1 : 2*period ) .* 4 / period ; unr_end = unr(end) ;
dwr = ( 1 : 1 : 2*period ) .* -1.333 / period ; dwr_end = dwr(end) ;
dnr = ( 1 : 1 : 2*period ) .* -4 / period ; dnr_end = dnr(end) ;
trends = [ sideways , uwr , unr.+uwr_end , sideways.+uwr_end.+unr_end , dnr.+uwr_end.+unr_end , dwr.+uwr_end.+unr_end.+dnr_end , sideways ] .+ 2 ;
noise = randn( 1 , length(trends) ) .* 0.0 ;
price = sinewave( length( trends ) , period ) .+ trends .+ noise ;
ma_2 = sma( price , 2 ) ;
% regress over 'n_bar' bars
n_bar = 9 ;
% and a 'p' order fit
p = 2 ;
% get the relevant coefficients
slope_coeffs = generalised_sgolay_filter_coeffs( n_bar , p , 1 ) ;
% container for 1 bar ahead projection
projection_1_bar = zeros( 1 , length( price ) ) ;
for ii = n_bar : length( price )
% calculate k1 value i.e. values at price(ii), the most recent price
k1 = price( ii-(n_bar-1) : ii ) * slope_coeffs( : , end ) ;
projection_of_point_k2 = price(ii) + k1 / 2 ;
% calculate k2 value
k2 = [ ma_2( ii-(n_bar-2) : ii ) ; projection_of_point_k2 ]' * slope_coeffs( : , end ) ;
projection_of_point_k3 = price(ii) + k2 / 2 ;
% calculate k3 value
k3 = [ ma_2( ii-(n_bar-2) : ii ) ; projection_of_point_k3 ]' * slope_coeffs( : , end ) ;
projection_of_point_k4 = price(ii) + k3 / 2 ;
% calculate k4 value
k4 = [ price( ii-(n_bar-2) : ii ) , projection_of_point_k4 ] * slope_coeffs( : , end ) ;
% the runge-kutta weighted moving average
projection_1_bar(ii) = price(ii) + ( k1 + 2 * ( k2 + k3 ) + k4 ) / 6 ;
end
% shift for plotting
projection_1_bar = shift( projection_1_bar , 1 ) ;
projection_1_bar( : , 1:n_bar ) = price( : , 1:n_bar ) ;
plot( price , 'c' , projection_1_bar , 'r' ) ;
This code produces a plot like this, without noise,and this with noise ( line 12 of the code ).
The cyan line is the underlying price and the red line is the Runge-Kutta 1 bar ahead projection. As can be seen, when the price is moving in rather straight lines the projection is quite accurate, however, at turnings there is some overshoot, which is to be expected. I'm not unduly concerned about this overshoot as my intention is simply to get the various k values as features, but this overshoot does have some implications which I will discuss in a future post.
Friday, 25 September 2015
Runge-Kutta Methods
As stated in my previous post I have been focusing on getting some meaningful features as possible inputs to my machine learning based trading system, and one of the possible ideas that has caught my attention is using Runge-Kutta methods to project ( otherwise known as "guessing" ) future price evolution. I have used this sort of approach before in the construction of my perfect oscillator ( links here, here, here and here ) but I deem this approach unsuitable for the type of "guessing" I want to do for my mfe-mae indicator as I need to "guess" rolling maximum highs and minimum lows, which quite often are actually flat for consecutive price bars. In fact, what I want to do is predict/guess upper and lower bounds for future price evolution, which might actually turn out to be a more tractable problem than predicting price itself.
The above wiki link to Runge-Kutta methods is a pretty dense mathematical read and readers may be wondering how approximation of solutions to ordinary differential equations can possibly relate to my stated aim, however the following links visualise Runge-Kutta in an accessible way:
which is just linear regression on the k slope values with the most recent price set as the intercept term! Hopefully this will be a useful way to generate features for my conditional restricted boltzmann machine, and if I use regularized linear regression I might finally be able to use my particle swarm optimisation code.
More in due course.
The above wiki link to Runge-Kutta methods is a pretty dense mathematical read and readers may be wondering how approximation of solutions to ordinary differential equations can possibly relate to my stated aim, however the following links visualise Runge-Kutta in an accessible way:
- https://jmckennonmth212s09.wordpress.com/2009/02/03/runge-kutta-method-vs-euler-method/
- https://wigglewave.wordpress.com/2012/11/09/intuitive-description-of-runge-kutta-integration/
- https://www.youtube.com/watch?v=0iRZqmzs9dk
- http://buttersblog.com/runge-kutta/
which is just linear regression on the k slope values with the most recent price set as the intercept term! Hopefully this will be a useful way to generate features for my conditional restricted boltzmann machine, and if I use regularized linear regression I might finally be able to use my particle swarm optimisation code.
More in due course.
Thursday, 10 September 2015
Recent reading
In my last post I mentioned that I was going away for the summer, but now I'm back. During the summer I didn't get to do as much reading etc. as I had hoped, but I did manage to play around with the Rssa package for singular spectrum analysis and this is still an ongoing investigation. I also briefly looked at independent component analysis and the FastICA package.
The common theme of the above is the extraction of meaningful time series features, and this general area is what I will be looking into for my next set of posts.
The common theme of the above is the extraction of meaningful time series features, and this general area is what I will be looking into for my next set of posts.
Wednesday, 24 June 2015
Results of Permutation tests on Cauchy-Schwarz
Following on from my previous post, I have to say that the results have been quite disappointing - in all the tests I have conducted so far I have been unable to reject the null hypothesis. These tests are still on-going and I may yet find some gold, but it is looking increasingly unlikely and so I won't bore readers with the results of these negative tests. Unless something drastically changes I am planning on abandoning the Cauchy-Schwarz matching algorithm, at least in its current form.
For those who are interested, the test I am conducting is the data mining bias adjusted random permutation test, which is the position_vector_permutation_test.cc file on my Data Snooping Tests page on Github.
Soon I shall be going away for the summer and although I will continue working on a borrowed laptop, I am not sure what internet access I will have and so this post may be my last substantive post until some time in September. I am taking a lot of reading with me, all my Octave code and data and I have loaded up my R installation with lots of interesting packages to play around with some new ideas, so hopefully there will be some interesting new things to post about in the autumn.
For those who are interested, the test I am conducting is the data mining bias adjusted random permutation test, which is the position_vector_permutation_test.cc file on my Data Snooping Tests page on Github.
Soon I shall be going away for the summer and although I will continue working on a borrowed laptop, I am not sure what internet access I will have and so this post may be my last substantive post until some time in September. I am taking a lot of reading with me, all my Octave code and data and I have loaded up my R installation with lots of interesting packages to play around with some new ideas, so hopefully there will be some interesting new things to post about in the autumn.
Tuesday, 23 June 2015
Cauchy-Schwarz Matching Algo Revisited: More Tests
Some time ago I blogged about my Cauchy Schwarz inequality inspired matching algorithm and some tests of it here and here. More recently I have come across a nice paper about developing and back testing systematic trading strategies here, courtesy of the quantnews.com aggregating site, and being motivated by the exhortation in said paper to conduct hypothesis driven development and separate evaluation of each component of a strategy, I have decided to perform some more tests of the matching algorithm.
The above mentioned tests were of the Effect size of differences in means between random matches of price and algorithm matched prices for 5 bars following a test point, with the test statistic being the Cauchy-Schwarz value itself. This was intended to be a test of the similarity of the evolution of price after any particular test point. However, a more pertinent test is whether this similarity can be exploited for profit, and doubly so since I intend the matching algorithm to select training examples for my machine learning development of a trading system. If there is no potential to extract profit from the basic selection of matched training examples, it would be naive to expect any machine learning algorithm to somehow magic such profit from these same examples.
The first (set) of test(s) I have in mind is a simple Monte Carlo Permutation test, which will be the subject of my next post.
The above mentioned tests were of the Effect size of differences in means between random matches of price and algorithm matched prices for 5 bars following a test point, with the test statistic being the Cauchy-Schwarz value itself. This was intended to be a test of the similarity of the evolution of price after any particular test point. However, a more pertinent test is whether this similarity can be exploited for profit, and doubly so since I intend the matching algorithm to select training examples for my machine learning development of a trading system. If there is no potential to extract profit from the basic selection of matched training examples, it would be naive to expect any machine learning algorithm to somehow magic such profit from these same examples.
The first (set) of test(s) I have in mind is a simple Monte Carlo Permutation test, which will be the subject of my next post.
Monday, 25 May 2015
Accounting for Data Mining Bias
I've recently subscribed to this forexfactory thread, which is about using machine learning to develop trading systems, and the subject of data mining/data dredging has come up. This post is a short description of how mining/dredging can be accounted for, but readers should be aware that the following is not a precise description of any particular test with accompanying code, but rather a hand wavy description of a general but important principle.
Suppose one has conducted a whole series of tests on a particular set of data with a view to developing a trading system. The precise nature of this is not really important - it could be some machine learning approach, a grid search of moving average crossover parameter values, a series of elimination contests to find the "best" indicator, or whatever. While doing this we keep a record of all our results and when the search is complete we plot a histogram thus:-
which is the result of 160,000 distinct tests plotted in 200 bins. Naturally, having done this, we select the best system found, represented by the vertical cursor line at x-axis value 5.2. This 5.2 is our test metric of choice, be it Sharpe ratio, win to loss ratio, whatever. But then we ask ourselves whether we have truly found a world beating system or is this discovery the result of data mining?
To test this, we create a random set of data which has the same attributes as the real data used above. The random data can be obtained by Bootstrapping, random permutation, application of a Markov chain with state spaces derived from the original data etc. The actual choice of which to use will depend on the null hypothesis one wants to test. Having obtained our random data set, we then perform the exact same search as we did above and record the test metric of best performing system found on this random data set. We repeat this 160,000 times and then plot a histogram ( in red ) of the best test results over all these random data sets:-
We find that this random set has a mean value of 0.5 and a standard deviation of 0.2. What this red test set represents is the ability/power of our machine learning algo, grid search criteria etc. to uncover "good" systems in even meaningless data, where all relationships are, in effect, spurious and contain no predictive ability.
We must now suppose that this power to uncover spurious relationships also exists in our original set of tests on the real data, and it must be accounted for. For purposes of illustration I'm going to take a naive approach and take 4 times the standard deviation plus the mean of the red distribution and shift our original green distribution to the right by an amount equal to this sum, a value of 1.3 thus:-
We now see that our original test metric value of 5.2, which was well out in the tail of the non-shifted green distribution, is comfortably within the tail of the shifted distribution, and depending on our choice of p-value etc. we may not be able to reject our null hypothesis, whatever it may have been.
As I warned readers above, this is not supposed to be a mathematically rigorous exposition of how to account for data mining bias, but rather an illustrative explanation of the principle(s) behind accounting for it. The main take away is that the red distribution, whatever it is for the test(s) you are running, needs to be generated and then the tests on real data need to be appropriately discounted by the relevant measures taken from the red distribution before any inferences are drawn about the efficacy of the results on the real data.
For more information about data mining tests readers might care to visit a Github repository I have created, which contains code and some academic papers on the subject.
Suppose one has conducted a whole series of tests on a particular set of data with a view to developing a trading system. The precise nature of this is not really important - it could be some machine learning approach, a grid search of moving average crossover parameter values, a series of elimination contests to find the "best" indicator, or whatever. While doing this we keep a record of all our results and when the search is complete we plot a histogram thus:-
which is the result of 160,000 distinct tests plotted in 200 bins. Naturally, having done this, we select the best system found, represented by the vertical cursor line at x-axis value 5.2. This 5.2 is our test metric of choice, be it Sharpe ratio, win to loss ratio, whatever. But then we ask ourselves whether we have truly found a world beating system or is this discovery the result of data mining?
To test this, we create a random set of data which has the same attributes as the real data used above. The random data can be obtained by Bootstrapping, random permutation, application of a Markov chain with state spaces derived from the original data etc. The actual choice of which to use will depend on the null hypothesis one wants to test. Having obtained our random data set, we then perform the exact same search as we did above and record the test metric of best performing system found on this random data set. We repeat this 160,000 times and then plot a histogram ( in red ) of the best test results over all these random data sets:-
We find that this random set has a mean value of 0.5 and a standard deviation of 0.2. What this red test set represents is the ability/power of our machine learning algo, grid search criteria etc. to uncover "good" systems in even meaningless data, where all relationships are, in effect, spurious and contain no predictive ability.
We must now suppose that this power to uncover spurious relationships also exists in our original set of tests on the real data, and it must be accounted for. For purposes of illustration I'm going to take a naive approach and take 4 times the standard deviation plus the mean of the red distribution and shift our original green distribution to the right by an amount equal to this sum, a value of 1.3 thus:-
We now see that our original test metric value of 5.2, which was well out in the tail of the non-shifted green distribution, is comfortably within the tail of the shifted distribution, and depending on our choice of p-value etc. we may not be able to reject our null hypothesis, whatever it may have been.
As I warned readers above, this is not supposed to be a mathematically rigorous exposition of how to account for data mining bias, but rather an illustrative explanation of the principle(s) behind accounting for it. The main take away is that the red distribution, whatever it is for the test(s) you are running, needs to be generated and then the tests on real data need to be appropriately discounted by the relevant measures taken from the red distribution before any inferences are drawn about the efficacy of the results on the real data.
For more information about data mining tests readers might care to visit a Github repository I have created, which contains code and some academic papers on the subject.
Thursday, 23 April 2015
A Simple Visual Test of CRBM Performance
Following on from the successful C++ .oct coding of the Gaussian and Binary units, I thought I would conduct a simple visual test of the conditional restricted boltzmann machine, both as a test of the algorithm itself and of my coding of the .oct functions. For this I selected a "difficult" set of price bars from the EURUSD forex pair, difficult in the sense that it is a ranging market with opportunities for the algorithm to fail at market turns, and this is shown below:
For this test there was no optimisation whatsoever; I simply chose a model order of 3, 50 hidden units for both the Gaussian and binary units, the top 500 training examples from my Cauchy-Schwarz matching algorithm, 100 training epochs and 3 sets of features each to model the next day's open, the 3 day maximum high and the 3 day minimum low. These training targets are different from the targets I presented earlier, where I modelled the next 3 OHLC bars individually, because of the results of a very simple analysis of what I will be trying to predict with the CRBM.
The video below presents the results of this visual test. It shows a sliding window of 9 bars moved from left to right over the bars shown in the chart above. In this window the first 6 bars are used to initialise the CRBM, with the 6th bar being the most recent, and the 7th, 8th and 9th bars are the "future" bars for which the CRBM models the open price of the 7th bar and the 3 bar max high and min low of the 7th, 8th and 9th bars. The open price level is shown by the horizontal black line, the max high by the blue line and the min low by the red line.
For this test there was no optimisation whatsoever; I simply chose a model order of 3, 50 hidden units for both the Gaussian and binary units, the top 500 training examples from my Cauchy-Schwarz matching algorithm, 100 training epochs and 3 sets of features each to model the next day's open, the 3 day maximum high and the 3 day minimum low. These training targets are different from the targets I presented earlier, where I modelled the next 3 OHLC bars individually, because of the results of a very simple analysis of what I will be trying to predict with the CRBM.
The video below presents the results of this visual test. It shows a sliding window of 9 bars moved from left to right over the bars shown in the chart above. In this window the first 6 bars are used to initialise the CRBM, with the 6th bar being the most recent, and the 7th, 8th and 9th bars are the "future" bars for which the CRBM models the open price of the 7th bar and the 3 bar max high and min low of the 7th, 8th and 9th bars. The open price level is shown by the horizontal black line, the max high by the blue line and the min low by the red line.
In viewing this readers should bear in mind that these levels will be the inputs to my MFEMAE indicator and so the accuracy of the absolute levels is not as important as the ratios between them. However, that said, I am quite impressed with this unoptimised performance and I am certain than this can be improved with cross validated optimisation of the various parameters. This will be my focus for the nearest future.
Monday, 20 April 2015
Optimised CRBM Code for Binary Units
Following on from my previous post, in the code box below there is the C++ .oct code for training the binary units of the CRBM and, as with the code for training the Gaussian units, the code is liberally commented. There is only a slight difference between the two code files.
DEFUN_DLD ( cc_binary_crbm_mersenne , args , nargout ,
"-*- texinfo -*-\n\
@deftypefn {Function File} {} cc_binary_crbm_mersenne (@var{ batchdata , minibatch , nt , num_epochs , num_hid })\n\n\
This function trains a binary crbm. The value nt is the order of the model, i.e. how many previous values should be included,\n\
num_epochs is the number of training epochs, and num_hid is the number of nodes in the hidden layer.\n\
@end deftypefn" )
{
octave_value_list retval_list ;
int nargin = args.length () ;
// check the input arguments
if ( nargin != 5 )
{
error ( "Input error: there are insufficient input arguments. Type help for more details." ) ;
return retval_list ;
}
if ( !args(0).is_real_matrix () ) // check batchdata
{
error ( "Input error: the 1st argument, batchdata, is not a matrix type. Type help for more details." ) ;
return retval_list ;
}
if ( !args(1).is_real_matrix () ) // check minibatch
{
error ( "Input error: the 2nd argument, minibatch, is not a matrix type. Type help for more details." ) ;
return retval_list ;
}
if ( args(2).length () != 1 ) // check nt
{
error ( "Input error: nt should be an interger value for the 'order' of the model. Type help for more details." ) ;
return retval_list ;
}
if ( args(3).length () != 1 ) // num_epochs
{
error ( "Input error: num_epochs should be an integer value for the number of training epochs. Type help for more details." ) ;
return retval_list ;
}
if ( args(4).length () != 1 ) // check num_hid
{
error ( "Input error: num_hid should be an integer for the number of nodes in hidden layer. Type help for more details." ) ;
return retval_list ;
}
if ( error_state )
{
error ( "Input error: type help for details." ) ;
return retval_list ;
}
// end of input checking
// inputs
Matrix batchdata = args(0).matrix_value () ;
Matrix minibatch = args(1).matrix_value () ;
int nt = args(2).int_value () ; // the "order" of the model
int num_epochs = args(3).int_value () ;
int num_hid = args(4).int_value () ;
// variables
// batchdata is a big matrix of all the data and we index it with "minibatch", a matrix of mini-batch indices in the columns
int num_cases = minibatch.rows () ; // Octave code ---> num_cases = length( minibatch{ batch } ) ;
int num_dims = batchdata.cols () ; // visible dimension
Matrix bi_star ( num_dims , num_cases ) ; bi_star.fill( 0.0 ) ; // Octave code ---> bi_star = zeros( num_dims , num_cases ) ;
Matrix bj_star ( num_hid , num_cases ) ; bj_star.fill( 0.0 ) ; // Octave code ---> bj_star = zeros( num_hid , num_cases ) ;
Matrix repmat_bj ( num_hid , num_cases ) ; repmat_bj.fill( 0.0 ) ; // for Octave code ---> repmat( bj , 1 , num_cases )
Matrix repmat_bi ( num_dims , num_cases ) ; repmat_bi.fill( 0.0 ) ; // for Octave code ---> repmat( bi , 1 , num_cases )
Matrix eta ( num_hid , num_cases ) ; eta.fill( 0.0 ) ;
Matrix h_posteriors ( num_hid , num_cases ) ; h_posteriors.fill( 0.0 ) ; // for the logistic function
Matrix ones ( num_hid , num_cases ) ; ones.fill( 1.0 ) ; // for the logistic function
Matrix ones_2 ( num_cases , num_hid ) ; ones_2.fill( 1.0 ) ; // for the logistic function of negdata
Matrix hid_states ( num_cases , num_hid ) ; hid_states.fill( 0.0 ) ; // for hid_states = double( h_posteriors' > rand( num_cases , num_hid ) ) ;
Matrix w_grad ( num_hid , num_dims ) ; w_grad.fill( 0.0 ) ; // for w_grad = hid_states' * ( data( : , : , 1 ) ./ gsd ) ;
Matrix bi_grad ( num_dims , 1 ) ; bi_grad.fill( 0.0 ) ; // for bi_grad = sum( data( : , : , 1 )' - repmat( bi , 1 , num_cases ) - bi_star , 2 ) ./ gsd^2 ;
Matrix bj_grad ( num_hid , 1 ) ; bj_grad.fill( 0.0 ) ; // for bj_grad = sum( hid_states , 1 )' ;
Matrix topdown ( num_cases , num_dims ) ; topdown.fill( 0.0 ) ; // for topdown = gsd .* ( hid_states * w ) ;
Matrix negdata ( num_cases , num_dims ) ; negdata.fill( 0.0 ) ;
Matrix negdata_transpose ( num_dims , num_cases ) ; negdata_transpose.fill( 0.0 ) ;
Matrix bi_transpose ( 1 , num_dims ) ; bi_transpose.fill( 0.0 ) ;
Matrix repmat_bi_transpose ( num_cases , num_dims ) ; repmat_bi_transpose.fill( 0.0 ) ;
Matrix neg_w_grad ( num_hid , num_dims ) ; neg_w_grad.fill( 0.0 ) ;
Matrix neg_bi_grad ( num_dims , 1 ) ; neg_bi_grad.fill( 0.0 ) ; // for neg_bi_grad = sum( negdata' - repmat( bi , 1 , num_cases ) - bi_star , 2 ) ./ gsd^2 ;
Matrix neg_bj_grad ( num_hid , 1 ) ; neg_bj_grad.fill( 0.0 ) ; // for neg_bj_grad = sum( h_posteriors , 2 ) ;
// Setting learning rates and create some utility matrices
Matrix epsilon_w ( num_hid , num_dims ) ; epsilon_w.fill( 0.001 ) ; // undirected
Matrix epsilon_bi ( num_dims , 1 ) ; epsilon_bi.fill( 0.001 ) ; // visibles
Matrix epsilon_bj ( num_hid , 1 ) ; epsilon_bj.fill( 0.001 ) ; // hidden units
Matrix epsilon_A ( num_dims , num_dims ) ; epsilon_A.fill( 0.001 ) ; // autoregressive
Matrix epsilon_B ( num_hid , num_dims ) ; epsilon_B.fill( 0.001 ) ; // prev visibles to hidden
Matrix w_decay ( num_hid , num_dims ) ; w_decay.fill( 0.0002 ) ; // currently we use the same weight decay for w, A, B
Matrix w_decay_A ( num_dims , num_dims ) ; w_decay_A.fill( 0.0002 ) ; // currently we use the same weight decay for w, A, B
Matrix w_decay_B ( num_hid , num_dims ) ; w_decay_B.fill( 0.0002 ) ; // currently we use the same weight decay for w, A, B
Matrix momentum_w ( num_hid , num_dims ) ; momentum_w.fill( 0.0 ) ; // momentum used only after 5 epochs of training, when it will be set to 0.9
Matrix num_cases_matrices_w_and_B ( num_hid , num_dims ) ; num_cases_matrices_w_and_B.fill( num_cases ) ;
Matrix momentum_bi ( num_dims , 1 ) ; momentum_bi.fill( 0.0 ) ;
Matrix num_cases_matrix_bi ( num_dims , 1 ) ; num_cases_matrix_bi.fill( num_cases ) ;
Matrix momentum_bj ( num_hid , 1 ) ; momentum_bj.fill( 0.0 ) ;
Matrix num_cases_matrix_bj ( num_hid , 1 ) ; num_cases_matrix_bj.fill( num_cases ) ;
Matrix momentum_A ( num_dims , num_dims ) ; momentum_A.fill( 0.0 ) ;
Matrix num_cases_matrix_A ( num_dims , num_dims ) ; num_cases_matrix_A.fill( num_cases ) ;
Matrix momentum_B ( num_hid , num_dims ) ; momentum_B.fill( 0.0 ) ;
// initialization of output matrices
Matrix w ( num_hid , num_dims ) ; w.fill( 0.0 ) ; // Octave code ---> w = 0.01 * randn( num_hid , num_dims ) ;
Matrix bi ( num_dims , 1 ) ; bi.fill( 0.0 ) ; // Octave code ---> bi = 0.01 * randn( num_dims , 1 ) ;
Matrix bj( num_hid , 1 ) ; bj.fill( 0.0 ) ; // Octave code ---> bj = -1 + 0.01 * randn( num_hid , 1 ) ; // set to favour units being "off"
// The autoregressive weights; A( : , : , j ) is the weight from t-j to the visible
NDArray A ( dim_vector( num_dims , num_dims , nt ) ) ; A.fill( 0.0 ) ; // Octave code ---> A = 0.01 * randn( num_dims ,num_dims , nt ) ;
// The weights from previous time-steps to the hiddens; B( : , : , j ) is the weight from t-j to the hidden layer
NDArray B ( dim_vector( num_hid , num_dims , nt ) ) ; B.fill( 0.0 ) ; // Octave code ---> B = 0.01 * randn( num_hid , num_dims , nt ) ;
// Declare MersenneTwister random values
MTRand mtrand1 ;
double rand_norm_value ;
double rand_uniform_value ;
// nested loops to fill w, bi, bj, A and B with initial random values
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_hid ; ii_nest_loop++ )
{
rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
bj ( ii_nest_loop , 0 ) = -1.0 + rand_norm_value ; // set to favour units being "off"
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_dims ; jj_nest_loop++ )
{
rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
w ( ii_nest_loop , jj_nest_loop ) = rand_norm_value ;
}
}
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_dims ; ii_nest_loop++ )
{
rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
bi ( ii_nest_loop , 0 ) = rand_norm_value ;
}
for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
{
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_dims ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_dims ; jj_nest_loop++ )
{
rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
A ( ii_nest_loop , jj_nest_loop , hh ) = rand_norm_value ;
} // end of jj_nest_loop loop
} // end of ii_nest_loop loop
} // end of hh loop
for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
{
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_hid ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_dims ; jj_nest_loop++ )
{
rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
B ( ii_nest_loop , jj_nest_loop , hh ) = rand_norm_value ;
} // end of jj_nest_loop loop
} // end of ii_nest_loop loop
} // end of hh loop
// keep previous updates around for momentum
Matrix w_update ( num_hid , num_dims ) ; w_update.fill( 0.0 ) ; // Octave code ---> w_update = zeros( size( w ) ) ;
Matrix bi_update ( num_dims , 1 ) ; bi_update.fill( 0.0 ) ; // Octave code ---> bi_update = zeros( size( bi ) ) ;
Matrix bj_update ( num_hid , 1 ) ; bj_update.fill( 0.0 ) ; // Octave code ---> bj_update = zeros( size( bj ) ) ;
NDArray A_update ( dim_vector( num_dims , num_dims , nt ) ) ; A_update.fill( 0.0 ) ; // Octave code ---> A_update = zeros( size( A ) ) ;
Matrix A_extract ( num_dims , num_dims ) ; A_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix A_update_extract ( num_dims , num_dims ) ; A_update_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArrays
NDArray B_update ( dim_vector( num_hid , num_dims , nt ) ) ; B_update.fill( 0.0 ) ; // Octave code ---> B_update = zeros( size( B ) ) ;
Matrix B_extract ( num_hid , num_dims ) ; B_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix B_update_extract ( num_hid , num_dims ) ; B_update_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArray
// data is a nt+1 dimensional array with current and delayed data corresponding to mini-batches
// num_cases = minibatch( batch ).length () ;
num_cases = minibatch.rows () ;
NDArray data ( dim_vector( num_cases , num_dims , nt + 1 ) ) ; data.fill( 0.0 ) ; // Octave code ---> data = zeros( num_cases , num_dims , nt + 1 ) ;
Matrix data_extract ( num_cases , num_dims ) ; data_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix data_transpose ( num_dims , num_cases ) ; data_transpose.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix data_0 ( num_cases , num_dims ) ; data_0.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix data_0_transpose ( num_dims , num_cases ) ; data_0_transpose.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArrays
NDArray A_grad ( dim_vector( num_dims , num_dims , nt ) ) ; A_grad.fill( 0.0 ) ; // for A_update( : , : , hh ) = momentum * A_update( : , : , hh ) + epsilon_A * ( ( A_grad( : , : , hh ) - neg_A_grad( : , : , hh ) ) / num_cases - w_decay * A( : , : , hh ) ) ;
Matrix A_grad_extract ( num_dims , num_dims ) ; A_grad_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArrays
NDArray neg_A_grad ( dim_vector( num_dims , num_dims , nt ) ) ; neg_A_grad.fill( 0.0 ) ; // for neg_A_grad( : , : , hh ) = ( negdata' - repmat( bi , 1 , num_cases ) - bi_star ) ./ gsd^2 * data( : , : , hh + 1 ) ;
Matrix neg_A_grad_extract ( num_dims , num_dims ) ; neg_A_grad_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArrays
NDArray B_grad ( dim_vector( num_hid , num_dims , nt ) ) ; B_grad.fill( 0.0 ) ; // for B_update( : , : , hh ) = momentum * B_update( : , : , hh ) + epsilon_B * ( ( B_grad( : , : , hh ) - neg_B_grad( : , : , hh ) ) / num_cases - w_decay * B( : , : , hh ) ) ;
Matrix B_grad_extract ( num_hid , num_dims ) ; B_grad_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArrays
NDArray neg_B_grad ( dim_vector( num_hid , num_dims , nt ) ) ; neg_B_grad.fill( 0.0 ) ; // for neg_B_grad( : , : , hh ) = h_posteriors * data( : , : , hh + 1 ) ;
Matrix neg_B_grad_extract ( num_hid , num_dims ) ; neg_B_grad_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArrays
Array p ( dim_vector ( nt , 1 ) , 0 ) ; // vector for writing to A_grad and B_grad
// end of initialization of matrices
// %%%%%%%%% THE MAIN CODE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
for ( octave_idx_type epoch ( 0 ) ; epoch < num_epochs ; epoch++ ) // main epoch loop
{
// // errsum = 0 ; % keep a running total of the difference between data and recon
//
for ( octave_idx_type batch ( 0 ) ; batch < minibatch.cols () ; batch++ ) // Octave code ---> int num_batches = minibatch.length () ;
{
// %%%%%%%%% START POSITIVE PHASE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
// // These next nested loops fill the data NDArray with the values from batchdata indexed
// // by the column values of minibatch. Octave code equivalent given below:-
// // Octave code ---> mb = minibatch{ batch } ; % caches the indices
// // Octave code ---> data( : , : , 1 ) = batchdata( mb , : ) ;
// // Octave code ---> for hh = 1 : nt
// // Octave code ---> data( : , : , hh + 1 ) = batchdata( mb - hh , : ) ;
// // Octave code ---> end
for ( octave_idx_type hh ( 0 ) ; hh < nt + 1 ; hh++ )
{
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_cases ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_dims ; jj_nest_loop++ )
{
data( ii_nest_loop , jj_nest_loop , hh ) = batchdata( minibatch( ii_nest_loop , batch ) - 1 - hh , jj_nest_loop ) ; // -1 for .oct zero based indexing vs Octave's 1 based
} // end of jj_nest_loop loop
} // end of ii_nest_loop loop
} // end of hh loop
// The above data filling loop could perhaps be implemented more quickly using the fortrans vec method as below
// http://stackoverflow.com.80bola.com/questions/28900153/create-a-ndarray-in-an-oct-file-from-a-double-pointer
// NDArray a (dim_vector(dim[0], dim[1], dim[2]));
//
// Then loop over (i, j, k) indices to copy the cube to the octave array
//
// double* a_vec = a.fortran_vec ();
// for (int i = 0; i < dim[0]; i++) {
// for (int j = 0; j < dim[1]; j++) {
// for (int k = 0; k < dim[2]; k++) {
// *a_vec++ = armadillo_cube(i, j, k);
// }
// }
// }
// calculate contributions from directed autoregressive connections and contributions from directed visible-to-hidden connections
bi_star.fill( 0.0 ) ; // Octave code ---> bi_star = zeros( num_dims , num_cases ) ; ( matrix declared earlier in code above )
bj_star.fill( 0.0 ) ; // Octave code ---> bj_star = zeros( num_hid , num_cases ) ; ( matrix declared earlier in code above )
// The code below calculates two separate Octave code loops in one nested C++ loop structure, namely
// Octave code ---> for hh = 1 : nt
// bi_star = bi_star + A( : , : , hh ) * data( : , : , hh + 1 )' ;
// end
// and
// Octave code ---> for hh = 1:nt
// bj_star = bj_star + B( : , : , hh ) * data( : , : , hh + 1 )' ;
// end
for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
{
// fill the intermediate calculation matrices
A_extract = A.page ( hh ) ;
B_extract = B.page ( hh ) ;
data_transpose = ( data.page ( hh + 1 ) ).transpose () ;
// add up the hh different matrix multiplications
bi_star += A_extract * data_transpose ;
bj_star += B_extract * data_transpose ;
} // end of hh loop
// extract and pre-calculate to save time in later computations
data_0 = data.page ( 0 ) ;
data_0_transpose = data_0.transpose () ;
// Calculate "posterior" probability -- hidden state being on ( Note that it isn't a true posterior )
// Octave code ---> eta = w * ( data( : , : , 1 ) ./ gsd )' + ... % bottom-up connections
// repmat( bj , 1 , num_cases ) + ... % static biases on unit
// bj_star ; % dynamic biases
// get repmat( bj , 1 , num_cases ) ( http://stackoverflow.com/questions/19273053/write-to-a-matrix-in-oct-file-without-looping?rq=1 )
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_cases ; jj_nest_loop++ ) // loop over the columns
{
repmat_bj.insert( bj , 0 , jj_nest_loop ) ;
}
// bottom_up = w * data( : , : , 1 )' ;
eta = w * data_0_transpose + repmat_bj + bj_star ;
// h_posteriors = 1 ./ ( 1 + exp( -eta ) ) ; % logistic
// -exponate the eta matrix
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < eta.rows () ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < eta.cols () ; jj_nest_loop++ )
{
eta ( ii_nest_loop , jj_nest_loop ) = exp( - eta ( ii_nest_loop , jj_nest_loop ) ) ;
}
}
// element division A./B == quotient(A,B)
h_posteriors = quotient( ones , ( ones + eta ) ) ;
// Activate the hidden units
// Octave code ---> hid_states = double( h_posteriors' > rand( num_cases , num_hid ) ) ;
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < hid_states.rows () ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < hid_states.cols () ; jj_nest_loop++ )
{
rand_uniform_value = mtrand1.randDblExc() ; // a real number in the range 0 to 1, excluding both 0 and 1
hid_states( ii_nest_loop , jj_nest_loop ) = h_posteriors( jj_nest_loop , ii_nest_loop ) > rand_uniform_value ? 1.0 : 0.0 ;
}
} // end of hid_states loop
// Calculate positive gradients ( note w.r.t. neg energy )
// Octave code ---> w_grad = hid_states' * ( data( : , : , 1 ) ./ gsd ) ;
// bi_grad = sum( data( : , : , 1 )' - repmat( bi , 1 , num_cases ) - bi_star , 2 ) ./ gsd^2 ;
// bj_grad = sum( hid_states , 1 )' ;
w_grad = hid_states.transpose () * data_0 ;
// get repmat( bi , 1 , num_cases ) ( http://stackoverflow.com/questions/19273053/write-to-a-matrix-in-oct-file-without-looping?rq=1 )
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_cases ; jj_nest_loop++ ) // loop over the columns
{
repmat_bi.insert( bi , 0 , jj_nest_loop ) ;
}
bi_grad = ( data_0_transpose - repmat_bi - bi_star ).sum ( 1 ) ;
bj_grad = ( hid_states.sum ( 0 ) ).transpose () ;
// Octave code ---> for hh = 1 : nt
// A_grad( : , : , hh ) = ( data( : , : , 1 )' - repmat( bi , 1 , num_cases ) - bi_star ) ./ gsd^2 * data( : , : , hh + 1 ) ;
// B_grad( : , : , hh ) = hid_states' * data( : , : , hh + 1 ) ;
// end
for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
{
p( 2 ) = hh ; // set Array p to write to page hh of A_grad and B_grad NDArrays
data_extract = data.page ( hh + 1 ) ; // get the equivalent of data( : , : , hh + 1 )
A_grad.insert( ( data_0_transpose - repmat_bi - bi_star ) * data_extract , p ) ;
B_grad.insert( hid_states.transpose () * data_extract , p ) ;
}
// the above code comes from http://stackoverflow.com/questions/29572075/how-do-you-create-an-arrayoctave-idx-type-in-an-oct-file
// e.g.
//
// Array p (dim_vector (3, 1));
// int n = 2;
// dim_vector dim(n, n, 3);
// NDArray a_matrix(dim);
//
// for (octave_idx_type i = 0; i < n; i++)
// for (octave_idx_type j = 0; j < n; j++)
// a_matrix(i,j, 1) = (i + 1) * 10 + (j + 1);
//
// std::cout << a_matrix;
//
// Matrix b_matrix = Matrix (n, n);
// b_matrix(0, 0) = 1;
// b_matrix(0, 1) = 2;
// b_matrix(1, 0) = 3;
// b_matrix(1, 1) = 4;
// std::cout << b_matrix;
//
// Array p (dim_vector (3, 1), 0);
// p(2) = 2;
// a_matrix.insert (b_matrix, p);
//
// std::cout << a_matrix;
// %%%%%%%%% END OF POSITIVE PHASE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
// Activate the visible units
// Octave code ---> topdown = hid_states * w ;
topdown = hid_states * w ;
// get repmat( bi' , 1 , num_cases ) ( http://stackoverflow.com/questions/19273053/write-to-a-matrix-in-oct-file-without-looping?rq=1 )
bi_transpose = bi.transpose () ;
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_cases ; ii_nest_loop++ ) // loop over the rows
{
repmat_bi_transpose.insert( bi_transpose , ii_nest_loop , 0 ) ;
}
eta = topdown + repmat_bi_transpose + bi_star.transpose () ;
// Octave code ---> negdata = 1 ./ ( 1 + exp( -eta ) ) ; % logistic
// -exponate the eta matrix
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < eta.rows () ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < eta.cols () ; jj_nest_loop++ )
{
eta ( ii_nest_loop , jj_nest_loop ) = exp( - eta ( ii_nest_loop , jj_nest_loop ) ) ;
}
}
// element division A./B == quotient(A,B)
negdata = quotient( ones_2 , ( ones_2 + eta ) ) ;
// Now conditional on negdata, calculate "posterior" probability for hiddens
// Octave code ---> eta = w * ( negdata ./ gsd )' + ... % bottom-up connections
// repmat( bj , 1 , num_cases ) + ... % static biases on unit (no change)
// bj_star ; % dynamic biases (no change)
negdata_transpose = negdata.transpose () ; // to save repetition of transpose
eta = w * negdata_transpose + repmat_bj + bj_star ;
// h_posteriors = 1 ./ ( 1 + exp( -eta ) ) ; % logistic
// -exponate the eta matrix
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < eta.rows () ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < eta.cols () ; jj_nest_loop++ )
{
eta ( ii_nest_loop , jj_nest_loop ) = exp( - eta ( ii_nest_loop , jj_nest_loop ) ) ;
}
}
// element division A./B == quotient(A,B)
h_posteriors = quotient( ones , ( ones + eta ) ) ;
// Calculate negative gradients
// Octave code ---> neg_w_grad = h_posteriors * ( negdata ./ gsd ) ; % not using activations
neg_w_grad = h_posteriors * negdata ; // not using activations
// Octave code ---> neg_bi_grad = sum( negdata' - repmat( bi , 1 , num_cases ) - bi_star , 2 ) ./ gsd^2 ;
neg_bi_grad = ( negdata_transpose - repmat_bi - bi_star ).sum ( 1 ) ;
// Octave code ---> neg_bj_grad = sum( h_posteriors , 2 ) ;
neg_bj_grad = h_posteriors.sum ( 1 ) ;
// Octave code ---> for hh = 1 : nt
// neg_A_grad( : , : , hh ) = ( negdata' - repmat( bi , 1 , num_cases ) - bi_star ) ./ gsd^2 * data( : , : , hh + 1 ) ;
// neg_B_grad( : , : , hh ) = h_posteriors * data( : , : , hh + 1 ) ;
// end
for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
{
p( 2 ) = hh ; // set Array p to write to page hh of A_grad and B_grad NDArrays
data_extract = data.page ( hh + 1 ) ; // get the equivalent of data( : , : , hh + 1 )
neg_A_grad.insert( ( negdata_transpose - repmat_bi - bi_star ) * data_extract , p ) ;
neg_B_grad.insert( h_posteriors * data_extract , p ) ;
} // end of hh loop
// %%%%%%%%% END NEGATIVE PHASE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
// Octave code ---> err = sum( sum( ( data( : , : , 1 ) - negdata ) .^2 ) ) ;
// Not used errsum = err + errsum ;
// Octave code ---> if ( epoch > 5 ) % use momentum
// momentum = mom ;
// else % no momentum
// momentum = 0 ;
// end
// momentum was initialised to 0.0, but on the 6th iteration of epoch, set momentum to 0.9
if ( epoch == 5 ) // will only be true once, after which momentum will == 0.9
{
momentum_w.fill( 0.9 ) ;
momentum_bi.fill( 0.9 ) ;
momentum_bj.fill( 0.9 ) ;
momentum_A.fill( 0.9 ) ;
momentum_B.fill( 0.9 ) ;
}
// %%%%%%%%% UPDATE WEIGHTS AND BIASES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
// Octave code ---> w_update = momentum * w_update + epsilon_w * ( ( w_grad - neg_w_grad ) / num_cases - w_decay * w ) ;
w_update = product( momentum_w , w_update ) + product( epsilon_w , quotient( ( w_grad - neg_w_grad ) , num_cases_matrices_w_and_B ) - product( w_decay , w ) ) ;
// Octave code ---> bi_update = momentum * bi_update + ( epsilon_bi / num_cases ) * ( bi_grad - neg_bi_grad ) ;
bi_update = product( momentum_bi , bi_update ) + product( quotient( epsilon_bi , num_cases_matrix_bi ) , ( bi_grad - neg_bi_grad ) ) ;
// Octave code ---> bj_update = momentum * bj_update + ( epsilon_bj / num_cases ) * ( bj_grad - neg_bj_grad ) ;
bj_update = product( momentum_bj , bj_update ) + product( quotient( epsilon_bj , num_cases_matrix_bj ) , ( bj_grad - neg_bj_grad ) ) ;
// The following two Octave code loops are combined into the single .oct loop that follows them
//
// Octave code ---> for hh = 1 : nt
// A_update( : , : , hh ) = momentum * A_update( : , : , hh ) + epsilon_A * ( ( A_grad( : , : , hh ) - neg_A_grad( : , : , hh ) ) / num_cases - w_decay * A( : , : , hh ) ) ;
// B_update( : , : , hh ) = momentum * B_update( : , : , hh ) + epsilon_B * ( ( B_grad( : , : , hh ) - neg_B_grad( : , : , hh ) ) / num_cases - w_decay * B( : , : , hh ) ) ;
// end
// Octave code ---> for hh = 1 : nt
// A( : , : , hh ) = A( : , : , hh ) + A_update( : , : , hh ) ;
// B( : , : , hh ) = B( : , : , hh ) + B_update( : , : , hh ) ;
// end
for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
{
p( 2 ) = hh ;
A_update_extract = A_update.page ( hh ) ;
A_grad_extract = A_grad.page ( hh ) ;
neg_A_grad_extract = neg_A_grad.page ( hh ) ;
A_extract = A.page ( hh ) ;
A_update.insert( product( momentum_A , A_update_extract ) + product( epsilon_A , ( quotient( ( A_grad_extract - neg_A_grad_extract ) , num_cases_matrix_A ) - product( w_decay_A , A_extract ) ) ) , p ) ;
A_update_extract = A_update.page ( hh ) ;
A.insert( A_extract + A_update_extract , p ) ;
B_update_extract = B_update.page ( hh ) ;
B_grad_extract = B_grad.page ( hh ) ;
neg_B_grad_extract = neg_B_grad.page ( hh ) ;
B_extract = B.page ( hh ) ;
B_update.insert( product( momentum_B , B_update_extract ) + product( epsilon_B , ( quotient( ( B_grad_extract - neg_B_grad_extract ) , num_cases_matrices_w_and_B ) - product( w_decay_B , B_extract ) ) ) , p ) ;
B_update_extract = B_update.page ( hh ) ;
B.insert( B_extract + B_update_extract , p ) ;
} // end of hh loop
// Octave code ---> w = w + w_update ;
// bi = bi + bi_update ;
// bj = bj + bj_update ;
w += w_update ;
bi += bi_update ;
bj += bj_update ;
// %%%%%%%%%%%%%%%% END OF UPDATES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
} // end of batch loop
} // end of main epoch loop
// return the values w , bj , bi , A , B
retval_list(4) = B ;
retval_list(3) = A ;
retval_list(2) = bi ;
retval_list(1) = bj ;
retval_list(0) = w ;
return retval_list ;
} // end of function
Thursday, 16 April 2015
Optimised CRBM Code for Gaussian Units
Over the last few weeks I have been working on optimising the conditional restricted boltzmann machine code, with a view to speeding it up via a C++ .oct file, and in the code box below is this .oct code for the gaussian_crbm.m code in my previous post. This gaussian_crbm.m function, plus the binary_crbm.m one, are the speed bottlenecks whilst training the crbm. In the code below I have made some adjustments for code simplification purposes, the most important of which are:
- the minibatch index is a matrix rather than a cell type index, with the individual batch indexes in the columns, which means that the batch sizes are all equal in length.
- as a result all data in cell arrays in the .m function are in NDArrays in the .oct function
- there is no input for the variable "gsd" because I have assumed a value of 1 applies. This means the input data must be normalised prior to the function call.
DEFUN_DLD ( cc_gaussian_crbm_mersenne , args , nargout ,
"-*- texinfo -*-\n\
@deftypefn {Function File} {} cc_gaussian_crbm_mersenne (@var{ batchdata , minibatch , nt , num_epochs , num_hid })\n\n\
This function trains a real valued, gaussian crbm where the gsd is assumed to be 1, so the batchdata input must be z-score normalised.\n\
The value nt is the order of the model, i.e. how many previous values should be included, num_epochs is the number of training epochs,\n\
and num_hid is the number of nodes in the hidden layer.\n\
@end deftypefn" )
{
octave_value_list retval_list ;
int nargin = args.length () ;
// check the input arguments
if ( nargin != 5 )
{
error ( "Input error: there are insufficient input arguments. Type help for more details." ) ;
return retval_list ;
}
if ( !args(0).is_real_matrix () ) // check batchdata
{
error ( "Input error: the 1st argument, batchdata, is not a matrix type. Type help for more details." ) ;
return retval_list ;
}
if ( !args(1).is_real_matrix () ) // check minibatch
{
error ( "Input error: the 2nd argument, minibatch, is not a matrix type. Type help for more details." ) ;
return retval_list ;
}
if ( args(2).length () != 1 ) // check nt
{
error ( "Input error: nt should be an interger value for the 'order' of the model. Type help for more details." ) ;
return retval_list ;
}
if ( args(3).length () != 1 ) // num_epochs
{
error ( "Input error: num_epochs should be an integer value for the number of training epochs. Type help for more details." ) ;
return retval_list ;
}
if ( args(4).length () != 1 ) // check num_hid
{
error ( "Input error: num_hid should be an integer for the number of nodes in hidden layer. Type help for more details." ) ;
return retval_list ;
}
if ( error_state )
{
error ( "Input error: type help for details." ) ;
return retval_list ;
}
// end of input checking
// inputs
Matrix batchdata = args(0).matrix_value () ;
Matrix minibatch = args(1).matrix_value () ;
int nt = args(2).int_value () ; // the "order" of the model
int num_epochs = args(3).int_value () ;
int num_hid = args(4).int_value () ;
// variables
// batchdata is a big matrix of all the data and we index it with "minibatch", a matrix of mini-batch indices in the columns
int num_cases = minibatch.rows () ; // Octave code ---> num_cases = length( minibatch{ batch } ) ;
int num_dims = batchdata.cols () ; // visible dimension
Matrix bi_star ( num_dims , num_cases ) ; bi_star.fill( 0.0 ) ; // Octave code ---> bi_star = zeros( num_dims , num_cases ) ;
Matrix bj_star ( num_hid , num_cases ) ; bj_star.fill( 0.0 ) ; // Octave code ---> bj_star = zeros( num_hid , num_cases ) ;
Matrix repmat_bj ( num_hid , num_cases ) ; repmat_bj.fill( 0.0 ) ; // for Octave code ---> repmat( bj , 1 , num_cases )
Matrix repmat_bi ( num_dims , num_cases ) ; repmat_bi.fill( 0.0 ) ; // for Octave code ---> repmat( bi , 1 , num_cases )
Matrix eta ( num_hid , num_cases ) ; eta.fill( 0.0 ) ;
Matrix h_posteriors ( num_hid , num_cases ) ; h_posteriors.fill( 0.0 ) ; // for the logistic function
Matrix ones ( num_hid , num_cases ) ; ones.fill( 1.0 ) ; // for the logistic function
Matrix hid_states ( num_cases , num_hid ) ; hid_states.fill( 0.0 ) ; // for hid_states = double( h_posteriors' > rand( num_cases , num_hid ) ) ;
Matrix w_grad ( num_hid , num_dims ) ; w_grad.fill( 0.0 ) ; // for w_grad = hid_states' * ( data( : , : , 1 ) ./ gsd ) ;
Matrix bi_grad ( num_dims , 1 ) ; bi_grad.fill( 0.0 ) ; // for bi_grad = sum( data( : , : , 1 )' - repmat( bi , 1 , num_cases ) - bi_star , 2 ) ./ gsd^2 ;
Matrix bj_grad ( num_hid , 1 ) ; bj_grad.fill( 0.0 ) ; // for bj_grad = sum( hid_states , 1 )' ;
Matrix topdown ( num_cases , num_dims ) ; topdown.fill( 0.0 ) ; // for topdown = gsd .* ( hid_states * w ) ;
Matrix negdata ( num_cases , num_dims ) ; negdata.fill( 0.0 ) ;
Matrix negdata_transpose ( num_dims , num_cases ) ; negdata_transpose.fill( 0.0 ) ;
Matrix bi_transpose ( 1 , num_dims ) ; bi_transpose.fill( 0.0 ) ;
Matrix repmat_bi_transpose ( num_cases , num_dims ) ; repmat_bi_transpose.fill( 0.0 ) ;
Matrix neg_w_grad ( num_hid , num_dims ) ; neg_w_grad.fill( 0.0 ) ;
Matrix neg_bi_grad ( num_dims , 1 ) ; neg_bi_grad.fill( 0.0 ) ; // for neg_bi_grad = sum( negdata' - repmat( bi , 1 , num_cases ) - bi_star , 2 ) ./ gsd^2 ;
Matrix neg_bj_grad ( num_hid , 1 ) ; neg_bj_grad.fill( 0.0 ) ; // for neg_bj_grad = sum( h_posteriors , 2 ) ;
// Setting learning rates and create some utility matrices
Matrix epsilon_w ( num_hid , num_dims ) ; epsilon_w.fill( 0.001 ) ; // undirected
Matrix epsilon_bi ( num_dims , 1 ) ; epsilon_bi.fill( 0.001 ) ; // visibles
Matrix epsilon_bj ( num_hid , 1 ) ; epsilon_bj.fill( 0.001 ) ; // hidden units
Matrix epsilon_A ( num_dims , num_dims ) ; epsilon_A.fill( 0.001 ) ; // autoregressive
Matrix epsilon_B ( num_hid , num_dims ) ; epsilon_B.fill( 0.001 ) ; // prev visibles to hidden
Matrix w_decay ( num_hid , num_dims ) ; w_decay.fill( 0.0002 ) ; // currently we use the same weight decay for w, A, B
Matrix w_decay_A ( num_dims , num_dims ) ; w_decay_A.fill( 0.0002 ) ; // currently we use the same weight decay for w, A, B
Matrix w_decay_B ( num_hid , num_dims ) ; w_decay_B.fill( 0.0002 ) ; // currently we use the same weight decay for w, A, B
Matrix momentum_w ( num_hid , num_dims ) ; momentum_w.fill( 0.0 ) ; // momentum used only after 5 epochs of training, when it will be set to 0.9
Matrix num_cases_matrices_w_and_B ( num_hid , num_dims ) ; num_cases_matrices_w_and_B.fill( num_cases ) ;
Matrix momentum_bi ( num_dims , 1 ) ; momentum_bi.fill( 0.0 ) ;
Matrix num_cases_matrix_bi ( num_dims , 1 ) ; num_cases_matrix_bi.fill( num_cases ) ;
Matrix momentum_bj ( num_hid , 1 ) ; momentum_bj.fill( 0.0 ) ;
Matrix num_cases_matrix_bj ( num_hid , 1 ) ; num_cases_matrix_bj.fill( num_cases ) ;
Matrix momentum_A ( num_dims , num_dims ) ; momentum_A.fill( 0.0 ) ;
Matrix num_cases_matrix_A ( num_dims , num_dims ) ; num_cases_matrix_A.fill( num_cases ) ;
Matrix momentum_B ( num_hid , num_dims ) ; momentum_B.fill( 0.0 ) ;
// initialization of output matrices
Matrix w ( num_hid , num_dims ) ; w.fill( 0.0 ) ; // Octave code ---> w = 0.01 * randn( num_hid , num_dims ) ;
Matrix bi ( num_dims , 1 ) ; bi.fill( 0.0 ) ; // Octave code ---> bi = 0.01 * randn( num_dims , 1 ) ;
Matrix bj( num_hid , 1 ) ; bj.fill( 0.0 ) ; // Octave code ---> bj = -1 + 0.01 * randn( num_hid , 1 ) ; // set to favour units being "off"
// The autoregressive weights; A( : , : , j ) is the weight from t-j to the visible
NDArray A ( dim_vector( num_dims , num_dims , nt ) ) ; A.fill( 0.0 ) ; // Octave code ---> A = 0.01 * randn( num_dims ,num_dims , nt ) ;
// The weights from previous time-steps to the hiddens; B( : , : , j ) is the weight from t-j to the hidden layer
NDArray B ( dim_vector( num_hid , num_dims , nt ) ) ; B.fill( 0.0 ) ; // Octave code ---> B = 0.01 * randn( num_hid , num_dims , nt ) ;
// Declare MersenneTwister random values
MTRand mtrand1 ;
double rand_norm_value ;
double rand_uniform_value ;
// nested loops to fill w, bi, bj, A and B with initial random values
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_hid ; ii_nest_loop++ )
{
rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
bj ( ii_nest_loop , 0 ) = -1.0 + rand_norm_value ; // set to favour units being "off"
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_dims ; jj_nest_loop++ )
{
rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
w ( ii_nest_loop , jj_nest_loop ) = rand_norm_value ;
}
}
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_dims ; ii_nest_loop++ )
{
rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
bi ( ii_nest_loop , 0 ) = rand_norm_value ;
}
for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
{
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_dims ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_dims ; jj_nest_loop++ )
{
rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
A ( ii_nest_loop , jj_nest_loop , hh ) = rand_norm_value ;
} // end of jj_nest_loop loop
} // end of ii_nest_loop loop
} // end of hh loop
for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
{
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_hid ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_dims ; jj_nest_loop++ )
{
rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
B ( ii_nest_loop , jj_nest_loop , hh ) = rand_norm_value ;
} // end of jj_nest_loop loop
} // end of ii_nest_loop loop
} // end of hh loop
// keep previous updates around for momentum
Matrix w_update ( num_hid , num_dims ) ; w_update.fill( 0.0 ) ; // Octave code ---> w_update = zeros( size( w ) ) ;
Matrix bi_update ( num_dims , 1 ) ; bi_update.fill( 0.0 ) ; // Octave code ---> bi_update = zeros( size( bi ) ) ;
Matrix bj_update ( num_hid , 1 ) ; bj_update.fill( 0.0 ) ; // Octave code ---> bj_update = zeros( size( bj ) ) ;
NDArray A_update ( dim_vector( num_dims , num_dims , nt ) ) ; A_update.fill( 0.0 ) ; // Octave code ---> A_update = zeros( size( A ) ) ;
Matrix A_extract ( num_dims , num_dims ) ; A_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix A_update_extract ( num_dims , num_dims ) ; A_update_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArrays
NDArray B_update ( dim_vector( num_hid , num_dims , nt ) ) ; B_update.fill( 0.0 ) ; // Octave code ---> B_update = zeros( size( B ) ) ;
Matrix B_extract ( num_hid , num_dims ) ; B_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix B_update_extract ( num_hid , num_dims ) ; B_update_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArray
// data is a nt+1 dimensional array with current and delayed data corresponding to mini-batches
// num_cases = minibatch( batch ).length () ;
num_cases = minibatch.rows () ;
NDArray data ( dim_vector( num_cases , num_dims , nt + 1 ) ) ; data.fill( 0.0 ) ; // Octave code ---> data = zeros( num_cases , num_dims , nt + 1 ) ;
Matrix data_extract ( num_cases , num_dims ) ; data_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix data_transpose ( num_dims , num_cases ) ; data_transpose.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix data_0 ( num_cases , num_dims ) ; data_0.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix data_0_transpose ( num_dims , num_cases ) ; data_0_transpose.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArrays
NDArray A_grad ( dim_vector( num_dims , num_dims , nt ) ) ; A_grad.fill( 0.0 ) ; // for A_update( : , : , hh ) = momentum * A_update( : , : , hh ) + epsilon_A * ( ( A_grad( : , : , hh ) - neg_A_grad( : , : , hh ) ) / num_cases - w_decay * A( : , : , hh ) ) ;
Matrix A_grad_extract ( num_dims , num_dims ) ; A_grad_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArrays
NDArray neg_A_grad ( dim_vector( num_dims , num_dims , nt ) ) ; neg_A_grad.fill( 0.0 ) ; // for neg_A_grad( : , : , hh ) = ( negdata' - repmat( bi , 1 , num_cases ) - bi_star ) ./ gsd^2 * data( : , : , hh + 1 ) ;
Matrix neg_A_grad_extract ( num_dims , num_dims ) ; neg_A_grad_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArrays
NDArray B_grad ( dim_vector( num_hid , num_dims , nt ) ) ; B_grad.fill( 0.0 ) ; // for B_update( : , : , hh ) = momentum * B_update( : , : , hh ) + epsilon_B * ( ( B_grad( : , : , hh ) - neg_B_grad( : , : , hh ) ) / num_cases - w_decay * B( : , : , hh ) ) ;
Matrix B_grad_extract ( num_hid , num_dims ) ; B_grad_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArrays
NDArray neg_B_grad ( dim_vector( num_hid , num_dims , nt ) ) ; neg_B_grad.fill( 0.0 ) ; // for neg_B_grad( : , : , hh ) = h_posteriors * data( : , : , hh + 1 ) ;
Matrix neg_B_grad_extract ( num_hid , num_dims ) ; neg_B_grad_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArrays
Array p ( dim_vector ( nt , 1 ) , 0 ) ; // vector for writing to A_grad and B_grad
// end of initialization of matrices
// %%%%%%%%% THE MAIN CODE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
for ( octave_idx_type epoch ( 0 ) ; epoch < num_epochs ; epoch++ ) // main epoch loop
{
// // errsum = 0 ; % keep a running total of the difference between data and recon
//
for ( octave_idx_type batch ( 0 ) ; batch < minibatch.cols () ; batch++ ) // Octave code ---> int num_batches = minibatch.length () ;
{
// %%%%%%%%% START POSITIVE PHASE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
// // These next nested loops fill the data NDArray with the values from batchdata indexed
// // by the column values of minibatch. Octave code equivalent given below:-
// // Octave code ---> mb = minibatch{ batch } ; % caches the indices
// // Octave code ---> data( : , : , 1 ) = batchdata( mb , : ) ;
// // Octave code ---> for hh = 1 : nt
// // Octave code ---> data( : , : , hh + 1 ) = batchdata( mb - hh , : ) ;
// // Octave code ---> end
for ( octave_idx_type hh ( 0 ) ; hh < nt + 1 ; hh++ )
{
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_cases ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_dims ; jj_nest_loop++ )
{
data( ii_nest_loop , jj_nest_loop , hh ) = batchdata( minibatch( ii_nest_loop , batch ) - 1 - hh , jj_nest_loop ) ; // -1 for .oct zero based indexing vs Octave's 1 based
} // end of jj_nest_loop loop
} // end of ii_nest_loop loop
} // end of hh loop
// The above data filling loop could perhaps be implemented more quickly using the fortrans vec method as below
// http://stackoverflow.com.80bola.com/questions/28900153/create-a-ndarray-in-an-oct-file-from-a-double-pointer
// NDArray a (dim_vector(dim[0], dim[1], dim[2]));
//
// Then loop over (i, j, k) indices to copy the cube to the octave array
//
// double* a_vec = a.fortran_vec ();
// for (int i = 0; i < dim[0]; i++) {
// for (int j = 0; j < dim[1]; j++) {
// for (int k = 0; k < dim[2]; k++) {
// *a_vec++ = armadillo_cube(i, j, k);
// }
// }
// }
// calculate contributions from directed autoregressive connections and contributions from directed visible-to-hidden connections
bi_star.fill( 0.0 ) ; // Octave code ---> bi_star = zeros( num_dims , num_cases ) ; ( matrix declared earlier in code above )
bj_star.fill( 0.0 ) ; // Octave code ---> bj_star = zeros( num_hid , num_cases ) ; ( matrix declared earlier in code above )
// The code below calculates two separate Octave code loops in one nested C++ loop structure, namely
// Octave code ---> for hh = 1 : nt
// bi_star = bi_star + A( : , : , hh ) * data( : , : , hh + 1 )' ;
// end
// and
// Octave code ---> for hh = 1:nt
// bj_star = bj_star + B( : , : , hh ) * data( : , : , hh + 1 )' ;
// end
for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
{
// fill the intermediate calculation matrices
A_extract = A.page ( hh ) ;
B_extract = B.page ( hh ) ;
data_transpose = ( data.page ( hh + 1 ) ).transpose () ;
// add up the hh different matrix multiplications
bi_star += A_extract * data_transpose ;
bj_star += B_extract * data_transpose ;
} // end of hh loop
// extract and pre-calculate to save time in later computations
data_0 = data.page ( 0 ) ;
data_0_transpose = data_0.transpose () ;
// Calculate "posterior" probability -- hidden state being on ( Note that it isn't a true posterior )
// Octave code ---> eta = w * ( data( : , : , 1 ) ./ gsd )' + ... % bottom-up connections
// repmat( bj , 1 , num_cases ) + ... % static biases on unit
// bj_star ; % dynamic biases
// get repmat( bj , 1 , num_cases ) ( http://stackoverflow.com/questions/19273053/write-to-a-matrix-in-oct-file-without-looping?rq=1 )
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_cases ; jj_nest_loop++ ) // loop over the columns
{
repmat_bj.insert( bj , 0 , jj_nest_loop ) ;
}
eta = w * data_0_transpose + repmat_bj + bj_star ;
// h_posteriors = 1 ./ ( 1 + exp( -eta ) ) ; % logistic
// -exponate the eta matrix
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < eta.rows () ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < eta.cols () ; jj_nest_loop++ )
{
eta ( ii_nest_loop , jj_nest_loop ) = exp( - eta ( ii_nest_loop , jj_nest_loop ) ) ;
}
}
// element division A./B == quotient(A,B)
h_posteriors = quotient( ones , ( ones + eta ) ) ;
// Activate the hidden units
// Octave code ---> hid_states = double( h_posteriors' > rand( num_cases , num_hid ) ) ;
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < hid_states.rows () ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < hid_states.cols () ; jj_nest_loop++ )
{
rand_uniform_value = mtrand1.randDblExc() ; // a real number in the range 0 to 1, excluding both 0 and 1
hid_states( ii_nest_loop , jj_nest_loop ) = h_posteriors( jj_nest_loop , ii_nest_loop ) > rand_uniform_value ? 1.0 : 0.0 ;
}
} // end of hid_states loop
// Calculate positive gradients ( note w.r.t. neg energy )
// Octave code ---> w_grad = hid_states' * ( data( : , : , 1 ) ./ gsd ) ;
// bi_grad = sum( data( : , : , 1 )' - repmat( bi , 1 , num_cases ) - bi_star , 2 ) ./ gsd^2 ;
// bj_grad = sum( hid_states , 1 )' ;
w_grad = hid_states.transpose () * data_0 ;
// get repmat( bi , 1 , num_cases ) ( http://stackoverflow.com/questions/19273053/write-to-a-matrix-in-oct-file-without-looping?rq=1 )
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_cases ; jj_nest_loop++ ) // loop over the columns
{
repmat_bi.insert( bi , 0 , jj_nest_loop ) ;
}
bi_grad = ( data_0_transpose - repmat_bi - bi_star ).sum ( 1 ) ;
bj_grad = ( hid_states.sum ( 0 ) ).transpose () ;
// Octave code ---> for hh = 1 : nt
// A_grad( : , : , hh ) = ( data( : , : , 1 )' - repmat( bi , 1 , num_cases ) - bi_star ) ./ gsd^2 * data( : , : , hh + 1 ) ;
// B_grad( : , : , hh ) = hid_states' * data( : , : , hh + 1 ) ;
// end
for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
{
p( 2 ) = hh ; // set Array p to write to page hh of A_grad and B_grad NDArrays
data_extract = data.page ( hh + 1 ) ; // get the equivalent of data( : , : , hh + 1 )
A_grad.insert( ( data_0_transpose - repmat_bi - bi_star ) * data_extract , p ) ;
B_grad.insert( hid_states.transpose () * data_extract , p ) ;
}
// the above code comes from http://stackoverflow.com/questions/29572075/how-do-you-create-an-arrayoctave-idx-type-in-an-oct-file
// e.g.
//
// Array p (dim_vector (3, 1));
// int n = 2;
// dim_vector dim(n, n, 3);
// NDArray a_matrix(dim);
//
// for (octave_idx_type i = 0; i < n; i++)
// for (octave_idx_type j = 0; j < n; j++)
// a_matrix(i,j, 1) = (i + 1) * 10 + (j + 1);
//
// std::cout << a_matrix;
//
// Matrix b_matrix = Matrix (n, n);
// b_matrix(0, 0) = 1;
// b_matrix(0, 1) = 2;
// b_matrix(1, 0) = 3;
// b_matrix(1, 1) = 4;
// std::cout << b_matrix;
//
// Array p (dim_vector (3, 1), 0);
// p(2) = 2;
// a_matrix.insert (b_matrix, p);
//
// std::cout << a_matrix;
// %%%%%%%%% END OF POSITIVE PHASE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
// Activate the visible units
// Find the mean of the Gaussian
// Octave code ---> topdown = gsd .* ( hid_states * w ) ;
topdown = hid_states * w ;
// This is the mean of the Gaussian. Instead of properly sampling, negdata is just the mean
// If we want to sample from the Gaussian, we would add in gsd .* randn( num_cases , num_dims ) ;
// Octave code ---> negdata = topdown + ... % top down connections
// repmat( bi' , num_cases , 1 ) + ... % static biases
// bi_star' ; % dynamic biases
// get repmat( bi' , 1 , num_cases ) ( http://stackoverflow.com/questions/19273053/write-to-a-matrix-in-oct-file-without-looping?rq=1 )
bi_transpose = bi.transpose () ;
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_cases ; ii_nest_loop++ ) // loop over the rows
{
repmat_bi_transpose.insert( bi_transpose , ii_nest_loop , 0 ) ;
}
negdata = topdown + repmat_bi_transpose + bi_star.transpose () ;
// Now conditional on negdata, calculate "posterior" probability for hiddens
// Octave code ---> eta = w * ( negdata ./ gsd )' + ... % bottom-up connections
// repmat( bj , 1 , num_cases ) + ... % static biases on unit (no change)
// bj_star ; % dynamic biases (no change)
negdata_transpose = negdata.transpose () ; // to save repetition of transpose
eta = w * negdata_transpose + repmat_bj + bj_star ;
// h_posteriors = 1 ./ ( 1 + exp( -eta ) ) ; % logistic
// -exponate the eta matrix
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < eta.rows () ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < eta.cols () ; jj_nest_loop++ )
{
eta ( ii_nest_loop , jj_nest_loop ) = exp( - eta ( ii_nest_loop , jj_nest_loop ) ) ;
}
}
// element division A./B == quotient(A,B)
h_posteriors = quotient( ones , ( ones + eta ) ) ;
// Calculate negative gradients
// Octave code ---> neg_w_grad = h_posteriors * ( negdata ./ gsd ) ; % not using activations
neg_w_grad = h_posteriors * negdata ; // not using activations
// Octave code ---> neg_bi_grad = sum( negdata' - repmat( bi , 1 , num_cases ) - bi_star , 2 ) ./ gsd^2 ;
neg_bi_grad = ( negdata_transpose - repmat_bi - bi_star ).sum ( 1 ) ;
// Octave code ---> neg_bj_grad = sum( h_posteriors , 2 ) ;
neg_bj_grad = h_posteriors.sum ( 1 ) ;
// Octave code ---> for hh = 1 : nt
// neg_A_grad( : , : , hh ) = ( negdata' - repmat( bi , 1 , num_cases ) - bi_star ) ./ gsd^2 * data( : , : , hh + 1 ) ;
// neg_B_grad( : , : , hh ) = h_posteriors * data( : , : , hh + 1 ) ;
// end
for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
{
p( 2 ) = hh ; // set Array p to write to page hh of A_grad and B_grad NDArrays
data_extract = data.page ( hh + 1 ) ; // get the equivalent of data( : , : , hh + 1 )
neg_A_grad.insert( ( negdata_transpose - repmat_bi - bi_star ) * data_extract , p ) ;
neg_B_grad.insert( h_posteriors * data_extract , p ) ;
} // end of hh loop
// %%%%%%%%% END NEGATIVE PHASE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
// Octave code ---> err = sum( sum( ( data( : , : , 1 ) - negdata ) .^2 ) ) ;
// Not used errsum = err + errsum ;
// Octave code ---> if ( epoch > 5 ) % use momentum
// momentum = mom ;
// else % no momentum
// momentum = 0 ;
// end
// momentum was initialised to 0.0, but on the 6th iteration of epoch, set momentum to 0.9
if ( epoch == 5 ) // will only be true once, after which momentum will == 0.9
{
momentum_w.fill( 0.9 ) ;
momentum_bi.fill( 0.9 ) ;
momentum_bj.fill( 0.9 ) ;
momentum_A.fill( 0.9 ) ;
momentum_B.fill( 0.9 ) ;
}
// %%%%%%%%% UPDATE WEIGHTS AND BIASES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
// Octave code ---> w_update = momentum * w_update + epsilon_w * ( ( w_grad - neg_w_grad ) / num_cases - w_decay * w ) ;
w_update = product( momentum_w , w_update ) + product( epsilon_w , quotient( ( w_grad - neg_w_grad ) , num_cases_matrices_w_and_B ) - product( w_decay , w ) ) ;
// Octave code ---> bi_update = momentum * bi_update + ( epsilon_bi / num_cases ) * ( bi_grad - neg_bi_grad ) ;
bi_update = product( momentum_bi , bi_update ) + product( quotient( epsilon_bi , num_cases_matrix_bi ) , ( bi_grad - neg_bi_grad ) ) ;
// Octave code ---> bj_update = momentum * bj_update + ( epsilon_bj / num_cases ) * ( bj_grad - neg_bj_grad ) ;
bj_update = product( momentum_bj , bj_update ) + product( quotient( epsilon_bj , num_cases_matrix_bj ) , ( bj_grad - neg_bj_grad ) ) ;
// The following two Octave code loops are combined into the single .oct loop that follows them
//
// Octave code ---> for hh = 1 : nt
// A_update( : , : , hh ) = momentum * A_update( : , : , hh ) + epsilon_A * ( ( A_grad( : , : , hh ) - neg_A_grad( : , : , hh ) ) / num_cases - w_decay * A( : , : , hh ) ) ;
// B_update( : , : , hh ) = momentum * B_update( : , : , hh ) + epsilon_B * ( ( B_grad( : , : , hh ) - neg_B_grad( : , : , hh ) ) / num_cases - w_decay * B( : , : , hh ) ) ;
// end
// Octave code ---> for hh = 1 : nt
// A( : , : , hh ) = A( : , : , hh ) + A_update( : , : , hh ) ;
// B( : , : , hh ) = B( : , : , hh ) + B_update( : , : , hh ) ;
// end
for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
{
p( 2 ) = hh ;
A_update_extract = A_update.page ( hh ) ;
A_grad_extract = A_grad.page ( hh ) ;
neg_A_grad_extract = neg_A_grad.page ( hh ) ;
A_extract = A.page ( hh ) ;
A_update.insert( product( momentum_A , A_update_extract ) + product( epsilon_A , ( quotient( ( A_grad_extract - neg_A_grad_extract ) , num_cases_matrix_A ) - product( w_decay_A , A_extract ) ) ) , p ) ;
A_update_extract = A_update.page ( hh ) ;
A.insert( A_extract + A_update_extract , p ) ;
B_update_extract = B_update.page ( hh ) ;
B_grad_extract = B_grad.page ( hh ) ;
neg_B_grad_extract = neg_B_grad.page ( hh ) ;
B_extract = B.page ( hh ) ;
B_update.insert( product( momentum_B , B_update_extract ) + product( epsilon_B , ( quotient( ( B_grad_extract - neg_B_grad_extract ) , num_cases_matrices_w_and_B ) - product( w_decay_B , B_extract ) ) ) , p ) ;
B_update_extract = B_update.page ( hh ) ;
B.insert( B_extract + B_update_extract , p ) ;
} // end of hh loop
// Octave code ---> w = w + w_update ;
// bi = bi + bi_update ;
// bj = bj + bj_update ;
w += w_update ;
bi += bi_update ;
bj += bj_update ;
// %%%%%%%%%%%%%%%% END OF UPDATES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
} // end of batch loop
} // end of main epoch loop
// return the values w , bj , bi , A , B
retval_list(4) = B ;
retval_list(3) = A ;
retval_list(2) = bi ;
retval_list(1) = bj ;
retval_list(0) = w ;
return retval_list ;
} // end of function
The code is heavilly commented throughout.The .oct code for the binary_crbm.m function will follow in due course.