DEFUN_DLD ( cc_binary_crbm_mersenne , args , nargout ,
"-*- texinfo -*-\n\
@deftypefn {Function File} {} cc_binary_crbm_mersenne (@var{ batchdata , minibatch , nt , num_epochs , num_hid })\n\n\
This function trains a binary crbm. The value nt is the order of the model, i.e. how many previous values should be included,\n\
num_epochs is the number of training epochs, and num_hid is the number of nodes in the hidden layer.\n\
@end deftypefn" )
{
octave_value_list retval_list ;
int nargin = args.length () ;
// check the input arguments
if ( nargin != 5 )
{
error ( "Input error: there are insufficient input arguments. Type help for more details." ) ;
return retval_list ;
}
if ( !args(0).is_real_matrix () ) // check batchdata
{
error ( "Input error: the 1st argument, batchdata, is not a matrix type. Type help for more details." ) ;
return retval_list ;
}
if ( !args(1).is_real_matrix () ) // check minibatch
{
error ( "Input error: the 2nd argument, minibatch, is not a matrix type. Type help for more details." ) ;
return retval_list ;
}
if ( args(2).length () != 1 ) // check nt
{
error ( "Input error: nt should be an interger value for the 'order' of the model. Type help for more details." ) ;
return retval_list ;
}
if ( args(3).length () != 1 ) // num_epochs
{
error ( "Input error: num_epochs should be an integer value for the number of training epochs. Type help for more details." ) ;
return retval_list ;
}
if ( args(4).length () != 1 ) // check num_hid
{
error ( "Input error: num_hid should be an integer for the number of nodes in hidden layer. Type help for more details." ) ;
return retval_list ;
}
if ( error_state )
{
error ( "Input error: type help for details." ) ;
return retval_list ;
}
// end of input checking
// inputs
Matrix batchdata = args(0).matrix_value () ;
Matrix minibatch = args(1).matrix_value () ;
int nt = args(2).int_value () ; // the "order" of the model
int num_epochs = args(3).int_value () ;
int num_hid = args(4).int_value () ;
// variables
// batchdata is a big matrix of all the data and we index it with "minibatch", a matrix of mini-batch indices in the columns
int num_cases = minibatch.rows () ; // Octave code ---> num_cases = length( minibatch{ batch } ) ;
int num_dims = batchdata.cols () ; // visible dimension
Matrix bi_star ( num_dims , num_cases ) ; bi_star.fill( 0.0 ) ; // Octave code ---> bi_star = zeros( num_dims , num_cases ) ;
Matrix bj_star ( num_hid , num_cases ) ; bj_star.fill( 0.0 ) ; // Octave code ---> bj_star = zeros( num_hid , num_cases ) ;
Matrix repmat_bj ( num_hid , num_cases ) ; repmat_bj.fill( 0.0 ) ; // for Octave code ---> repmat( bj , 1 , num_cases )
Matrix repmat_bi ( num_dims , num_cases ) ; repmat_bi.fill( 0.0 ) ; // for Octave code ---> repmat( bi , 1 , num_cases )
Matrix eta ( num_hid , num_cases ) ; eta.fill( 0.0 ) ;
Matrix h_posteriors ( num_hid , num_cases ) ; h_posteriors.fill( 0.0 ) ; // for the logistic function
Matrix ones ( num_hid , num_cases ) ; ones.fill( 1.0 ) ; // for the logistic function
Matrix ones_2 ( num_cases , num_hid ) ; ones_2.fill( 1.0 ) ; // for the logistic function of negdata
Matrix hid_states ( num_cases , num_hid ) ; hid_states.fill( 0.0 ) ; // for hid_states = double( h_posteriors' > rand( num_cases , num_hid ) ) ;
Matrix w_grad ( num_hid , num_dims ) ; w_grad.fill( 0.0 ) ; // for w_grad = hid_states' * ( data( : , : , 1 ) ./ gsd ) ;
Matrix bi_grad ( num_dims , 1 ) ; bi_grad.fill( 0.0 ) ; // for bi_grad = sum( data( : , : , 1 )' - repmat( bi , 1 , num_cases ) - bi_star , 2 ) ./ gsd^2 ;
Matrix bj_grad ( num_hid , 1 ) ; bj_grad.fill( 0.0 ) ; // for bj_grad = sum( hid_states , 1 )' ;
Matrix topdown ( num_cases , num_dims ) ; topdown.fill( 0.0 ) ; // for topdown = gsd .* ( hid_states * w ) ;
Matrix negdata ( num_cases , num_dims ) ; negdata.fill( 0.0 ) ;
Matrix negdata_transpose ( num_dims , num_cases ) ; negdata_transpose.fill( 0.0 ) ;
Matrix bi_transpose ( 1 , num_dims ) ; bi_transpose.fill( 0.0 ) ;
Matrix repmat_bi_transpose ( num_cases , num_dims ) ; repmat_bi_transpose.fill( 0.0 ) ;
Matrix neg_w_grad ( num_hid , num_dims ) ; neg_w_grad.fill( 0.0 ) ;
Matrix neg_bi_grad ( num_dims , 1 ) ; neg_bi_grad.fill( 0.0 ) ; // for neg_bi_grad = sum( negdata' - repmat( bi , 1 , num_cases ) - bi_star , 2 ) ./ gsd^2 ;
Matrix neg_bj_grad ( num_hid , 1 ) ; neg_bj_grad.fill( 0.0 ) ; // for neg_bj_grad = sum( h_posteriors , 2 ) ;
// Setting learning rates and create some utility matrices
Matrix epsilon_w ( num_hid , num_dims ) ; epsilon_w.fill( 0.001 ) ; // undirected
Matrix epsilon_bi ( num_dims , 1 ) ; epsilon_bi.fill( 0.001 ) ; // visibles
Matrix epsilon_bj ( num_hid , 1 ) ; epsilon_bj.fill( 0.001 ) ; // hidden units
Matrix epsilon_A ( num_dims , num_dims ) ; epsilon_A.fill( 0.001 ) ; // autoregressive
Matrix epsilon_B ( num_hid , num_dims ) ; epsilon_B.fill( 0.001 ) ; // prev visibles to hidden
Matrix w_decay ( num_hid , num_dims ) ; w_decay.fill( 0.0002 ) ; // currently we use the same weight decay for w, A, B
Matrix w_decay_A ( num_dims , num_dims ) ; w_decay_A.fill( 0.0002 ) ; // currently we use the same weight decay for w, A, B
Matrix w_decay_B ( num_hid , num_dims ) ; w_decay_B.fill( 0.0002 ) ; // currently we use the same weight decay for w, A, B
Matrix momentum_w ( num_hid , num_dims ) ; momentum_w.fill( 0.0 ) ; // momentum used only after 5 epochs of training, when it will be set to 0.9
Matrix num_cases_matrices_w_and_B ( num_hid , num_dims ) ; num_cases_matrices_w_and_B.fill( num_cases ) ;
Matrix momentum_bi ( num_dims , 1 ) ; momentum_bi.fill( 0.0 ) ;
Matrix num_cases_matrix_bi ( num_dims , 1 ) ; num_cases_matrix_bi.fill( num_cases ) ;
Matrix momentum_bj ( num_hid , 1 ) ; momentum_bj.fill( 0.0 ) ;
Matrix num_cases_matrix_bj ( num_hid , 1 ) ; num_cases_matrix_bj.fill( num_cases ) ;
Matrix momentum_A ( num_dims , num_dims ) ; momentum_A.fill( 0.0 ) ;
Matrix num_cases_matrix_A ( num_dims , num_dims ) ; num_cases_matrix_A.fill( num_cases ) ;
Matrix momentum_B ( num_hid , num_dims ) ; momentum_B.fill( 0.0 ) ;
// initialization of output matrices
Matrix w ( num_hid , num_dims ) ; w.fill( 0.0 ) ; // Octave code ---> w = 0.01 * randn( num_hid , num_dims ) ;
Matrix bi ( num_dims , 1 ) ; bi.fill( 0.0 ) ; // Octave code ---> bi = 0.01 * randn( num_dims , 1 ) ;
Matrix bj( num_hid , 1 ) ; bj.fill( 0.0 ) ; // Octave code ---> bj = -1 + 0.01 * randn( num_hid , 1 ) ; // set to favour units being "off"
// The autoregressive weights; A( : , : , j ) is the weight from t-j to the visible
NDArray A ( dim_vector( num_dims , num_dims , nt ) ) ; A.fill( 0.0 ) ; // Octave code ---> A = 0.01 * randn( num_dims ,num_dims , nt ) ;
// The weights from previous time-steps to the hiddens; B( : , : , j ) is the weight from t-j to the hidden layer
NDArray B ( dim_vector( num_hid , num_dims , nt ) ) ; B.fill( 0.0 ) ; // Octave code ---> B = 0.01 * randn( num_hid , num_dims , nt ) ;
// Declare MersenneTwister random values
MTRand mtrand1 ;
double rand_norm_value ;
double rand_uniform_value ;
// nested loops to fill w, bi, bj, A and B with initial random values
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_hid ; ii_nest_loop++ )
{
rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
bj ( ii_nest_loop , 0 ) = -1.0 + rand_norm_value ; // set to favour units being "off"
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_dims ; jj_nest_loop++ )
{
rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
w ( ii_nest_loop , jj_nest_loop ) = rand_norm_value ;
}
}
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_dims ; ii_nest_loop++ )
{
rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
bi ( ii_nest_loop , 0 ) = rand_norm_value ;
}
for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
{
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_dims ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_dims ; jj_nest_loop++ )
{
rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
A ( ii_nest_loop , jj_nest_loop , hh ) = rand_norm_value ;
} // end of jj_nest_loop loop
} // end of ii_nest_loop loop
} // end of hh loop
for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
{
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_hid ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_dims ; jj_nest_loop++ )
{
rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
B ( ii_nest_loop , jj_nest_loop , hh ) = rand_norm_value ;
} // end of jj_nest_loop loop
} // end of ii_nest_loop loop
} // end of hh loop
// keep previous updates around for momentum
Matrix w_update ( num_hid , num_dims ) ; w_update.fill( 0.0 ) ; // Octave code ---> w_update = zeros( size( w ) ) ;
Matrix bi_update ( num_dims , 1 ) ; bi_update.fill( 0.0 ) ; // Octave code ---> bi_update = zeros( size( bi ) ) ;
Matrix bj_update ( num_hid , 1 ) ; bj_update.fill( 0.0 ) ; // Octave code ---> bj_update = zeros( size( bj ) ) ;
NDArray A_update ( dim_vector( num_dims , num_dims , nt ) ) ; A_update.fill( 0.0 ) ; // Octave code ---> A_update = zeros( size( A ) ) ;
Matrix A_extract ( num_dims , num_dims ) ; A_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix A_update_extract ( num_dims , num_dims ) ; A_update_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArrays
NDArray B_update ( dim_vector( num_hid , num_dims , nt ) ) ; B_update.fill( 0.0 ) ; // Octave code ---> B_update = zeros( size( B ) ) ;
Matrix B_extract ( num_hid , num_dims ) ; B_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix B_update_extract ( num_hid , num_dims ) ; B_update_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArray
// data is a nt+1 dimensional array with current and delayed data corresponding to mini-batches
// num_cases = minibatch( batch ).length () ;
num_cases = minibatch.rows () ;
NDArray data ( dim_vector( num_cases , num_dims , nt + 1 ) ) ; data.fill( 0.0 ) ; // Octave code ---> data = zeros( num_cases , num_dims , nt + 1 ) ;
Matrix data_extract ( num_cases , num_dims ) ; data_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix data_transpose ( num_dims , num_cases ) ; data_transpose.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix data_0 ( num_cases , num_dims ) ; data_0.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix data_0_transpose ( num_dims , num_cases ) ; data_0_transpose.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArrays
NDArray A_grad ( dim_vector( num_dims , num_dims , nt ) ) ; A_grad.fill( 0.0 ) ; // for A_update( : , : , hh ) = momentum * A_update( : , : , hh ) + epsilon_A * ( ( A_grad( : , : , hh ) - neg_A_grad( : , : , hh ) ) / num_cases - w_decay * A( : , : , hh ) ) ;
Matrix A_grad_extract ( num_dims , num_dims ) ; A_grad_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArrays
NDArray neg_A_grad ( dim_vector( num_dims , num_dims , nt ) ) ; neg_A_grad.fill( 0.0 ) ; // for neg_A_grad( : , : , hh ) = ( negdata' - repmat( bi , 1 , num_cases ) - bi_star ) ./ gsd^2 * data( : , : , hh + 1 ) ;
Matrix neg_A_grad_extract ( num_dims , num_dims ) ; neg_A_grad_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArrays
NDArray B_grad ( dim_vector( num_hid , num_dims , nt ) ) ; B_grad.fill( 0.0 ) ; // for B_update( : , : , hh ) = momentum * B_update( : , : , hh ) + epsilon_B * ( ( B_grad( : , : , hh ) - neg_B_grad( : , : , hh ) ) / num_cases - w_decay * B( : , : , hh ) ) ;
Matrix B_grad_extract ( num_hid , num_dims ) ; B_grad_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArrays
NDArray neg_B_grad ( dim_vector( num_hid , num_dims , nt ) ) ; neg_B_grad.fill( 0.0 ) ; // for neg_B_grad( : , : , hh ) = h_posteriors * data( : , : , hh + 1 ) ;
Matrix neg_B_grad_extract ( num_hid , num_dims ) ; neg_B_grad_extract.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArrays
Array p ( dim_vector ( nt , 1 ) , 0 ) ; // vector for writing to A_grad and B_grad
// end of initialization of matrices
// %%%%%%%%% THE MAIN CODE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
for ( octave_idx_type epoch ( 0 ) ; epoch < num_epochs ; epoch++ ) // main epoch loop
{
// // errsum = 0 ; % keep a running total of the difference between data and recon
//
for ( octave_idx_type batch ( 0 ) ; batch < minibatch.cols () ; batch++ ) // Octave code ---> int num_batches = minibatch.length () ;
{
// %%%%%%%%% START POSITIVE PHASE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
// // These next nested loops fill the data NDArray with the values from batchdata indexed
// // by the column values of minibatch. Octave code equivalent given below:-
// // Octave code ---> mb = minibatch{ batch } ; % caches the indices
// // Octave code ---> data( : , : , 1 ) = batchdata( mb , : ) ;
// // Octave code ---> for hh = 1 : nt
// // Octave code ---> data( : , : , hh + 1 ) = batchdata( mb - hh , : ) ;
// // Octave code ---> end
for ( octave_idx_type hh ( 0 ) ; hh < nt + 1 ; hh++ )
{
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_cases ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_dims ; jj_nest_loop++ )
{
data( ii_nest_loop , jj_nest_loop , hh ) = batchdata( minibatch( ii_nest_loop , batch ) - 1 - hh , jj_nest_loop ) ; // -1 for .oct zero based indexing vs Octave's 1 based
} // end of jj_nest_loop loop
} // end of ii_nest_loop loop
} // end of hh loop
// The above data filling loop could perhaps be implemented more quickly using the fortrans vec method as below
// http://stackoverflow.com.80bola.com/questions/28900153/create-a-ndarray-in-an-oct-file-from-a-double-pointer
// NDArray a (dim_vector(dim[0], dim[1], dim[2]));
//
// Then loop over (i, j, k) indices to copy the cube to the octave array
//
// double* a_vec = a.fortran_vec ();
// for (int i = 0; i < dim[0]; i++) {
// for (int j = 0; j < dim[1]; j++) {
// for (int k = 0; k < dim[2]; k++) {
// *a_vec++ = armadillo_cube(i, j, k);
// }
// }
// }
// calculate contributions from directed autoregressive connections and contributions from directed visible-to-hidden connections
bi_star.fill( 0.0 ) ; // Octave code ---> bi_star = zeros( num_dims , num_cases ) ; ( matrix declared earlier in code above )
bj_star.fill( 0.0 ) ; // Octave code ---> bj_star = zeros( num_hid , num_cases ) ; ( matrix declared earlier in code above )
// The code below calculates two separate Octave code loops in one nested C++ loop structure, namely
// Octave code ---> for hh = 1 : nt
// bi_star = bi_star + A( : , : , hh ) * data( : , : , hh + 1 )' ;
// end
// and
// Octave code ---> for hh = 1:nt
// bj_star = bj_star + B( : , : , hh ) * data( : , : , hh + 1 )' ;
// end
for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
{
// fill the intermediate calculation matrices
A_extract = A.page ( hh ) ;
B_extract = B.page ( hh ) ;
data_transpose = ( data.page ( hh + 1 ) ).transpose () ;
// add up the hh different matrix multiplications
bi_star += A_extract * data_transpose ;
bj_star += B_extract * data_transpose ;
} // end of hh loop
// extract and pre-calculate to save time in later computations
data_0 = data.page ( 0 ) ;
data_0_transpose = data_0.transpose () ;
// Calculate "posterior" probability -- hidden state being on ( Note that it isn't a true posterior )
// Octave code ---> eta = w * ( data( : , : , 1 ) ./ gsd )' + ... % bottom-up connections
// repmat( bj , 1 , num_cases ) + ... % static biases on unit
// bj_star ; % dynamic biases
// get repmat( bj , 1 , num_cases ) ( http://stackoverflow.com/questions/19273053/write-to-a-matrix-in-oct-file-without-looping?rq=1 )
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_cases ; jj_nest_loop++ ) // loop over the columns
{
repmat_bj.insert( bj , 0 , jj_nest_loop ) ;
}
// bottom_up = w * data( : , : , 1 )' ;
eta = w * data_0_transpose + repmat_bj + bj_star ;
// h_posteriors = 1 ./ ( 1 + exp( -eta ) ) ; % logistic
// -exponate the eta matrix
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < eta.rows () ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < eta.cols () ; jj_nest_loop++ )
{
eta ( ii_nest_loop , jj_nest_loop ) = exp( - eta ( ii_nest_loop , jj_nest_loop ) ) ;
}
}
// element division A./B == quotient(A,B)
h_posteriors = quotient( ones , ( ones + eta ) ) ;
// Activate the hidden units
// Octave code ---> hid_states = double( h_posteriors' > rand( num_cases , num_hid ) ) ;
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < hid_states.rows () ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < hid_states.cols () ; jj_nest_loop++ )
{
rand_uniform_value = mtrand1.randDblExc() ; // a real number in the range 0 to 1, excluding both 0 and 1
hid_states( ii_nest_loop , jj_nest_loop ) = h_posteriors( jj_nest_loop , ii_nest_loop ) > rand_uniform_value ? 1.0 : 0.0 ;
}
} // end of hid_states loop
// Calculate positive gradients ( note w.r.t. neg energy )
// Octave code ---> w_grad = hid_states' * ( data( : , : , 1 ) ./ gsd ) ;
// bi_grad = sum( data( : , : , 1 )' - repmat( bi , 1 , num_cases ) - bi_star , 2 ) ./ gsd^2 ;
// bj_grad = sum( hid_states , 1 )' ;
w_grad = hid_states.transpose () * data_0 ;
// get repmat( bi , 1 , num_cases ) ( http://stackoverflow.com/questions/19273053/write-to-a-matrix-in-oct-file-without-looping?rq=1 )
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_cases ; jj_nest_loop++ ) // loop over the columns
{
repmat_bi.insert( bi , 0 , jj_nest_loop ) ;
}
bi_grad = ( data_0_transpose - repmat_bi - bi_star ).sum ( 1 ) ;
bj_grad = ( hid_states.sum ( 0 ) ).transpose () ;
// Octave code ---> for hh = 1 : nt
// A_grad( : , : , hh ) = ( data( : , : , 1 )' - repmat( bi , 1 , num_cases ) - bi_star ) ./ gsd^2 * data( : , : , hh + 1 ) ;
// B_grad( : , : , hh ) = hid_states' * data( : , : , hh + 1 ) ;
// end
for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
{
p( 2 ) = hh ; // set Array p to write to page hh of A_grad and B_grad NDArrays
data_extract = data.page ( hh + 1 ) ; // get the equivalent of data( : , : , hh + 1 )
A_grad.insert( ( data_0_transpose - repmat_bi - bi_star ) * data_extract , p ) ;
B_grad.insert( hid_states.transpose () * data_extract , p ) ;
}
// the above code comes from http://stackoverflow.com/questions/29572075/how-do-you-create-an-arrayoctave-idx-type-in-an-oct-file
// e.g.
//
// Array p (dim_vector (3, 1));
// int n = 2;
// dim_vector dim(n, n, 3);
// NDArray a_matrix(dim);
//
// for (octave_idx_type i = 0; i < n; i++)
// for (octave_idx_type j = 0; j < n; j++)
// a_matrix(i,j, 1) = (i + 1) * 10 + (j + 1);
//
// std::cout << a_matrix;
//
// Matrix b_matrix = Matrix (n, n);
// b_matrix(0, 0) = 1;
// b_matrix(0, 1) = 2;
// b_matrix(1, 0) = 3;
// b_matrix(1, 1) = 4;
// std::cout << b_matrix;
//
// Array p (dim_vector (3, 1), 0);
// p(2) = 2;
// a_matrix.insert (b_matrix, p);
//
// std::cout << a_matrix;
// %%%%%%%%% END OF POSITIVE PHASE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
// Activate the visible units
// Octave code ---> topdown = hid_states * w ;
topdown = hid_states * w ;
// get repmat( bi' , 1 , num_cases ) ( http://stackoverflow.com/questions/19273053/write-to-a-matrix-in-oct-file-without-looping?rq=1 )
bi_transpose = bi.transpose () ;
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_cases ; ii_nest_loop++ ) // loop over the rows
{
repmat_bi_transpose.insert( bi_transpose , ii_nest_loop , 0 ) ;
}
eta = topdown + repmat_bi_transpose + bi_star.transpose () ;
// Octave code ---> negdata = 1 ./ ( 1 + exp( -eta ) ) ; % logistic
// -exponate the eta matrix
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < eta.rows () ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < eta.cols () ; jj_nest_loop++ )
{
eta ( ii_nest_loop , jj_nest_loop ) = exp( - eta ( ii_nest_loop , jj_nest_loop ) ) ;
}
}
// element division A./B == quotient(A,B)
negdata = quotient( ones_2 , ( ones_2 + eta ) ) ;
// Now conditional on negdata, calculate "posterior" probability for hiddens
// Octave code ---> eta = w * ( negdata ./ gsd )' + ... % bottom-up connections
// repmat( bj , 1 , num_cases ) + ... % static biases on unit (no change)
// bj_star ; % dynamic biases (no change)
negdata_transpose = negdata.transpose () ; // to save repetition of transpose
eta = w * negdata_transpose + repmat_bj + bj_star ;
// h_posteriors = 1 ./ ( 1 + exp( -eta ) ) ; % logistic
// -exponate the eta matrix
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < eta.rows () ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < eta.cols () ; jj_nest_loop++ )
{
eta ( ii_nest_loop , jj_nest_loop ) = exp( - eta ( ii_nest_loop , jj_nest_loop ) ) ;
}
}
// element division A./B == quotient(A,B)
h_posteriors = quotient( ones , ( ones + eta ) ) ;
// Calculate negative gradients
// Octave code ---> neg_w_grad = h_posteriors * ( negdata ./ gsd ) ; % not using activations
neg_w_grad = h_posteriors * negdata ; // not using activations
// Octave code ---> neg_bi_grad = sum( negdata' - repmat( bi , 1 , num_cases ) - bi_star , 2 ) ./ gsd^2 ;
neg_bi_grad = ( negdata_transpose - repmat_bi - bi_star ).sum ( 1 ) ;
// Octave code ---> neg_bj_grad = sum( h_posteriors , 2 ) ;
neg_bj_grad = h_posteriors.sum ( 1 ) ;
// Octave code ---> for hh = 1 : nt
// neg_A_grad( : , : , hh ) = ( negdata' - repmat( bi , 1 , num_cases ) - bi_star ) ./ gsd^2 * data( : , : , hh + 1 ) ;
// neg_B_grad( : , : , hh ) = h_posteriors * data( : , : , hh + 1 ) ;
// end
for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
{
p( 2 ) = hh ; // set Array p to write to page hh of A_grad and B_grad NDArrays
data_extract = data.page ( hh + 1 ) ; // get the equivalent of data( : , : , hh + 1 )
neg_A_grad.insert( ( negdata_transpose - repmat_bi - bi_star ) * data_extract , p ) ;
neg_B_grad.insert( h_posteriors * data_extract , p ) ;
} // end of hh loop
// %%%%%%%%% END NEGATIVE PHASE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
// Octave code ---> err = sum( sum( ( data( : , : , 1 ) - negdata ) .^2 ) ) ;
// Not used errsum = err + errsum ;
// Octave code ---> if ( epoch > 5 ) % use momentum
// momentum = mom ;
// else % no momentum
// momentum = 0 ;
// end
// momentum was initialised to 0.0, but on the 6th iteration of epoch, set momentum to 0.9
if ( epoch == 5 ) // will only be true once, after which momentum will == 0.9
{
momentum_w.fill( 0.9 ) ;
momentum_bi.fill( 0.9 ) ;
momentum_bj.fill( 0.9 ) ;
momentum_A.fill( 0.9 ) ;
momentum_B.fill( 0.9 ) ;
}
// %%%%%%%%% UPDATE WEIGHTS AND BIASES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
// Octave code ---> w_update = momentum * w_update + epsilon_w * ( ( w_grad - neg_w_grad ) / num_cases - w_decay * w ) ;
w_update = product( momentum_w , w_update ) + product( epsilon_w , quotient( ( w_grad - neg_w_grad ) , num_cases_matrices_w_and_B ) - product( w_decay , w ) ) ;
// Octave code ---> bi_update = momentum * bi_update + ( epsilon_bi / num_cases ) * ( bi_grad - neg_bi_grad ) ;
bi_update = product( momentum_bi , bi_update ) + product( quotient( epsilon_bi , num_cases_matrix_bi ) , ( bi_grad - neg_bi_grad ) ) ;
// Octave code ---> bj_update = momentum * bj_update + ( epsilon_bj / num_cases ) * ( bj_grad - neg_bj_grad ) ;
bj_update = product( momentum_bj , bj_update ) + product( quotient( epsilon_bj , num_cases_matrix_bj ) , ( bj_grad - neg_bj_grad ) ) ;
// The following two Octave code loops are combined into the single .oct loop that follows them
//
// Octave code ---> for hh = 1 : nt
// A_update( : , : , hh ) = momentum * A_update( : , : , hh ) + epsilon_A * ( ( A_grad( : , : , hh ) - neg_A_grad( : , : , hh ) ) / num_cases - w_decay * A( : , : , hh ) ) ;
// B_update( : , : , hh ) = momentum * B_update( : , : , hh ) + epsilon_B * ( ( B_grad( : , : , hh ) - neg_B_grad( : , : , hh ) ) / num_cases - w_decay * B( : , : , hh ) ) ;
// end
// Octave code ---> for hh = 1 : nt
// A( : , : , hh ) = A( : , : , hh ) + A_update( : , : , hh ) ;
// B( : , : , hh ) = B( : , : , hh ) + B_update( : , : , hh ) ;
// end
for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
{
p( 2 ) = hh ;
A_update_extract = A_update.page ( hh ) ;
A_grad_extract = A_grad.page ( hh ) ;
neg_A_grad_extract = neg_A_grad.page ( hh ) ;
A_extract = A.page ( hh ) ;
A_update.insert( product( momentum_A , A_update_extract ) + product( epsilon_A , ( quotient( ( A_grad_extract - neg_A_grad_extract ) , num_cases_matrix_A ) - product( w_decay_A , A_extract ) ) ) , p ) ;
A_update_extract = A_update.page ( hh ) ;
A.insert( A_extract + A_update_extract , p ) ;
B_update_extract = B_update.page ( hh ) ;
B_grad_extract = B_grad.page ( hh ) ;
neg_B_grad_extract = neg_B_grad.page ( hh ) ;
B_extract = B.page ( hh ) ;
B_update.insert( product( momentum_B , B_update_extract ) + product( epsilon_B , ( quotient( ( B_grad_extract - neg_B_grad_extract ) , num_cases_matrices_w_and_B ) - product( w_decay_B , B_extract ) ) ) , p ) ;
B_update_extract = B_update.page ( hh ) ;
B.insert( B_extract + B_update_extract , p ) ;
} // end of hh loop
// Octave code ---> w = w + w_update ;
// bi = bi + bi_update ;
// bj = bj + bj_update ;
w += w_update ;
bi += bi_update ;
bj += bj_update ;
// %%%%%%%%%%%%%%%% END OF UPDATES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
} // end of batch loop
} // end of main epoch loop
// return the values w , bj , bi , A , B
retval_list(4) = B ;
retval_list(3) = A ;
retval_list(2) = bi ;
retval_list(1) = bj ;
retval_list(0) = w ;
return retval_list ;
} // end of function
"Trading is statistics and time series analysis." This blog details my progress in developing a systematic trading system for use on the futures and forex markets, with discussion of the various indicators and other inputs used in the creation of the system. Also discussed are some of the issues/problems encountered during this development process. Within the blog posts there are links to other web pages that are/have been useful to me.
Monday, 20 April 2015
Optimised CRBM Code for Binary Units
Following on from my previous post, in the code box below there is the C++ .oct code for training the binary units of the CRBM and, as with the code for training the Gaussian units, the code is liberally commented. There is only a slight difference between the two code files.
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment