## Thursday 16 April 2015

### Optimised CRBM Code for Gaussian Units

Over the last few weeks I have been working on optimising the conditional restricted boltzmann machine code, with a view to speeding it up via a C++ .oct file, and in the code box below is this .oct code for the gaussian_crbm.m code in my previous post. This gaussian_crbm.m function, plus the binary_crbm.m one, are the speed bottlenecks whilst training the crbm. In the code below I have made some adjustments for code simplification purposes, the most important of which are:
• the minibatch index is a matrix rather than a cell type index, with the individual batch indexes in the columns, which means that the batch sizes are all equal in length.
• as a result all data in cell arrays in the .m function are in NDArrays in the .oct function
• there is no input for the variable "gsd" because I have assumed a value of 1 applies. This means the input data must be normalised prior to the function call.
``````DEFUN_DLD ( cc_gaussian_crbm_mersenne , args , nargout ,
"-*- texinfo -*-\n\
@deftypefn {Function File} {} cc_gaussian_crbm_mersenne (@var{ batchdata , minibatch , nt , num_epochs , num_hid })\n\n\
This function trains a real valued, gaussian crbm where the gsd is assumed to be 1, so the batchdata input must be z-score normalised.\n\
The value nt is the order of the model, i.e. how many previous values should be included, num_epochs is the number of training epochs,\n\
and num_hid is the number of nodes in the hidden layer.\n\
@end deftypefn" )

{
octave_value_list retval_list ;
int nargin = args.length () ;

// check the input arguments
if ( nargin != 5 )
{
error ( "Input error: there are insufficient input arguments. Type help for more details." ) ;
return retval_list ;
}

if ( !args(0).is_real_matrix () ) // check batchdata
{
error ( "Input error: the 1st argument, batchdata, is not a matrix type. Type help for more details." ) ;
return retval_list ;
}

if ( !args(1).is_real_matrix () ) // check minibatch
{
error ( "Input error: the 2nd argument, minibatch, is not a matrix type. Type help for more details." ) ;
return retval_list ;
}

if ( args(2).length () != 1 ) // check nt
{
error ( "Input error: nt should be an interger value for the 'order' of the model. Type help for more details." ) ;
return retval_list ;
}

if ( args(3).length () != 1 ) // num_epochs
{
error ( "Input error: num_epochs should be an integer value for the number of training epochs. Type help for more details." ) ;
return retval_list ;
}

if ( args(4).length () != 1 ) // check num_hid
{
error ( "Input error: num_hid should be an integer for the number of nodes in hidden layer. Type help for more details." ) ;
return retval_list ;
}

if ( error_state )
{
error ( "Input error: type help for details." ) ;
return retval_list ;
}
// end of input checking

// inputs
Matrix batchdata = args(0).matrix_value () ;
Matrix minibatch = args(1).matrix_value () ;
int nt = args(2).int_value () ; // the "order" of the model
int num_epochs = args(3).int_value () ;
int num_hid = args(4).int_value () ;

// variables
// batchdata is a big matrix of all the data and we index it with "minibatch", a matrix of mini-batch indices in the columns
int num_cases = minibatch.rows () ;                                                     // Octave code ---> num_cases = length( minibatch{ batch } ) ;
int num_dims = batchdata.cols () ;                                                      // visible dimension
Matrix bi_star ( num_dims , num_cases ) ; bi_star.fill( 0.0 ) ;                         // Octave code ---> bi_star = zeros( num_dims , num_cases ) ;
Matrix bj_star ( num_hid , num_cases ) ; bj_star.fill( 0.0 ) ;                          // Octave code ---> bj_star = zeros( num_hid , num_cases ) ;
Matrix repmat_bj ( num_hid , num_cases ) ; repmat_bj.fill( 0.0 ) ;                      // for Octave code ---> repmat( bj , 1 , num_cases )
Matrix repmat_bi ( num_dims , num_cases ) ; repmat_bi.fill( 0.0 ) ;                     // for Octave code ---> repmat( bi , 1 , num_cases )
Matrix eta ( num_hid , num_cases ) ; eta.fill( 0.0 ) ;
Matrix h_posteriors ( num_hid , num_cases ) ; h_posteriors.fill( 0.0 ) ;                // for the logistic function
Matrix ones ( num_hid , num_cases ) ; ones.fill( 1.0 ) ;                                // for the logistic function
Matrix hid_states ( num_cases , num_hid ) ; hid_states.fill( 0.0 ) ;                    // for hid_states = double( h_posteriors' > rand( num_cases , num_hid ) ) ;
Matrix w_grad ( num_hid , num_dims ) ; w_grad.fill( 0.0 ) ;                             // for w_grad = hid_states' * ( data( : , : , 1 ) ./ gsd ) ;
Matrix bi_grad ( num_dims , 1 ) ; bi_grad.fill( 0.0 ) ;                                 // for bi_grad = sum( data( : , : , 1 )' - repmat( bi , 1 , num_cases ) - bi_star , 2 ) ./ gsd^2 ;
Matrix bj_grad ( num_hid , 1 ) ; bj_grad.fill( 0.0 ) ;                                  // for bj_grad = sum( hid_states , 1 )' ;
Matrix topdown ( num_cases , num_dims ) ; topdown.fill( 0.0 ) ;                         // for topdown = gsd .* ( hid_states * w ) ;
Matrix negdata ( num_cases , num_dims ) ; negdata.fill( 0.0 ) ;
Matrix negdata_transpose ( num_dims , num_cases ) ; negdata_transpose.fill( 0.0 ) ;
Matrix bi_transpose ( 1 , num_dims ) ; bi_transpose.fill( 0.0 ) ;
Matrix repmat_bi_transpose ( num_cases , num_dims ) ; repmat_bi_transpose.fill( 0.0 ) ;
Matrix neg_w_grad ( num_hid , num_dims ) ; neg_w_grad.fill( 0.0 ) ;
Matrix neg_bi_grad ( num_dims , 1 ) ; neg_bi_grad.fill( 0.0 ) ;                         // for neg_bi_grad = sum( negdata' - repmat( bi , 1 , num_cases ) - bi_star , 2 ) ./ gsd^2 ;
Matrix neg_bj_grad ( num_hid , 1 ) ; neg_bj_grad.fill( 0.0 ) ;                          // for neg_bj_grad = sum( h_posteriors , 2 ) ;

// Setting learning rates and create some utility matrices
Matrix epsilon_w ( num_hid , num_dims ) ; epsilon_w.fill( 0.001 ) ;   // undirected
Matrix epsilon_bi ( num_dims , 1 ) ; epsilon_bi.fill( 0.001 ) ;       // visibles
Matrix epsilon_bj ( num_hid , 1 ) ; epsilon_bj.fill( 0.001 ) ;        // hidden units
Matrix epsilon_A ( num_dims , num_dims ) ; epsilon_A.fill( 0.001 ) ;  // autoregressive
Matrix epsilon_B ( num_hid , num_dims ) ; epsilon_B.fill( 0.001 ) ;   // prev visibles to hidden
Matrix w_decay ( num_hid , num_dims ) ; w_decay.fill( 0.0002 ) ;      // currently we use the same weight decay for w, A, B
Matrix w_decay_A ( num_dims , num_dims ) ; w_decay_A.fill( 0.0002 ) ; // currently we use the same weight decay for w, A, B
Matrix w_decay_B ( num_hid , num_dims ) ; w_decay_B.fill( 0.0002 ) ;  // currently we use the same weight decay for w, A, B

Matrix momentum_w ( num_hid , num_dims ) ; momentum_w.fill( 0.0 ) ;   // momentum used only after 5 epochs of training, when it will be set to 0.9
Matrix num_cases_matrices_w_and_B ( num_hid , num_dims ) ; num_cases_matrices_w_and_B.fill( num_cases ) ;

Matrix momentum_bi ( num_dims , 1 ) ; momentum_bi.fill( 0.0 ) ;
Matrix num_cases_matrix_bi ( num_dims , 1 ) ; num_cases_matrix_bi.fill( num_cases ) ;

Matrix momentum_bj ( num_hid , 1 ) ; momentum_bj.fill( 0.0 ) ;
Matrix num_cases_matrix_bj ( num_hid , 1 ) ; num_cases_matrix_bj.fill( num_cases ) ;

Matrix momentum_A ( num_dims , num_dims ) ; momentum_A.fill( 0.0 ) ;
Matrix num_cases_matrix_A ( num_dims , num_dims ) ; num_cases_matrix_A.fill( num_cases ) ;

Matrix momentum_B ( num_hid , num_dims ) ; momentum_B.fill( 0.0 ) ;

// initialization of output matrices
Matrix w ( num_hid , num_dims ) ; w.fill( 0.0 ) ; // Octave code ---> w = 0.01 * randn( num_hid , num_dims ) ;
Matrix bi ( num_dims , 1 ) ; bi.fill( 0.0 ) ;     // Octave code ---> bi = 0.01 * randn( num_dims , 1 ) ;
Matrix bj( num_hid , 1 ) ; bj.fill( 0.0 ) ;       // Octave code ---> bj = -1 + 0.01 * randn( num_hid , 1 ) ; // set to favour units being "off"

// The autoregressive weights; A( : , : , j ) is the weight from t-j to the visible
NDArray A ( dim_vector( num_dims , num_dims , nt ) ) ; A.fill( 0.0 ) ; // Octave code ---> A = 0.01 * randn( num_dims ,num_dims , nt ) ;

// The weights from previous time-steps to the hiddens; B( : , : , j ) is the weight from t-j to the hidden layer
NDArray B ( dim_vector( num_hid , num_dims , nt ) ) ; B.fill( 0.0 ) ;  // Octave code ---> B = 0.01 * randn( num_hid , num_dims , nt ) ;

// Declare MersenneTwister random values
MTRand mtrand1 ;
double rand_norm_value ;
double rand_uniform_value ;

// nested loops to fill w, bi, bj, A and B with initial random values
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_hid ; ii_nest_loop++ )
{
rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
bj ( ii_nest_loop , 0 ) = -1.0 + rand_norm_value ; // set to favour units being "off"

for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_dims ; jj_nest_loop++ )
{
rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
w ( ii_nest_loop , jj_nest_loop ) = rand_norm_value ;
}
}

for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_dims ; ii_nest_loop++ )
{
rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
bi ( ii_nest_loop , 0 ) = rand_norm_value ;
}

for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
{
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_dims ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_dims  ; jj_nest_loop++ )
{
rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
A ( ii_nest_loop , jj_nest_loop , hh ) = rand_norm_value ;
}  // end of jj_nest_loop loop
}       // end of ii_nest_loop loop
}             // end of hh loop

for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
{
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_hid ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_dims  ; jj_nest_loop++ )
{
rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
B ( ii_nest_loop , jj_nest_loop , hh ) = rand_norm_value ;
}  // end of jj_nest_loop loop
}       // end of ii_nest_loop loop
}             // end of hh loop

// keep previous updates around for momentum
Matrix w_update ( num_hid , num_dims ) ; w_update.fill( 0.0 ) ; // Octave code ---> w_update = zeros( size( w ) ) ;
Matrix bi_update ( num_dims , 1 ) ; bi_update.fill( 0.0 ) ;     // Octave code ---> bi_update = zeros( size( bi ) ) ;
Matrix bj_update ( num_hid , 1 ) ; bj_update.fill( 0.0 ) ;      // Octave code ---> bj_update = zeros( size( bj ) ) ;

NDArray A_update ( dim_vector( num_dims , num_dims , nt ) ) ; A_update.fill( 0.0 ) ; // Octave code ---> A_update = zeros( size( A ) ) ;
Matrix A_extract ( num_dims , num_dims ) ; A_extract.fill( 0.0 ) ;                   // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix A_update_extract ( num_dims , num_dims ) ; A_update_extract.fill( 0.0 ) ;     // matrix for intermediate calculations because cannot directly "slice" into a NDArrays

NDArray B_update ( dim_vector( num_hid , num_dims , nt ) ) ; B_update.fill( 0.0 ) ;  // Octave code ---> B_update = zeros( size( B ) ) ;
Matrix B_extract ( num_hid , num_dims ) ; B_extract.fill( 0.0 ) ;                    // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix B_update_extract ( num_hid , num_dims ) ; B_update_extract.fill( 0.0 ) ;      // matrix for intermediate calculations because cannot directly "slice" into a NDArray

// data is a nt+1 dimensional array with current and delayed data corresponding to mini-batches
// num_cases = minibatch( batch ).length () ;
num_cases = minibatch.rows () ;
NDArray data ( dim_vector( num_cases , num_dims , nt + 1 ) ) ; data.fill( 0.0 ) ; // Octave code ---> data = zeros( num_cases , num_dims , nt + 1 ) ;
Matrix data_extract ( num_cases , num_dims ) ; data_extract.fill( 0.0 ) ;         // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix data_transpose ( num_dims , num_cases ) ; data_transpose.fill( 0.0 ) ;     // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix data_0 ( num_cases , num_dims ) ; data_0.fill( 0.0 ) ;                     // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix data_0_transpose ( num_dims , num_cases ) ; data_0_transpose.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArrays

NDArray A_grad ( dim_vector( num_dims , num_dims , nt ) ) ; A_grad.fill( 0.0 ) ;  // for A_update( : , : , hh ) = momentum * A_update( : , : , hh ) + epsilon_A * ( ( A_grad( : , : , hh ) - neg_A_grad( : , : , hh ) ) / num_cases - w_decay * A( : , : , hh ) ) ;
Matrix A_grad_extract ( num_dims , num_dims ) ; A_grad_extract.fill( 0.0 ) ;      // matrix for intermediate calculations because cannot directly "slice" into a NDArrays

NDArray neg_A_grad ( dim_vector( num_dims , num_dims , nt ) ) ; neg_A_grad.fill( 0.0 ) ; // for neg_A_grad( : , : , hh ) = ( negdata' - repmat( bi , 1 , num_cases ) - bi_star ) ./ gsd^2 * data( : , : , hh + 1 ) ;
Matrix neg_A_grad_extract ( num_dims , num_dims ) ; neg_A_grad_extract.fill( 0.0 ) ;     // matrix for intermediate calculations because cannot directly "slice" into a NDArrays

NDArray B_grad ( dim_vector( num_hid , num_dims , nt ) ) ; B_grad.fill( 0.0 ) ;          // for B_update( : , : , hh ) = momentum * B_update( : , : , hh ) + epsilon_B * ( ( B_grad( : , : , hh ) - neg_B_grad( : , : , hh ) ) / num_cases - w_decay * B( : , : , hh ) ) ;
Matrix B_grad_extract ( num_hid , num_dims ) ; B_grad_extract.fill( 0.0 ) ;              // matrix for intermediate calculations because cannot directly "slice" into a NDArrays

NDArray neg_B_grad ( dim_vector( num_hid , num_dims , nt ) ) ; neg_B_grad.fill( 0.0 ) ;  // for neg_B_grad( : , : , hh ) = h_posteriors * data( : , : , hh + 1 ) ;
Matrix neg_B_grad_extract ( num_hid , num_dims ) ; neg_B_grad_extract.fill( 0.0 ) ;      // matrix for intermediate calculations because cannot directly "slice" into a NDArrays

Array  p ( dim_vector ( nt , 1 ) , 0 ) ;                                // vector for writing to A_grad and B_grad

// end of initialization of matrices

// %%%%%%%%% THE MAIN CODE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

for ( octave_idx_type epoch ( 0 ) ; epoch < num_epochs ; epoch++ ) // main epoch loop
{
//     // errsum = 0 ; % keep a running total of the difference between data and recon
//
for ( octave_idx_type batch ( 0 ) ; batch < minibatch.cols () ; batch++ ) // Octave code ---> int num_batches = minibatch.length () ;
{
// %%%%%%%%% START POSITIVE PHASE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

// // These next nested loops fill the data NDArray with the values from batchdata indexed
// // by the column values of minibatch. Octave code equivalent given below:-
// // Octave code ---> mb = minibatch{ batch } ; % caches the indices
// // Octave code ---> data( : , : , 1 ) = batchdata( mb , : ) ;
// // Octave code ---> for hh = 1 : nt
// // Octave code ---> data( : , : , hh + 1 ) = batchdata( mb - hh , : ) ;
// // Octave code ---> end
for ( octave_idx_type hh ( 0 ) ; hh < nt + 1 ; hh++ )
{
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_cases ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_dims  ; jj_nest_loop++ )
{
data( ii_nest_loop , jj_nest_loop , hh ) = batchdata( minibatch( ii_nest_loop , batch ) - 1 - hh , jj_nest_loop ) ; // -1 for .oct zero based indexing vs Octave's 1 based
} // end of jj_nest_loop loop
}       // end of ii_nest_loop loop
}             // end of hh loop

// The above data filling loop could perhaps be implemented more quickly using the fortrans vec method as below
// http://stackoverflow.com.80bola.com/questions/28900153/create-a-ndarray-in-an-oct-file-from-a-double-pointer
// NDArray a (dim_vector(dim[0], dim[1], dim[2]));
//
// Then loop over (i, j, k) indices to copy the cube to the octave array
//
// double* a_vec = a.fortran_vec ();
// for (int i = 0; i < dim[0]; i++) {
//     for (int j = 0; j < dim[1]; j++) {
//         for (int k = 0; k < dim[2]; k++) {
//             *a_vec++ = armadillo_cube(i, j, k);
//         }
//     }
// }

// calculate contributions from directed autoregressive connections and contributions from directed visible-to-hidden connections
bi_star.fill( 0.0 ) ; // Octave code ---> bi_star = zeros( num_dims , num_cases ) ; ( matrix declared earlier in code above )
bj_star.fill( 0.0 ) ; // Octave code ---> bj_star = zeros( num_hid , num_cases ) ; ( matrix declared earlier in code above )

// The code below calculates two separate Octave code loops in one nested C++ loop structure, namely
// Octave code ---> for hh = 1 : nt
//                      bi_star = bi_star +  A( : , : , hh ) * data( : , : , hh + 1 )' ;
//                  end
// and
// Octave code ---> for hh = 1:nt
//                      bj_star = bj_star + B( : , : , hh ) * data( : , : , hh + 1 )' ;
//                  end

for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
{
// fill the intermediate calculation matrices
A_extract = A.page ( hh ) ;
B_extract = B.page ( hh ) ;
data_transpose = ( data.page ( hh + 1 ) ).transpose () ;

// add up the hh different matrix multiplications
bi_star += A_extract * data_transpose ;
bj_star += B_extract * data_transpose ;

} // end of hh loop

// extract and pre-calculate to save time in later computations
data_0 = data.page ( 0 ) ;
data_0_transpose = data_0.transpose () ;

// Calculate "posterior" probability -- hidden state being on ( Note that it isn't a true posterior )
//          Octave code ---> eta = w * ( data( : , : , 1 ) ./ gsd )' + ... % bottom-up connections
//                                 repmat( bj , 1 , num_cases ) + ...      % static biases on unit
//                                 bj_star ;                               % dynamic biases

//          get repmat( bj , 1 , num_cases ) ( http://stackoverflow.com/questions/19273053/write-to-a-matrix-in-oct-file-without-looping?rq=1 )
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_cases ; jj_nest_loop++ ) // loop over the columns
{
repmat_bj.insert( bj , 0 , jj_nest_loop ) ;
}

eta = w * data_0_transpose + repmat_bj + bj_star ;

// h_posteriors = 1 ./ ( 1 + exp( -eta ) ) ;    % logistic
// -exponate the eta matrix
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < eta.rows () ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < eta.cols () ; jj_nest_loop++ )
{
eta ( ii_nest_loop , jj_nest_loop ) = exp( - eta ( ii_nest_loop , jj_nest_loop ) ) ;
}
}

// element division  A./B == quotient(A,B)
h_posteriors = quotient( ones , ( ones + eta ) ) ;

// Activate the hidden units
//          Octave code ---> hid_states = double( h_posteriors' > rand( num_cases , num_hid ) ) ;
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < hid_states.rows () ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < hid_states.cols () ; jj_nest_loop++ )
{
rand_uniform_value = mtrand1.randDblExc() ; // a real number in the range 0 to 1, excluding both 0 and 1
hid_states( ii_nest_loop , jj_nest_loop ) = h_posteriors( jj_nest_loop , ii_nest_loop ) > rand_uniform_value ? 1.0 : 0.0 ;
}
} // end of hid_states loop

// Calculate positive gradients ( note w.r.t. neg energy )
// Octave code ---> w_grad = hid_states' * ( data( : , : , 1 ) ./ gsd ) ;
//                  bi_grad = sum( data( : , : , 1 )' - repmat( bi , 1 , num_cases ) - bi_star , 2 ) ./ gsd^2 ;
//                  bj_grad = sum( hid_states , 1 )' ;
w_grad = hid_states.transpose () * data_0 ;

//          get repmat( bi , 1 , num_cases ) ( http://stackoverflow.com/questions/19273053/write-to-a-matrix-in-oct-file-without-looping?rq=1 )
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_cases ; jj_nest_loop++ ) // loop over the columns
{
repmat_bi.insert( bi , 0 , jj_nest_loop ) ;
}
bi_grad = ( data_0_transpose - repmat_bi - bi_star ).sum ( 1 ) ;
bj_grad = ( hid_states.sum ( 0 ) ).transpose () ;

// Octave code ---> for hh = 1 : nt
//                  A_grad( : , : , hh ) = ( data( : , : , 1 )' - repmat( bi , 1 , num_cases ) - bi_star ) ./ gsd^2 * data( : , : , hh + 1 ) ;
//                  B_grad( : , : , hh ) = hid_states' * data( : , : , hh + 1 ) ;
//                  end
for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
{
p( 2 ) = hh ;                                                                    // set Array  p to write to page hh of A_grad and B_grad NDArrays
data_extract = data.page ( hh + 1 ) ;                                            // get the equivalent of data( : , : , hh + 1 )
A_grad.insert( ( data_0_transpose - repmat_bi - bi_star ) * data_extract , p ) ;
B_grad.insert( hid_states.transpose () * data_extract , p ) ;
}
// the above code comes from http://stackoverflow.com/questions/29572075/how-do-you-create-an-arrayoctave-idx-type-in-an-oct-file
// e.g.
//
// Array p (dim_vector (3, 1));
// int n = 2;
// dim_vector dim(n, n, 3);
// NDArray a_matrix(dim);
//
// for (octave_idx_type i = 0; i < n; i++)
//   for (octave_idx_type j = 0; j < n; j++)
//     a_matrix(i,j, 1) = (i + 1) * 10 + (j + 1);
//
// std::cout << a_matrix;
//
// Matrix b_matrix = Matrix (n, n);
// b_matrix(0, 0) = 1;
// b_matrix(0, 1) = 2;
// b_matrix(1, 0) = 3;
// b_matrix(1, 1) = 4;
// std::cout << b_matrix;
//
// Array p (dim_vector (3, 1), 0);
// p(2) = 2;
// a_matrix.insert (b_matrix, p);
//
// std::cout << a_matrix;

// %%%%%%%%% END OF POSITIVE PHASE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

// Activate the visible units
// Find the mean of the Gaussian
// Octave code ---> topdown = gsd .* ( hid_states * w ) ;
topdown = hid_states * w ;

// This is the mean of the Gaussian. Instead of properly sampling, negdata is just the mean
// If we want to sample from the Gaussian, we would add in gsd .* randn( num_cases , num_dims ) ;
// Octave code ---> negdata = topdown + ...                       % top down connections
//                            repmat( bi' , num_cases , 1 ) + ... % static biases
//                            bi_star' ;                          % dynamic biases

// get repmat( bi' , 1 , num_cases ) ( http://stackoverflow.com/questions/19273053/write-to-a-matrix-in-oct-file-without-looping?rq=1 )
bi_transpose = bi.transpose () ;
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_cases ; ii_nest_loop++ ) // loop over the rows
{
repmat_bi_transpose.insert( bi_transpose , ii_nest_loop , 0 ) ;
}
negdata = topdown + repmat_bi_transpose + bi_star.transpose () ;

// Now conditional on negdata, calculate "posterior" probability for hiddens
// Octave code ---> eta = w * ( negdata ./ gsd )' + ...      % bottom-up connections
//                        repmat( bj , 1 , num_cases ) + ... % static biases on unit (no change)
//                        bj_star ;                          % dynamic biases (no change)

negdata_transpose = negdata.transpose () ;             // to save repetition of transpose
eta = w * negdata_transpose + repmat_bj + bj_star ;

// h_posteriors = 1 ./ ( 1 + exp( -eta ) ) ;    % logistic
// -exponate the eta matrix
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < eta.rows () ; ii_nest_loop++ )
{
for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < eta.cols () ; jj_nest_loop++ )
{
eta ( ii_nest_loop , jj_nest_loop ) = exp( - eta ( ii_nest_loop , jj_nest_loop ) ) ;
}
}

// element division  A./B == quotient(A,B)
h_posteriors = quotient( ones , ( ones + eta ) ) ;

// Octave code ---> neg_w_grad = h_posteriors * ( negdata ./ gsd ) ; % not using activations
neg_w_grad = h_posteriors * negdata ; // not using activations

// Octave code ---> neg_bi_grad = sum( negdata' - repmat( bi , 1 , num_cases ) - bi_star , 2 ) ./ gsd^2 ;
neg_bi_grad = ( negdata_transpose - repmat_bi - bi_star ).sum ( 1 ) ;

// Octave code ---> neg_bj_grad = sum( h_posteriors , 2 ) ;
neg_bj_grad = h_posteriors.sum ( 1 ) ;

// Octave code ---> for hh = 1 : nt
//                      neg_A_grad( : , : , hh ) = ( negdata' - repmat( bi , 1 , num_cases ) - bi_star ) ./ gsd^2 * data( : , : , hh + 1 ) ;
//                      neg_B_grad( : , : , hh ) = h_posteriors * data( : , : , hh + 1 ) ;
//                  end

for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
{
p( 2 ) = hh ;                                                                            // set Array  p to write to page hh of A_grad and B_grad NDArrays

data_extract = data.page ( hh + 1 ) ;                                                    // get the equivalent of data( : , : , hh + 1 )
neg_A_grad.insert( ( negdata_transpose - repmat_bi - bi_star ) * data_extract , p ) ;
neg_B_grad.insert( h_posteriors * data_extract , p ) ;

} // end of hh loop

// %%%%%%%%% END NEGATIVE PHASE  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

// Octave code ---> err = sum( sum( ( data( : , : , 1 ) - negdata ) .^2 ) ) ;
// Not used         errsum = err + errsum ;

// Octave code ---> if ( epoch > 5 ) % use momentum
//                  momentum = mom ;
//                  else % no momentum
//                  momentum = 0 ;
//                  end

// momentum was initialised to 0.0, but on the 6th iteration of epoch, set momentum to 0.9
if ( epoch == 5 ) // will only be true once, after which momentum will == 0.9
{
momentum_w.fill( 0.9 ) ;
momentum_bi.fill( 0.9 ) ;
momentum_bj.fill( 0.9 ) ;
momentum_A.fill( 0.9 ) ;
momentum_B.fill( 0.9 ) ;
}

// %%%%%%%%% UPDATE WEIGHTS AND BIASES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

// Octave code ---> w_update = momentum * w_update + epsilon_w * ( ( w_grad - neg_w_grad ) / num_cases - w_decay * w ) ;
w_update = product( momentum_w , w_update ) + product( epsilon_w , quotient( ( w_grad - neg_w_grad ) , num_cases_matrices_w_and_B ) - product( w_decay , w ) ) ;

// Octave code ---> bi_update = momentum * bi_update + ( epsilon_bi / num_cases ) * ( bi_grad - neg_bi_grad ) ;
bi_update = product( momentum_bi , bi_update ) + product( quotient( epsilon_bi , num_cases_matrix_bi ) , ( bi_grad - neg_bi_grad ) ) ;

// Octave code ---> bj_update = momentum * bj_update + ( epsilon_bj / num_cases ) * ( bj_grad - neg_bj_grad ) ;
bj_update = product( momentum_bj , bj_update ) + product( quotient( epsilon_bj , num_cases_matrix_bj ) , ( bj_grad - neg_bj_grad ) ) ;

// The following two Octave code loops are combined into the single .oct loop that follows them
//
// Octave code ---> for hh = 1 : nt
//                      A_update( : , : , hh ) = momentum * A_update( : , : , hh ) + epsilon_A * ( ( A_grad( : , : , hh ) - neg_A_grad( : , : , hh ) ) / num_cases - w_decay * A( : , : , hh ) ) ;
//                      B_update( : , : , hh ) = momentum * B_update( : , : , hh ) + epsilon_B * ( ( B_grad( : , : , hh ) - neg_B_grad( : , : , hh ) ) / num_cases - w_decay * B( : , : , hh ) ) ;
//                  end

// Octave code ---> for hh = 1 : nt
//                      A( : , : , hh ) = A( : , : , hh ) + A_update( : , : , hh ) ;
//                      B( : , : , hh ) = B( : , : , hh ) + B_update( : , : , hh ) ;
//                  end

for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
{
p( 2 ) = hh ;

A_update_extract = A_update.page ( hh ) ;
A_extract = A.page ( hh ) ;
A_update.insert( product( momentum_A , A_update_extract ) + product( epsilon_A , ( quotient( ( A_grad_extract - neg_A_grad_extract ) , num_cases_matrix_A ) - product( w_decay_A , A_extract ) ) ) , p ) ;
A_update_extract = A_update.page ( hh ) ;
A.insert( A_extract + A_update_extract , p ) ;

B_update_extract = B_update.page ( hh ) ;
B_extract = B.page ( hh ) ;
B_update.insert( product( momentum_B , B_update_extract ) + product( epsilon_B , ( quotient( ( B_grad_extract - neg_B_grad_extract ) , num_cases_matrices_w_and_B ) - product( w_decay_B , B_extract ) ) ) , p ) ;
B_update_extract = B_update.page ( hh ) ;
B.insert( B_extract + B_update_extract , p ) ;

} // end of hh loop

// Octave code ---> w = w + w_update ;
//                  bi = bi + bi_update ;
//                  bj = bj + bj_update ;

w += w_update ;
bi += bi_update ;
bj += bj_update ;

// %%%%%%%%%%%%%%%% END OF UPDATES  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

} // end of batch loop

} // end of main epoch loop

// return the values w , bj , bi , A , B
retval_list(4) = B ;
retval_list(3) = A ;
retval_list(2) = bi ;
retval_list(1) = bj ;
retval_list(0) = w ;

return retval_list ;

} // end of function``````
The code is heavilly commented throughout.The .oct code for the binary_crbm.m function will follow in due course.