Dekalog Blog: Optimised CRBM Code for Gaussian Units

Over the last few weeks I have been working on optimising the conditional restricted boltzmann machine code, with a view to speeding it up via a C++ .oct file, and in the code box below is this .oct code for the gaussian_crbm.m code in my previous post. This gaussian_crbm.m function, plus the binary_crbm.m one, are the speed bottlenecks whilst training the crbm. In the code below I have made some adjustments for code simplification purposes, the most important of which are:
the minibatch index is a matrix rather than a cell type index, with the individual batch indexes in the columns, which means that the batch sizes are all equal in length.
as a result all data in cell arrays in the .m function are in NDArrays in the .oct function
there is no input for the variable "gsd" because I have assumed a value of 1 applies. This means the input data must be normalised prior to the function call.
DEFUN_DLD ( cc_gaussian_crbm_mersenne , args , nargout ,
"-*- texinfo -*-\n\
@deftypefn {Function File} {} cc_gaussian_crbm_mersenne (@var{ batchdata , minibatch , nt , num_epochs , num_hid })\n\n\
This function trains a real valued, gaussian crbm where the gsd is assumed to be 1, so the batchdata input must be z-score normalised.\n\
The value nt is the order of the model, i.e. how many previous values should be included, num_epochs is the number of training epochs,\n\
and num_hid is the number of nodes in the hidden layer.\n\
@end deftypefn" )

{
octave_value_list retval_list ;
int nargin = args.length () ;

// check the input arguments
if ( nargin != 5 )
   {
   error ( "Input error: there are insufficient input arguments. Type help for more details." ) ;
   return retval_list ;
   }

if ( !args(0).is_real_matrix () ) // check batchdata
   {
   error ( "Input error: the 1st argument, batchdata, is not a matrix type. Type help for more details." ) ;
   return retval_list ;
   }

if ( !args(1).is_real_matrix () ) // check minibatch
   {
   error ( "Input error: the 2nd argument, minibatch, is not a matrix type. Type help for more details." ) ;
   return retval_list ;
   }   
   
if ( args(2).length () != 1 ) // check nt
   {
   error ( "Input error: nt should be an interger value for the 'order' of the model. Type help for more details." ) ;
   return retval_list ;
   }
   
if ( args(3).length () != 1 ) // num_epochs
   {
   error ( "Input error: num_epochs should be an integer value for the number of training epochs. Type help for more details." ) ;
   return retval_list ;
   }
   
if ( args(4).length () != 1 ) // check num_hid
   {
   error ( "Input error: num_hid should be an integer for the number of nodes in hidden layer. Type help for more details." ) ;
   return retval_list ;
   }
     
if ( error_state )
   {
   error ( "Input error: type help for details." ) ;
   return retval_list ;
   }
// end of input checking
  
// inputs
Matrix batchdata = args(0).matrix_value () ;
Matrix minibatch = args(1).matrix_value () ;
int nt = args(2).int_value () ; // the "order" of the model
int num_epochs = args(3).int_value () ;
int num_hid = args(4).int_value () ;

// variables
// batchdata is a big matrix of all the data and we index it with "minibatch", a matrix of mini-batch indices in the columns                       
int num_cases = minibatch.rows () ;                                                     // Octave code ---> num_cases = length( minibatch{ batch } ) ;
int num_dims = batchdata.cols () ;                                                      // visible dimension
Matrix bi_star ( num_dims , num_cases ) ; bi_star.fill( 0.0 ) ;                         // Octave code ---> bi_star = zeros( num_dims , num_cases ) ; 
Matrix bj_star ( num_hid , num_cases ) ; bj_star.fill( 0.0 ) ;                          // Octave code ---> bj_star = zeros( num_hid , num_cases ) ; 
Matrix repmat_bj ( num_hid , num_cases ) ; repmat_bj.fill( 0.0 ) ;                      // for Octave code ---> repmat( bj , 1 , num_cases )
Matrix repmat_bi ( num_dims , num_cases ) ; repmat_bi.fill( 0.0 ) ;                     // for Octave code ---> repmat( bi , 1 , num_cases )
Matrix eta ( num_hid , num_cases ) ; eta.fill( 0.0 ) ;
Matrix h_posteriors ( num_hid , num_cases ) ; h_posteriors.fill( 0.0 ) ;                // for the logistic function
Matrix ones ( num_hid , num_cases ) ; ones.fill( 1.0 ) ;                                // for the logistic function
Matrix hid_states ( num_cases , num_hid ) ; hid_states.fill( 0.0 ) ;                    // for hid_states = double( h_posteriors' > rand( num_cases , num_hid ) ) ;
Matrix w_grad ( num_hid , num_dims ) ; w_grad.fill( 0.0 ) ;                             // for w_grad = hid_states' * ( data( : , : , 1 ) ./ gsd ) ;
Matrix bi_grad ( num_dims , 1 ) ; bi_grad.fill( 0.0 ) ;                                 // for bi_grad = sum( data( : , : , 1 )' - repmat( bi , 1 , num_cases ) - bi_star , 2 ) ./ gsd^2 ;
Matrix bj_grad ( num_hid , 1 ) ; bj_grad.fill( 0.0 ) ;                                  // for bj_grad = sum( hid_states , 1 )' ;
Matrix topdown ( num_cases , num_dims ) ; topdown.fill( 0.0 ) ;                         // for topdown = gsd .* ( hid_states * w ) ;
Matrix negdata ( num_cases , num_dims ) ; negdata.fill( 0.0 ) ;
Matrix negdata_transpose ( num_dims , num_cases ) ; negdata_transpose.fill( 0.0 ) ;
Matrix bi_transpose ( 1 , num_dims ) ; bi_transpose.fill( 0.0 ) ;
Matrix repmat_bi_transpose ( num_cases , num_dims ) ; repmat_bi_transpose.fill( 0.0 ) ; 
Matrix neg_w_grad ( num_hid , num_dims ) ; neg_w_grad.fill( 0.0 ) ;
Matrix neg_bi_grad ( num_dims , 1 ) ; neg_bi_grad.fill( 0.0 ) ;                         // for neg_bi_grad = sum( negdata' - repmat( bi , 1 , num_cases ) - bi_star , 2 ) ./ gsd^2 ;
Matrix neg_bj_grad ( num_hid , 1 ) ; neg_bj_grad.fill( 0.0 ) ;                          // for neg_bj_grad = sum( h_posteriors , 2 ) ;
  
// Setting learning rates and create some utility matrices
Matrix epsilon_w ( num_hid , num_dims ) ; epsilon_w.fill( 0.001 ) ;   // undirected
Matrix epsilon_bi ( num_dims , 1 ) ; epsilon_bi.fill( 0.001 ) ;       // visibles
Matrix epsilon_bj ( num_hid , 1 ) ; epsilon_bj.fill( 0.001 ) ;        // hidden units
Matrix epsilon_A ( num_dims , num_dims ) ; epsilon_A.fill( 0.001 ) ;  // autoregressive
Matrix epsilon_B ( num_hid , num_dims ) ; epsilon_B.fill( 0.001 ) ;   // prev visibles to hidden
Matrix w_decay ( num_hid , num_dims ) ; w_decay.fill( 0.0002 ) ;      // currently we use the same weight decay for w, A, B
Matrix w_decay_A ( num_dims , num_dims ) ; w_decay_A.fill( 0.0002 ) ; // currently we use the same weight decay for w, A, B
Matrix w_decay_B ( num_hid , num_dims ) ; w_decay_B.fill( 0.0002 ) ;  // currently we use the same weight decay for w, A, B

Matrix momentum_w ( num_hid , num_dims ) ; momentum_w.fill( 0.0 ) ;   // momentum used only after 5 epochs of training, when it will be set to 0.9
Matrix num_cases_matrices_w_and_B ( num_hid , num_dims ) ; num_cases_matrices_w_and_B.fill( num_cases ) ;

Matrix momentum_bi ( num_dims , 1 ) ; momentum_bi.fill( 0.0 ) ;
Matrix num_cases_matrix_bi ( num_dims , 1 ) ; num_cases_matrix_bi.fill( num_cases ) ;

Matrix momentum_bj ( num_hid , 1 ) ; momentum_bj.fill( 0.0 ) ;
Matrix num_cases_matrix_bj ( num_hid , 1 ) ; num_cases_matrix_bj.fill( num_cases ) ;

Matrix momentum_A ( num_dims , num_dims ) ; momentum_A.fill( 0.0 ) ;
Matrix num_cases_matrix_A ( num_dims , num_dims ) ; num_cases_matrix_A.fill( num_cases ) ;

Matrix momentum_B ( num_hid , num_dims ) ; momentum_B.fill( 0.0 ) ;

// initialization of output matrices
Matrix w ( num_hid , num_dims ) ; w.fill( 0.0 ) ; // Octave code ---> w = 0.01 * randn( num_hid , num_dims ) ;
Matrix bi ( num_dims , 1 ) ; bi.fill( 0.0 ) ;     // Octave code ---> bi = 0.01 * randn( num_dims , 1 ) ;
Matrix bj( num_hid , 1 ) ; bj.fill( 0.0 ) ;       // Octave code ---> bj = -1 + 0.01 * randn( num_hid , 1 ) ; // set to favour units being "off"

// The autoregressive weights; A( : , : , j ) is the weight from t-j to the visible
NDArray A ( dim_vector( num_dims , num_dims , nt ) ) ; A.fill( 0.0 ) ; // Octave code ---> A = 0.01 * randn( num_dims ,num_dims , nt ) ;

// The weights from previous time-steps to the hiddens; B( : , : , j ) is the weight from t-j to the hidden layer
NDArray B ( dim_vector( num_hid , num_dims , nt ) ) ; B.fill( 0.0 ) ;  // Octave code ---> B = 0.01 * randn( num_hid , num_dims , nt ) ;

// Declare MersenneTwister random values
MTRand mtrand1 ;
double rand_norm_value ;
double rand_uniform_value ;

// nested loops to fill w, bi, bj, A and B with initial random values
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_hid ; ii_nest_loop++ )
    { 
    rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
    bj ( ii_nest_loop , 0 ) = -1.0 + rand_norm_value ; // set to favour units being "off"

     for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_dims ; jj_nest_loop++ )
         {
         rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
         w ( ii_nest_loop , jj_nest_loop ) = rand_norm_value ;
         }
    }
    
for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_dims ; ii_nest_loop++ )
    {
    rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
    bi ( ii_nest_loop , 0 ) = rand_norm_value ;
    }

for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
    {
      for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_dims ; ii_nest_loop++ )
          {
           for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_dims  ; jj_nest_loop++ )
               {
               rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
               A ( ii_nest_loop , jj_nest_loop , hh ) = rand_norm_value ;
               }  // end of jj_nest_loop loop      
          }       // end of ii_nest_loop loop
    }             // end of hh loop
    
for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
    {
      for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_hid ; ii_nest_loop++ )
          {
           for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_dims  ; jj_nest_loop++ )
               {
               rand_norm_value = mtrand1.randNorm( 0.0 , 0.01 ) ; // mean of zero and std of 0.01
               B ( ii_nest_loop , jj_nest_loop , hh ) = rand_norm_value ;
               }  // end of jj_nest_loop loop      
          }       // end of ii_nest_loop loop
    }             // end of hh loop        
    
// keep previous updates around for momentum
Matrix w_update ( num_hid , num_dims ) ; w_update.fill( 0.0 ) ; // Octave code ---> w_update = zeros( size( w ) ) ;
Matrix bi_update ( num_dims , 1 ) ; bi_update.fill( 0.0 ) ;     // Octave code ---> bi_update = zeros( size( bi ) ) ;
Matrix bj_update ( num_hid , 1 ) ; bj_update.fill( 0.0 ) ;      // Octave code ---> bj_update = zeros( size( bj ) ) ;

NDArray A_update ( dim_vector( num_dims , num_dims , nt ) ) ; A_update.fill( 0.0 ) ; // Octave code ---> A_update = zeros( size( A ) ) ;
Matrix A_extract ( num_dims , num_dims ) ; A_extract.fill( 0.0 ) ;                   // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix A_update_extract ( num_dims , num_dims ) ; A_update_extract.fill( 0.0 ) ;     // matrix for intermediate calculations because cannot directly "slice" into a NDArrays

NDArray B_update ( dim_vector( num_hid , num_dims , nt ) ) ; B_update.fill( 0.0 ) ;  // Octave code ---> B_update = zeros( size( B ) ) ;
Matrix B_extract ( num_hid , num_dims ) ; B_extract.fill( 0.0 ) ;                    // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix B_update_extract ( num_hid , num_dims ) ; B_update_extract.fill( 0.0 ) ;      // matrix for intermediate calculations because cannot directly "slice" into a NDArray

// data is a nt+1 dimensional array with current and delayed data corresponding to mini-batches
// num_cases = minibatch( batch ).length () ;
num_cases = minibatch.rows () ;
NDArray data ( dim_vector( num_cases , num_dims , nt + 1 ) ) ; data.fill( 0.0 ) ; // Octave code ---> data = zeros( num_cases , num_dims , nt + 1 ) ;
Matrix data_extract ( num_cases , num_dims ) ; data_extract.fill( 0.0 ) ;         // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix data_transpose ( num_dims , num_cases ) ; data_transpose.fill( 0.0 ) ;     // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix data_0 ( num_cases , num_dims ) ; data_0.fill( 0.0 ) ;                     // matrix for intermediate calculations because cannot directly "slice" into a NDArray
Matrix data_0_transpose ( num_dims , num_cases ) ; data_0_transpose.fill( 0.0 ) ; // matrix for intermediate calculations because cannot directly "slice" into a NDArrays

NDArray A_grad ( dim_vector( num_dims , num_dims , nt ) ) ; A_grad.fill( 0.0 ) ;  // for A_update( : , : , hh ) = momentum * A_update( : , : , hh ) + epsilon_A * ( ( A_grad( : , : , hh ) - neg_A_grad( : , : , hh ) ) / num_cases - w_decay * A( : , : , hh ) ) ;                                    
Matrix A_grad_extract ( num_dims , num_dims ) ; A_grad_extract.fill( 0.0 ) ;      // matrix for intermediate calculations because cannot directly "slice" into a NDArrays

NDArray neg_A_grad ( dim_vector( num_dims , num_dims , nt ) ) ; neg_A_grad.fill( 0.0 ) ; // for neg_A_grad( : , : , hh ) = ( negdata' - repmat( bi , 1 , num_cases ) - bi_star ) ./ gsd^2 * data( : , : , hh + 1 ) ;
Matrix neg_A_grad_extract ( num_dims , num_dims ) ; neg_A_grad_extract.fill( 0.0 ) ;     // matrix for intermediate calculations because cannot directly "slice" into a NDArrays

NDArray B_grad ( dim_vector( num_hid , num_dims , nt ) ) ; B_grad.fill( 0.0 ) ;          // for B_update( : , : , hh ) = momentum * B_update( : , : , hh ) + epsilon_B * ( ( B_grad( : , : , hh ) - neg_B_grad( : , : , hh ) ) / num_cases - w_decay * B( : , : , hh ) ) ;
Matrix B_grad_extract ( num_hid , num_dims ) ; B_grad_extract.fill( 0.0 ) ;              // matrix for intermediate calculations because cannot directly "slice" into a NDArrays

NDArray neg_B_grad ( dim_vector( num_hid , num_dims , nt ) ) ; neg_B_grad.fill( 0.0 ) ;  // for neg_B_grad( : , : , hh ) = h_posteriors * data( : , : , hh + 1 ) ;
Matrix neg_B_grad_extract ( num_hid , num_dims ) ; neg_B_grad_extract.fill( 0.0 ) ;      // matrix for intermediate calculations because cannot directly "slice" into a NDArrays

Array  p ( dim_vector ( nt , 1 ) , 0 ) ;                                // vector for writing to A_grad and B_grad

// end of initialization of matrices

// %%%%%%%%% THE MAIN CODE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

for ( octave_idx_type epoch ( 0 ) ; epoch < num_epochs ; epoch++ ) // main epoch loop
    { 
//     // errsum = 0 ; % keep a running total of the difference between data and recon
//       
 for ( octave_idx_type batch ( 0 ) ; batch < minibatch.cols () ; batch++ ) // Octave code ---> int num_batches = minibatch.length () ;
     {
// %%%%%%%%% START POSITIVE PHASE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%      

// // These next nested loops fill the data NDArray with the values from batchdata indexed
// // by the column values of minibatch. Octave code equivalent given below:-
// // Octave code ---> mb = minibatch{ batch } ; % caches the indices   
// // Octave code ---> data( : , : , 1 ) = batchdata( mb , : ) ;
// // Octave code ---> for hh = 1 : nt
// // Octave code ---> data( : , : , hh + 1 ) = batchdata( mb - hh , : ) ;
// // Octave code ---> end
     for ( octave_idx_type hh ( 0 ) ; hh < nt + 1 ; hh++ )
         {
    for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_cases ; ii_nest_loop++ )
        {
   for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_dims  ; jj_nest_loop++ )
       {
       data( ii_nest_loop , jj_nest_loop , hh ) = batchdata( minibatch( ii_nest_loop , batch ) - 1 - hh , jj_nest_loop ) ; // -1 for .oct zero based indexing vs Octave's 1 based
       } // end of jj_nest_loop loop      
        }       // end of ii_nest_loop loop
         }             // end of hh loop

// The above data filling loop could perhaps be implemented more quickly using the fortrans vec method as below
// http://stackoverflow.com.80bola.com/questions/28900153/create-a-ndarray-in-an-oct-file-from-a-double-pointer
// NDArray a (dim_vector(dim[0], dim[1], dim[2]));
// 
// Then loop over (i, j, k) indices to copy the cube to the octave array
// 
// double* a_vec = a.fortran_vec ();
// for (int i = 0; i < dim[0]; i++) {
//     for (int j = 0; j < dim[1]; j++) {
//         for (int k = 0; k < dim[2]; k++) {
//             *a_vec++ = armadillo_cube(i, j, k);
//         }
//     }
// }

// calculate contributions from directed autoregressive connections and contributions from directed visible-to-hidden connections
            bi_star.fill( 0.0 ) ; // Octave code ---> bi_star = zeros( num_dims , num_cases ) ; ( matrix declared earlier in code above )
            bj_star.fill( 0.0 ) ; // Octave code ---> bj_star = zeros( num_hid , num_cases ) ; ( matrix declared earlier in code above )
            
            // The code below calculates two separate Octave code loops in one nested C++ loop structure, namely
            // Octave code ---> for hh = 1 : nt       
            //                      bi_star = bi_star +  A( : , : , hh ) * data( : , : , hh + 1 )' ;
            //                  end 
            // and
            // Octave code ---> for hh = 1:nt
            //                      bj_star = bj_star + B( : , : , hh ) * data( : , : , hh + 1 )' ;
            //                  end 
            
            for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
                {
                // fill the intermediate calculation matrices 
                A_extract = A.page ( hh ) ;
                B_extract = B.page ( hh ) ;
                data_transpose = ( data.page ( hh + 1 ) ).transpose () ;

                // add up the hh different matrix multiplications
                bi_star += A_extract * data_transpose ; 
                bj_star += B_extract * data_transpose ;
     
         } // end of hh loop
         
                // extract and pre-calculate to save time in later computations
                data_0 = data.page ( 0 ) ;
                data_0_transpose = data_0.transpose () ;
                  
// Calculate "posterior" probability -- hidden state being on ( Note that it isn't a true posterior )   
//          Octave code ---> eta = w * ( data( : , : , 1 ) ./ gsd )' + ... % bottom-up connections
//                                 repmat( bj , 1 , num_cases ) + ...      % static biases on unit
//                                 bj_star ;                               % dynamic biases

//          get repmat( bj , 1 , num_cases ) ( http://stackoverflow.com/questions/19273053/write-to-a-matrix-in-oct-file-without-looping?rq=1 )
            for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_cases ; jj_nest_loop++ ) // loop over the columns
         {
         repmat_bj.insert( bj , 0 , jj_nest_loop ) ; 
         }

            eta = w * data_0_transpose + repmat_bj + bj_star ;       
 
            // h_posteriors = 1 ./ ( 1 + exp( -eta ) ) ;    % logistic
            // -exponate the eta matrix
            for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < eta.rows () ; ii_nest_loop++ )
                { 
           for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < eta.cols () ; jj_nest_loop++ )
               {
               eta ( ii_nest_loop , jj_nest_loop ) = exp( - eta ( ii_nest_loop , jj_nest_loop ) ) ;
               }
         }

            // element division  A./B == quotient(A,B) 
            h_posteriors = quotient( ones , ( ones + eta ) ) ;

// Activate the hidden units    
//          Octave code ---> hid_states = double( h_posteriors' > rand( num_cases , num_hid ) ) ;
     for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < hid_states.rows () ; ii_nest_loop++ )
         {
         for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < hid_states.cols () ; jj_nest_loop++ )
             {
      rand_uniform_value = mtrand1.randDblExc() ; // a real number in the range 0 to 1, excluding both 0 and 1
      hid_states( ii_nest_loop , jj_nest_loop ) = h_posteriors( jj_nest_loop , ii_nest_loop ) > rand_uniform_value ? 1.0 : 0.0 ;
             }
         } // end of hid_states loop

// Calculate positive gradients ( note w.r.t. neg energy )
// Octave code ---> w_grad = hid_states' * ( data( : , : , 1 ) ./ gsd ) ;
//                  bi_grad = sum( data( : , : , 1 )' - repmat( bi , 1 , num_cases ) - bi_star , 2 ) ./ gsd^2 ;
//                  bj_grad = sum( hid_states , 1 )' ;
            w_grad = hid_states.transpose () * data_0 ;

//          get repmat( bi , 1 , num_cases ) ( http://stackoverflow.com/questions/19273053/write-to-a-matrix-in-oct-file-without-looping?rq=1 )
            for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < num_cases ; jj_nest_loop++ ) // loop over the columns
                {
                repmat_bi.insert( bi , 0 , jj_nest_loop ) ; 
                }  
            bi_grad = ( data_0_transpose - repmat_bi - bi_star ).sum ( 1 ) ;
            bj_grad = ( hid_states.sum ( 0 ) ).transpose () ;
    
// Octave code ---> for hh = 1 : nt      
//                  A_grad( : , : , hh ) = ( data( : , : , 1 )' - repmat( bi , 1 , num_cases ) - bi_star ) ./ gsd^2 * data( : , : , hh + 1 ) ;
//                  B_grad( : , : , hh ) = hid_states' * data( : , : , hh + 1 ) ;      
//                  end
            for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
                {
                p( 2 ) = hh ;                                                                    // set Array  p to write to page hh of A_grad and B_grad NDArrays
                data_extract = data.page ( hh + 1 ) ;                                            // get the equivalent of data( : , : , hh + 1 )
                A_grad.insert( ( data_0_transpose - repmat_bi - bi_star ) * data_extract , p ) ;
                B_grad.insert( hid_states.transpose () * data_extract , p ) ;
                }
// the above code comes from http://stackoverflow.com/questions/29572075/how-do-you-create-an-arrayoctave-idx-type-in-an-oct-file
// e.g.
//
// Array p (dim_vector (3, 1));
// int n = 2;
// dim_vector dim(n, n, 3);
// NDArray a_matrix(dim);
// 
// for (octave_idx_type i = 0; i < n; i++)
//   for (octave_idx_type j = 0; j < n; j++)
//     a_matrix(i,j, 1) = (i + 1) * 10 + (j + 1);
// 
// std::cout << a_matrix;
// 
// Matrix b_matrix = Matrix (n, n);
// b_matrix(0, 0) = 1; 
// b_matrix(0, 1) = 2; 
// b_matrix(1, 0) = 3; 
// b_matrix(1, 1) = 4; 
// std::cout << b_matrix;
// 
// Array p (dim_vector (3, 1), 0);
// p(2) = 2;
// a_matrix.insert (b_matrix, p);
// 
// std::cout << a_matrix;

// %%%%%%%%% END OF POSITIVE PHASE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  
 
// Activate the visible units
// Find the mean of the Gaussian
// Octave code ---> topdown = gsd .* ( hid_states * w ) ;
   topdown = hid_states * w ;
   
// This is the mean of the Gaussian. Instead of properly sampling, negdata is just the mean
// If we want to sample from the Gaussian, we would add in gsd .* randn( num_cases , num_dims ) ;
// Octave code ---> negdata = topdown + ...                       % top down connections
//                            repmat( bi' , num_cases , 1 ) + ... % static biases
//                            bi_star' ;                          % dynamic biases
   
// get repmat( bi' , 1 , num_cases ) ( http://stackoverflow.com/questions/19273053/write-to-a-matrix-in-oct-file-without-looping?rq=1 )
   bi_transpose = bi.transpose () ;
   for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < num_cases ; ii_nest_loop++ ) // loop over the rows
       {
       repmat_bi_transpose.insert( bi_transpose , ii_nest_loop , 0 ) ; 
       } 
       negdata = topdown + repmat_bi_transpose + bi_star.transpose () ;

// Now conditional on negdata, calculate "posterior" probability for hiddens
// Octave code ---> eta = w * ( negdata ./ gsd )' + ...      % bottom-up connections
//                        repmat( bj , 1 , num_cases ) + ... % static biases on unit (no change)
//                        bj_star ;                          % dynamic biases (no change)

   negdata_transpose = negdata.transpose () ;             // to save repetition of transpose
   eta = w * negdata_transpose + repmat_bj + bj_star ;
   
// h_posteriors = 1 ./ ( 1 + exp( -eta ) ) ;    % logistic
// -exponate the eta matrix
   for ( octave_idx_type ii_nest_loop ( 0 ) ; ii_nest_loop < eta.rows () ; ii_nest_loop++ )
       { 
        for ( octave_idx_type jj_nest_loop ( 0 ) ; jj_nest_loop < eta.cols () ; jj_nest_loop++ )
            {
            eta ( ii_nest_loop , jj_nest_loop ) = exp( - eta ( ii_nest_loop , jj_nest_loop ) ) ;
            }
       }

// element division  A./B == quotient(A,B) 
   h_posteriors = quotient( ones , ( ones + eta ) ) ;

// Calculate negative gradients
// Octave code ---> neg_w_grad = h_posteriors * ( negdata ./ gsd ) ; % not using activations
   neg_w_grad = h_posteriors * negdata ; // not using activations
   
// Octave code ---> neg_bi_grad = sum( negdata' - repmat( bi , 1 , num_cases ) - bi_star , 2 ) ./ gsd^2 ;
   neg_bi_grad = ( negdata_transpose - repmat_bi - bi_star ).sum ( 1 ) ;
   
// Octave code ---> neg_bj_grad = sum( h_posteriors , 2 ) ;
   neg_bj_grad = h_posteriors.sum ( 1 ) ;
   
// Octave code ---> for hh = 1 : nt
//                      neg_A_grad( : , : , hh ) = ( negdata' - repmat( bi , 1 , num_cases ) - bi_star ) ./ gsd^2 * data( : , : , hh + 1 ) ;
//                      neg_B_grad( : , : , hh ) = h_posteriors * data( : , : , hh + 1 ) ;        
//                  end
   
   for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
       {
       p( 2 ) = hh ;                                                                            // set Array  p to write to page hh of A_grad and B_grad NDArrays
       
       data_extract = data.page ( hh + 1 ) ;                                                    // get the equivalent of data( : , : , hh + 1 )
       neg_A_grad.insert( ( negdata_transpose - repmat_bi - bi_star ) * data_extract , p ) ;
       neg_B_grad.insert( h_posteriors * data_extract , p ) ;
   
       } // end of hh loop   

// %%%%%%%%% END NEGATIVE PHASE  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

// Octave code ---> err = sum( sum( ( data( : , : , 1 ) - negdata ) .^2 ) ) ;
// Not used         errsum = err + errsum ;

// Octave code ---> if ( epoch > 5 ) % use momentum
//                  momentum = mom ;
//                  else % no momentum
//                  momentum = 0 ;
//                  end

   // momentum was initialised to 0.0, but on the 6th iteration of epoch, set momentum to 0.9
   if ( epoch == 5 ) // will only be true once, after which momentum will == 0.9
      {
        momentum_w.fill( 0.9 ) ;
        momentum_bi.fill( 0.9 ) ;
        momentum_bj.fill( 0.9 ) ;
        momentum_A.fill( 0.9 ) ;
        momentum_B.fill( 0.9 ) ;
      }
     
// %%%%%%%%% UPDATE WEIGHTS AND BIASES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%        

// Octave code ---> w_update = momentum * w_update + epsilon_w * ( ( w_grad - neg_w_grad ) / num_cases - w_decay * w ) ;
   w_update = product( momentum_w , w_update ) + product( epsilon_w , quotient( ( w_grad - neg_w_grad ) , num_cases_matrices_w_and_B ) - product( w_decay , w ) ) ;
   
// Octave code ---> bi_update = momentum * bi_update + ( epsilon_bi / num_cases ) * ( bi_grad - neg_bi_grad ) ;
   bi_update = product( momentum_bi , bi_update ) + product( quotient( epsilon_bi , num_cases_matrix_bi ) , ( bi_grad - neg_bi_grad ) ) ;
   
// Octave code ---> bj_update = momentum * bj_update + ( epsilon_bj / num_cases ) * ( bj_grad - neg_bj_grad ) ;
   bj_update = product( momentum_bj , bj_update ) + product( quotient( epsilon_bj , num_cases_matrix_bj ) , ( bj_grad - neg_bj_grad ) ) ;
 
// The following two Octave code loops are combined into the single .oct loop that follows them
//   
// Octave code ---> for hh = 1 : nt
//                      A_update( : , : , hh ) = momentum * A_update( : , : , hh ) + epsilon_A * ( ( A_grad( : , : , hh ) - neg_A_grad( : , : , hh ) ) / num_cases - w_decay * A( : , : , hh ) ) ;
//                      B_update( : , : , hh ) = momentum * B_update( : , : , hh ) + epsilon_B * ( ( B_grad( : , : , hh ) - neg_B_grad( : , : , hh ) ) / num_cases - w_decay * B( : , : , hh ) ) ;
//                  end
   
// Octave code ---> for hh = 1 : nt
//                      A( : , : , hh ) = A( : , : , hh ) + A_update( : , : , hh ) ;
//                      B( : , : , hh ) = B( : , : , hh ) + B_update( : , : , hh ) ;
//                  end   

   for ( octave_idx_type hh ( 0 ) ; hh < nt ; hh++ )
       {
       p( 2 ) = hh ;
   
       A_update_extract = A_update.page ( hh ) ;
       A_grad_extract = A_grad.page ( hh ) ;
       neg_A_grad_extract = neg_A_grad.page ( hh ) ;
       A_extract = A.page ( hh ) ;
       A_update.insert( product( momentum_A , A_update_extract ) + product( epsilon_A , ( quotient( ( A_grad_extract - neg_A_grad_extract ) , num_cases_matrix_A ) - product( w_decay_A , A_extract ) ) ) , p ) ;
       A_update_extract = A_update.page ( hh ) ;
       A.insert( A_extract + A_update_extract , p ) ;
   
       B_update_extract = B_update.page ( hh ) ;
       B_grad_extract = B_grad.page ( hh ) ;
       neg_B_grad_extract = neg_B_grad.page ( hh ) ;
       B_extract = B.page ( hh ) ;
       B_update.insert( product( momentum_B , B_update_extract ) + product( epsilon_B , ( quotient( ( B_grad_extract - neg_B_grad_extract ) , num_cases_matrices_w_and_B ) - product( w_decay_B , B_extract ) ) ) , p ) ;
       B_update_extract = B_update.page ( hh ) ;
       B.insert( B_extract + B_update_extract , p ) ;
       
       } // end of hh loop
  
// Octave code ---> w = w + w_update ;
//                  bi = bi + bi_update ;
//                  bj = bj + bj_update ;

   w += w_update ;
   bi += bi_update ;
   bj += bj_update ;
       
// %%%%%%%%%%%%%%%% END OF UPDATES  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

            } // end of batch loop

    } // end of main epoch loop

// return the values w , bj , bi , A , B
retval_list(4) = B ;
retval_list(3) = A ;
retval_list(2) = bi ;    
retval_list(1) = bj ;
retval_list(0) = w ;

return retval_list ;
  
} // end of function
The code is heavilly commented throughout.The .oct code for the binary_crbm.m function will follow in due course.
Dekalog Blog

Pages

Thursday, 16 April 2015

Optimised CRBM Code for Gaussian Units

No comments:

Post a Comment