Wednesday, 3 February 2016

Refactored Denoising Autoencoder Update #2

Below is this second code update.
%  select rolling window length to use - an optimisable parameter via pso?
rolling_window_length = 50 ;
batchsize = 5 ;

%  how-many timesteps do we look back for directed connections - this is what we call the "order" of the model 
n1 = 3 ; % first "gaussian" layer order, a best guess just for batchdata creation purposes
n2 = 3 ; % second "binary" layer order, a best guess just for batchdata creation purposes

%  taking into account rolling_window_length, n1, n2 and batchsize, get total lookback length
remainder = rem( ( rolling_window_length + n1 + n2 ) , batchsize ) ;

if ( remainder > 0 ) % number of training examples with lookback and orders n1 and n2 not exactly divisable by batchsize
lookback_length = ( rolling_window_length + n1 + n2 + ( batchsize - remainder ) ) ; % increase the lookback_length
else                 % number of training examples with lookback and orders n1 and n2 exactly divisable by batchsize
lookback_length = ( rolling_window_length + n1 + n2 ) ;
end

%  create batchdataindex using lookback_length to index bars in the features matrix
batchdataindex = ( ( training_point_index - ( lookback_length - 1 ) ) : 1 : training_point_index )' ;
batchdata = features( batchdataindex , : ) ;

%  now that the batchdata has been created, check it for autocorrelation in the features
all_ar_coeff = zeros( size( batchdata , 2 ) , 1 ) ;

  for ii = 1 : size( batchdata , 2 )
  ar_coeffs = arburg( batchdata( : , ii ) , 10 , 'FPE' ) ;
  all_ar_coeff( ii ) = length( ar_coeffs ) - 1 ;
  end
  
%  set order of gaussian_crbm, n1, to be equal to the average length of any autocorrelation in the data
n1 = round( mean( all_ar_coeff ) ) ;  

%  z-normalise the batchdata matrix with the mean and std of columns 
data_mean = mean( batchdata , 1 ) ;
data_std = std( batchdata , 1 ) ;
batchdata = ( batchdata .- repmat( data_mean , size( batchdata , 1 ) , 1 ) ) ./ repmat( data_std , size( batchdata , 1 ) , 1 ) ; % batchdata is now z-normalised by data_mean & data_std

%  create the minibatch index matrix for gaussian rbm pre-training of directed weights w
minibatch = ( 1 : 1 : size( batchdata , 1 ) ) ; minibatch( 1 : ( size( batchdata , 1 ) - rolling_window_length ) ) = [] ;
minibatch = minibatch( randperm( size( minibatch , 2 ) ) ) ; minibatch = reshape( minibatch , batchsize , size( minibatch , 2 ) / batchsize ) ; 

% PRE-TRAINING FOR THE VISABLE TO HIDDEN AND THE VISIBLE TO VISIBLE WEIGHTS %%%%
% First create a training set and target set for the pre-training of gaussian layer
dAuto_Encode_targets = batchdata ; dAuto_Encode_training_data = [] ;
% dAuto_Encode_targets = batchdata( : , 2 : end ) ; dAuto_Encode_training_data = [] ; % if bias added to raw data
  
  % loop to create the dAuto_Encode_training_data ( n1 == "order" of the gaussian layer of crbm )
  for ii = 1 : n1
  dAuto_Encode_training_data = [ dAuto_Encode_training_data shift( batchdata , ii ) ] ;
  end

% now delete the first n1 rows due to circular shift induced mismatch of data and targets
dAuto_Encode_targets( 1 : n1 , : ) = [] ; dAuto_Encode_training_data( 1 : n1 , : ) = [] ;

% DO RBM PRE-TRAINING FOR THE BOTTOM UP DIRECTED WEIGHTS W %%%%%%%%%%%%%%%%%%%%%
% use rbm trained initial weights instead of using random initialisation for weights
% Doing this because we are not using regularisation in the autoencoder pre-training
epochs = 10000 ;
hidden_layer_size = 4 * size( dAuto_Encode_targets , 2 ) ;
[ w_weights , w_weights_hid_bias , w_weights_vis_bias ] = cc_gaussian_rbm( dAuto_Encode_targets , minibatch , epochs , hidden_layer_size , 0.05 ) ;
% keep a copy of these original w_weights
w1 = w_weights ;
[ A_weights , A_weights_hid_bias , A_weights_vis_bias ] = cc_gaussian_rbm( dAuto_Encode_training_data , minibatch , epochs , size( dAuto_Encode_targets , 2 ) , 0.05 ) ;
[ B_weights , B_weights_hid_bias , B_weights_vis_bias ] = cc_gaussian_rbm( dAuto_Encode_training_data , minibatch , epochs , hidden_layer_size , 0.05 ) ;

% END OF RBM PRE-TRAINING OF AUTOENCODER WEIGHTS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

figure(1) ; surf( A_weights ) ; title( 'A Weights after RBM training' ) ;
figure(2) ; surf( B_weights ) ; title( 'B Weights after RBM training' ) ;
figure(3) ; surf( w_weights ) ; title( 'w Weights after RBM training' ) ;
figure(4) ; plot( A_weights_hid_bias , 'b' , B_weights_hid_bias , 'r' , w_weights_vis_bias , 'g' ) ; title( 'Biases after RBM training' ) ; legend( 'A' , 'B' , 'w' ) ;

% DO THE AUTOENCODER TRAINING %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% create weight update matrices
A_weights_update = zeros( size( A_weights ) ) ;
A_weights_hid_bias_update = zeros( size( A_weights_hid_bias ) ) ;
B_weights_update = zeros( size( B_weights ) ) ;
B_weights_hid_bias_update = zeros( size( B_weights_hid_bias ) ) ;
w_weights_update = zeros( size( w_weights ) ) ;
w_weights_vis_bias_update = zeros( size( w_weights_vis_bias ) ) ;

% for adagrad
historical_A = zeros( size( A_weights ) ) ;
historical_A_hid_bias = zeros( size( A_weights_hid_bias ) ) ;
historical_B = zeros( size( B_weights ) ) ;
historical_B_hid_bias = zeros( size( B_weights_hid_bias ) ) ;
historical_w = zeros( size( w_weights ) ) ;
historical_w_vis_bias = zeros( size( w_weights_vis_bias ) ) ;

% set some training parameters
n = size( dAuto_Encode_training_data , 1 ) ; % number of training examples in dAuto_Encode_training_data
input_layer_size = size( dAuto_Encode_training_data , 2 ) ;
fudge_factor = 1e-6 ; % for numerical stability for adagrad
learning_rate = 0.01 ; % will be changed to 0.001 after 50 iters through epoch loop
mom = 0 ;            % will be changed to 0.9 after 50 iters through epoch loop
noise = 0.5 ;
epochs = 1000 ;
cost = zeros( epochs , 1 ) ;
lowest_cost = inf ;

  % Stochastic Gradient Descent training over dAuto_Encode_training_data 
  for iter = 1 : epochs
   
      % change momentum and learning_rate after 50 iters
      if iter == 50
      mom = 0.9 ;
      learning_rate = 0.001 ;
      end
  
      index = randperm( n ) ; % randomise the order of training examples
     
      for training_example = 1 : n
      
      % Select data for this training batch
      tmp_X = dAuto_Encode_training_data( index( training_example ) , : ) ;
      tmp_T = dAuto_Encode_targets( index( training_example ) , : ) ;
      
      % Randomly black out some of the input training data
      tmp_X( rand( size( tmp_X ) ) < noise ) = 0 ;
      
      % feedforward tmp_X through B_weights and get sigmoid e.g ret = 1.0 ./ ( 1.0 + exp(-input) )
      tmp_X_through_sigmoid = 1.0 ./ ( 1.0 .+ exp( - ( tmp_X * B_weights .+ B_weights_hid_bias ) ) ) ;
      
      % Randomly black out some of tmp_X_through_sigmoid for dropout training
      tmp_X_through_sigmoid( rand( size( tmp_X_through_sigmoid ) ) < noise ) = 0 ;
    
      % feedforward tmp_X through A_weights and add to tmp_X_through_sigmoid * w_weights for linear output layer
      final_output_layer = ( tmp_X * A_weights .+ A_weights_hid_bias ) .+ ( tmp_X_through_sigmoid * w_weights' .+ w_weights_vis_bias ) ;
    
      % now do backpropagation
      % this is the derivative of weights for the linear final_output_layer
      delta_out = ( tmp_T - final_output_layer ) ;
      
      % NOTE! gradient of sigmoid function g = sigmoid(z) .* ( 1.0 .- sigmoid(z) )
      sig_grad = tmp_X_through_sigmoid .* ( 1 .- tmp_X_through_sigmoid ) ; 
      
      % backpropagation only through the w_weights that are connected to tmp_X_through_sigmoid
      delta_hidden = ( delta_out * w_weights ) .* sig_grad ;
      
      % apply deltas from backpropagation with adagrad for the weight updates
      historical_A = historical_A .+ ( tmp_X' * delta_out ).^2 ;    
      A_weights_update = mom .* A_weights_update .+ ( learning_rate .* ( tmp_X' * delta_out ) ) ./ ( fudge_factor .+ sqrt( historical_A ) ) ;
      
      historical_A_hid_bias = historical_A_hid_bias .+ delta_out.^2 ;
      A_weights_hid_bias_update = mom .* A_weights_hid_bias_update .+ ( learning_rate .* delta_out ) ./ ( fudge_factor .+ sqrt( historical_A_hid_bias ) ) ;
      
      historical_w = historical_w .+ ( delta_out' * tmp_X_through_sigmoid ).^2 ;
      w_weights_update = mom .* w_weights_update .+ ( learning_rate .* ( delta_out' * tmp_X_through_sigmoid ) ) ./ ( fudge_factor .+ sqrt( historical_w ) ) ;
      
      historical_w_vis_bias = historical_w_vis_bias .+ delta_out.^2 ;
      w_weights_vis_bias_update = mom .* w_weights_vis_bias_update .+ ( learning_rate .* delta_out ) ./ ( fudge_factor .+ sqrt( historical_w_vis_bias ) ) ;
      
      historical_B = historical_B .+ ( tmp_X' * delta_hidden ).^2 ;
      B_weights_update = mom .* B_weights_update .+ ( learning_rate .* ( tmp_X' * delta_hidden ) ) ./ ( fudge_factor .+ sqrt( historical_B ) ) ;
      
      historical_B_hid_bias = historical_B_hid_bias .+ delta_hidden.^2 ;
      B_weights_hid_bias_update = mom .* B_weights_hid_bias_update .+ ( learning_rate .* delta_hidden ) ./ ( fudge_factor .+ sqrt( historical_B_hid_bias ) ) ;
      
      % update the weight matrices with weight_updates
      A_weights = A_weights + A_weights_update ;
      A_weights_hid_bias = A_weights_hid_bias + A_weights_hid_bias_update ;
      B_weights = B_weights + B_weights_update ;
      B_weights_hid_bias = B_weights_hid_bias + B_weights_hid_bias_update ;
      w_weights = w_weights + w_weights_update ;
      w_weights_vis_bias = w_weights_vis_bias + w_weights_vis_bias_update ;
      
      end % end of training_example loop
  
  % feedforward with this epoch's updated weights
  epoch_trained_tmp_X_through_sigmoid = 1.0 ./ ( 1.0 .+ exp( -( dAuto_Encode_training_data * B_weights .+ repmat( B_weights_hid_bias , size( dAuto_Encode_training_data , 1 ) , 1 ) ) ) ) ;
  epoch_trained_output = ( dAuto_Encode_training_data * A_weights .+ repmat( A_weights_hid_bias , size( dAuto_Encode_training_data , 1 ) , 1 ) )...
                          .+ ( epoch_trained_tmp_X_through_sigmoid * w_weights' .+ repmat( w_weights_vis_bias , size( epoch_trained_tmp_X_through_sigmoid , 1 ) , 1 ) ) ;
 
  % get sum squared error cost
  cost( iter , 1 ) = sum( sum( ( dAuto_Encode_targets .- epoch_trained_output ) .^ 2 ) ) ;
  
    % record best so far
    if cost( iter , 1 ) <= lowest_cost
       lowest_cost = cost( iter , 1 ) ;
       iter_min = iter ;
       best_A = A_weights ;
       best_B = B_weights ;
       best_w = w_weights ;
    end
  
  end % end of backpropagation epoch loop

% plot weights
figure(5) ; surf( best_A ) ; title( 'Best A Weights' ) ;
figure(6) ; surf( best_B ) ; title( 'Best B Weights' ) ;
figure(7) ; surf( best_w ) ; title( 'Best w Weights' ) ;
figure(8) ; plot( A_weights_hid_bias , 'b' , B_weights_hid_bias , 'r' , w_weights_vis_bias , 'g' ) ; title( 'Biases after Autoencoder training' ) ; legend( 'A' , 'B' , 'w' ) ;
figure(9) ; plot( cost ) ; title( 'Evolution of Autoencoder cost' ) ;

% END OF CRBM WEIGHT PRE-TRAINING %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The changes from the previous code update are a slightly different way to handle the bias units, the introduction of hidden and visible bias units from Restricted Boltzmann machine (RBM) pre-training and the introduction of an automated way to select the "order" of the Conditional Restricted Boltzmann machine (CRBM).

The order of a CRBM is how many time steps we look back in order to model the autoregressive components. This could be decided heuristically or through cross validation but I have decided to use the Octave "arburg" function to "auto-magically" select this look back length, the idea being that the data itself informs this decision and makes the whole CRBM training algorithm adaptive to current conditions. Since the ultimate point of the CRBM will be to make predictions of future OHLC values I have chosen to use the final prediction error model selection criteria for the arburg function.

Now that the bulk of this coding has been completed I think it would be useful to describe the proposed work flow of the various components.
  • the data and its derived inputs, such as indicators etc, are input to a Gaussian RBM as a weight initialisation step for the denoising autoencoder training. A Gaussian RBM is used because the data are real valued and not binary. This step is typical of what happens in deep learning and helps to extract meaningful features from the raw data in an unsupervised manner
  • the data and RBM initialised weights are then input to the denoising autoencoder to further model the weights and to take into account the autoregressive components of the data
  • these twice modelled weights are then used as the initial weights for the CRBM training of a Gaussian-Binary CRBM layer
  • the hidden layer of the above Gaussian-Binary CRBM is then used as data for a second Binary-Binary CRBM layer which will be stacked. The training for this second layer will follow the format above, i.e. RBM and denoising autoencoder pre-training of weights
The next step will be for me to compile the denoising autoencoder code into an Octave C++ .oct function for speed optimisation purposes. 


Thursday, 21 January 2016

Refactored Denoising Autoencoder Code Update

This code box contains updated code from my previous post. The main change is the inclusion of bias units for the directed auto-regressive weights and the visible to hidden weights. In addition there is code showing how data is pre-processed into batches/targets for the pre-training and code showing how the weight matrices are manipulated into a form suitable for my existing optimised crbm code for gaussian units.
%  select rolling window length to use - an optimisable parameter via pso?
rolling_window_length = 50 ;

%  how-many timesteps do we look back for directed connections - this is what we call the "order" of the model 
n1 = 3 ; % first "gaussian" layer order
n2 = 3 ; % second "binary" layer order
batchsize = 5 ;

%  taking into account rolling_window_length, n1, n2 and batchsize, get total lookback length
remainder = rem( ( rolling_window_length + n1 + n2 ) , batchsize ) ;

if ( remainder > 0 ) % number of training examples with lookback and orders n1 and n2 not exactly divisable by batchsize
lookback_length = ( rolling_window_length + n1 + n2 + ( batchsize - remainder ) ) ; % increase the lookback_length
else                 % number of training examples with lookback and orders n1 and n2 exactly divisable by batchsize
lookback_length = ( rolling_window_length + n1 + n2 ) ;
end

%  create batchdataindex using lookback_length to index bars in the features matrix
batchdataindex = ( ( training_point_index - ( lookback_length - 1 ) ) : 1 : training_point_index )' ;
batchdata = features( batchdataindex , : ) ;

%  z-normalise the batchdata matrix with the mean and std of columns 
data_mean = mean( batchdata , 1 ) ;
data_std = std( batchdata , 1 ) ;
batchdata = ( batchdata .- repmat( data_mean , size( batchdata , 1 ) , 1 ) ) ./ repmat( data_std , size( batchdata , 1 ) , 1 ) ; % batchdata is now z-normalised by data_mean & data_std
% add bias neurons
batchdata = [ ones( size( batchdata , 1 ) , 1 ) batchdata ] ;

%  create the minibatch index matrix for gaussian rbm pre-training of directed weights w
minibatch = ( 1 : 1 : size( batchdata , 1 ) ) ; minibatch( 1 : ( size( batchdata , 1 ) - rolling_window_length ) ) = [] ;
minibatch = minibatch( randperm( size( minibatch , 2 ) ) ) ; minibatch = reshape( minibatch , batchsize , size( minibatch , 2 ) / batchsize ) ; 

% PRE-TRAINING FOR THE VISABLE TO HIDDEN AND THE VISIBLE TO VISIBLE WEIGHTS %%%%
% First create a training set and target set for the pre-training
dAuto_Encode_targets = batchdata( : , 2 : end ) ; dAuto_Encode_training_data = [] ;
  
  % loop to create the dAuto_Encode_training_data ( n1 == "order" of the gaussian layer of crbm )
  for ii = 1 : n1
  dAuto_Encode_training_data = [ dAuto_Encode_training_data shift( batchdata , ii ) ] ;
  end

% now delete the first n1 rows due to circular shift induced mismatch of data and targets
dAuto_Encode_targets( 1 : n1 , : ) = [] ; dAuto_Encode_training_data( 1 : n1 , : ) = [] ;
% add bias
%dAuto_Encode_training_data = [ ones( size( dAuto_Encode_training_data , 1 ) , 1 ) dAuto_Encode_training_data ] ; 
% bias units idx
bias_idx = ( 1 : size( batchdata , 2 ) : size( dAuto_Encode_training_data , 2 ) ) ;

% DO RBM PRE-TRAINING FOR THE BOTTOM UP DIRECTED WEIGHTS W %%%%%%%%%%%%%%%%%%%%%
% use rbm trained initial weights instead of using random initialisation for weights
% Doing this because we are not using regularisation in the autoencoder pre-training
epochs = 10000 ;
hidden_layer_size = 2 * size( dAuto_Encode_targets , 2 ) ;
w_weights = gaussian_rbm( dAuto_Encode_targets , minibatch , epochs , hidden_layer_size ) ;
% keep a copy of these original w_weights
w1 = w_weights ;
A_weights = gaussian_rbm( dAuto_Encode_training_data , minibatch , epochs , size( dAuto_Encode_targets , 2 ) ) ;
B_weights = gaussian_rbm( dAuto_Encode_training_data , minibatch , epochs , hidden_layer_size ) ;

% create weight update matrices
A_weights_update = zeros( size( A_weights ) ) ;
B_weights_update = zeros( size( B_weights ) ) ;
w_weights_update = zeros( size( w_weights ) ) ;

% for adagrad
historical_A = zeros( size( A_weights ) ) ;
historical_B = zeros( size( B_weights ) ) ;
historical_w = zeros( size( w_weights ) ) ;

% set some training parameters
n = size( dAuto_Encode_training_data , 1 ) ; % number of training examples in dAuto_Encode_training_data
input_layer_size = size( dAuto_Encode_training_data , 2 ) ;
fudge_factor = 1e-6 ; % for numerical stability for adagrad
learning_rate = 0.1 ; % will be changed to 0.01 after 50 iters through epoch loop
mom = 0 ;             % will be changed to 0.9 after 50 iters through epoch loop
noise = 0.5 ;
epochs = 1000 ;
cost = zeros( epochs , 1 ) ;
lowest_cost = inf ;

  % Stochastic Gradient Descent training over dAuto_Encode_training_data 
  for iter = 1 : epochs
   
      % change momentum and learning_rate after 50 iters
      if iter == 50
      mom = 0.9 ;
      learning_rate = 0.01 ;
      end
  
      index = randperm( n ) ; % randomise the order of training examples
     
      for training_example = 1 : n
      
      % Select data for this training batch
      tmp_X = dAuto_Encode_training_data( index( training_example ) , : ) ;
      tmp_T = dAuto_Encode_targets( index( training_example ) , : ) ;
      
      % Randomly black out some of the input training data
      tmp_X( rand( size( tmp_X ) ) < noise ) = 0 ;
      % but keep bias units
      tmp_X( bias_idx ) = 1 ;
      
      % feedforward tmp_X through B_weights and get sigmoid e.g ret = 1.0 ./ ( 1.0 + exp(-input) )
      tmp_X_through_sigmoid = 1.0 ./ ( 1.0 .+ exp( - B_weights * tmp_X' ) ) ;
      
      % Randomly black out some of tmp_X_through_sigmoid for dropout training
      tmp_X_through_sigmoid( rand( size( tmp_X_through_sigmoid ) ) < noise ) = 0 ;
    
      % feedforward tmp_X through A_weights and add to tmp_X_through_sigmoid * w_weights for linear output layer
      final_output_layer = ( tmp_X * A_weights' ) .+ ( tmp_X_through_sigmoid' * w_weights ) ;
    
      % now do backpropagation
      % this is the derivative of weights for the linear final_output_layer
      delta_out = ( tmp_T - final_output_layer ) ;
      
      % NOTE! gradient of sigmoid function g = sigmoid(z) .* ( 1.0 .- sigmoid(z) )
      sig_grad = tmp_X_through_sigmoid .* ( 1 .- tmp_X_through_sigmoid ) ; 
      
      % backpropagation only through the w_weights that are connected to tmp_X_through_sigmoid
      delta_hidden = ( w_weights * delta_out' ) .* sig_grad ;
      
      % apply deltas from backpropagation with adagrad for the weight updates
      historical_A = historical_A .+ ( delta_out' * tmp_X ).^2 ;    
      A_weights_update = mom .* A_weights_update .+ ( learning_rate .* ( delta_out' * tmp_X ) ) ./ ( fudge_factor .+ sqrt( historical_A ) ) ;
      
      historical_w = historical_w .+ ( tmp_X_through_sigmoid * delta_out ).^2 ;
      w_weights_update = mom .* w_weights_update .+ ( learning_rate .* ( tmp_X_through_sigmoid * delta_out ) ) ./ ( fudge_factor .+ sqrt( historical_w ) ) ;
      
      historical_B = historical_B .+ ( delta_hidden * tmp_X ).^2 ;
      B_weights_update = mom .* B_weights_update .+ ( learning_rate .* ( delta_hidden * tmp_X ) ) ./ ( fudge_factor .+ sqrt( historical_B ) ) ;
      
      % update the weight matrices with weight_updates
      A_weights = A_weights + A_weights_update ;
      B_weights = B_weights + B_weights_update ;
      w_weights = w_weights + w_weights_update ;
      
      end % end of training_example loop
  
  % feedforward with this epoch's updated weights
  epoch_trained_tmp_X_through_sigmoid = ( 1.0 ./ ( 1.0 .+ exp( -( ( B_weights ) * dAuto_Encode_training_data' ) ) ) ) ;
  epoch_trained_output = ( dAuto_Encode_training_data * ( A_weights )' ) .+ ( epoch_trained_tmp_X_through_sigmoid' * ( w_weights ) ) ;
 
  % get sum squared error cost
  cost( iter , 1 ) = sum( sum( ( dAuto_Encode_targets .- epoch_trained_output ) .^ 2 ) ) ;
  
    % record best so far
    if cost( iter , 1 ) <= lowest_cost
       lowest_cost = cost( iter , 1 ) ;
       iter_min = iter ;
       best_A = A_weights ;
       best_B = B_weights ;
       best_w = w_weights ;
    end
  
  end % end of backpropagation loop

lowest_cost                                        % print final cost to terminal
iter_min ;                                           % and the iter it occured on
graphics_toolkit( "qt" ) ;
figure(1) ; plot( cost , 'r' , 'linewidth' , 3 ) ; % and plot the cost curve

% plot weights
graphics_toolkit( "gnuplot" ) ;
figure(2) ; surf( best_A ) ; title( 'Best A Weights' ) ;
figure(3) ; surf( best_B ) ; title( 'Best B Weights' ) ;
figure(4) ; surf( best_w ) ; title( 'Best w Weights' ) ;

% END OF CRBM WEIGHT PRE-TRAINING %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% extract bias weights from best_A and best_B
A_bias = best_A( : , bias_idx ) ; best_A( : , bias_idx ) = [] ; A_bias = sum( A_bias , 2 ) ; 
B_bias = best_B( : , bias_idx ) ; best_B( : , bias_idx ) = [] ; B_bias = sum( B_bias , 2 ) ;

% now delete bias units from batchdata
batchdata( : , 1 ) = [] ;

% create reshaped structures to hold A_weights and B_weights
A1 = reshape( best_A , size( best_A , 1 ) , size( best_A , 2 ) / n1 , n1 ) ;
B1 = reshape( best_B , size( best_B , 1 ) , size( best_B , 2 ) / n1 , n1 ) ;
The following video shows the evolution of the weights whilst training over 20 consecutive price bars. The top three panes are the weights after the denoising autoencoder training and the bottom three are the same weights after being used as initialisation weights for the CRBM training and then being modified by this CRBM training. It is this final set of weights that would typically be used for CRBM generation.
non-embedded view

Monday, 18 January 2016

Refactored Denoising Autoencoder Code

It has been quite a challenge but I now have a working first attempt at Octave code for autoencoder pre-training of weights for a Conditional Restricted Boltzmann Machine, which is shown in the code box below.
% PRE-TRAINING FOR THE VISABLE TO HIDDEN AND THE VISIBLE TO VISIBLE WEIGHTS %%%%
% First create a training set and target set for the pre-training
dAuto_Encode_targets = batchdata ; dAuto_Encode_training_data = [] ;
  
  % loop to create the dAuto_Encode_training_data ( n1 == "order" of the gaussian layer of crbm )
  for ii = 1 : n1
  dAuto_Encode_training_data = [ dAuto_Encode_training_data shift( batchdata , ii ) ] ;
  end

% now delete the first n1 rows due to circular shift induced mismatch of data and targets
dAuto_Encode_targets( 1 : n1 , : ) = [] ; dAuto_Encode_training_data( 1 : n1 , : ) = [] ; 

% DO RBM PRE-TRAINING FOR THE BOTTOM UP DIRECTED WEIGHTS W %%%%%%%%%%%%%%%%%%%%%
% use rbm trained initial weights instead of using random initialisation for weights
% Doing this because we are not using regularisation in the autoencoder pre-training
epochs = 5000 ;
hidden_layer_size = 2 * size( dAuto_Encode_targets , 2 ) ;
w_weights = gaussian_rbm( dAuto_Encode_targets , minibatch , epochs , hidden_layer_size ) ;
A_weights = gaussian_rbm( dAuto_Encode_training_data , minibatch , epochs , size( dAuto_Encode_targets , 2 ) ) ;
B_weights = gaussian_rbm( dAuto_Encode_training_data , minibatch , epochs , hidden_layer_size ) ;

% create weight update matrices
A_weights_update = zeros( size( A_weights ) ) ;
B_weights_update = zeros( size( B_weights ) ) ;
w_weights_update = zeros( size( w_weights ) ) ;

% for adagrad
historical_A = zeros( size( A_weights ) ) ;
historical_B = zeros( size( B_weights ) ) ;
historical_w = zeros( size( w_weights ) ) ;

% set some training parameters
n = size( dAuto_Encode_training_data , 1 ) ; % number of training examples in dAuto_Encode_training_data
input_layer_size = size( dAuto_Encode_training_data , 2 ) ;
fudge_factor = 1e-6 ; % for numerical stability for adagrad
learning_rate = 0.1 ; % will be changed to 0.01 after 50 iters through epoch loop
mom = 0 ;             % will be changed to 0.9 after 50 iters through epoch loop
noise = 0.5 ;
epochs = 1000 ;
cost = zeros( epochs , 1 ) ;
lowest_cost = inf ;

  % Stochastic Gradient Descent training over dAuto_Encode_training_data 
  for iter = 1 : epochs
   
      % change momentum and learning_rate after 50 iters
      if iter == 50
      mom = 0.9 ;
      learning_rate = 0.01 ;
      end
  
      index = randperm( n ) ; % randomise the order of training examples
     
      for training_example = 1 : n
      
      % Select data for this training batch
      tmp_X = dAuto_Encode_training_data( index( training_example ) , : ) ;
      tmp_T = dAuto_Encode_targets( index( training_example ) , : ) ;
      
      % Randomly black out some of the input training data
      tmp_X( rand( size( tmp_X ) ) &lt; noise ) = 0 ;
      
      % feedforward tmp_X through B_weights and get sigmoid e.g ret = 1.0 ./ ( 1.0 + exp(-input) )
      tmp_X_through_sigmoid = 1.0 ./ ( 1.0 .+ exp( - B_weights * tmp_X' ) ) ;
      
      % Randomly black out some of tmp_X_through_sigmoid for dropout training
      tmp_X_through_sigmoid( rand( size( tmp_X_through_sigmoid ) ) &lt; noise ) = 0 ;
    
      % feedforward tmp_X through A_weights and add to tmp_X_through_sigmoid * w_weights for linear output layer
      final_output_layer = ( tmp_X * A_weights' ) .+ ( tmp_X_through_sigmoid' * w_weights ) ;
    
      % now do backpropagation
      % this is the derivative of weights for the linear final_output_layer
      delta_out = ( tmp_T - final_output_layer ) ;
      
      % NOTE! gradient of sigmoid function g = sigmoid(z) .* ( 1.0 .- sigmoid(z) )
      sig_grad = tmp_X_through_sigmoid .* ( 1 .- tmp_X_through_sigmoid ) ; 
      
      % backpropagation only through the w_weights that are connected to tmp_X_through_sigmoid
      delta_hidden = ( w_weights * delta_out' ) .* sig_grad ;
      
      % apply deltas from backpropagation with adagrad for the weight updates
      historical_A = historical_A .+ ( delta_out' * tmp_X ).^2 ;    
      A_weights_update = mom .* A_weights_update .+ ( learning_rate .* ( delta_out' * tmp_X ) ) ./ ( fudge_factor .+ sqrt( historical_A ) ) ;
      
      historical_w = historical_w .+ ( tmp_X_through_sigmoid * delta_out ).^2 ;
      w_weights_update = mom .* w_weights_update .+ ( learning_rate .* ( tmp_X_through_sigmoid * delta_out ) ) ./ ( fudge_factor .+ sqrt( historical_w ) ) ;
      
      historical_B = historical_B .+ ( delta_hidden * tmp_X ).^2 ;
      B_weights_update = mom .* B_weights_update .+ ( learning_rate .* ( delta_hidden * tmp_X ) ) ./ ( fudge_factor .+ sqrt( historical_B ) ) ;
      
      % update the weight matrices with weight_updates
      A_weights = A_weights + A_weights_update ;
      B_weights = B_weights + B_weights_update ;
      w_weights = w_weights + w_weights_update ;
      
      end % end of training_example loop
  
  % feedforward with this epoch's updated weights
  epoch_trained_tmp_X_through_sigmoid = ( 1.0 ./ ( 1.0 .+ exp( -( ( B_weights./2 ) * dAuto_Encode_training_data' ) ) ) ) ;
  epoch_trained_output = ( dAuto_Encode_training_data * ( A_weights./2 )' ) .+ ( epoch_trained_tmp_X_through_sigmoid' * ( w_weights./2 ) ) ;
 
  % get sum squared error cost
  cost( iter , 1 ) = sum( sum( ( dAuto_Encode_targets .- epoch_trained_output ) .^ 2 ) ) ;
  
    % record best so far
    if cost( iter , 1 ) &lt;= lowest_cost
       lowest_cost = cost( iter , 1 ) ;
       best_A = A_weights ./ 2 ;
       best_B = B_weights ./ 2 ;
       best_w = w_weights ./ 2;
    end
  
  end % end of backpropagation loop

lowest_cost                                        % print final cost to terminal
graphics_toolkit( "qt" ) ;
figure(1) ; plot( cost , 'r' , 'linewidth' , 3 ) ; % and plot the cost curve

% plot weights
graphics_toolkit( "gnuplot" ) ;
figure(2) ; surf( best_A ) ; title( 'Best A Weights' ) ;
figure(3) ; surf( best_B ) ; title( 'Best B Weights' ) ;
figure(4) ; surf( best_w ) ; title( 'Best w Weights' ) ;

% END OF CRBM WEIGHT PRE-TRAINING %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Prior to this code being run there is a bit of data pre-processing to create data batches and their indexes, and of course afterwards the generated weights will be used as initial starting weights for the CRBM training.

I will not discuss the code very much in this post as it will probably be heavily revised in the near future. However, I will point out a few things about it, namely
  • I have used Restricted Boltzmann machine pre-training as initial weights for the autoencoder training
  • I have applied dropout to the single hidden layer in the autoencoder
  • early stoppingMomentum and AdaGrad are employed
  • bias units are not included
  • the code plots some charts for visual inspection (see below), which probably will not be present in the code's final version
My reason for using RBM pre-training instead of random initialisation of weights is simple - RBM training generally seems to result in a lower average cost when the trained autoencoder itself is applied to the training data. 

Below is a short video of a desktop slide show which shows typical, final, pre-trained weights from a sequence of 20 training sessions. The first 10 slides are random initialisation and the last 10 are initialisation with RBM training. Note the smoother cost curve for the RBM initialisation. Apart from this initialisation all other training parameters were exactly the same for all training sessions.
Non embedded view
The lower pane shows the evolution of cost on the Y axis vs. epoch on the X axis. The upper three panes show the weights on what are called the A, B and w weights respectively. A are the direct auto-regressive weights from training data to target data without any intervening hidden layers, B are the weights to the hidden sigmoid layer and w are the weights from the hidden layer to the targets. It is interesting to see how the A weights remain relatively consistent over the different sessions.

Monday, 21 December 2015

John Ehler's Sinewave Indicator Code

A reader recently inquired about my use of this indicator and so below I provide my Octave C++ .oct  version that I have been using for the past few years.
DEFUN_DLD ( sinewave_indicator, args, nargout )
{
octave_value_list retval_list ;
int nargin = args.length () ;
int vec_length = args(0).length () ;

// check the input argument
if ( nargin != 1 )
   {
   error ("Invalid argument. Input is a single price vector.") ;
   return retval_list ;
   }

if ( vec_length < 50 )
   {
   error ("Invalid argument. Input is a single price vector.") ;
   return retval_list ;
   }

if ( error_state )
   {
   error ("Invalid argument. Input is a single price vector.") ;
   return retval_list ;
   }
// end of input checking

// inputs
ColumnVector price = args(0).column_vector_value () ;

// outputs
ColumnVector sinewave( vec_length ) ;
ColumnVector sinewave_lead_1( vec_length ) ;
ColumnVector smoothperiod_out( vec_length ) ;
ColumnVector dcphase_vec( vec_length ) ;
ColumnVector sumperiod( vec_length ) ;
ColumnVector sum_period( vec_length ) ;
ColumnVector deltaphase( vec_length ) ;

// Declarations for calculations of period, phase & sine wave measurements
ColumnVector smooth( vec_length ) ;
ColumnVector period( vec_length ) ;          
ColumnVector smoothperiod( vec_length ) ; 
ColumnVector detrender( vec_length ) ; 
ColumnVector Q1( vec_length ) ; 
ColumnVector I1( vec_length ) ; 
ColumnVector jI( vec_length ) ; 
ColumnVector jQ( vec_length ) ; 
ColumnVector I2( vec_length ) ;  
ColumnVector Q2( vec_length ) ;  
ColumnVector sI2( vec_length ) ; 
ColumnVector sQ2( vec_length ) ; 
ColumnVector Re( vec_length ) ;
ColumnVector Im( vec_length ) ; 
ColumnVector sRe( vec_length ) ; 
ColumnVector sIm( vec_length ) ;  
int dcperiod ; 
double realpart ;
double imagpart ;  
double dcphase ;
double sum_deltaphase ;
int count ;

// unrolled loop to fill the first 5 elements of above calculation vectors ( unrolled for speed optimisation )
sinewave(0) = 0.0 ; sinewave(1) = 0.0 ; sinewave(2) = 0.0 ; sinewave(3) = 0.0 ; sinewave(4) = 0.0 ;
sinewave_lead_1(0) = 0.0 ; sinewave_lead_1(1) = 0.0 ; sinewave_lead_1(2) = 0.0 ; sinewave_lead_1(3) = 0.0 ; sinewave_lead_1(4) = 0.0 ;
smoothperiod_out(0) = 0.0 ; smoothperiod_out(1) = 0.0 ; smoothperiod_out(2) = 0.0 ; smoothperiod_out(3) = 0.0 ; smoothperiod_out(4) = 0.0 ;
dcphase_vec(0) = 0.0 ; dcphase_vec(1) = 0.0 ; dcphase_vec(2) = 0.0 ; dcphase_vec(3) = 0.0 ; dcphase_vec(4) = 0.0 ;

smooth(0) = 0.0 ; smooth(1) = 0.0 ; smooth(2) = 0.0 ; smooth(3) = 0.0 ; smooth(4) = 0.0 ;
period(0) = 0.0 ; period(1) = 0.0 ; period(2) = 0.0 ; period(3) = 0.0 ; period(4) = 0.0 ;            
smoothperiod(0) = 0.0 ; smoothperiod(1) = 0.0 ; smoothperiod(2) = 0.0 ; smoothperiod(3) = 0.0 ; smoothperiod(4) = 0.0 ;
detrender(0) = 0.0 ; detrender(1) = 0.0 ; detrender(2) = 0.0 ; detrender(3) = 0.0 ; detrender(4) = 0.0 ; 
Q1(0) = 0.0 ; Q1(1) = 0.0 ; Q1(2) = 0.0 ; Q1(3) = 0.0 ; Q1(4) = 0.0 ;
I1(0) = 0.0 ; I1(1) = 0.0 ; I1(2) = 0.0 ; I1(3) = 0.0 ; I1(4) = 0.0 ; 
jI(0) = 0.0 ; jI(1) = 0.0 ; jI(2) = 0.0 ; jI(3) = 0.0 ; jI(4) = 0.0 ;
jQ(0) = 0.0 ; jQ(1) = 0.0 ; jQ(2) = 0.0 ; jQ(3) = 0.0 ; jQ(4) = 0.0 ;
I2(0) = 0.0 ; I2(1) = 0.0 ; I2(2) = 0.0 ; I2(3) = 0.0 ; I2(4) = 0.0 ; 
Q2(0) = 0.0 ; Q2(1) = 0.0 ; Q2(2) = 0.0 ; Q2(3) = 0.0 ; Q2(4) = 0.0 ;
sI2(0) = 0.0 ; sI2(1) = 0.0 ; sI2(2) = 0.0 ; sI2(3) = 0.0 ; sI2(4) = 0.0 ;
sQ2(0) = 0.0 ; sQ2(1) = 0.0 ; sQ2(2) = 0.0 ; sQ2(3) = 0.0 ; sQ2(4) = 0.0 ;
Re(0) = 0.0 ; Re(1) = 0.0 ; Re(2) = 0.0 ; Re(3) = 0.0 ; Re(4) = 0.0 ;
Im(0) = 0.0 ; Im(1) = 0.0 ; Im(2) = 0.0 ; Im(3) = 0.0 ; Im(4) = 0.0 ;
sRe(0) = 0.0 ; sRe(1) = 0.0 ; sRe(2) = 0.0 ; sRe(3) = 0.0 ; sRe(4) = 0.0 ;
sIm(0) = 0.0 ; sIm(1) = 0.0 ; sIm(2) = 0.0 ; sIm(3) = 0.0 ; sIm(4) = 0.0 ;
            
 for ( octave_idx_type ii (5) ; ii < vec_length ; ii++ ) // Start the main loop
     {
       
     // smooth the price for hilbert calculations
     smooth(ii) = (4.0 * price(ii) + 3.0 * price(ii-1) + 2.0 * price(ii-2) + price(ii-3) ) / 10.0 ; 
     
     // Detrend the input
     detrender(ii) = (0.0962 * smooth(ii) + 0.5769 * smooth(ii-2) - 0.5769 * smooth(ii-4) - 0.0962 * smooth(ii-6)) * (0.075 * period(ii-1) + 0.54) ;

     // Compute  InPhase and Quadrature components 
     Q1(ii) = (0.0962 * detrender(ii) + 0.5769 * detrender(ii-2) - 0.5769 * detrender(ii-4) - 0.0962 * detrender(ii-6)) * (0.075 * period(ii-1) + 0.54) ;
     I1(ii) = detrender(ii-3) ;

     // Advance the phase of  I1 and Q1 by 90 degrees
     jI(ii) = (0.0962 * I1(ii) + 0.5769 * I1(ii-2) - 0.5769 * I1(ii-4) - 0.0962 * I1(ii-6)) * (0.075 * period(ii-1) + 0.54) ;
     jQ(ii) = (0.0962 * Q1(ii) + 0.5769 * Q1(ii-2) - 0.5769 * Q1(ii-4) - 0.0962 * Q1(ii-6)) * (0.075 * period(ii-1) + 0.54) ;

     // Phasor addition for 3 bar averaging
     I2(ii) = I1(ii) - jQ(ii) ;
     Q2(ii) = Q1(ii) + jI(ii) ;

     // Smooth the  I and Q components before applying the discriminator
     sI2(ii) = 0.2 * I2(ii) + 0.8 * sI2(ii-1) ;
     sQ2(ii) = 0.2 * Q2(ii) + 0.8 * sQ2(ii-1) ;

     // Homodyne Discriminator
     Re(ii) = sI2(ii) * sI2(ii-1) + sQ2(ii) * sQ2(ii-1) ;
     Im(ii) = sI2(ii) * sQ2(ii-1) - sQ2(ii) * sI2(ii-1) ;
     sRe(ii) = 0.2 * Re(ii) + 0.8 * sRe(ii-1) ;
     sIm(ii) = 0.2 * Im(ii) + 0.8 * sIm(ii-1) ; 

       if ( (sIm(ii) > 0.0 || sIm(ii) < 0.0) && (sRe(ii) > 0.0 || sRe(ii) < 0.0) )
       { 
       period(ii) = 360.0 / ( ((atan(sIm(ii) / sRe(ii))) * 180.0) / PI ) ;
       }
       else
       {
       period(ii) = period(ii-1) ;
       }

       if ( period(ii) > 1.5 * period(ii-1) )
       {
       period(ii) = 1.5 * period(ii-1) ;
       }

       if ( period(ii) < 0.67 * period(ii-1) )
       {
       period(ii) = 0.67 * period(ii-1) ;
       }

       if ( period(ii) < 6.0 )
       {
       period(ii) = 6.0 ;
       }

       if ( period(ii) > 50.0 )
       {
       period(ii) = 50.0 ;
       }
 
     period(ii) = 0.2 * period(ii) + 0.8 * period(ii-1) ;
     smoothperiod(ii) = 0.33 * period(ii) + 0.67 * smoothperiod(ii-1) ;

     // Compute Dominant Cycle
     dcperiod = int ( smoothperiod(ii) + 0.5 ) ;
     realpart = 0.0 ;
     imagpart = 0.0 ;
     dcphase = 0.0 ;

      for ( octave_idx_type jj (0) ; jj <= ( dcperiod - 1 ) ; jj++ )
          {
          realpart += sin( PI/180.0 * 360.0 * jj / dcperiod ) * ( smooth(ii-jj) ) ;
          imagpart += cos( PI/180.0 * 360.0 * jj / dcperiod ) * ( smooth(ii-jj) ) ;
          }

      if ( fabs( imagpart ) > 0.0 )
         {
         dcphase = atan( realpart / imagpart ) * 180.0 / PI ;
         }
      else if ( fabs( imagpart ) < 0.001 )
              {
              if ( realpart < 0.0 )
                 {
                 dcphase -= 90.0 ;
                 }
              else if ( realpart > 0.0 )
                      {
                      dcphase += 90.0 ;
                      }
              }
         dcphase += 90.0 ;

      // Compensate for one bar lag of the 4 bar weighted moving average
      dcphase += 360.0 / smoothperiod(ii) ;

      if ( imagpart < 0.0 )
         dcphase += 180.0 ;

      if ( dcphase > 315.0 )
         dcphase -= 360.0 ;
     
     // phase output 
     dcphase_vec(ii) = dcphase ;
     
     //Now compute a differential phase, resolve phase wraparound, and limit delta phase errors
     deltaphase(ii) = dcphase_vec(ii) - dcphase_vec(ii-1) ;
     
     if ( dcphase_vec(ii-1) > 270.0 && dcphase_vec(ii) < 90.0 )
        {
 deltaphase(ii) = 360.0 - dcphase_vec(ii-1) + dcphase_vec(ii) ;
 }
 
     if ( deltaphase(ii) < 1.0 )
        { 
 deltaphase(ii) = 1.0 ;
 }
 
     if ( deltaphase(ii) > 60.0 )
        {
 deltaphase(ii) = 60.0 ;
 }

     // Sum Deltaphases to reach 360 degrees. The sum is the instantaneous period.
     sum_period(ii) = 0.0 ;
     sum_deltaphase = 0.0 ;
     count = 0 ;
     
     while ( sum_deltaphase < 360.0 ) 
           {
           sum_deltaphase += deltaphase(ii-count) ;
           count ++ ;
    sum_period(ii) = count ;
           } 

      // Resolve Instantaneous Period errors and smooth
      if ( sum_period(ii) == 0.0 )
         {
  sum_period(ii) = sum_period(ii-1) ;
  }
  
      sumperiod(ii) = 0.25 * sum_period(ii) + 0.75 * sum_period(ii-1) ;

     // sinewave output
     sinewave(ii) = sin( dcphase * PI / 180.0 ) ;
     
     // one bar leading function
     sinewave_lead_1(ii) = sin( ( dcphase + 360.0 / smoothperiod(ii) ) * PI / 180.0 ) ;
     
     // period output
     smoothperiod_out(ii) = floor ( smoothperiod(ii) + 0.5 ) ;
        
     } // end of main ii loop
      
 retval_list(3) = dcphase_vec ;
 retval_list(2) = smoothperiod_out ;
 retval_list(1) = sinewave_lead_1 ;                                                                  
 retval_list(0) = sinewave ;

return retval_list ; 
                                                                       
} // end of function
This is a straightforward conversion of the code available from here. A nice intro to how it can be used is here and Ehler's own website can be found here

Sunday, 6 December 2015

Denoising Autoencoder MATLAB/Octave Code

Following on from my last post I have been looking for Octave code for the denoising autoencoder to avoid reinventing the wheel and writing it myself from scratch, and luckily I have found two options. The first is a tutorial on autoencoders, by a Piotr Mirowski, which has a link to a Github page with code. The second is a toolbox for dimensionality reduction, by Laurens van der Maaten, which has autoencoders as one of its options.

As of now I'm not sure which one I'll use, and perhaps I might yet write my own code heavily reusing elements from both of the above. More in due course.

Sunday, 22 November 2015

Recent Readings and New Directions.

Since my last post I have been doing a fair bit of online research and fortunately I have discovered the following papers, which mesh nicely with what I am trying to do with Conditional Restricted Boltzmann Machines to model time series:-

Deep Learning Architecture for Univariate Time Series Forecasting
Temporal Autoencoding Restricted Boltzmann Machine
Temporal Autoencoding Improves Generative Models of Time Series
Deep Modelling Complex Couplings Within Financial Markets
Predicting Time Series of Railway Speed Restrictions with Time Dependent Machine Learning Techniques

The big take away from these readings is to explicitly model the autoregressive components via a Denoising Autoencoder and secondly, not to model a univariate time series in isolation, but model a multivariate time series where the "other" time series are either informative measures taken from the univariate series itself (informative indicators?) and/or related time series e.g in forex one could use concepts similar to fractional product inefficiency or on all markets the concept of Intermarket analysis.

For the nearest future I have therefore set myself the task of adapting my CRBM code to include the denoising autoencoder and to investigate the multivariate time series approach.

Tuesday, 13 October 2015

Giving up on Runge-Kutta Methods (for now?)

Over the last few weeks I have been looking at using Runge-Kutta methods for the creation of features, but I have decided to give up on this for now simply because I think I have found a better way to accomplish what I want. I was alerted to this possible approach by this post over at http://dsp.stackexchange.com/ and following up on this I remembered that a few years ago I coded up John Ehler's Linear Prediction Filter (the white paper for which might still be available here) and my crudely transcribed code given below:
 DEFUN_DLD (linearpredict, args, , "Help String")
{
octave_value retval;

 ColumnVector a = args(0).column_vector_value ();                                   // The input price                          
 ColumnVector b(a);                                                                 // This will be the output column vector returned to Octave by "retval"
 const int n = args(1).int_value();
 int lngth = 10;                                                                 
 
 double g[30];
 int zz;
 for (zz = 0; zz < 30; zz++)
     g[zz] = 0.0;

 double sigPredict[30];
 for (zz = 0; zz < 30; zz++)
     sigPredict[zz] = 0.0;

 double sigPower = 0.0;
 double mu = 0.0;
 double xBar = 0.0;
 int jj = 0;                                                          
 for (octave_idx_type ii (10); ii < a.length="" 10="" a="" average="" factor="" for="" href="https://draft.blogger.com/null" if="" ii="" jj="" lngth="" loop="" normalization="" ompute="" onvergence="" power="" sigpower="" start="" the=""> 0)
         mu = 0.25 / (sigPower * 10);

      //Compute signal estimate
      xBar = 0;
      for (jj = 1; jj <= lngth; jj++)
          xBar = xBar + a(ii - jj) * g[jj];

     //Compute gain coefficients
     for (jj = 1; jj <= lngth; jj++)
         g[jj] = g[jj] + (mu * (a(ii) - xBar) * a(ii - jj));

     //Compute signal prediction waveform
     for (jj = 0; jj <= lngth; jj++)
         sigPredict[jj] = a(ii - (10 - jj));

     //Extend signal prediction into the future
     int kk = 0;
     for (jj = lngth + 1; jj <= lngth + 5; jj++)
     {
         sigPredict[jj] = 0;

         for (kk = 1; kk <= lngth; kk++)
	     sigPredict[jj] = sigPredict[jj] + sigPredict[jj - kk] * g[kk];
     }

     b(ii) = sigPredict[lngth + n];

     }
 
retval = b;                                                                         // Assign the output column vector to the return value

return retval;                                                                       // Return the output to Octave
}
which is very similar in concept to Burg's method. I think some application of these methods show more promise than concentrating on my naive implementation of Runge-Kutta.