Using the simple Octave function below with the Akaike Information Criterion as the minimisation objective
## Copyright (C) 2019 dekalog
##
## This program is free software: you can redistribute it and/or modify it
## under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program. If not, see
## .
## -*- texinfo -*-
## @deftypefn {} {@var{J} =} wann_training_of_cyclic_embedding()
##
## @seealso{}
## @end deftypefn
## Author: dekalog
## Created: 2019-10-26
function J = wann_training_of_cyclic_embedding( x )
global sample_features ; global sample_targets ;
epsilon = 1e-15 ; ## to ensure log() does not give out a nan
## get the parameters from input x
activation_funcs = floor( x( 1 : 5 ) ) ; ## get the activations, 1 == sigmoid, 2 == tanh, 3 == Lecun sigmoid
layer_size = floor( x( 6 : 10 ) ) ;
[ min_layer_size , ix_min ] = min( layer_size ) ;
if( min_layer_size > 0 ) ## to be expected most of the time
nn_depth = length( layer_size ) ;
elseif( min_layer_size == 0 ) ## one layer has no nodes, hence limits depth of nn
nn_depth = ix_min - 1 ;
endif
length_jj_loop = 25 ;
all_aic_values = zeros( length_jj_loop , 1 ) ;
for jj = 1 : length_jj_loop
previous_layer_out = sample_features ;
sum_of_k = 0 ;
for ii = 1 : nn_depth
new_weight_matrix = ones( size( previous_layer_out , 2 ) , layer_size( ii ) ) ./ sqrt( size( previous_layer_out , 2 ) ) ;
sum_of_k = sum_of_k + numel( new_weight_matrix ) ;
prior_to_activation_input = previous_layer_out * new_weight_matrix ;
## select the activation function
if( activation_funcs( ii ) == 1 ) ## sigmoid activation
previous_layer_out = 1.0 ./ ( 1.0 .+ exp( -prior_to_activation_input ) ) ;
elseif( activation_funcs( ii ) == 2 ) ## tanh activation
previous_layer_out = tanh( prior_to_activation_input ) ;
elseif( activation_funcs( ii ) == 3 ) ## lecun sigmoid activation
previous_layer_out = sigmoid_lecun_m( prior_to_activation_input ) ;
endif
endfor
## the final logistic output
new_weight_matrix = ones( size( previous_layer_out , 2 ) , 1 ) ./ sqrt( size( previous_layer_out , 2 ) ) ;
sum_of_k = sum_of_k + numel( new_weight_matrix ) ;
final_output = previous_layer_out * new_weight_matrix ;
final_output = 1.0 ./ ( 1.0 .+ exp( -final_output ) ) ;
max_likelihood = sum( log( final_output .+ epsilon ) .* sample_targets + log( 1 .- final_output .+ epsilon ) .* ( 1 .- sample_targets ) ) ;
## get Akaike Information criteria
all_aic_values( jj ) = 2 * sum_of_k - 2 * max_likelihood ;
endfor ## end of jj loop
J = mean( all_aic_values ) ;
endfunction
and the Octave interface of the Bayesopt Library I am currently iterating over different architectures ( up to 5 hidden layers deep with a max of 100 nodes per layer and using a choice of 3 hidden activations ) for a simple Logistic Regression model to predict turning points in different sets of statistical mechanics synthetic data using just features based on Taken's embedding.More in due course.