Dekalog Blog: RStudio

Showing posts with label RStudio. Show all posts

Saturday, 14 November 2020

Temporal Clustering, Part 3

Continuing on with the subject matter of my last post, in the code box below there is R code which is a straight forward refactoring of the Octave code contained in the second code box of my last post. This code is my implementation of the cross validation routine described in the paper Cluster Validation by Prediction Strength, but adapted for use in the one dimensional case. I have refactored this into R code so that I can use the Ckmeans.1d.dp package for optimal, one dimensional clustering.

library( Ckmeans.1d.dp )

## load the training data from Octave output (comment out as necessary )
data = read.csv( "~/path/to//all_data_matrix" , header = FALSE )

## comment out as necessary
adjust = 0 ## default adjust value
sum_seq = seq( from = 1 , to = 198 , by = 1 ) ; adjust = 1 ; sum_seq_l = as.numeric( length( sum_seq ) )## Monday
##sum_seq = seq( from = 115 , to = 342 , by = 1 ) ; sum_seq_l = as.numeric( length( sum_seq ) ) ## Tuesday
##sum_seq = seq( from = 115 , to = 342 , by = 1 ) ; sum_seq_l = as.numeric( length( sum_seq ) ) ## Wednesday
##sum_seq = seq( from = 115 , to = 342 , by = 1 ) ; sum_seq_l = as.numeric( length( sum_seq ) ) ## Thursday
##sum_seq = seq( from = 547 , to = 720 , by = 1 ) ; adjust = 2 ; sum_seq_l = as.numeric( length( sum_seq ) ) ## Friday

## intraday --- commnet out or adjust as necessary
##sum_seq = seq( from = 25 , to = 100 , by = 1 ) ; sum_seq_l = as.numeric( length( sum_seq ) )

upper_tri_mask = 1 * upper.tri( matrix( 0L , nrow = sum_seq_l , ncol = sum_seq_l ) , diag = FALSE )
no_sample_iters = 1000
max_K = 20
all_k_ps = matrix( 0L , nrow = 1 , ncol = max_K )

for ( iters in 1 : no_sample_iters ) {

## sample the data in data by rows
train_ix = sample( nrow( data ) , size = round( nrow( data ) / 2 ) , replace = FALSE )
train_data = data[ train_ix , sum_seq ] ## extract training data using train_ix rows of data
train_data_sum = colSums( train_data )  ## sum down the columns of train_data
test_data = data[ -train_ix , sum_seq ] ## extract test data using NOT train_ix rows of data
test_data_sum = colSums( test_data )    ## sum down the columns of test_data
## adjust for weekend if necessary
if ( adjust == 1 ) { ## Monday, so correct artifacts of weekend gap
  train_data_sum[ 1 : 5 ] = mean( train_data_sum[ 1 : 48 ] )
  test_data_sum[ 1 : 5 ] = mean( test_data_sum[ 1 : 48 ] )   
} else if ( adjust == 2 ) { ## Friday, so correct artifacts of weekend gap
  train_data_sum[ ( sum_seq_l - 4 ) : sum_seq_l ] = mean( train_data_sum[ ( sum_seq_l - 47 ) : sum_seq_l ] )
  test_data_sum[  ( sum_seq_l - 4 ) : sum_seq_l ] = mean( test_data_sum[ ( sum_seq_l - 47 ) : sum_seq_l ] ) 
}

for ( k in 1 : max_K ) {
  
## K segment train_data_sum
train_res = Ckmeans.1d.dp( sum_seq , k , train_data_sum )
train_out_pairs_mat = matrix( 0L , nrow = sum_seq_l , ncol = sum_seq_l )

## K segment test_data_sum
test_res = Ckmeans.1d.dp( sum_seq , k , test_data_sum )
test_out_pairs_mat = matrix( 0L , nrow = sum_seq_l , ncol = sum_seq_l )

  for ( ii in 1 : length( train_res$centers ) ) {
    ix = which( train_res$cluster == ii )
    train_out_pairs_mat[ ix , ix ] = 1 
    ix = which( test_res$cluster == ii )
    test_out_pairs_mat[ ix , ix ] = 1
    }
  ## coerce to upper triangular matrix
  train_out_pairs_mat = train_out_pairs_mat * upper_tri_mask
  test_out_pairs_mat = test_out_pairs_mat * upper_tri_mask
  
  ## get minimum co-membership cluster proportion
  sample_min_vec = matrix( 0L , nrow = 1 , ncol = length( test_res$centers ) )
  for ( ii in 1 : length( test_res$centers ) ) {
    ix = which( test_res$cluster == ii )
    test_cluster_sum = sum( test_out_pairs_mat[ ix , ix ] )
    train_cluster_sum = sum( test_out_pairs_mat[ ix , ix ] * train_out_pairs_mat[ ix , ix ] )
    sample_min_vec[ , ii ] = train_cluster_sum / test_cluster_sum
  }
  
  ## get min of sample_min_vec
  min_val = min( sample_min_vec[ !is.nan( sample_min_vec ) ] ) ## removing any NaN
  all_k_ps[ , k ] = all_k_ps[ , k ] + min_val

} ## end of K for loop

} ## end of sample loop

all_k_ps = all_k_ps / no_sample_iters ## average values
plot( 1 : length( all_k_ps ) , all_k_ps , "b" , xlab = "Number of Clusters K" , ylab = "Prediction Strength Value" )
abline( h = 0.8 , col = "red" )

The purpose of the cross validation routine is to select the number of clusters K, in the model selection sense, that is best supported by the available data. The above linked paper suggests that the optimal number of clusters K is the highest number K that has a prediction strength value over some given threshold (e.g. 0.8 or 0.9). The last part of the code plots the values of prediction strength for K (x-axis) vs. prediction strength (y-axis), along with the threshold value of 0.8 in red. For the particular set of data in question, it can be seen that the optimal K value for the number of clusters is 8.

This second code box shows code, re-using some of the above code, to visualise the clusters for a given K,

library( Ckmeans.1d.dp )

## load the training data from Octave output (comment out as necessary )
data = read.csv( "~/path/to/all_data_matrix" , header = FALSE )
data_sum = colSums( data ) ## sum down the columns of data
data_sum[ 1 : 5 ] = mean( data_sum[ 1 : 48 ] ) ## correct artifacts of weekend gap
data_sum[ 716 : 720 ] = mean( data_sum[ 1 : 48 ] ) ## correct artifacts of weekend gap

## comment out as necessary
adjust = 0 ## default adjust value
sum_seq = seq( from = 1 , to = 198 , by = 1 ) ; sum_seq_l = as.numeric( length( sum_seq ) ) ## Monday
##sum_seq = seq( from = 115 , to = 342 , by = 1 ) ; sum_seq_l = as.numeric( length( sum_seq ) ) ## Tuesday
# sum_seq = seq( from = 115 , to = 342 , by = 1 ) ; sum_seq_l = as.numeric( length( sum_seq ) ) ## Wednesday
# sum_seq = seq( from = 115 , to = 342 , by = 1 ) ; sum_seq_l = as.numeric( length( sum_seq ) ) ## Thursday
##sum_seq = seq( from = 547 , to = 720 , by = 1 ) ; sum_seq_l = as.numeric( length( sum_seq ) ) ## Friday

## intraday --- commnet out or adjust as necessary
##sum_seq = seq( from = 25 , to = 100 , by = 1 ) ; sum_seq_l = as.numeric( length( sum_seq ) )

k = 8
res = Ckmeans.1d.dp( sum_seq , k , data_sum[ sum_seq ] )

plot( sum_seq , data_sum[ sum_seq ], main = "Cluster centres. Cluster centre ix is a predicted turning point",
     col = res$cluster,
     pch = res$cluster, type = "h", xlab = "Count from beginning ix at ix = 1",
     ylab = "Total Counts per ix" )

abline( v = res$centers, col = "chocolate" , lty = "dashed" )

text( res$centers, max(data_sum[sum_seq]) * 0.95, cex = 0.75, font = 2,
      paste( round(res$centers) ) )

a typical plot for which is shown below.

The above plot can be thought of as a clustering at a particular scale, and one can go down in scale by selecting smaller ranges of the data. For example, taking all the datum clustered in the 3 clusters centred at x-axis ix values 38, 63 and 89 and re-running the code in the first code box on just this data gives this prediction strength plot, which suggests a K value of 6.

Re-running the code in the second code box plots these 6 clusters thus.

Looking at this last plot, it can be seen that there is a cluster at x-axis ix value 58, which corresponds to 7.30 a.m. London time, and within this green cluster there are 2 distinct peaks which correspond to 7.00 a.m. and 8.00 a.m. A similar, visual analysis of the far right cluster, centre ix = 94, shows a peak at the time of the New York open.

My hypothesis is that by clustering in the above manner it will be possible to identify distinct, intraday times at which the probability of a market turn is greater than at other times. More in due course.

Tuesday, 31 October 2017

Prepending Historical Data with Oanda's R API

As a follow on to my previous post, which was about appending data, the script below prepends historical data to an assumed existing data record.

% cd to the hourly data directory
setwd("~/Documents/octave/oanda_data/hourly")

all_current_historical_data_list = read.table("instrument_hourly_update_file",header=FALSE,sep="",colClasses=c("character","Date","numeric") )

for( ii in 1 : nrow( all_current_historical_data_list ) ) {  

   instrument = all_current_historical_data_list[ ii , 1 ]
   
   current_ohlc_record = read.table( file = paste( instrument , "raw_OHLC_hourly" , sep = "_" ) , header = FALSE , na = "" , sep = "," ,
                                      stringsAsFactors = FALSE )
   
   current_ohlc_record_begin_date_time = as.character( current_ohlc_record[ 1 , 1 ] ) % get the date/time value to be matched
   last_date_ix = as.Date( current_ohlc_record[ 1 , 1 ] )                             % the end date for new data to be downloaded             
   
   % last 40 weeks of hourly data approx = 5000 hourly bars            
   begin_date_ix = as.Date( last_date_ix - 280 )                      % the begin date for new data to be downloaded

   % download the missing historical data from begin_date_ix to last_date_x.
   new_historical_data = HisPricesDates( Granularity = "H1", DayAlign, TimeAlign, AccountToken, instrument,
                          begin_date_ix , last_date_ix + 2 ) % +2 to ensure that the end of the new downloaded data will
                                                             % overlap with the beginning of current_ohlc_record
  
   % having ensured no data is missed by overlaping with the current_ohlc_record, delete duplicated OHLC information
   new_historical_data_date_times = as.character( new_historical_data[ , 1 ] ) % vector to search for the above date value
   
   ix = charmatch( current_ohlc_record_begin_date_time , new_historical_data_date_times ) % get the matching index value

   % delete that part of new_historical_data which is already contained in filename
   new_historical_data = new_historical_data[ -( ix : nrow( new_historical_data ) ) , ]
   
   % before prepending new_historical_data in front of current_ohlc_record, need to give names to current_ohlc_record as
   % rbind needs to bind by named attributes
   names( current_ohlc_record ) = names( new_historical_data )
   
   % see https://stackoverflow.com/questions/11785710/rbind-function-changing-my-entries for reason for following
   % also need to coerce that dates in new_historical_data from POSIXct to character
   new_historical_data$TimeStamp = as.character( new_historical_data$TimeStamp )
   
   % and now prepend new_historical_data to current_ohlc_record
   combined_records = rbind( new_historical_data , current_ohlc_record , stringsAsFactors = FALSE )
   
   % and coerce character dates back to a POSIXct date format prior to printing
   combined_records$TimeStamp = as.POSIXct( combined_records$TimeStamp )
   
   % write combined_records to file
   write.table( combined_records , file = paste( instrument , "raw_OHLC_hourly" , sep = "_" ) , row.names = FALSE , 
                col.names = FALSE , sep = "," )
   
   added_data_length = nrow( new_historical_data ) % length of added new data

   % and amend Instrument_update file with lastest update information
   all_current_historical_data_list[ ii , 3 ] = all_current_historical_data_list[ ii , 3 ] + added_data_length
   
   % write updated Instrument_update_file to file
   write.table( all_current_historical_data_list , file = "instrument_hourly_update_file" , row.names = FALSE , col.names = FALSE )

} % end of for all_current_historical_data_list loop

As described in the previous post the function HisPricesDates is called to do the actual downloading, with the relevant dates for function input being read and calculated from the existing data file ( I have hard coded for hourly data but this can, of course, be changed or implemented as user input in the R session). As usual I have commented the script to explain what is going on.

However, one important caveat is that it is assumed that there is actually some Oanda data to download and prepend prior the the earliest date in the existing record, and there are no checks of this assumption. Therefore, the script might fail in unexpected ways if one attempts to reach too far back in history for the prependable data.

Friday, 27 October 2017

Updating Historical Data Using Oanda's API and R

Following on from my previous post about downloading historical data, this post shows how previously downloaded data may be updated and appended with new, more recent data without having to re-download all the old data all over again.

The main function to do this, HisPricesDates, downloads data between given dates as function inputs and is shown below.

HisPricesDates  = function( Granularity, DayAlign, TimeAlign, AccountToken, Instrument, Start, End ){

% a typical Oanda API call might look like  
% https://api-fxtrade.oanda.com/v1/candles?instrument=EUR_USD&granularity=D&start=2014-03-21&end=2014-04-21&candleFormat=midpoint&includeFirst=false
% which is slowly built up by using the R paste function, commented at end of each line below
  
  httpaccount  = "https://api-fxtrade.oanda.com"
  auth           = c(Authorization = paste("Bearer",AccountToken,sep=" "))
  QueryHistPrec  = paste(httpaccount,"/v1/candles?instrument=",sep="") % https://api-fxtrade.oanda.com/v1/candles?instrument=
  QueryHistPrec1 = paste(QueryHistPrec,Instrument,sep="")              % https://api-fxtrade.oanda.com/v1/candles?instrument=EUR_USD
  qstart = paste("start=",Start,sep="")                                % start=2014-03-21
  qend   = paste("end=",End,sep="")                                    % end=2014-04-21   
  qcandleFormat  = "candleFormat=midpoint"                             % candleFormat=midpoint
  qgranularity   = paste("granularity=",Granularity,sep="")            % granularity=D
  qdailyalignment    = paste("dailyAlignment=",DayAlign,sep="")        % dailyAlignment=0
  qincludeFirst = "includeFirst=false"                                 % includeFirst=false
  QueryHistPrec2 = paste(QueryHistPrec1,qgranularity,qstart,qend,qcandleFormat,qincludeFirst,qdailyalignment,sep="&")
  InstHistP = getURL(QueryHistPrec2,cainfo=system.file("CurlSSL","cacert.pem",package="RCurl"),httpheader=auth)
  InstHistPjson = fromJSON(InstHistP, simplifyDataFrame = TRUE)
  Prices        = data.frame(InstHistPjson[[3]])
  Prices$time   = paste(substr(Prices$time,1,10),substr(Prices$time,12,19), sep=" ")
  colnames(Prices) = c("TimeStamp","Open","High","Low","Close","TickVolume","Complete")
  Prices$TimeStamp = as.POSIXct(strptime(Prices$TimeStamp, "%Y-%m-%d %H:%M:%OS"),origin="1970-01-01",tz = "UTC")
  attributes(Prices$TimeStamp)$tzone = TimeAlign
  return(Prices)
  
}

This function is called by two R scripts, one for downloading daily data and one for intraday data.

The daily update script, which is shown next,

% cd to the daily data directory
setwd("~/Documents/octave/oanda_data/daily")

all_current_historical_data_list = read.table("instrument_daily_update_file",header=FALSE,sep="",colClasses=c("character","Date","numeric") )

for( ii in 1 : nrow( all_current_historical_data_list ) ) {
  
  instrument = all_current_historical_data_list[ ii , 1 ]
  % read second column of dates in all_current_historical_data_list as a date index
  date_ix = as.Date( all_current_historical_data_list[ ii , 2 ] )
  todays_date = as.Date( Sys.time() )
  
  % download the missing historical data from date_ix to todays_date, if and only if, date_ix != todays_date
  if( date_ix + 1 != todays_date ) {
  
  new_historical_data = HisPricesDates( Granularity = "D", DayAlign, TimeAlign, AccountToken, instrument,
                         date_ix , todays_date )

  % the new_historical_data might only try to add incomplete OHLC data, in which case do not actually
  % want to update, so only update if we will be adding new, complete OHLC information  
  if ( nrow( new_historical_data ) >= 2 & new_historical_data[ 2 , 7 ] == TRUE ) {
  
    % now do some data manipulation
    % expect date of last line in Instrument_update_file == date of first line in new_historical_data 
    if ( date_ix == as.Date( new_historical_data[ 1 , 1 ] ) ) { % this is the case if true
       new_historical_data = new_historical_data[ -1 , ]       % so delete first row of new_historical_data
    }
  
    % similarly, expect last line of new_historical_data to be an incomplete OHLC bar
    if ( new_historical_data[ nrow( new_historical_data) , 7 ] == FALSE) {         % if so,
       new_historical_data = new_historical_data[ -nrow( new_historical_data) , ] % delete this last line
    }
    
    % append new_historical_data to the relevant raw data file
    write.table( new_historical_data , file = paste( instrument , "raw_OHLC_daily" , sep = "_" ) , row.names = FALSE , na = "" ,
                 col.names = FALSE , sep = "," , append = TRUE )
  
    added_data_length = nrow( new_historical_data )
    new_last_date = as.Date( new_historical_data[ added_data_length , 1 ] )
    
    % and amend Instrument_update file with lastest update information  
    all_current_historical_data_list[ ii , 2 ] = new_last_date
    all_current_historical_data_list[ ii , 3 ] = all_current_historical_data_list[ ii , 3 ] + added_data_length
  
    } % end of download if statement
  
  } % end of ( date_ix != todays_date ) if statement
  
} % end of for all_current_historical_data_list loop

% Write updated Instrument_update_file to file
write.table( all_current_historical_data_list , file = "instrument_daily_update_file" , row.names = FALSE , col.names = FALSE , na = "" )

has if statements as control structures to check that there is likely to be new daily data to actually download. It does this by checking a last_update date contained in an "instrument_daily_update_file" and comparing this with the current OS system time. If there is likely to be new data, the script runs and then updates this "instrument_daily_update_file." If not, the script exits with nothing having been done.

The intraday update script doe not have the checks the daily script has because I assume there will always be some new intraday data available for download. In this case, the last_update date is read from the "instrument_update_file" purely to act as an input to the above HisPricesDates function. As a result, this script involves some data manipulation to ensure that duplicate data is not printed to file. This script is shown next and is heavily commented to explain what is happening.

% cd to the hourly data directory
setwd("~/Documents/octave/oanda_data")

all_current_historical_data_list = read.table("instrument_hourly_update_file",header=FALSE,sep="",colClasses=c("character","Date","numeric") )

for( ii in 1 : nrow( all_current_historical_data_list ) ) {

   instrument = all_current_historical_data_list[ ii , 1 ]
   
   % read second column of dates in all_current_historical_data_list as a date index
   date_ix = as.Date( all_current_historical_data_list[ ii , 2 ] )
   
   todays_date = as.Date( Sys.time() )

   % download the missing historical data from date_ix to todays_date. If date_ix == todays_date, will download all
   % hourly bars for today only. 
   new_historical_data = HisPricesDates( Granularity = "H1", DayAlign, TimeAlign, AccountToken, instrument,
                          date_ix , todays_date + 1 )

   % the new_historical_data will almost certainly have incomplete hourly OHLC data in its last line,
   % so delete this incomplete OHLC information
   if ( new_historical_data[ nrow( new_historical_data ) , 7 ] == FALSE ) {
        new_historical_data = new_historical_data[ -nrow( new_historical_data ) , ]
   }
   
   % read the last line only of the current OHLC file for this instrument
   file = paste( instrument , "raw_OHLC_hourly" , sep = "_" ) % get the filename
   
   system_command = paste( "tail -1" , file , sep = " " )     % create a unix system command to read the last line of this file
   
   % read the file's last line
   old_historical_data = read.csv( textConnection( system( system_command , intern = TRUE ) ) , header = FALSE , sep = "," ,
                                    stringsAsFactors = FALSE )
   
   old_historical_data_end_date_time = old_historical_data[ 1 , 1 ]            % get the date value to be matched 
   
   new_historical_data_date_times = as.character( new_historical_data[ , 1 ] ) % vector to search for the above date value

   ix = charmatch( old_historical_data_end_date_time , new_historical_data_date_times ) % get the matching index value

   % delete that part of new_historical_data which is already contained in filename
   new_historical_data = new_historical_data[ -( 1 : ix ) , ]
   
   % append new_historical_data to the relevant raw data file
   write.table( new_historical_data , file = paste( instrument , "raw_OHLC_hourly" , sep = "_" ) , row.names = FALSE , na = "" ,
                 col.names = FALSE , sep = "," , append = TRUE )

   added_data_length = nrow( new_historical_data )                          % length of added new data
   new_last_date = as.Date( new_historical_data[ added_data_length , 1 ] )  % date of last update

   % and amend Instrument_update file with lastest update information
   all_current_historical_data_list[ ii , 2 ] = new_last_date
   all_current_historical_data_list[ ii , 3 ] = all_current_historical_data_list[ ii , 3 ] + added_data_length

} % end of for all_current_historical_data_list loop

% finally, write updated Instrument_update_file to file
write.table( all_current_historical_data_list , file = "instrument_hourly_update_file" , row.names = FALSE , col.names = FALSE , na = "" )

There is one important thing to point out on lines 29 to 33, which is that this section of code relies on a Unix based command, which in turn means that this almost certainly will not work on Windows based OSes. Windows users will have to find their own hack to load just the last line of the relevant file, or put up with loading the whole historical data file and indexing just the last line.

Pages