Predict Upcoming Day's Volume and Range
The goal of this project is to develop a model that can predict the upcoming day's volume and range.
Read in End of Day Data and End of Night Data
## Read End of Night and End of Day Data
data <- read.csv("C:/Users/Analytics/Desktop/Work/Eurodollar Futures/EoN_EoD_Scrape/EoN_EoD_Scrape.csv", sep = " ", row.names = NULL)
names(data) <- c("Contract", "Last", "Change", "High", "Low", "Volume", "Day", "Updated")
## End of Day Data
eod <- data[as.numeric(substring(data$Updated,1,2)) > 12,]
eod <- eod[,-3]
eod$Volume <- gsub(",","",eod$Volume)
for(i in 2:5) {
eod[,i] <- as.numeric(as.character(eod[,i]))
}
eod$Last <- eod$Last * 100
eod$High <- eod$High * 100
eod$Low <- eod$Low * 100
eod <- eod[complete.cases(eod),]
## End of Night Data
eoN <- data[as.numeric(substring(data$Updated,1,2)) < 12,]
eoN <- eoN[,-3]
eoN$Volume <- gsub(",","",eoN$Volume)
for(i in 2:5) {
eoN[,i] <- as.numeric(as.character(eoN[,i]))
}
eoN$Last <- eoN$Last * 100
eoN$High <- eoN$High * 100
eoN$Low <- eoN$Low * 100
eoN <- eoN[complete.cases(eoN),]
End of Day Data
dayFrame <- data.frame()
for (u in unique(eod$Day)) {
df <- eod[eod$Day == u,][1:6,]
dFrame <- data.frame()
selVars <- c("Contract", "Last", "High", "Low", "Volume", "Range")
for(i in 1:6) {
for(s in selVars) {
if(s == "Range") {
dFrame[1,paste("Outright",i,paste(".",s, sep = ""), sep = "")] <- dFrame[1,paste("Outright",i,".High", sep = "")] - dFrame[1,paste("Outright",i,".Low", sep = "")]
} else {
dFrame[1,paste("Outright",i,paste(".",s, sep = ""), sep = "")] <- df[i,s]
}
}
}
dayFrame <- rbind(dayFrame, dFrame)
}
dayFrame$Date <- unique(eod$Day)
dayFrame <- dayFrame[complete.cases(dayFrame),]
Add Economic Schedule
library(xlsx)
econSchedule <- read.xlsx("C:/Users/Analytics/Google Drive/RFC Quant/CME Data/Economic Schedule.xlsx", 1)
dayFrame$Event <- NA
for(i in 1:length(dayFrame$Event)) {
dayFrame$Event[i] <- as.character(econSchedule$Event[as.Date(econSchedule$Date) == as.Date(dayFrame$Date[i])])
}
Add Previous Trading Day Info and MVA
todayVars <- names(dayFrame)
yestVars <- paste("y", todayVars, sep = "")
mva5Vars <- paste("mva", todayVars, sep = "")
mva5Vars <- mva5Vars[-grep("mvaEvent", mva5Vars)]
mva5Vars <- mva5Vars[-grep("mvaDate", mva5Vars)]
dayFrame[,yestVars] <- NA
dayFrame[,mva5Vars] <- NA
for(i in 2:length(dayFrame$Event)) {
dayFrame[i,yestVars] <- dayFrame[i-1, todayVars]
}
for(i in 6:length(dayFrame$Event)) {
for(mva in 1:length(mva5Vars)) {
dayFrame[i, mva5Vars[mva]] <- mean(dayFrame[(i-5):(i-1), todayVars[mva]])
}
}
Combine Historical Data with Predicted Day
myDate <- Sys.Date()
myDay <- dayFrame[length(dayFrame[,1]),]
myDay$Date <- myDate
myDay$yEvent[1] <- myDay$Event[1]
myDay$Event[1] <- as.character(econSchedule$Event[econSchedule$Date == myDate])
myDay[1,yestVars] <- dayFrame[length(dayFrame[,1]), todayVars]
for(mva in 1:length(mva5Vars)) {
myDay[1, mva5Vars[mva]] <- mean(dayFrame[(length(dayFrame[,1])-4):length(dayFrame[,1]), todayVars[mva]])
}
dayFrame$Date <- as.character(dayFrame$Date)
myDay$Date <- as.character(myDay$Date)
dayFrame <- rbind(dayFrame, myDay)
Filter Out Unwanted Columns
volCols <- names(dayFrame)[grepl("Volume", names(dayFrame))]
rangeCols <- names(dayFrame)[grepl("Range", names(dayFrame))]
predCols <- c(volCols, rangeCols)
predFrame <- dayFrame[,predCols]
predFrame$Event <- dayFrame$Event
predFrame$yEvent <- dayFrame$yEvent
predFrame <- predFrame[complete.cases(predFrame),]
Scale Columns
predFrame_s <- scale(predFrame[,predCols])
## Create pcenter and pscale to unscale data later
pcenter <- attr(predFrame_s, "scaled:center")
pscale <- attr(predFrame_s, "scaled:scale")
## Create Final data frame
predFrame <- as.data.frame(cbind(predFrame$Event, predFrame$yEvent,predFrame_s))
names(predFrame)[1] <- "Event"
names(predFrame)[2] <- "yEvent"
predFrame$Event <- as.factor(predFrame$Event)
predFrame$yEvent <- as.factor(predFrame$yEvent)
for(pc in predCols) {
predFrame[,pc] <- as.numeric(as.character(predFrame[,pc]))
}
Build Prediction Model
library(caret)
## inTrain <- createDataPartition(y = predFrame$Outright5.Volume, p = .7, list = F)
## training <- predFrame[inTrain,]
## testing <- predFrame[-inTrain,]
training <- predFrame[1:(length(predFrame[,1])-1),]
testing <- predFrame[length(predFrame[,1]),]
unscale <- function(scaledpt, centervec, scalevec) {
scaledpt*scalevec + centervec
}
formVars <- names(training)[-c(3,4,5,6,7,8,21,22,23,24,25,26)]
library(randomForest)
finalFrame <- data.frame()
for(i in 1:6) {
predVarRange <- paste("Outright",i,".Range", sep = "")
predVarVol <- paste("Outright",i,".Volume", sep = "")
formulaRange <- paste(predVarRange, paste(formVars, collapse = "+"), sep = "~")
formulaVol <- paste(predVarVol, paste(formVars, collapse = "+"), sep = "~")
## RPart Fit Range
rpfitRange <- train(as.formula(formulaRange), method = "rpart", data = training)
rpPredictRange <- predict(rpfitRange, testing)
##msre_rpart_range <- mean(sqrt((testing$Outright5.Range-rpPredictRange)^2))
rp_result_range <- unscale(rpPredictRange, pcenter, pscale)[predVarRange]
## RPart Fit Volume
rpfitVol <- train(as.formula(formulaVol), method = "rpart", data = training)
rpPredictVol <- predict(rpfitVol, testing)
##msre_rpart_vol <- mean(sqrt((testing$Outright5.Volume-rpPredictVol)^2))
rp_result_vol <- unscale(rpPredictVol, pcenter, pscale)[predVarVol]
rp_result_range
## Linear Regression Range
glm_fit_range <- train(as.formula(formulaRange), method = "glm", data = training)
glmPredictRange <- predict(glm_fit_range, testing)
## msre_glm_range <- mean(sqrt((testing$Outright5.Range-glmPredictRange)^2))
glm_result_range <- unscale(glmPredictRange, pcenter, pscale)[predVarRange]
## Linear Regression Volume
glm_fit_vol <- train(as.formula(formulaVol), method = "glm", data = training)
glmPredictVol <- predict(glm_fit_vol, testing)
## msre_glm_vol <- mean(sqrt((testing$Outright5.Volume-glmPredictVol)^2))
glm_result_vol <- unscale(glmPredictVol, pcenter, pscale)[predVarVol]
glm_result_range
glm_result_vol
## Boosting Model Range
fitControl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 10)
gbm_fit_range <- train(as.formula(formulaRange), method = "gbm",
data = training,
trControl = fitControl,
verbose = FALSE)
gbmPredictRange <- predict(gbm_fit_range, testing)
##msre_gbm_range <- mean(sqrt((testing$Outright5.Range-gbmPredictRange)^2))
gbm_result_range <- unscale(gbmPredictRange, pcenter, pscale)[predVarRange]
## Boosting Model Volume
gbm_fit_vol <- train(as.formula(formulaVol), method = "gbm", data = training)
gbmPredictVol <- predict(gbm_fit_vol, testing)
## msre_gbm_vol <- mean(sqrt((testing$Outright5.Volume-gbmPredictVol)^2))
gbm_result_vol <- unscale(gbmPredictVol, pcenter, pscale)[predVarVol]
gbm_result_range
gbm_result_vol
## Random Forest Range
rf_fit_range <- randomForest(as.formula(formulaRange), data = training)
rfPredictRange <- predict(rf_fit_range, testing)
##msre_rf_range <- mean(sqrt((testing$Outright5.Range-rfPredictRange)^2))
rf_result_range <- unscale(rfPredictRange, pcenter, pscale)[predVarRange]
## Random Forest Volume
rf_fit_vol <- randomForest(as.formula(formulaVol), data = training)
rfPredictVol <- predict(rf_fit_vol, testing)
##msre_rf_vol <- mean(sqrt((testing$Outright5.Volume-rfPredictVol)^2))
rf_result_vol <- unscale(rfPredictVol, pcenter, pscale)[predVarVol]
rf_result_range
rf_result_vol
r <- c("Volume", "Range")
cls <- c("Linear", "DecisionTree", "Boosting", "RandomForest")
vec <- c(glm_result_vol, rp_result_vol, gbm_result_vol,rf_result_vol, glm_result_range,
gbm_result_range, rp_result_range, rf_result_range)
df <- as.data.frame(matrix(vec, nrow = 4))
df <- cbind(cls, df)
names(df) <- c("Model", "Volume(Predicted)", "Range(Predicted)")
df$`Volume(Predicted)` <- round(df$`Volume(Predicted)`, 0)
df$`Range(Predicted)` <- round(df$`Range(Predicted)`*2,0)/2
df$Model <- as.character(df$Model)
finalFrame <- rbind(finalFrame,df)
}
## Neural Network
## library(neuralnet)
Prediction Frame For 2015-12-01
options(scipen = 999)
library(htmlTable)
htmlTable(finalFrame,
rnames = c(rep("",24)),
n.rgroup = c(4,4,4,4,4,4),
rgroup = c("Outright 1", "Outright 2",
"Outright 3", "Outright 4",
"Outright 5", "Outright 6"))
Model | Volume(Predicted) | Range(Predicted) | |
---|---|---|---|
Outright 1 | |||
Linear | 139868 | 1.5 | |
DecisionTree | 163255 | 1 | |
Boosting | 163338 | 1.5 | |
RandomForest | 141511 | 1.5 | |
Outright 2 | |||
Linear | 149644 | 2.5 | |
DecisionTree | 193775 | 3 | |
Boosting | 185633 | 2.5 | |
RandomForest | 143209 | 2 | |
Outright 3 | |||
Linear | 170680 | 4 | |
DecisionTree | 196389 | 4 | |
Boosting | 185591 | 4 | |
RandomForest | 147524 | 3 | |
Outright 4 | |||
Linear | 171702 | 5 | |
DecisionTree | 166944 | 5 | |
Boosting | 129565 | 5 | |
RandomForest | 140906 | 4 | |
Outright 5 | |||
Linear | 217326 | 5.5 | |
DecisionTree | 224915 | 6 | |
Boosting | 229856 | 5.5 | |
RandomForest | 201921 | 4.5 | |
Outright 6 | |||
Linear | 158204 | 6 | |
DecisionTree | 134894 | 7 | |
Boosting | 136534 | 6.5 | |
RandomForest | 128232 | 5 |
No comments:
Post a Comment