Tuesday, December 1, 2015

How to Build a Daily Range and Volume Prediction Model in R

Predict Upcoming Day's Volume and Range

Predict Upcoming Day's Volume and Range

The goal of this project is to develop a model that can predict the upcoming day's volume and range.

Read in End of Day Data and End of Night Data

  ## Read End of Night and End of Day Data
  data <- read.csv("C:/Users/Analytics/Desktop/Work/Eurodollar Futures/EoN_EoD_Scrape/EoN_EoD_Scrape.csv", sep = " ", row.names = NULL)
  names(data) <- c("Contract", "Last", "Change", "High", "Low", "Volume", "Day", "Updated")

  ## End of Day Data
  eod <- data[as.numeric(substring(data$Updated,1,2)) > 12,]
  eod <- eod[,-3]
  eod$Volume <- gsub(",","",eod$Volume)

  for(i in 2:5) {
    eod[,i] <- as.numeric(as.character(eod[,i]))
  }
  eod$Last <- eod$Last * 100
  eod$High <- eod$High * 100
  eod$Low <- eod$Low * 100
  eod <- eod[complete.cases(eod),]

  ## End of Night Data
  eoN <- data[as.numeric(substring(data$Updated,1,2)) < 12,]
  eoN <- eoN[,-3]
  eoN$Volume <- gsub(",","",eoN$Volume)

  for(i in 2:5) {
    eoN[,i] <- as.numeric(as.character(eoN[,i]))
  }
  eoN$Last <- eoN$Last * 100
  eoN$High <- eoN$High * 100
  eoN$Low <- eoN$Low * 100
  eoN <- eoN[complete.cases(eoN),]

End of Day Data

  dayFrame <- data.frame()
  for (u in unique(eod$Day)) {
    df <- eod[eod$Day == u,][1:6,]
    dFrame <- data.frame()
    selVars <- c("Contract", "Last", "High", "Low", "Volume", "Range")
    for(i in 1:6) {
      for(s in selVars) {
        if(s == "Range") {
          dFrame[1,paste("Outright",i,paste(".",s, sep = ""), sep = "")] <- dFrame[1,paste("Outright",i,".High", sep = "")] - dFrame[1,paste("Outright",i,".Low", sep = "")]  
        } else {
          dFrame[1,paste("Outright",i,paste(".",s, sep = ""), sep = "")] <- df[i,s]
        }      
      }
    }
    dayFrame <- rbind(dayFrame, dFrame)
  }
  dayFrame$Date <- unique(eod$Day)

  dayFrame <- dayFrame[complete.cases(dayFrame),]

Add Economic Schedule

  library(xlsx)
  econSchedule <- read.xlsx("C:/Users/Analytics/Google Drive/RFC Quant/CME Data/Economic Schedule.xlsx", 1)
  dayFrame$Event <- NA
  for(i in 1:length(dayFrame$Event)) {
    dayFrame$Event[i] <- as.character(econSchedule$Event[as.Date(econSchedule$Date) == as.Date(dayFrame$Date[i])])
  }

Add Previous Trading Day Info and MVA

  todayVars <- names(dayFrame)
  yestVars <- paste("y", todayVars, sep = "")
  mva5Vars <- paste("mva", todayVars, sep = "")
  mva5Vars <- mva5Vars[-grep("mvaEvent", mva5Vars)]
  mva5Vars <- mva5Vars[-grep("mvaDate", mva5Vars)]
  dayFrame[,yestVars] <- NA
  dayFrame[,mva5Vars] <- NA
  for(i in 2:length(dayFrame$Event)) {
    dayFrame[i,yestVars] <- dayFrame[i-1, todayVars]
  }

  for(i in 6:length(dayFrame$Event)) {
    for(mva in 1:length(mva5Vars)) {
      dayFrame[i, mva5Vars[mva]] <- mean(dayFrame[(i-5):(i-1), todayVars[mva]])
    }
  }

Combine Historical Data with Predicted Day

  myDate <- Sys.Date()
  myDay <- dayFrame[length(dayFrame[,1]),]
  myDay$Date <- myDate
  myDay$yEvent[1] <- myDay$Event[1]
  myDay$Event[1] <- as.character(econSchedule$Event[econSchedule$Date == myDate])

  myDay[1,yestVars] <- dayFrame[length(dayFrame[,1]), todayVars]

  for(mva in 1:length(mva5Vars)) {
    myDay[1, mva5Vars[mva]] <- mean(dayFrame[(length(dayFrame[,1])-4):length(dayFrame[,1]), todayVars[mva]])
  }
  dayFrame$Date <- as.character(dayFrame$Date)
  myDay$Date <- as.character(myDay$Date)
  dayFrame <- rbind(dayFrame, myDay)

Filter Out Unwanted Columns

  volCols <- names(dayFrame)[grepl("Volume", names(dayFrame))]
  rangeCols <- names(dayFrame)[grepl("Range", names(dayFrame))]

  predCols <- c(volCols, rangeCols)
  predFrame <- dayFrame[,predCols]
  predFrame$Event <- dayFrame$Event
  predFrame$yEvent <- dayFrame$yEvent
  predFrame <- predFrame[complete.cases(predFrame),]

Scale Columns

  predFrame_s <- scale(predFrame[,predCols])

  ## Create pcenter and pscale to unscale data later
  pcenter <- attr(predFrame_s, "scaled:center")
  pscale <- attr(predFrame_s, "scaled:scale")

  ## Create Final data frame
  predFrame <- as.data.frame(cbind(predFrame$Event, predFrame$yEvent,predFrame_s))
  names(predFrame)[1] <- "Event"
  names(predFrame)[2] <- "yEvent"
  predFrame$Event <- as.factor(predFrame$Event)
  predFrame$yEvent <- as.factor(predFrame$yEvent)

  for(pc in predCols) {
    predFrame[,pc] <- as.numeric(as.character(predFrame[,pc]))
  }

Build Prediction Model

  library(caret)
##  inTrain <- createDataPartition(y = predFrame$Outright5.Volume, p = .7, list = F)
##  training <- predFrame[inTrain,]
##  testing <- predFrame[-inTrain,]
  training <- predFrame[1:(length(predFrame[,1])-1),]
  testing <- predFrame[length(predFrame[,1]),]

  unscale <- function(scaledpt, centervec, scalevec) {
    scaledpt*scalevec + centervec
  }

  formVars <- names(training)[-c(3,4,5,6,7,8,21,22,23,24,25,26)]

  library(randomForest)
  finalFrame <- data.frame()
  for(i in 1:6) {
    predVarRange <- paste("Outright",i,".Range", sep = "")
    predVarVol <- paste("Outright",i,".Volume", sep = "")
    formulaRange <- paste(predVarRange, paste(formVars, collapse = "+"), sep = "~")
    formulaVol <- paste(predVarVol, paste(formVars, collapse = "+"), sep = "~")

    ## RPart Fit Range
    rpfitRange <- train(as.formula(formulaRange), method = "rpart", data = training)
    rpPredictRange <- predict(rpfitRange, testing)
    ##msre_rpart_range <- mean(sqrt((testing$Outright5.Range-rpPredictRange)^2))
    rp_result_range <- unscale(rpPredictRange, pcenter, pscale)[predVarRange]
    ## RPart Fit Volume
    rpfitVol <- train(as.formula(formulaVol), method = "rpart", data = training)
    rpPredictVol <- predict(rpfitVol, testing)
    ##msre_rpart_vol <- mean(sqrt((testing$Outright5.Volume-rpPredictVol)^2))
    rp_result_vol <- unscale(rpPredictVol, pcenter, pscale)[predVarVol]
    rp_result_range

    ## Linear Regression Range
    glm_fit_range <- train(as.formula(formulaRange), method = "glm", data = training)
    glmPredictRange <- predict(glm_fit_range, testing)
    ## msre_glm_range <- mean(sqrt((testing$Outright5.Range-glmPredictRange)^2))
    glm_result_range <- unscale(glmPredictRange, pcenter, pscale)[predVarRange]
    ## Linear Regression Volume
    glm_fit_vol <- train(as.formula(formulaVol), method = "glm", data = training)
    glmPredictVol <- predict(glm_fit_vol, testing)
    ## msre_glm_vol <- mean(sqrt((testing$Outright5.Volume-glmPredictVol)^2))
    glm_result_vol <- unscale(glmPredictVol, pcenter, pscale)[predVarVol]
    glm_result_range
    glm_result_vol

    ## Boosting Model Range
    fitControl <- trainControl(method = "repeatedcv",
                               number = 10, 
                               repeats = 10)
    gbm_fit_range <- train(as.formula(formulaRange), method = "gbm", 
                           data = training,
                           trControl = fitControl,
                           verbose = FALSE)
    gbmPredictRange <- predict(gbm_fit_range, testing)
    ##msre_gbm_range <- mean(sqrt((testing$Outright5.Range-gbmPredictRange)^2))
    gbm_result_range <- unscale(gbmPredictRange, pcenter, pscale)[predVarRange]
    ## Boosting Model Volume
    gbm_fit_vol <- train(as.formula(formulaVol), method = "gbm", data = training)
    gbmPredictVol <- predict(gbm_fit_vol, testing)
    ## msre_gbm_vol <- mean(sqrt((testing$Outright5.Volume-gbmPredictVol)^2))
    gbm_result_vol <- unscale(gbmPredictVol, pcenter, pscale)[predVarVol]
    gbm_result_range
    gbm_result_vol

    ## Random Forest Range
    rf_fit_range <- randomForest(as.formula(formulaRange), data = training)
    rfPredictRange <- predict(rf_fit_range, testing)
    ##msre_rf_range <- mean(sqrt((testing$Outright5.Range-rfPredictRange)^2))
    rf_result_range <- unscale(rfPredictRange, pcenter, pscale)[predVarRange]
    ## Random Forest Volume
    rf_fit_vol <- randomForest(as.formula(formulaVol), data = training)
    rfPredictVol <- predict(rf_fit_vol, testing)
    ##msre_rf_vol <- mean(sqrt((testing$Outright5.Volume-rfPredictVol)^2))
    rf_result_vol <- unscale(rfPredictVol, pcenter, pscale)[predVarVol]
    rf_result_range
    rf_result_vol

    r <- c("Volume", "Range")
    cls <- c("Linear", "DecisionTree", "Boosting", "RandomForest")
    vec <- c(glm_result_vol, rp_result_vol, gbm_result_vol,rf_result_vol, glm_result_range,
                  gbm_result_range, rp_result_range, rf_result_range)
    df <- as.data.frame(matrix(vec, nrow = 4))
    df <- cbind(cls, df)
    names(df) <- c("Model", "Volume(Predicted)", "Range(Predicted)")
    df$`Volume(Predicted)` <- round(df$`Volume(Predicted)`, 0)
    df$`Range(Predicted)` <- round(df$`Range(Predicted)`*2,0)/2
    df$Model <- as.character(df$Model)
    finalFrame <- rbind(finalFrame,df)
  }



  ## Neural Network
##  library(neuralnet)

Prediction Frame For 2015-12-01

  options(scipen = 999)
  library(htmlTable)
  htmlTable(finalFrame,
            rnames = c(rep("",24)),
            n.rgroup = c(4,4,4,4,4,4),
            rgroup = c("Outright 1", "Outright 2",
                       "Outright 3", "Outright 4",
                       "Outright 5", "Outright 6"))
Model Volume(Predicted) Range(Predicted)
Outright 1
   Linear 139868 1.5
   DecisionTree 163255 1
   Boosting 163338 1.5
   RandomForest 141511 1.5
Outright 2
   Linear 149644 2.5
   DecisionTree 193775 3
   Boosting 185633 2.5
   RandomForest 143209 2
Outright 3
   Linear 170680 4
   DecisionTree 196389 4
   Boosting 185591 4
   RandomForest 147524 3
Outright 4
   Linear 171702 5
   DecisionTree 166944 5
   Boosting 129565 5
   RandomForest 140906 4
Outright 5
   Linear 217326 5.5
   DecisionTree 224915 6
   Boosting 229856 5.5
   RandomForest 201921 4.5
Outright 6
   Linear 158204 6
   DecisionTree 134894 7
   Boosting 136534 6.5
   RandomForest 128232 5

No comments:

Post a Comment