# Install missing packages automatically install.packages(setdiff(c("quantmod", "tseries", "randomForest", "gbm"), installed.packages()[, "Package"])) # Library library(quantmod) library(tseries) library(randomForest) library(gbm) # analyze_stock() ------------------------------------------------------------- # Download a stock, compute daily log returns, check stationarity, fit a # next-day random forest and gradient boosting model on lagged returns, compare # them (RMSE / MPE), then produce a RECURSIVE multi-step forecast: the model is # iterated forward `horizon` days, each prediction fed back as the newest lag, # yielding a full predicted price PATH for the next `horizon` trading days. # # symbol, from, to - ticker and date range # horizon - number of days in the forecast path (5 = one week) # lags - past return lags used as predictors # train_frac - chronological train fraction (no shuffle) # seed, show_plots - reproducibility / plotting toggle # # Note: metrics in `results` are 1-DAY (next-day) errors. The path is the # iterated forecast; recursive multi-step errors compound and the path reflects # the conditional mean (smooth), not realistic volatility. # ----------------------------------------------------------------------------- ## recursive helper: iterate a 1-day model forward h steps forecast_path <- function(model, ret_vec, lags, h, gbm_iter = NULL) { s <- as.numeric(ret_vec) out <- numeric(h) for (i in seq_len(h)) { L <- length(s) newx <- as.data.frame(setNames( lapply(lags, function(k) s[L - (k - 1)]), paste0("lag", lags))) out[i] <- if (is.null(gbm_iter)) as.numeric(predict(model, newx)) else as.numeric(predict(model, newx, n.trees = gbm_iter)) s <- c(s, out[i]) # feed prediction back as next step's lag1 } out } analyze_stock <- function(symbol = "AAPL", from = "2018-01-01", to = Sys.Date(), horizon = 5, lags = c(1, 2, 3, 5, 10), train_frac = 0.90, seed = 1, show_plots = TRUE) { ## 1. Download -- auto.assign = FALSE so it works for ANY symbol quotes <- getSymbols(symbol, src = "yahoo", from = from, to = to, auto.assign = FALSE) adj <- na.omit(Ad(quotes)) ret <- na.omit(diff(log(adj))) colnames(ret) <- "log_return" ## 2. Stationarity of the daily returns adf <- suppressWarnings(adf.test(as.numeric(ret))) ## 3. Next-day target (y = r_t) + lagged-return predictors lagged <- lapply(lags, function(k) Lag(ret, k)) dat <- na.omit(do.call(merge, c(list(ret), lagged))) colnames(dat) <- c("y", paste0("lag", lags)) d <- data.frame(coredata(dat)) ## 4. Chronological split (no shuffle; 1-day target needs no purge) n <- nrow(d) cut <- floor(train_frac * n) train <- d[1:cut, ] test <- d[(cut + 1):n, ] ## 5. Random forest: tune mtry (OOB) + ntree, then fit set.seed(seed) p <- length(lags) best_mtry <- if (p > 1) { tt <- tuneRF(train[, -1], train$y, ntreeTry = 500, stepFactor = 1.5, improve = 0.01, trace = FALSE, plot = FALSE) tt[which.min(tt[, 2]), 1] } else 1 set.seed(seed) rf_big <- randomForest(y ~ ., data = train, mtry = best_mtry, ntree = 1000) best_ntree <- which.min(rf_big$mse) set.seed(seed) rf <- randomForest(y ~ ., data = train, mtry = best_mtry, ntree = best_ntree) ## 6. Gradient boosting: tune n.trees by CV, then fit set.seed(seed) gbm_fit <- gbm(y ~ ., data = train, distribution = "gaussian", n.trees = 1000, interaction.depth = 3, shrinkage = 0.01, bag.fraction = 0.8, cv.folds = 5) best_iter <- gbm.perf(gbm_fit, method = "cv", plot.it = FALSE) ## 7. One-day predictions + metrics rmse <- function(a, p) sqrt(mean((a - p)^2)) mpe <- function(a, p) mean((a - p) / a) * 100 # unstable near 0 denominators pr_tr_rw <- rep(0, nrow(train)) # random walk baseline: E[next return] = 0 pr_te_rw <- rep(0, nrow(test)) pr_tr_rf <- predict(rf, train); pr_te_rf <- predict(rf, test) pr_tr_gb <- predict(gbm_fit, train, n.trees = best_iter) pr_te_gb <- predict(gbm_fit, test, n.trees = best_iter) results <- data.frame( Model = c("Random Walk", "Random Forest", "Gradient Boosting"), Train_RMSE = c(rmse(train$y, pr_tr_rw), rmse(train$y, pr_tr_rf), rmse(train$y, pr_tr_gb)), Test_RMSE = c(rmse(test$y, pr_te_rw), rmse(test$y, pr_te_rf), rmse(test$y, pr_te_gb)), Train_MPE = c(mpe(train$y, pr_tr_rw), mpe(train$y, pr_tr_rf), mpe(train$y, pr_tr_gb)), Test_MPE = c(mpe(test$y, pr_te_rw), mpe(test$y, pr_te_rf), mpe(test$y, pr_te_gb)) ) ## 8. Recursive multi-step forecast -> predicted price PATH ret_vec <- as.numeric(ret) path_rf <- forecast_path(rf, ret_vec, lags, horizon) path_gb <- forecast_path(gbm_fit, ret_vec, lags, horizon, gbm_iter = best_iter) last_price <- as.numeric(last(adj)) forecast_tbl <- data.frame( Step = seq_len(horizon), RW_Price = rep(last_price, horizon), # flat: zero expected return RF_Return = path_rf, RF_Price = last_price * exp(cumsum(path_rf)), GBM_Return = path_gb, GBM_Price = last_price * exp(cumsum(path_gb)) ) ## 9. Summary (printed first) cat("Symbol:", symbol, "| from", as.character(from), "| horizon:", horizon, "day(s) | obs:", n, "\n") cat("ADF p-value:", signif(adf$p.value, 3), "(small => daily returns stationary)\n") cat("RF -> mtry:", best_mtry, " ntree:", best_ntree, " | GBM best n.trees:", best_iter, "\n\n") print(results, row.names = FALSE) cat("\nLast actual close:", round(last_price, 2), "| recursive forecast path (", horizon, "days):\n") print(forecast_tbl, row.names = FALSE) ## 10. Graphs LAST (drawn after the printed summary) if (show_plots) { ## (a) daily log returns plot(ret, main = paste(symbol, "Daily Log Returns")) ## (b) next-day RMSE: RF vs GBM rmse_mat <- rbind(Train = results$Train_RMSE, Test = results$Test_RMSE) colnames(rmse_mat) <- results$Model barplot(rmse_mat, beside = TRUE, col = c("steelblue", "tomato"), ylab = "RMSE", main = paste(symbol, "- RF vs GBM RMSE (next-day)"), legend.text = rownames(rmse_mat), args.legend = list(x = "topright", bty = "n")) ## (c) recursive forecast price path yr <- range(last_price, forecast_tbl$RF_Price, forecast_tbl$GBM_Price) plot(0:horizon, c(last_price, forecast_tbl$RF_Price), type = "b", col = "forestgreen", ylim = yr, xlab = "Days ahead", ylab = "Predicted price", main = paste0(symbol, " - recursive ", horizon, "-day forecast path")) lines(0:horizon, c(last_price, forecast_tbl$GBM_Price), type = "b", col = "tomato") abline(h = last_price, col = "gray40", lty = 2) # random walk (flat) legend("topleft", c("Random Walk", "Random Forest", "Gradient Boosting"), col = c("gray40", "forestgreen", "tomato"), lty = c(2, 1, 1), pch = c(NA, 1, 1), bty = "n") } invisible(list(symbol = symbol, horizon = horizon, returns = ret, adf = adf, train = train, test = test, rf = rf, gbm = gbm_fit, best_iter = best_iter, results = results, forecast_path = forecast_tbl)) } # ---- example usage ---- # analyze_stock("AAPL", from = "2018-01-01") # 5-day path (default) # analyze_stock("MSFT", from = "2015-06-01", horizon = 10) # 10-day path