library(ggplot2)
library(ggpmisc)
library(tidyverse)
library(data.table)

# settle down
rm(list=ls())

ss <- function(x) { 
  return(sum((x-mean(x))^2))
}

# data preparation
set.seed(101)
nx <- 50 # variable x, sample size 
mx <- 4.5 # mean of x
sdx <- mx * 0.56  # sd of x 
x <- rnorm(nx, mx, sdx) # generating x 
slp <- 4 # slop of x = coefficient, b
# y variable 
y <-  slp * x + rnorm(nx, 0, slp*3*sdx)

data <- data.frame(x, y)
head(data)

# check with regression
mo <- lm(y ~ x, data = data)
summary(mo)

# graph
ggplot(data = data, aes(x = x, y = y)) + 
  geom_point() +
  stat_poly_line() +
  stat_poly_eq(use_label(c("eq", "R2"))) +
  theme_classic() 

# from what we know
# get covariance value
sp.yx <- sum((x-mean(x))*(y-mean(y)))
df.yx <- length(y)-1
sp.yx/df.yx 
# check with cov function
cov(x,y)
# correlation value
cov(x,y)/(sd(x)*sd(y))
cor(x,y)

# regression by hand 
# b and a 
b <- sp.yx / ss(x) # b2 <- cov(x,y)/var(x)
a <- mean(y) - b*(mean(x))
a
b

# check a and b value from the lm 
summary(mo)$coefficient[1]
summary(mo)$coefficient[2]
summary(mo)

fit.yx <- a + b*x # predicted value of y from x data
res <- y - fit.yx # error residuals
reg <- fit.yx - mean(y) # error regressions 
ss.res <- sum(res^2)
ss.reg <- sum(reg^2)
ss.res+ss.reg
ss.tot <- ss(y)
ss.tot

plot(x,y)
abline(a, b, col="red", lwd=2)
plot(x, fit.yx)
plot(x, res)

df.y <- length(y)-1
df.reg <- 2-1
df.res <- df.y - df.reg
df.res

r.sq <- ss.reg / ss.tot
r.sq
summary(mo)$r.square
ms.reg <- ss.reg / df.reg
ms.res <- ss.res / df.res


f.cal <- ms.reg / ms.res
f.cal 
pf(f.cal, df.reg, df.res,lower.tail = F)
t.cal <- sqrt(f.cal)
t.cal
se.b <- sqrt(ms.res/ss(x))
se.b
t.cal <- (b-0)/se.b
t.cal
pt(t.cal, df=df.res, lower.tail = F)*2
summary(mo)


# getting a and b from
# gradient descent
a <- rnorm(1)
b <- rnorm(1)
a.start <- a
b.start <- b
a.start
b.start

# Predict function:
predict <- function(x, a, b){
  return (a + b * x)
}

# And loss function is:
residuals <- function(fit, y) {
  return(y - fit)
}

gradient <- function(x, res){
  db = -2 * mean(x * res)
  da = -2 * mean(res)
  return(list("b" = db, "a" = da))
}

# to check ms.residual
msrloss <- function(fit, y) {
  res <- residuals(fit, y)
  return(mean(res^2))
}

# Train the model with scaled features
learning.rate = 1e-1 # 0.1

# Record Loss for each epoch:
as = c()
bs = c()
msrs = c()
ssrs = c()
mres = c()
zx <- (x-mean(x))/sd(x)

nlen <- 75
for (epoch in 1:nlen) {
  fit.val <- predict(zx, a, b)
  residual <- residuals(fit.val, y)
  loss <- msrloss(fit.val, y)
  mres <- append(mres, mean(residual))
  msrs <- append(msrs, loss)
  
  grad <- gradient(zx, residual)
  step.b <- grad$b * learning.rate 
  step.a <- grad$a * learning.rate
  b <- b-step.b
  a <- a-step.a
  
  as <- append(as, a)
  bs <- append(bs, b)
}
msrs
mres
as
bs

# scaled
a
b

# unscale coefficients to make them comprehensible
# see http://commres.net/wiki/gradient_descent#why_normalize_scale_or_make_z-score_xi
# and 
# http://commres.net/wiki/gradient_descent#how_to_unnormalize_unscale_a_and_b
#  
a =  a - (mean(x) / sd(x)) * b
b =  b / sd(x)
a
b

# changes of estimators
as <- as - (mean(x) /sd(x)) * bs
bs <- bs / sd(x)

as
bs
mres
msrs

parameters <- data.frame(as, bs, mres, msrs)

cat(paste0("Intercept: ", a, "\n", "Slope: ", b, "\n"))

summary(mo)$coefficients

msrs <- data.frame(msrs)
msrs.log <- data.table(epoch = 1:nlen, msrs)
ggplot(msrs.log, aes(epoch, msrs)) +
  geom_line(color="blue") +
  theme_classic()

mres <- data.frame(mres)
mres.log <- data.table(epoch = 1:nlen, mres)
ggplot(mres.log, aes(epoch, mres)) +
  geom_line(color="red") +
  theme_classic()

ch <- data.frame(mres, msrs)
ch
max(y)
ggplot(data, aes(x = x, y = y)) + 
  geom_point(size = 2) + 
  geom_abline(aes(intercept = as, slope = bs),
              data = parameters, linewidth = 0.5, 
              color = 'green') + 
  stat_poly_line() +
  stat_poly_eq(use_label(c("eq", "R2"))) +
  theme_classic() +
  geom_abline(aes(intercept = as, slope = bs), 
              data = parameters %>% slice_head(), 
              linewidth = 1, color = 'blue') + 
  geom_abline(aes(intercept = as, slope = bs), 
              data = parameters %>% slice_tail(), 
              linewidth = 1, color = 'red') +
  labs(title = 'Gradient descent. blue: start, red: end, green: gradients')
summary(mo)
a.start
b.start
a
b
summary(mo)$coefficient[1]
summary(mo)$coefficient[2]