====== Gradient Descent ======
====== explanation ======
점차하강 = 조금씩 깍아서 원하는 기울기 (미분값) 찾기
prerequisite:
[[estimated_standard_deviation#실험적_수학적_이해|표준편차 추론에서 평균을 사용하는 이유: 실험적_수학적_이해]]
[[:deriviation of a and b in a simple regression]]
위의 문서는 a, b에 대한 값을 미분법을 이용해서 직접 구하였다. 컴퓨터로는 이렇게 하기가 쉽지 않다. 그렇다면 이 값을 반복계산을 이용해서 추출하는 방법은 없을까? gradient descent
우선 위의 문서에서 (두번째) 최소값이 되는 SS값을 찾는다고 설명했는데, 이는 MS값으로 대체해서 생각해도 된다.
\begin{eqnarray*}
\text{MS} & = & \frac {\text{SS}}{n}
\end{eqnarray*}
\begin{eqnarray*}
\text{for a (constant)} \\
\\
\dfrac{\text{d}}{\text{dv}} \text{MSE (Mean Square Error)} & = & \dfrac{\text{d}}{\text{dv}} \frac {\sum{(Y_i - (a + bX_i))^2}} {N} \\
& = & \sum \dfrac{\text{d}}{\text{dv}} \frac{{(Y_i - (a + bX_i))^2}} {N} \\
& = & \sum{2 \frac{1}{N} (Y_i - (a + bX_i))} * (-1) \;\;\;\; \\
& \because & \dfrac{\text{d}}{\text{dv for a}} (Y_i - (a+bX_i)) = -1 \\
& = & -2 \frac{\sum{(Y_i - (a + bX_i))}}{N} \\
& = & -2 * \text{mean of residuals} \\
\end{eqnarray*}
아래 R code에서 gradient function을 참조.
\begin{eqnarray*}
\text{for b, (coefficient)} \\
\\
\dfrac{\text{d}}{\text{dv}} \frac{\sum{(Y_i - (a + bX_i))^2}}{N} & = & \sum \dfrac{\text{d}}{\text{dv}} \frac{{(Y_i - (a + bX_i))^2}} {N} \\
& = & \sum{2 \frac{1}{N} (Y_i - (a + bX_i))} * (-X_i) \;\;\;\; \\
& \because & \dfrac{\text{d}}{\text{dv for b}} (Y_i - (a+bX_i)) = -X_i \\
& = & -2 X_i \frac{\sum{(Y_i - (a + bX_i))}}{N} \\
& = & -2 * X_i * \text{mean of residuals} \\
\\
\end{eqnarray*}
(미분을 이해한다는 것을 전제로) 위의 식은 b값이 변할 때 msr (mean square residual) 값이 어떻게 변하는가를 알려주는 것이다. 그리고 그것은 b값에 대한 residual의 총합에 (-2/N)*X값을 곱한 값이다.
====== R code ======
# d statquest explanation
# x <- c(0.5, 2.3, 2.9)
# y <- c(1.4, 1.9, 3.2)
rm(list=ls())
# set.seed(191)
n <- 300
x <- rnorm(n, 5, 1.2)
y <- 2.14 * x + rnorm(n, 0, 4)
# data <- data.frame(x, y)
data <- tibble(x = x, y = y)
mo <- lm(y~x)
summary(mo)
# set.seed(191)
# Initialize random betas
b1 = rnorm(1)
b0 = rnorm(1)
b1.init <- b1
b0.init <- b0
# Predict function:
predict <- function(x, b0, b1){
return (b0 + b1 * x)
}
# And loss function is:
residuals <- function(predictions, y) {
return(y - predictions)
}
loss_mse <- function(predictions, y){
residuals = y - predictions
return(mean(residuals ^ 2))
}
predictions <- predict(x, b0, b1)
residuals <- residuals(predictions, y)
loss = loss_mse(predictions, y)
data <- tibble(data.frame(x, y, predictions, residuals))
print(paste0("Loss is: ", round(loss)))
gradient <- function(x, y, predictions){
dinputs = y - predictions
db1 = -2 * mean(x * dinputs)
db0 = -2 * mean(dinputs)
return(list("db1" = db1, "db0" = db0))
}
gradients <- gradient(x, y, predictions)
print(gradients)
# Train the model with scaled features
x_scaled <- (x - mean(x)) / sd(x)
learning_rate = 1e-1
# Record Loss for each epoch:
# logs = list()
# bs=list()
b0s = c()
b1s = c()
mse = c()
nlen <- 80
for (epoch in 1:nlen){
# Predict all y values:
predictions = predict(x_scaled, b0, b1)
loss = loss_mse(predictions, y)
mse = append(mse, loss)
# logs = append(logs, loss)
if (epoch %% 10 == 0){
print(paste0("Epoch: ",epoch, ", Loss: ", round(loss, 5)))
}
gradients <- gradient(x_scaled, y, predictions)
db1 <- gradients$db1
db0 <- gradients$db0
b1 <- b1 - db1 * learning_rate
b0 <- b0 - db0 * learning_rate
b0s <- append(b0s, b0)
b1s <- append(b1s, b1)
}
# unscale coefficients to make them comprehensible
b0 = b0 - (mean(x) / sd(x)) * b1
b1 = b1 / sd(x)
# changes of estimators
b0s <- b0s - (mean(x) /sd(x)) * b1s
b1s <- b1s / sd(x)
parameters <- tibble(data.frame(b0s, b1s, mse))
cat(paste0("Slope: ", b1, ", \n", "Intercept: ", b0, "\n"))
summary(lm(y~x))$coefficients
ggplot(data, aes(x = x, y = y)) +
geom_point(size = 2) +
geom_abline(aes(intercept = b0s, slope = b1s),
data = parameters, linewidth = 0.5,
color = 'green') +
theme_classic() +
geom_abline(aes(intercept = b0s, slope = b1s),
data = parameters %>% slice_head(),
linewidth = 1, color = 'blue') +
geom_abline(aes(intercept = b0s, slope = b1s),
data = parameters %>% slice_tail(),
linewidth = 1, color = 'red') +
labs(title = 'Gradient descent. blue: start, red: end, green: gradients')
b0.init
b1.init
data
parameters
====== R output =====
> rm(list=ls())
> # set.seed(191)
> n <- 300
> x <- rnorm(n, 5, 1.2)
> y <- 2.14 * x + rnorm(n, 0, 4)
>
> # data <- data.frame(x, y)
> data <- tibble(x = x, y = y)
>
> mo <- lm(y~x)
> summary(mo)
Call:
lm(formula = y ~ x)
Residuals:
Min 1Q Median 3Q Max
-9.754 -2.729 -0.135 2.415 10.750
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.7794 0.9258 -0.842 0.401
x 2.2692 0.1793 12.658 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.951 on 298 degrees of freedom
Multiple R-squared: 0.3497, Adjusted R-squared: 0.3475
F-statistic: 160.2 on 1 and 298 DF, p-value: < 2.2e-16
>
> # set.seed(191)
> # Initialize random betas
> b1 = rnorm(1)
> b0 = rnorm(1)
>
> b1.init <- b1
> b0.init <- b0
>
> # Predict function:
> predict <- function(x, b0, b1){
+ return (b0 + b1 * x)
+ }
>
> # And loss function is:
> residuals <- function(predictions, y) {
+ return(y - predictions)
+ }
>
> loss_mse <- function(predictions, y){
+ residuals = y - predictions
+ return(mean(residuals ^ 2))
+ }
>
> predictions <- predict(x, b0, b1)
> residuals <- residuals(predictions, y)
> loss = loss_mse(predictions, y)
>
> data <- tibble(data.frame(x, y, predictions, residuals))
>
> print(paste0("Loss is: ", round(loss)))
[1] "Loss is: 393"
>
> gradient <- function(x, y, predictions){
+ dinputs = y - predictions
+ db1 = -2 * mean(x * dinputs)
+ db0 = -2 * mean(dinputs)
+
+ return(list("db1" = db1, "db0" = db0))
+ }
>
> gradients <- gradient(x, y, predictions)
> print(gradients)
$db1
[1] -200.6834
$db0
[1] -37.76994
>
> # Train the model with scaled features
> x_scaled <- (x - mean(x)) / sd(x)
>
> learning_rate = 1e-1
>
> # Record Loss for each epoch:
> # logs = list()
> # bs=list()
> b0s = c()
> b1s = c()
> mse = c()
>
> nlen <- 80
> for (epoch in 1:nlen){
+ # Predict all y values:
+ predictions = predict(x_scaled, b0, b1)
+ loss = loss_mse(predictions, y)
+ mse = append(mse, loss)
+ # logs = append(logs, loss)
+
+ if (epoch %% 10 == 0){
+ print(paste0("Epoch: ",epoch, ", Loss: ", round(loss, 5)))
+ }
+
+ gradients <- gradient(x_scaled, y, predictions)
+ db1 <- gradients$db1
+ db0 <- gradients$db0
+
+ b1 <- b1 - db1 * learning_rate
+ b0 <- b0 - db0 * learning_rate
+ b0s <- append(b0s, b0)
+ b1s <- append(b1s, b1)
+ }
[1] "Epoch: 10, Loss: 18.5393"
[1] "Epoch: 20, Loss: 15.54339"
[1] "Epoch: 30, Loss: 15.50879"
[1] "Epoch: 40, Loss: 15.50839"
[1] "Epoch: 50, Loss: 15.50839"
[1] "Epoch: 60, Loss: 15.50839"
[1] "Epoch: 70, Loss: 15.50839"
[1] "Epoch: 80, Loss: 15.50839"
>
> # unscale coefficients to make them comprehensible
> b0 = b0 - (mean(x) / sd(x)) * b1
> b1 = b1 / sd(x)
>
> # changes of estimators
> b0s <- b0s - (mean(x) /sd(x)) * b1s
> b1s <- b1s / sd(x)
>
> parameters <- tibble(data.frame(b0s, b1s, mse))
>
> cat(paste0("Slope: ", b1, ", \n", "Intercept: ", b0, "\n"))
Slope: 2.26922511738252,
Intercept: -0.779435058320381
> summary(lm(y~x))$coefficients
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.7794352 0.9258064 -0.8418986 4.005198e-01
x 2.2692252 0.1792660 12.6584242 1.111614e-29
>
> ggplot(data, aes(x = x, y = y)) +
+ geom_point(size = 2) +
+ geom_abline(aes(intercept = b0s, slope = b1s),
+ data = parameters, linewidth = 0.5,
+ color = 'green') +
+ theme_classic() +
+ geom_abline(aes(intercept = b0s, slope = b1s),
+ data = parameters %>% slice_head(),
+ linewidth = 1, color = 'blue') +
+ geom_abline(aes(intercept = b0s, slope = b1s),
+ data = parameters %>% slice_tail(),
+ linewidth = 1, color = 'red') +
+ labs(title = 'Gradient descent. blue: start, red: end, green: gradients')
>
> b0.init
[1] -1.67967
> b1.init
[1] -1.323992
>
> data
# A tibble: 300 × 4
x y predictions residuals
1 4.13 6.74 -7.14 13.9
2 7.25 14.0 -11.3 25.3
3 6.09 13.5 -9.74 23.3
4 6.29 15.1 -10.0 25.1
5 4.40 3.81 -7.51 11.3
6 6.03 13.9 -9.67 23.5
7 6.97 12.1 -10.9 23.0
8 4.84 12.8 -8.09 20.9
9 6.85 17.2 -10.7 28.0
10 3.33 3.80 -6.08 9.88
# ℹ 290 more rows
# ℹ Use `print(n = ...)` to see more rows
> parameters
# A tibble: 80 × 3
b0s b1s mse
1 2.67 -0.379 183.
2 1.99 0.149 123.
3 1.44 0.571 84.3
4 1.00 0.910 59.6
5 0.652 1.18 43.7
6 0.369 1.40 33.6
7 0.142 1.57 27.1
8 -0.0397 1.71 22.9
9 -0.186 1.82 20.2
10 -0.303 1.91 18.5
# ℹ 70 more rows
#
{{:pasted:20250801-185727.png}}