# the above no gradient gradient <- function(x, v){ residuals = x - v # y = (sum(x-v)^2)/n 혹은 (mean(x-v)^2) # 의 식이 ms값을 구하는 식인데 # 이를 v에 대해서 미분하면 chain rule을 써야 한다 # 자세한 것은 http://commres.net/wiki/estimated_standard_deviation # 문서 중에 미분 부분 참조 # dy/dv = ( 2(x-v)*-1 ) / n chain rule # dy/dv = -2 (x-v) / n = -2 (mean(residual)) dx = -2 * mean(residuals) # return(list("ds" = dx)) return(dx) } # function returns ds value zx <- (x-mean(x))/sd(x) # pick one random v in (x-v) v <- rnorm(1) # Train the model with scaled features learning.rate = 1e-1 msrs <- c() vs <- c() msr <- function(x, v) { residuals <- (x - v) return((mean(residuals^2))) } nlen <- 75 for (epoch in 1:nlen) { residual <- residuals(zx, v) msr.x <- msr(zx, v) msrs <- append(msrs, msr.x) grad <- gradient(zx, v) step.v <- grad * learning.rate # v <- v - step.v # 그 다음 v값 vs <- append(vs, v) # v값 저장 } tail(msrs) tail(vs) plot(vs, msrs, type='b') # scaled vs # 변화하는 v 값들의 집합 vs.orig <- (vs*sd(x))+mean(x) vs.orig # 마지막 v값이 최소값에 근접한 값 v v.orig <- (v*sd(x))+mean(x) v.orig plot(vs.orig, msrs, type='b')