====== Print ====== > pi [1] 3.141593 > sqrt(2) [1] 1.414214 When you enter expressions like that, R evaluates the expression and then implicitly calls the print function. So the previous example is identical to this: > print(pi) [1] 3.141593 > print(sqrt(2)) [1] 1.414214 The print function has __a significant limitation__, however: it prints __only one object at a time__. Trying to print multiple items gives this mind-numbing error message: > print("The zero occurs at", 2*pi, "radians.") Error in print.default("The zero occurs at", 2 * pi, "radians.") : unimplemented type 'character' in 'asLogical' Instead, use **cat** > cat("The zero occurs at", 2*pi, "radians.", "\n") The zero occurs at 6.283185 radians. Note: space attached, line feed not. A simple vector > fib <- c(0,1,1,2,3,5,8,13,21,34) > cat("The first few Fibonacci numbers are:", fib, "...\n") The first few Fibonacci numbers are: 0 1 1 2 3 5 8 13 21 34 ... A serious limitation, however, is that it __cannot print compound data structures such as matrices and lists__. ====== Variables ====== > variable_name <- 3 free from declaration: > x <- 3 > print(x) [1] 3 > x <- c("fee", "fie", "foe", "fum") > print(x) [1] "fee" "fie" "foe" "fum" ====== List ====== > ls() character(0) > x <- 10 > y <- 50 > z <- c("three", "blind", "mice") > f <- function(n,p) sqrt(p*(1-p)/n) > ls() [1] "f" "x" "y" "z" > ls.str() f : function (n, p) x : num 10 y : num 50 z : chr [1:3] "three" "blind" "mice" hidden variable with "." > .hidvar <- 10 > ls() [1] "f" "x" "y" "z" > ls(all.names=TRUE) [1] ".hidvar" "f" "x" "y" "z" ====== Deleting Variable ====== > x <- 2*pi > x [1] 6.283185 > rm(x) > x Error: object "x" not found Note: no "undo" Wipe out variables in a session: > ls() [1] "f" "x" "y" "z" > rm(list=ls()) > ls() character(0) ====== Vector ====== > c(1,1,2,3,5,8,13,21) [1] 1 1 2 3 5 8 13 21 > c(1*pi, 2*pi, 3*pi, 4*pi) [1] 3.141593 6.283185 9.424778 12.566371 > c("Everyone", "loves", "stats.") [1] "Everyone" "loves" "stats." > c(TRUE,TRUE,FALSE,TRUE) [1] TRUE TRUE FALSE TRUE If the arguments to c(...) are themselves vectors, it __flattens them and combines them into one single vector__: > v1 <- c(1,2,3) > v2 <- c(4,5,6) > c(v1,v2) [1] 1 2 3 4 5 6 > v1 <- c(1,2,3) > v3 <- c("A","B","C") > c(v1,v3) [1] "1" "2" "3" "A" "B" "C" > c(3.1415, "foo") [1] "3.1415" "foo" > mode(c(3.1415, "foo")) [1] "character" ====== Basic (descriptive) Statistics ====== mean, median, standard deviation, variance, correlation, or covariance. mean(x) median(x) sd(x) var(x) cor(x, y) cov(x, y) Variable x, y should be numeric (number variable, see [[:level of measurement]]) > x <- c(0,1,1,2,3,5,8,13,21,34) > mean(x) [1] 8.8 > median(x) [1] 4 > sd(x) [1] 11.03328 > var(x) [1] 121.7333 > x <- c(0,1,1,2,3,5,8,13,21,34) > y <- log(x+1) > cor(x,y) [1] 0.9068053 > cov(x,y) [1] 11.49988 $$ r = \frac {\text{covariance (x, y)}} {sd(x) * sd(y)} $$ > x <- c(0,1,1,2,3,5,8,13,21,34) > y <- log(x+1) > cor(x,y) [1] 0.9068053 > cov(x,y)/(sd(x)*sd(y)) [1] 0.9068053 > cov(x,y)/sqrt(var(x)*var(y)) [1] 0.9068053 > x <- c(0,1,1,2,3,NA) > mean(x) [1] NA > sd(x) [1] NA > x <- c(0,1,1,2,3,NA) > mean(x, na.rm=TRUE) [1] 1.4 > sd(x, na.rm=TRUE) [1] 1.140175 data small <- c(0.6739635, 1.5524619, 0.3250562, 1.2143595, 1.3107692, 2.1739663, 1.6187899, 0.8872657, 1.9170283, 0.7767406) medium <- c(10.526448, 9.205156, 11.427756, 8.53318, 9.763317, 9.806662, 9.150245, 10.058465, 9.18233, 7.949692) big <- c(99.83624, 100.70852, 99.73202, 98.53608, 100.74444, 98.58961, 100.46707, 99.88068, 100.46724, 100.49814) dframe <- data.frame(small, medium, big) > dlist <- list(small,medium,big) > dlist [[1]] [1] 0.6739635 1.5524619 0.3250562 1.2143595 1.3107692 2.1739663 [7] 1.6187899 0.8872657 1.9170283 0.7767406 [[2]] [1] 10.526448 9.205156 11.427756 8.533180 9.763317 9.806662 [7] 9.150245 10.058465 9.182330 7.949692 [[3]] [1] 99.83624 100.70852 99.73202 98.53608 100.74444 98.58961 [7] 100.46707 99.88068 100.46724 100.49814 > lapply (dlist,mean) [[1]] [1] 1.24504 [[2]] [1] 9.560325 [[3]] [1] 99.946 > sapply(dlist, sd) [1] 0.5844025 0.9920282 0.8135503 > print(dframe) small medium big 1 0.6739635 10.526448 99.83624 2 1.5524619 9.205156 100.70852 3 0.3250562 11.427756 99.73202 4 1.2143595 8.533180 98.53608 5 1.3107692 9.763317 100.74444 6 2.1739663 9.806662 98.58961 7 1.6187899 9.150245 100.46707 8 0.8872657 10.058465 99.88068 9 1.9170283 9.182330 100.46724 10 0.7767406 7.949692 100.49814 > mean(dframe) # This does not work. > colMeans(dframe) # This works. Note the function name: col+Means. small medium big 1.245040 9.560325 99.946003 > sd(dframe) # Not work. > sd(dframe$small) # Instead, do separately. > sd(dframe$medium) > sd(dframe$big) > # OR . . . . > sapply(dframe, sd) small medium big 0.5844025 0.9920282 0.8135503 # then . . . > sapply(dframe, mean) small medium big 1.245040 9.560325 99.946004 > var(dframe) small medium big small 0.34152627 -0.21516416 -0.04005275 medium -0.21516416 0.98411974 -0.09253855 big -0.04005275 -0.09253855 0.66186326 > cor(dframe) small medium big small 1.00000000 -0.3711367 -0.08424345 medium -0.37113670 1.0000000 -0.11466070 big -0.08424345 -0.1146607 1.00000000 > cov(dframe) small medium big small 0.34152627 -0.21516416 -0.04005275 medium -0.21516416 0.98411974 -0.09253855 big -0.04005275 -0.09253855 0.66186326 ====== Sequence ====== > 1:5 [1] 1 2 3 4 5 > seq(from=1, to=5, by=2) [1] 1 3 5 > rep(1, times=5) [1] 1 1 1 1 1 > seq(from=0, to=20, length.out=5) [1] 0 5 10 15 20 > seq(from=0, to=100, length.out=5) [1] 0 25 50 75 100 sequence (''seq'') 는 x축의 구성을 임의적으로 만들 때 유용. 예를 들면, normal distribution graph 등. x <- seq(-4, 4, length=10000) y <- dnorm(x, mean=0, sd=1) plot(x, y, type="l", lwd=1) ====== Comparing Vectors ====== > a <- 3 > a == pi # Test for equality [1] FALSE > a != pi # Test for inequality [1] TRUE > a < pi [1] TRUE > a > pi [1] FALSE > a <= pi [1] TRUE > a >= pi [1] FALSE > a <- var(dframe) > b <- cov(dframe) > a == b small medium big small TRUE TRUE TRUE medium TRUE TRUE TRUE big TRUE TRUE TRUE > > v <- c( 3, pi, 4) > w <- c(pi, pi, pi) > v == w # Compare two 3-element vectors [1] FALSE TRUE FALSE # Result is a 3-element vector > v != w [1] TRUE FALSE TRUE > v < w [1] TRUE FALSE FALSE > v <= w [1] TRUE TRUE FALSE > v > w [1] FALSE FALSE TRUE > v >= w [1] FALSE TRUE TRUE > v <- c(3, pi, 4) > v == pi # Compare a 3-element vector against one number [1] FALSE TRUE FALSE > v != pi [1] TRUE FALSE TRUE . . . > v <- c(3, pi, 4) > any(v == pi) # Return TRUE if any element of v equals pi [1] TRUE > all(v == 0) # Return TRUE if all elements of v are zero [1] FALSE ====== Selecting Vector Elements ====== > fib <- c(0,1,1,2,3,5,8,13,21,34) > fib [1] 0 1 1 2 3 5 8 13 21 34 > fib[1] [1] 0 > fib[2] [1] 1 > fib[3] [1] 1 > fib[4] [1] 2 > fib[5] [1] 3 > fib[1:3] # Select elements 1 through 3 [1] 0 1 1 > fib[4:9] # Select elements 4 through 9 [1] 2 3 5 8 13 21 > fib[c(1,2,4,8)] [1] 0 1 2 13 > fib[-1] # Ignore first element [1] 1 1 2 3 5 8 13 21 34 > fib[1:3] # As before [1] 0 1 1 > fib[-(1:3)] # Invert sign of index to exclude instead of select [1] 2 3 5 8 13 21 34 > fib < 10 # This vector is TRUE wherever fib is less than 10 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE > fib[fib < 10] # Use that vector to select elements less than 10 [1] 0 1 1 2 3 5 8 > fib %% 2 == 0 # This vector is TRUE wherever fib is even [1] TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE TRUE > fib[fib %% 2 == 0] # Use that vector to select the even elements [1] 0 2 8 34 v[ v > median(v) ] Select all elements in the lower and upper 5% v[ (v < quantile(v,0.05)) | (v > quantile(v,0.95)) ] Select all elements that exceed ±2 standard deviations from the mean v[ abs(v-mean(v)) > 2*sd(v) ] Select all elements that are neither NA nor NULL v[ !is.na(v) & !is.null(v) ] > years <- c(1960, 1964, 1976, 1994) > names(years) <- c("Kennedy", "Johnson", "Carter", "Clinton") > years Kennedy Johnson Carter Clinton 1960 1964 1976 1994 > years["Carter"] Carter 1976 > years["Clinton"] Clinton 1994 > years[c("Carter","Clinton")] Carter Clinton 1976 1994 ====== Performing Vector Arithmetic ====== see [[:social network analysis]] > v <- c(11,12,13,14,15) > w <- c(1,2,3,4,5) > v + w [1] 12 14 16 18 20 > v - w [1] 10 10 10 10 10 > v * w [1] 11 24 39 56 75 > v / w [1] 11.000000 6.000000 4.333333 3.500000 3.000000 > w ^ v [1] 1 4096 1594323 268435456 30517578125 > w [1] 1 2 3 4 5 > mean(w) [1] 3 > w - mean(w) [1] -2 -1 0 1 2 > w [1] 1 2 3 4 5 > sd(w) [1] 1.581139 > (w - mean(w)) / sd(w) [1] -1.2649111 -0.6324555 0.0000000 0.6324555 1.2649111 get variance of v without using var() function. > w [1] 1 2 3 4 5 > sqrt(w) [1] 1.000000 1.414214 1.732051 2.000000 2.236068 > log(w) [1] 0.0000000 0.6931472 1.0986123 1.3862944 1.6094379 > sin(w) [1] 0.8414710 0.9092974 0.1411200 -0.7568025 -0.9589243 ^ Operator ^ Meaning ^ See also ^ | [ [[ | Indexing | Recipe 2.9 | | :: ::: | Access variables in a name space | | | $ @ | Component extraction, slot extraction | | | ''^'' | Exponentiation (right to left) | | | - + | Unary minus and plus | | | : | Sequence creation | Recipe 2.7, Recipe 7.14 | | %any% | Special operators | Discussion | | * / | Multiplication, division | Discussion | | + - | Addition, subtraction | | | == != < > <= >= | Comparison | Recipe 2.8 | | ! | Logical negation | | | & && | Logical "and", short-circuit "and" | | | ''|'' ''||'' | Logical "or", short-circuit "or" | | | ~ | Formula | Recipe 11.1 | | -> ->> | Rightward assignment | Recipe 2.2 | | = | Assignment (right to left) | Recipe 2.2 | | <- <<- | Assignment (right to left) | Recipe 2.2 | | ? | Help | Recipe 1.7 | %% Modulo operator %/% Integer division %*% Matrix multiplication %in% Returns TRUE if the left operand occurs in its right operand; FALSE otherwise classtaken = matrix(0,8,10) edge.list = matrix ( c(1,1,1,2,1,3,1,4,1,9, 2,2,2,5,2,7,2,8, 3,1,3,5,3,6,3,7,3,8, 4,2,4,6,4,9,4,10, 5,1,5,2,5,5,5,7,5,8, 6,2,6,3,6,4,6,7, 7,3,7,4,7,7,7,8, 8,1,8,2,8,6,8,9,8,10), byrow=T, nrow=36,ncol=2) classtaken[edge.list] = 1 rownames(classtaken) = c("a","b", "c", "d","e", "f", "g", "h") colnames(classtaken) = c("writer", "comtheo", "pr","adv", "broadc","internet","camshoot", "edit", "newmedia", "cmc") classtaken c = classtaken tc = t(classtaken) stu = c %*% tc class = tc %*% c stu class fill values less than 2 with zeros in stu matrix stu[stu < 3] <- 0 ====== Defining a Function ====== function(param1, ..., paramN) { expr1 . . . exprM } > cv <- function(x) sd(x)/mean(x) > cv(1:10) [1] 0.5504819 > cv <- function(x) sd(x)/mean(x) > lapply(lst, cv) > gcd <- function(a,b) { + if (b == 0) return(a) + else return(gcd(b, a %% b)) + } ====== Using a script (file) ====== {{tag> statistics r "r basics"}}