====== Print ======
> pi
[1] 3.141593
> sqrt(2)
[1] 1.414214
When you enter expressions like that, R evaluates the expression and then implicitly calls the print function. So the previous example is identical to this:
> print(pi)
[1] 3.141593
> print(sqrt(2))
[1] 1.414214
The print function has __a significant limitation__, however: it prints __only one object at a time__. Trying to print multiple items gives this mind-numbing error message:
> print("The zero occurs at", 2*pi, "radians.")
Error in print.default("The zero occurs at", 2 * pi, "radians.") :
unimplemented type 'character' in 'asLogical'
Instead, use **cat**
> cat("The zero occurs at", 2*pi, "radians.", "\n")
The zero occurs at 6.283185 radians.
Note: space attached, line feed not.
A simple vector
> fib <- c(0,1,1,2,3,5,8,13,21,34)
> cat("The first few Fibonacci numbers are:", fib, "...\n")
The first few Fibonacci numbers are: 0 1 1 2 3 5 8 13 21 34 ...
A serious limitation, however, is that it __cannot print compound data structures such as matrices and lists__.
====== Variables ======
> variable_name <- 3
free from declaration:
> x <- 3
> print(x)
[1] 3
> x <- c("fee", "fie", "foe", "fum")
> print(x)
[1] "fee" "fie" "foe" "fum"
====== List ======
> ls()
character(0)
> x <- 10
> y <- 50
> z <- c("three", "blind", "mice")
> f <- function(n,p) sqrt(p*(1-p)/n)
> ls()
[1] "f" "x" "y" "z"
> ls.str()
f : function (n, p)
x : num 10
y : num 50
z : chr [1:3] "three" "blind" "mice"
hidden variable with "."
> .hidvar <- 10
> ls()
[1] "f" "x" "y" "z"
> ls(all.names=TRUE)
[1] ".hidvar" "f" "x" "y" "z"
====== Deleting Variable ======
> x <- 2*pi
> x
[1] 6.283185
> rm(x)
> x
Error: object "x" not found
Note: no "undo"
Wipe out variables in a session:
> ls()
[1] "f" "x" "y" "z"
> rm(list=ls())
> ls()
character(0)
====== Vector ======
> c(1,1,2,3,5,8,13,21)
[1] 1 1 2 3 5 8 13 21
> c(1*pi, 2*pi, 3*pi, 4*pi)
[1] 3.141593 6.283185 9.424778 12.566371
> c("Everyone", "loves", "stats.")
[1] "Everyone" "loves" "stats."
> c(TRUE,TRUE,FALSE,TRUE)
[1] TRUE TRUE FALSE TRUE
If the arguments to c(...) are themselves vectors, it __flattens them and combines them into one single vector__:
> v1 <- c(1,2,3)
> v2 <- c(4,5,6)
> c(v1,v2)
[1] 1 2 3 4 5 6
> v1 <- c(1,2,3)
> v3 <- c("A","B","C")
> c(v1,v3)
[1] "1" "2" "3" "A" "B" "C"
> c(3.1415, "foo")
[1] "3.1415" "foo"
> mode(c(3.1415, "foo"))
[1] "character"
====== Basic (descriptive) Statistics ======
mean, median, standard deviation, variance, correlation, or covariance.
mean(x)
median(x)
sd(x)
var(x)
cor(x, y)
cov(x, y)
Variable x, y should be numeric (number variable, see [[:level of measurement]])
> x <- c(0,1,1,2,3,5,8,13,21,34)
> mean(x)
[1] 8.8
> median(x)
[1] 4
> sd(x)
[1] 11.03328
> var(x)
[1] 121.7333
> x <- c(0,1,1,2,3,5,8,13,21,34)
> y <- log(x+1)
> cor(x,y)
[1] 0.9068053
> cov(x,y)
[1] 11.49988
$$ r = \frac {\text{covariance (x, y)}} {sd(x) * sd(y)} $$
> x <- c(0,1,1,2,3,5,8,13,21,34)
> y <- log(x+1)
> cor(x,y)
[1] 0.9068053
> cov(x,y)/(sd(x)*sd(y))
[1] 0.9068053
> cov(x,y)/sqrt(var(x)*var(y))
[1] 0.9068053
> x <- c(0,1,1,2,3,NA)
> mean(x)
[1] NA
> sd(x)
[1] NA
> x <- c(0,1,1,2,3,NA)
> mean(x, na.rm=TRUE)
[1] 1.4
> sd(x, na.rm=TRUE)
[1] 1.140175
data
small <- c(0.6739635, 1.5524619, 0.3250562, 1.2143595, 1.3107692, 2.1739663, 1.6187899, 0.8872657, 1.9170283, 0.7767406)
medium <- c(10.526448, 9.205156, 11.427756, 8.53318, 9.763317, 9.806662, 9.150245, 10.058465, 9.18233, 7.949692)
big <- c(99.83624, 100.70852, 99.73202, 98.53608, 100.74444, 98.58961, 100.46707, 99.88068, 100.46724, 100.49814)
dframe <- data.frame(small, medium, big)
> dlist <- list(small,medium,big)
> dlist
[[1]]
[1] 0.6739635 1.5524619 0.3250562 1.2143595 1.3107692 2.1739663
[7] 1.6187899 0.8872657 1.9170283 0.7767406
[[2]]
[1] 10.526448 9.205156 11.427756 8.533180 9.763317 9.806662
[7] 9.150245 10.058465 9.182330 7.949692
[[3]]
[1] 99.83624 100.70852 99.73202 98.53608 100.74444 98.58961
[7] 100.46707 99.88068 100.46724 100.49814
> lapply (dlist,mean)
[[1]]
[1] 1.24504
[[2]]
[1] 9.560325
[[3]]
[1] 99.946
> sapply(dlist, sd)
[1] 0.5844025 0.9920282 0.8135503
> print(dframe)
small medium big
1 0.6739635 10.526448 99.83624
2 1.5524619 9.205156 100.70852
3 0.3250562 11.427756 99.73202
4 1.2143595 8.533180 98.53608
5 1.3107692 9.763317 100.74444
6 2.1739663 9.806662 98.58961
7 1.6187899 9.150245 100.46707
8 0.8872657 10.058465 99.88068
9 1.9170283 9.182330 100.46724
10 0.7767406 7.949692 100.49814
> mean(dframe) # This does not work.
> colMeans(dframe) # This works. Note the function name: col+Means.
small medium big
1.245040 9.560325 99.946003
> sd(dframe) # Not work.
> sd(dframe$small) # Instead, do separately.
> sd(dframe$medium)
> sd(dframe$big)
> # OR . . . .
> sapply(dframe, sd)
small medium big
0.5844025 0.9920282 0.8135503
# then . . .
> sapply(dframe, mean)
small medium big
1.245040 9.560325 99.946004
> var(dframe)
small medium big
small 0.34152627 -0.21516416 -0.04005275
medium -0.21516416 0.98411974 -0.09253855
big -0.04005275 -0.09253855 0.66186326
> cor(dframe)
small medium big
small 1.00000000 -0.3711367 -0.08424345
medium -0.37113670 1.0000000 -0.11466070
big -0.08424345 -0.1146607 1.00000000
> cov(dframe)
small medium big
small 0.34152627 -0.21516416 -0.04005275
medium -0.21516416 0.98411974 -0.09253855
big -0.04005275 -0.09253855 0.66186326
====== Sequence ======
> 1:5
[1] 1 2 3 4 5
> seq(from=1, to=5, by=2)
[1] 1 3 5
> rep(1, times=5)
[1] 1 1 1 1 1
> seq(from=0, to=20, length.out=5)
[1] 0 5 10 15 20
> seq(from=0, to=100, length.out=5)
[1] 0 25 50 75 100
sequence (''seq'') 는 x축의 구성을 임의적으로 만들 때 유용. 예를 들면, normal distribution graph 등.
x <- seq(-4, 4, length=10000)
y <- dnorm(x, mean=0, sd=1)
plot(x, y, type="l", lwd=1)
====== Comparing Vectors ======
> a <- 3
> a == pi # Test for equality
[1] FALSE
> a != pi # Test for inequality
[1] TRUE
> a < pi
[1] TRUE
> a > pi
[1] FALSE
> a <= pi
[1] TRUE
> a >= pi
[1] FALSE
> a <- var(dframe)
> b <- cov(dframe)
> a == b
small medium big
small TRUE TRUE TRUE
medium TRUE TRUE TRUE
big TRUE TRUE TRUE
>
> v <- c( 3, pi, 4)
> w <- c(pi, pi, pi)
> v == w # Compare two 3-element vectors
[1] FALSE TRUE FALSE # Result is a 3-element vector
> v != w
[1] TRUE FALSE TRUE
> v < w
[1] TRUE FALSE FALSE
> v <= w
[1] TRUE TRUE FALSE
> v > w
[1] FALSE FALSE TRUE
> v >= w
[1] FALSE TRUE TRUE
> v <- c(3, pi, 4)
> v == pi # Compare a 3-element vector against one number
[1] FALSE TRUE FALSE
> v != pi
[1] TRUE FALSE TRUE
. . .
> v <- c(3, pi, 4)
> any(v == pi) # Return TRUE if any element of v equals pi
[1] TRUE
> all(v == 0) # Return TRUE if all elements of v are zero
[1] FALSE
====== Selecting Vector Elements ======
> fib <- c(0,1,1,2,3,5,8,13,21,34)
> fib
[1] 0 1 1 2 3 5 8 13 21 34
> fib[1]
[1] 0
> fib[2]
[1] 1
> fib[3]
[1] 1
> fib[4]
[1] 2
> fib[5]
[1] 3
> fib[1:3] # Select elements 1 through 3
[1] 0 1 1
> fib[4:9] # Select elements 4 through 9
[1] 2 3 5 8 13 21
> fib[c(1,2,4,8)]
[1] 0 1 2 13
> fib[-1] # Ignore first element
[1] 1 1 2 3 5 8 13 21 34
> fib[1:3] # As before
[1] 0 1 1
> fib[-(1:3)] # Invert sign of index to exclude instead of select
[1] 2 3 5 8 13 21 34
> fib < 10 # This vector is TRUE wherever fib is less than 10
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE
> fib[fib < 10] # Use that vector to select elements less than 10
[1] 0 1 1 2 3 5 8
> fib %% 2 == 0 # This vector is TRUE wherever fib is even
[1] TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE TRUE
> fib[fib %% 2 == 0] # Use that vector to select the even elements
[1] 0 2 8 34
v[ v > median(v) ]
Select all elements in the lower and upper 5%
v[ (v < quantile(v,0.05)) | (v > quantile(v,0.95)) ]
Select all elements that exceed ±2 standard deviations from the mean
v[ abs(v-mean(v)) > 2*sd(v) ]
Select all elements that are neither NA nor NULL
v[ !is.na(v) & !is.null(v) ]
> years <- c(1960, 1964, 1976, 1994)
> names(years) <- c("Kennedy", "Johnson", "Carter", "Clinton")
> years
Kennedy Johnson Carter Clinton
1960 1964 1976 1994
> years["Carter"]
Carter
1976
> years["Clinton"]
Clinton
1994
> years[c("Carter","Clinton")]
Carter Clinton
1976 1994
====== Performing Vector Arithmetic ======
see [[:social network analysis]]
> v <- c(11,12,13,14,15)
> w <- c(1,2,3,4,5)
> v + w
[1] 12 14 16 18 20
> v - w
[1] 10 10 10 10 10
> v * w
[1] 11 24 39 56 75
> v / w
[1] 11.000000 6.000000 4.333333 3.500000 3.000000
> w ^ v
[1] 1 4096 1594323 268435456 30517578125
> w
[1] 1 2 3 4 5
> mean(w)
[1] 3
> w - mean(w)
[1] -2 -1 0 1 2
> w
[1] 1 2 3 4 5
> sd(w)
[1] 1.581139
> (w - mean(w)) / sd(w)
[1] -1.2649111 -0.6324555 0.0000000 0.6324555 1.2649111
get variance of v without using var() function.
> w
[1] 1 2 3 4 5
> sqrt(w)
[1] 1.000000 1.414214 1.732051 2.000000 2.236068
> log(w)
[1] 0.0000000 0.6931472 1.0986123 1.3862944 1.6094379
> sin(w)
[1] 0.8414710 0.9092974 0.1411200 -0.7568025 -0.9589243
^ Operator ^ Meaning ^ See also ^
| [ [[ | Indexing | Recipe 2.9 |
| :: ::: | Access variables in a name space | |
| $ @ | Component extraction, slot extraction | |
| ''^'' | Exponentiation (right to left) | |
| - + | Unary minus and plus | |
| : | Sequence creation | Recipe 2.7, Recipe 7.14 |
| %any% | Special operators | Discussion |
| * / | Multiplication, division | Discussion |
| + - | Addition, subtraction | |
| == != < > <= >= | Comparison | Recipe 2.8 |
| ! | Logical negation | |
| & && | Logical "and", short-circuit "and" | |
| ''|'' ''||'' | Logical "or", short-circuit "or" | |
| ~ | Formula | Recipe 11.1 |
| -> ->> | Rightward assignment | Recipe 2.2 |
| = | Assignment (right to left) | Recipe 2.2 |
| <- <<- | Assignment (right to left) | Recipe 2.2 |
| ? | Help | Recipe 1.7 |
%%
Modulo operator
%/%
Integer division
%*%
Matrix multiplication
%in%
Returns TRUE if the left operand occurs in its right operand; FALSE otherwise
classtaken = matrix(0,8,10)
edge.list = matrix (
c(1,1,1,2,1,3,1,4,1,9,
2,2,2,5,2,7,2,8,
3,1,3,5,3,6,3,7,3,8,
4,2,4,6,4,9,4,10,
5,1,5,2,5,5,5,7,5,8,
6,2,6,3,6,4,6,7,
7,3,7,4,7,7,7,8,
8,1,8,2,8,6,8,9,8,10), byrow=T, nrow=36,ncol=2)
classtaken[edge.list] = 1
rownames(classtaken) = c("a","b", "c", "d","e", "f", "g", "h")
colnames(classtaken) = c("writer", "comtheo", "pr","adv",
"broadc","internet","camshoot", "edit",
"newmedia", "cmc")
classtaken
c = classtaken
tc = t(classtaken)
stu = c %*% tc
class = tc %*% c
stu
class
fill values less than 2 with zeros in stu matrix
stu[stu < 3] <- 0
====== Defining a Function ======
function(param1, ..., paramN) {
expr1
.
.
.
exprM
}
> cv <- function(x) sd(x)/mean(x)
> cv(1:10)
[1] 0.5504819
> cv <- function(x) sd(x)/mean(x)
> lapply(lst, cv)
> gcd <- function(a,b) {
+ if (b == 0) return(a)
+ else return(gcd(b, a %% b))
+ }
====== Using a script (file) ======
{{tag> statistics r "r basics"}}