====== Print ======

> pi 
[1] 3.141593
> sqrt(2)
[1] 1.414214

When you enter expressions like that, R evaluates the expression and then implicitly calls the print function. So the previous example is identical to this:

> print(pi)
[1] 3.141593
> print(sqrt(2))
[1] 1.414214

The print function has __a significant limitation__, however: it prints __only one object at a time__. Trying to print multiple items gives this mind-numbing error message:

> print("The zero occurs at", 2*pi, "radians.")
Error in print.default("The zero occurs at", 2 * pi, "radians.") : 
        unimplemented type 'character' in 'asLogical'

Instead, use **cat**

> cat("The zero occurs at", 2*pi, "radians.", "\n")
The zero occurs at 6.283185 radians.

Note: space attached, line feed not. A simple vector

> fib <- c(0,1,1,2,3,5,8,13,21,34)
> cat("The first few Fibonacci numbers are:", fib, "...\n")
The first few Fibonacci numbers are: 0 1 1 2 3 5 8 13 21 34 ...

A serious limitation, however, is that it __cannot print compound data structures such as matrices and lists__. ====== Variables ======

> variable_name <- 3

free from declaration:

> x <- 3
> print(x)
[1] 3
> x <- c("fee", "fie", "foe", "fum")
> print(x)
[1] "fee" "fie" "foe" "fum"

====== List ======

> ls()
character(0)

> x <- 10
> y <- 50
> z <- c("three", "blind", "mice")
> f <- function(n,p) sqrt(p*(1-p)/n)
> ls()
[1] "f" "x" "y" "z"

> ls.str()
f : function (n, p)  
x :  num 10
y :  num 50
z :  chr [1:3] "three" "blind" "mice"

hidden variable with "."

> .hidvar <- 10
> ls()
[1] "f" "x" "y" "z"
> ls(all.names=TRUE)
[1] ".hidvar" "f"       "x"       "y"       "z"

====== Deleting Variable ======

> x <- 2*pi
> x
[1] 6.283185
> rm(x)
> x
Error: object "x" not found

Note: no "undo" Wipe out variables in a session:

> ls()
[1] "f" "x" "y" "z"
> rm(list=ls())
> ls()
character(0)

====== Vector ======

> c(1,1,2,3,5,8,13,21)
[1]  1  1  2  3  5  8 13 21
> c(1*pi, 2*pi, 3*pi, 4*pi)
[1]  3.141593  6.283185  9.424778 12.566371
> c("Everyone", "loves", "stats.")
[1] "Everyone" "loves"    "stats."
> c(TRUE,TRUE,FALSE,TRUE)
[1]  TRUE  TRUE FALSE  TRUE

If the arguments to c(...) are themselves vectors, it __flattens them and combines them into one single vector__:

> v1 <- c(1,2,3)
> v2 <- c(4,5,6)
> c(v1,v2)
[1] 1 2 3 4 5 6

> v1 <- c(1,2,3)
> v3 <- c("A","B","C")
> c(v1,v3)
[1] "1" "2" "3" "A" "B" "C"

> c(3.1415, "foo")
[1] "3.1415" "foo"
> mode(c(3.1415, "foo"))
[1] "character"

====== Basic (descriptive) Statistics ====== mean, median, standard deviation, variance, correlation, or covariance.


mean(x)
median(x)
sd(x)
var(x)
cor(x, y)
cov(x, y)

Variable x, y should be numeric (number variable, see [[:level of measurement]])

> x <- c(0,1,1,2,3,5,8,13,21,34)
> mean(x)
[1] 8.8
> median(x)
[1] 4
> sd(x)
[1] 11.03328
> var(x)
[1] 121.7333

> x <- c(0,1,1,2,3,5,8,13,21,34)
> y <- log(x+1)
> cor(x,y)
[1] 0.9068053
> cov(x,y)
[1] 11.49988

$$ r = \frac {\text{covariance (x, y)}} {sd(x) * sd(y)} $$


> x <- c(0,1,1,2,3,5,8,13,21,34)
> y <- log(x+1)
> cor(x,y)
[1] 0.9068053
> cov(x,y)/(sd(x)*sd(y))
[1] 0.9068053
> cov(x,y)/sqrt(var(x)*var(y))
[1] 0.9068053

> x <- c(0,1,1,2,3,NA)
> mean(x)
[1] NA
> sd(x)
[1] NA

> x <- c(0,1,1,2,3,NA)
> mean(x, na.rm=TRUE)
[1] 1.4
> sd(x, na.rm=TRUE)
[1] 1.140175

data

small <- c(0.6739635, 1.5524619, 0.3250562, 1.2143595, 1.3107692, 2.1739663, 1.6187899, 0.8872657, 1.9170283, 0.7767406)
medium <- c(10.526448, 9.205156, 11.427756, 8.53318, 9.763317, 9.806662, 9.150245, 10.058465, 9.18233, 7.949692)
big <- c(99.83624, 100.70852, 99.73202, 98.53608, 100.74444, 98.58961, 100.46707, 99.88068, 100.46724, 100.49814)

dframe <- data.frame(small, medium, big)


> dlist <- list(small,medium,big)
> dlist
[[1]]
 [1] 0.6739635 1.5524619 0.3250562 1.2143595 1.3107692 2.1739663
 [7] 1.6187899 0.8872657 1.9170283 0.7767406

[[2]]
 [1] 10.526448  9.205156 11.427756  8.533180  9.763317  9.806662
 [7]  9.150245 10.058465  9.182330  7.949692

[[3]]
 [1]  99.83624 100.70852  99.73202  98.53608 100.74444  98.58961
 [7] 100.46707  99.88068 100.46724 100.49814
> lapply (dlist,mean)
[[1]]
[1] 1.24504

[[2]]
[1] 9.560325

[[3]]
[1] 99.946
> sapply(dlist, sd)
[1] 0.5844025 0.9920282 0.8135503


> print(dframe)
       small    medium       big
1  0.6739635 10.526448  99.83624
2  1.5524619  9.205156 100.70852
3  0.3250562 11.427756  99.73202
4  1.2143595  8.533180  98.53608
5  1.3107692  9.763317 100.74444
6  2.1739663  9.806662  98.58961
7  1.6187899  9.150245 100.46707
8  0.8872657 10.058465  99.88068
9  1.9170283  9.182330 100.46724
10 0.7767406  7.949692 100.49814
> mean(dframe)       # This does not work.
> colMeans(dframe)   # This works. Note the function name: col+Means.
    small    medium       big 
 1.245040  9.560325 99.946003 
> sd(dframe)         # Not work.
> sd(dframe$small)   # Instead, do separately.
> sd(dframe$medium)
> sd(dframe$big)
> # OR . . . . 
> sapply(dframe, sd)
    small    medium       big 
0.5844025 0.9920282 0.8135503 
# then . . . 
> sapply(dframe, mean)
    small    medium       big 
 1.245040  9.560325 99.946004

> var(dframe)
             small      medium         big
small   0.34152627 -0.21516416 -0.04005275
medium -0.21516416  0.98411974 -0.09253855
big    -0.04005275 -0.09253855  0.66186326

> cor(dframe)
             small     medium         big
small   1.00000000 -0.3711367 -0.08424345
medium -0.37113670  1.0000000 -0.11466070
big    -0.08424345 -0.1146607  1.00000000

> cov(dframe)
             small      medium         big
small   0.34152627 -0.21516416 -0.04005275
medium -0.21516416  0.98411974 -0.09253855
big    -0.04005275 -0.09253855  0.66186326

====== Sequence ======

> 1:5
[1] 1 2 3 4 5

> seq(from=1, to=5, by=2)
[1] 1 3 5

> rep(1, times=5)
[1] 1 1 1 1 1

> seq(from=0, to=20, length.out=5)
[1]  0  5 10 15 20
> seq(from=0, to=100, length.out=5)
[1]   0  25  50  75 100

sequence (''seq'') 는 x축의 구성을 임의적으로 만들 때 유용. 예를 들면, normal distribution graph 등.


x <- seq(-4, 4, length=10000)
y <- dnorm(x, mean=0, sd=1)
plot(x, y, type="l", lwd=1)

====== Comparing Vectors ======

> a <- 3
> a == pi     # Test for equality
[1] FALSE
> a != pi     # Test for inequality
[1] TRUE
> a < pi
[1] TRUE
> a > pi
[1] FALSE
> a <= pi
[1] TRUE
> a >= pi
[1] FALSE


> a <- var(dframe)
> b <- cov(dframe)
> a == b
       small medium  big
small   TRUE   TRUE TRUE
medium  TRUE   TRUE TRUE
big     TRUE   TRUE TRUE
>

> v <- c( 3, pi,  4)
> w <- c(pi, pi, pi)
> v == w                  # Compare two 3-element vectors
[1] FALSE  TRUE FALSE     # Result is a 3-element vector
> v != w
[1]  TRUE FALSE  TRUE
> v < w
[1]  TRUE FALSE FALSE
> v <= w
[1]  TRUE  TRUE FALSE
> v > w
[1] FALSE FALSE  TRUE
> v >= w
[1] FALSE  TRUE  TRUE


> v <- c(3, pi, 4)
> v == pi                 # Compare a 3-element vector against one number
[1] FALSE  TRUE FALSE
> v != pi
[1]  TRUE FALSE  TRUE
. . .

> v <- c(3, pi, 4)
> any(v == pi)         # Return TRUE if any element of v equals pi
[1] TRUE
> all(v == 0)          # Return TRUE if all elements of v are zero
[1] FALSE

====== Selecting Vector Elements ======


> fib <- c(0,1,1,2,3,5,8,13,21,34)
> fib
 [1]  0  1  1  2  3  5  8 13 21 34
> fib[1]
[1] 0
> fib[2]
[1] 1
> fib[3]
[1] 1
> fib[4]
[1] 2
> fib[5]
[1] 3

> fib[1:3]        # Select elements 1 through 3
[1] 0 1 1
> fib[4:9]        # Select elements 4 through 9
[1]  2  3  5  8 13 21

> fib[c(1,2,4,8)]
[1]  0  1  2 13

> fib[-1]          # Ignore first element
[1]  1  1  2  3  5  8 13 21 34

> fib[1:3]         # As before
[1] 0 1 1
> fib[-(1:3)]      # Invert sign of index to exclude instead of select
[1]  2  3  5  8 13 21 34

> fib < 10               # This vector is TRUE wherever fib is less than 10
 [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE
> fib[fib < 10]          # Use that vector to select elements less than 10
[1] 0 1 1 2 3 5 8

> fib %% 2 == 0          # This vector is TRUE wherever fib is even
 [1]  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE
> fib[fib %% 2 == 0]     # Use that vector to select the even elements
[1]  0  2  8 34


v[ v > median(v) ]
Select all elements in the lower and upper 5%
v[ (v < quantile(v,0.05)) | (v > quantile(v,0.95)) ]
Select all elements that exceed ±2 standard deviations from the mean
v[ abs(v-mean(v)) > 2*sd(v) ]
Select all elements that are neither NA nor NULL
v[ !is.na(v) & !is.null(v) ]

> years <- c(1960, 1964, 1976, 1994)
> names(years) <- c("Kennedy", "Johnson", "Carter", "Clinton")
> years
Kennedy Johnson  Carter Clinton 
   1960    1964    1976    1994

> years["Carter"]
Carter 
  1976 
> years["Clinton"]
Clinton 
   1994

> years[c("Carter","Clinton")]
 Carter Clinton 
   1976    1994

====== Performing Vector Arithmetic ====== see [[:social network analysis]]

> v <- c(11,12,13,14,15)
> w <- c(1,2,3,4,5)
> v + w
[1] 12 14 16 18 20
> v - w
[1] 10 10 10 10 10
> v * w
[1] 11 24 39 56 75
> v / w
[1] 11.000000  6.000000  4.333333  3.500000  3.000000
> w ^ v
[1]           1        4096     1594323   268435456 30517578125

> w
[1] 1 2 3 4 5
> mean(w)
[1] 3
> w - mean(w)
[1] -2 -1  0  1  2

> w
[1] 1 2 3 4 5
> sd(w)
[1] 1.581139
> (w - mean(w)) / sd(w)
[1] -1.2649111 -0.6324555  0.0000000  0.6324555  1.2649111

get variance of v without using var() function.

> w
[1] 1 2 3 4 5
> sqrt(w)
[1] 1.000000 1.414214 1.732051 2.000000 2.236068
> log(w)
[1] 0.0000000 0.6931472 1.0986123 1.3862944 1.6094379
> sin(w)
[1]  0.8414710  0.9092974  0.1411200 -0.7568025 -0.9589243

^ Operator ^ Meaning ^ See also ^ | [ [[ | Indexing | Recipe 2.9 | | :: ::: | Access variables in a name space | | | $ @ | Component extraction, slot extraction | | | ''^'' | Exponentiation (right to left) | | | - + | Unary minus and plus | | | : | Sequence creation | Recipe 2.7, Recipe 7.14 | | %any% | Special operators | Discussion | | * / | Multiplication, division | Discussion | | + - | Addition, subtraction | | | == != < > <= >= | Comparison | Recipe 2.8 | | ! | Logical negation | | | & && | Logical "and", short-circuit "and" | | | ''|'' ''||'' | Logical "or", short-circuit "or" | | | ~ | Formula | Recipe 11.1 | | -> ->> | Rightward assignment | Recipe 2.2 | | = | Assignment (right to left) | Recipe 2.2 | | <- <<- | Assignment (right to left) | Recipe 2.2 | | ? | Help | Recipe 1.7 |

%%
Modulo operator
%/%
Integer division
%*%
Matrix multiplication
%in%
Returns TRUE if the left operand occurs in its right operand; FALSE otherwise

classtaken = matrix(0,8,10)
edge.list = matrix (
    c(1,1,1,2,1,3,1,4,1,9,
    2,2,2,5,2,7,2,8,
    3,1,3,5,3,6,3,7,3,8,
    4,2,4,6,4,9,4,10,
    5,1,5,2,5,5,5,7,5,8,
    6,2,6,3,6,4,6,7,
    7,3,7,4,7,7,7,8,
    8,1,8,2,8,6,8,9,8,10), byrow=T, nrow=36,ncol=2)
classtaken[edge.list] = 1
rownames(classtaken) = c("a","b", "c", "d","e", "f", "g", "h")
colnames(classtaken) = c("writer", "comtheo", "pr","adv", 
                      "broadc","internet","camshoot", "edit", 
                      "newmedia", "cmc")
classtaken

c = classtaken
tc = t(classtaken)

stu = c %*% tc
class = tc %*% c

stu
class

fill values less than 2 with zeros in stu matrix stu[stu < 3] <- 0 ====== Defining a Function ======

function(param1, ..., paramN) {
    expr1
    .
    .
    .
    exprM
}

> cv <- function(x) sd(x)/mean(x)
> cv(1:10)
[1] 0.5504819

> cv <- function(x) sd(x)/mean(x)
> lapply(lst, cv)

> gcd <- function(a,b) {
+     if (b == 0) return(a)
+     else return(gcd(b, a %% b))
+ }

====== Using a script (file) ====== {{tag> statistics r "r basics"}}