> v <- c(40,2,83,28,58)
> f <- factor(c("A","C","C","B","C"))
====== Splitting a Vector into Groups ======
> library(MASS)
Warning message:
패키지 ‘MASS’는 R 버전 3.2.5에서 작성되었습니다
> split(Cars93$MPG.city, Cars93$Origin) # Origin별로 MPG.city를 나눠라
$USA
[1] 22 19 16 19 16 16 25 25 19 21 18 15
[13] 17 17 20 23 20 29 23 22 17 21 18 29
[25] 20 31 23 22 22 24 15 21 18 17 18 23
[37] 19 24 23 18 19 23 31 23 19 19 19 28
$`non-USA`
[1] 25 18 20 19 22 46 30 24 42 24 29 22
[13] 26 20 17 18 18 29 28 26 18 17 20 19
[25] 29 18 29 24 17 21 20 33 25 23 39 32
[37] 25 22 18 25 17 21 18 21 20
Comparison between (among) groups
> g <- split(Cars93$MPG.city, Cars93$Origin)
> g
$USA
[1] 22 19 16 19 16 16 25 25 19 21 18 15
[13] 17 17 20 23 20 29 23 22 17 21 18 29
[25] 20 31 23 22 22 24 15 21 18 17 18 23
[37] 19 24 23 18 19 23 31 23 19 19 19 28
$`non-USA`
[1] 25 18 20 19 22 46 30 24 42 24 29 22
[13] 26 20 17 18 18 29 28 26 18 17 20 19
[25] 29 18 29 24 17 21 20 33 25 23 39 32
[37] 25 22 18 25 17 21 18 21 20
> mean(g$USA)
[1] 20.95833
> mean(g$`non-USA`)
[1] 23.86667
>
# or
> sapply(g, mean)
USA non-USA
20.95833 23.86667
# or retain list format
> lapply(g, mean)
$USA
[1] 20.95833
$`non-USA`
[1] 23.86667
====== Applying a Function to Each List Element ======
S1 <- c(89, 85, 85, 86, 88, 89, 86, 82, 96, 85, 93, 91,
98, 87, 94, 77, 87, 98, 85, 89, 95, 85, 93, 93,
97, 71, 97, 93, 75, 68, 98, 95, 79, 94, 98, 95)
S2 <- c(60, 98, 94, 95, 99, 97, 100, 73, 93, 91, 98,
86, 66, 83, 77, 97, 91, 93, 71, 91, 95, 100,
72, 96, 91, 76, 100, 97, 99, 95, 97, 77, 94,
99, 88, 100, 94, 93, 86)
S3 <- c(95, 86, 90, 90, 75, 83, 96, 85, 83, 84, 81, 98,
77, 94, 84, 89, 93, 99, 91, 77, 95, 90, 91, 87,
85, 76, 99, 99, 97, 97, 97, 77, 93, 96, 90, 87,
97, 88)
S4 <- c(67, 93, 63, 83, 87, 97, 96, 92, 93, 96, 87, 90,
94, 90, 82, 91, 85, 93, 83, 90, 87, 99, 94, 88,
90, 72, 81, 93, 93, 94, 97, 89, 96, 95, 82, 97)
scores <- list(S1=S1,S2=S2,S3=S3,S4=S4)
scores
$S1
[1] 89 85 85 86 88 89 86 82 96 85 93 91 98 87 94 77 87 98 85 89
[21] 95 85 93 93 97 71 97 93 75 68 98 95 79 94 98 95
$S2
[1] 60 98 94 95 99 97 100 73 93 91 98 86 66 83 77
[16] 97 91 93 71 91 95 100 72 96 91 76 100 97 99 95
[31] 97 77 94 99 88 100 94 93 86
$S3
[1] 95 86 90 90 75 83 96 85 83 84 81 98 77 94 84 89 93 99 91 77
[21] 95 90 91 87 85 76 99 99 97 97 97 77 93 96 90 87 97 88
$S4
[1] 67 93 63 83 87 97 96 92 93 96 87 90 94 90 82 91 85 93 83 90
[21] 87 99 94 88 90 72 81 93 93 94 97 89 96 95 82 97
**lapply(list_name, function)**
lapply(scores, length)
$S1
[1] 36
$S2
[1] 39
$S3
[1] 38
$S4
[1] 36
**sapply(list_name, function)**
> sapply(scores, length)
S1 S2 S3 S4
36 39 38 36
> sapply(scores, mean)
S1 S2 S3 S4
88.77778 89.79487 89.23684 88.86111
> sapply(scores, sd)
S1 S2 S3 S4
7.720515 10.543592 7.178926 8.208542
If the called function returns a vector, sapply will form the results into **a matrix**. The range function, for example, returns a two-element vector:
> sapply(scores, range)
S1 S2 S3 S4
[1,] 68 60 75 63
[2,] 98 100 99 99
If the called function returns a structured object, such as a list, then you will need to use lapply rather than sapply. Structured objects cannot be put into a vector. Suppose we want to perform a t test on every semester. The t.test function returns a list, so we must use lapply:
> tests <- lapply(scores, t.test)
====== Applying a Function to Every Row ======
> longdata<- c(-1.850152, -1.406571, -1.0104817, -3.7170704,
-0.2804896, 0.9496313, 1.346517, -0.1580926, 1.6272786,
-2.4483321, -0.5407272, -1.708678, -0.3480616, -0.2757667,
-1.2177024)
> long <- matrix(longdata, 3,5)
> colnames(long) <- c("trial1","trial2","trial3","trial4","trial5")
> rownames(long) <- c("Moe", "Larry", "Curly")
> long
trial1 trial2 trial3 trial4 trial5
Moe -1.850152 -3.7170704 1.3465170 2.4483321 -0.3480616
Larry -1.406571 -0.2804896 -0.1580926 -0.5407272 -0.2757667
Curly -1.010482 0.9496313 1.6272786 -1.7086780 -1.2177024
apply(long, 1, mean)
Moe Larry Curly
-1.6529530 1.2427334 -0.8181872
apply(long, 1, range)
Moe Larry Curly
[1,] -3.7170704 -0.1580926 -1.7086779
[2,] -0.2804896 2.4483321 -0.2757667
====== Applying a Function to Every Column ======
apply(matrix, 2, function)
1 -> row by row
2 -> column by column
====== Applying a Function to Groups of Data ======
> tapply(vector, factor, function)
city county state pop
Chicago Cook IL 2853114
Kenosha Kenosha WI 90352
Aurora Kane IL 171782
Elgin Kane IL 94487
Gary Lake(IN) IN 102746
Joliet Kendall IL 106221
Naperville DuPage IL 147779
Arlington Heights Cook IL 76031
Bolingbrook Will IL 70834
Cicero Cook IL 72616
Evanston Cook IL 74239
Hammond Lake(IN) IN 83048
Palatine Cook IL 67232
Schaumburg Cook IL 75386
Skokie Cook IL 63348
Waukegan Lake(IL) IL 91452
suburbs <- read.csv("suburbs.csv", head=T, sep=" ")
suburbs <- read.csv("http://commres.net/wiki/_export/code/r/data_transformations?codeblock=15", head=T, sep=" ")
> attach(suburbs)
> pop
[1] 2853114 90352 171782 94487 102746 106221 147779 76031 70834
[10] 72616 74239 83048 67232 75386 63348 91452
We can easily compute sums and averages for all the cities:
> sum(pop)
[1] 4240667
> mean(pop)
[1] 265041.7
factors by county = 8
> county
[1] Cook Kenosha Kane Kane Lake(IN) Kendall
[7] DuPage Cook Will Cook Cook Lake(IN)
[13] Cook Cook Cook Lake(IL)
8 Levels: Cook DuPage Kane Kendall Kenosha Lake(IL) ... Will
> tapply(pop, county, sum)
Cook DuPage Kane Kendall Kenosha Lake(IL) Lake(IN) Will
3281966 147779 266269 106221 90352 91452 185794 70834
> tapply(pop,county,mean)
Cook DuPage Kane Kendall Kenosha Lake(IL) Lake(IN) Will
468852.3 147779.0 133134.5 106221.0 90352.0 91452.0 92897.0 70834.0
The function given to tapply should expect a single argument: a vector containing all the members of one group. A good example is the length function, which takes a vector parameter and returns the vector’s length. Use it to count the number of data in each group; in this case, the number of cities in each county:
> tapply(pop,county,length)
Cook DuPage Kane Kendall Kenosha Lake(IL) Lake(IN) Will
7 1 2 1 1 1 2 1
====== Applying a Function to Groups of Rows ======
> by(dfrm, fact, fun)
dfrm = the data frame,
fact = grouping factor,
fun = function. The function should expect one argument, a data frame.
library("MASS")
sel <- Cars93[c("Origin", "Manufacturer", "MPG.city", "MPG.highway", "EngineSize")]
> by(sel, sel$Orig, summary)
sel$Orig: USA
Origin Manufacturer MPG.city MPG.highway EngineSize
USA :48 Chevrolet : 8 Min. :15.00 Min. :20.00 Min. :1.300
non-USA: 0 Ford : 8 1st Qu.:18.00 1st Qu.:26.00 1st Qu.:2.200
Dodge : 6 Median :20.00 Median :28.00 Median :3.000
Pontiac : 5 Mean :20.96 Mean :28.15 Mean :3.067
Buick : 4 3rd Qu.:23.00 3rd Qu.:30.00 3rd Qu.:3.800
Oldsmobile: 4 Max. :31.00 Max. :41.00 Max. :5.700
(Other) :13
------------------------------------------------------------------
sel$Orig: non-USA
Origin Manufacturer MPG.city MPG.highway EngineSize
USA : 0 Mazda : 5 Min. :17.00 Min. :21.00 Min. :1.000
non-USA:45 Hyundai : 4 1st Qu.:19.00 1st Qu.:25.00 1st Qu.:1.600
Nissan : 4 Median :22.00 Median :30.00 Median :2.200
Toyota : 4 Mean :23.87 Mean :30.09 Mean :2.242
Volkswagen: 4 3rd Qu.:26.00 3rd Qu.:33.00 3rd Qu.:2.800
Honda : 3 Max. :46.00 Max. :50.00 Max. :4.500
(Other) :21
tapply(suburbs$pop, suburbs$state, summary)
by(suburbs$pop, suburbs$state, summary)