# COMMunicationRESearch.NET

### Site Tools

b:r_cookbook:data_transformations
> v <- c(40,2,83,28,58)
> f <- factor(c("A","C","C","B","C"))

# Splitting a Vector into Groups

> library(MASS)
Warning message:
패키지 ‘MASS’는 R 버전 3.2.5에서 작성되었습니다
> split(Cars93$MPG.city, Cars93$Origin) # Origin별로 MPG.city를 나눠라
$USA [1] 22 19 16 19 16 16 25 25 19 21 18 15 [13] 17 17 20 23 20 29 23 22 17 21 18 29 [25] 20 31 23 22 22 24 15 21 18 17 18 23 [37] 19 24 23 18 19 23 31 23 19 19 19 28$non-USA
[1] 25 18 20 19 22 46 30 24 42 24 29 22
[13] 26 20 17 18 18 29 28 26 18 17 20 19
[25] 29 18 29 24 17 21 20 33 25 23 39 32
[37] 25 22 18 25 17 21 18 21 20

Comparison between (among) groups

> g <- split(Cars93$MPG.city, Cars93$Origin)
> g
$USA [1] 22 19 16 19 16 16 25 25 19 21 18 15 [13] 17 17 20 23 20 29 23 22 17 21 18 29 [25] 20 31 23 22 22 24 15 21 18 17 18 23 [37] 19 24 23 18 19 23 31 23 19 19 19 28$non-USA
[1] 25 18 20 19 22 46 30 24 42 24 29 22
[13] 26 20 17 18 18 29 28 26 18 17 20 19
[25] 29 18 29 24 17 21 20 33 25 23 39 32
[37] 25 22 18 25 17 21 18 21 20

> mean(g$USA) [1] 20.95833 > mean(g$non-USA)
[1] 23.86667
>
# or
> sapply(g, mean)
USA  non-USA
20.95833 23.86667
# or retain list format
> lapply(g, mean)
$USA [1] 20.95833$non-USA
[1] 23.86667



# Applying a Function to Each List Element

S1 <- c(89, 85, 85, 86, 88, 89, 86, 82, 96, 85, 93, 91,
98, 87, 94, 77, 87, 98, 85, 89, 95, 85, 93, 93,
97, 71, 97, 93, 75, 68, 98, 95, 79, 94, 98, 95)
S2 <- c(60, 98, 94, 95, 99, 97, 100, 73, 93, 91, 98,
86, 66, 83, 77, 97, 91, 93, 71, 91, 95, 100,
72, 96, 91, 76, 100, 97, 99, 95, 97, 77, 94,
99, 88, 100, 94, 93, 86)
S3 <- c(95, 86, 90, 90, 75, 83, 96, 85, 83, 84, 81, 98,
77, 94, 84, 89, 93, 99, 91, 77, 95, 90, 91, 87,
85, 76, 99, 99, 97, 97, 97, 77, 93, 96, 90, 87,
97, 88)
S4 <- c(67, 93, 63, 83, 87, 97, 96, 92, 93, 96, 87, 90,
94, 90, 82, 91, 85, 93, 83, 90, 87, 99, 94, 88,
90, 72, 81, 93, 93, 94, 97, 89, 96, 95, 82, 97)

scores <- list(S1=S1,S2=S2,S3=S3,S4=S4)
scores
$S1 [1] 89 85 85 86 88 89 86 82 96 85 93 91 98 87 94 77 87 98 85 89 [21] 95 85 93 93 97 71 97 93 75 68 98 95 79 94 98 95$S2
[1]  60  98  94  95  99  97 100  73  93  91  98  86  66  83  77
[16]  97  91  93  71  91  95 100  72  96  91  76 100  97  99  95
[31]  97  77  94  99  88 100  94  93  86

$S3 [1] 95 86 90 90 75 83 96 85 83 84 81 98 77 94 84 89 93 99 91 77 [21] 95 90 91 87 85 76 99 99 97 97 97 77 93 96 90 87 97 88$S4
[1] 67 93 63 83 87 97 96 92 93 96 87 90 94 90 82 91 85 93 83 90
[21] 87 99 94 88 90 72 81 93 93 94 97 89 96 95 82 97

lapply(list_name, function)

lapply(scores, length)
$S1 [1] 36$S2
[1] 39

$S3 [1] 38$S4
[1] 36

sapply(list_name, function)

> sapply(scores, length)
S1 S2 S3 S4
36 39 38 36
> sapply(scores, mean)
S1       S2       S3       S4
88.77778 89.79487 89.23684 88.86111
> sapply(scores, sd)
S1        S2        S3        S4
7.720515 10.543592  7.178926  8.208542

If the called function returns a vector, sapply will form the results into a matrix. The range function, for example, returns a two-element vector:

> sapply(scores, range)
S1  S2 S3 S4
[1,] 68  60 75 63
[2,] 98 100 99 99

If the called function returns a structured object, such as a list, then you will need to use lapply rather than sapply. Structured objects cannot be put into a vector. Suppose we want to perform a t test on every semester. The t.test function returns a list, so we must use lapply:

> tests <- lapply(scores, t.test)

# Applying a Function to Every Row

> longdata<- c(-1.850152, -1.406571, -1.0104817, -3.7170704,
-0.2804896, 0.9496313, 1.346517, -0.1580926, 1.6272786,
-2.4483321, -0.5407272, -1.708678, -0.3480616, -0.2757667,
-1.2177024)

> long <- matrix(longdata, 3,5)
> colnames(long) <- c("trial1","trial2","trial3","trial4","trial5")
> rownames(long) <- c("Moe", "Larry", "Curly")

> long
trial1     trial2     trial3     trial4     trial5
Moe   -1.850152 -3.7170704  1.3465170  2.4483321 -0.3480616
Larry -1.406571 -0.2804896 -0.1580926 -0.5407272 -0.2757667
Curly -1.010482  0.9496313  1.6272786 -1.7086780 -1.2177024
apply(long, 1, mean)
Moe      Larry      Curly
-1.6529530  1.2427334 -0.8181872
apply(long, 1, range)
Moe      Larry      Curly
[1,] -3.7170704 -0.1580926 -1.7086779
[2,] -0.2804896  2.4483321 -0.2757667

# Applying a Function to Every Column

apply(matrix, 2, function)

1 → row by row
2 → column by column

# Applying a Function to Groups of Data

> tapply(vector, factor, function)
suburbs.csv
city	county	state	pop
Chicago	Cook	IL	2853114
Kenosha	Kenosha	WI	90352
Aurora	Kane	IL	171782
Elgin	Kane	IL	94487
Gary	Lake(IN)	IN	102746
Joliet	Kendall	IL	106221
Naperville	DuPage	IL	147779
Arlington Heights	Cook	IL	76031
Bolingbrook	Will	IL	70834
Cicero	Cook	IL	72616
Evanston	Cook	IL	74239
Hammond	Lake(IN)	IN	83048
Palatine	Cook	IL	67232
Schaumburg	Cook	IL	75386
Skokie	Cook	IL	63348
Waukegan	Lake(IL)	IL	91452
suburbs <- read.csv("suburbs.csv", head=T, sep="	")
suburbs <- read.csv("http://commres.net/wiki/_export/code/r/data_transformations?codeblock=15", head=T, sep="	")
> attach(suburbs)
> pop
[1] 2853114   90352  171782   94487  102746  106221  147779   76031   70834
[10]   72616   74239   83048   67232   75386   63348   91452
We can easily compute sums and averages for all the cities:
> sum(pop)
[1] 4240667
> mean(pop)
[1] 265041.7

factors by county = 8

> county
[1] Cook     Kenosha  Kane     Kane     Lake(IN) Kendall
[7] DuPage   Cook     Will     Cook     Cook     Lake(IN)
[13] Cook     Cook     Cook     Lake(IL)
8 Levels: Cook DuPage Kane Kendall Kenosha Lake(IL) ... Will
> tapply(pop, county, sum)
Cook   DuPage     Kane  Kendall  Kenosha Lake(IL) Lake(IN)     Will
3281966   147779   266269   106221    90352    91452   185794    70834

> tapply(pop,county,mean)
Cook   DuPage     Kane  Kendall  Kenosha Lake(IL) Lake(IN)     Will
468852.3 147779.0 133134.5 106221.0  90352.0  91452.0  92897.0  70834.0 

The function given to tapply should expect a single argument: a vector containing all the members of one group. A good example is the length function, which takes a vector parameter and returns the vector’s length. Use it to count the number of data in each group; in this case, the number of cities in each county:

> tapply(pop,county,length)
Cook   DuPage     Kane  Kendall  Kenosha Lake(IL) Lake(IN)     Will
7        1        2        1        1        1        2        1

# Applying a Function to Groups of Rows

> by(dfrm, fact, fun)

dfrm = the data frame,
fact = grouping factor,
fun = function. The function should expect one argument, a data frame.

library("MASS")
sel <- Cars93[c("Origin", "Manufacturer", "MPG.city", "MPG.highway", "EngineSize")]

> by(sel, sel$Orig, summary) sel$Orig: USA
Origin       Manufacturer    MPG.city      MPG.highway      EngineSize
USA    :48   Chevrolet : 8    Min.   :15.00   Min.   :20.00   Min.   :1.300
non-USA: 0   Ford      : 8    1st Qu.:18.00   1st Qu.:26.00   1st Qu.:2.200
Dodge     : 6    Median :20.00   Median :28.00   Median :3.000
Pontiac   : 5    Mean   :20.96   Mean   :28.15   Mean   :3.067
Buick     : 4    3rd Qu.:23.00   3rd Qu.:30.00   3rd Qu.:3.800
Oldsmobile: 4    Max.   :31.00   Max.   :41.00   Max.   :5.700
(Other)   :13
------------------------------------------------------------------
sel$Orig: non-USA Origin Manufacturer MPG.city MPG.highway EngineSize USA : 0 Mazda : 5 Min. :17.00 Min. :21.00 Min. :1.000 non-USA:45 Hyundai : 4 1st Qu.:19.00 1st Qu.:25.00 1st Qu.:1.600 Nissan : 4 Median :22.00 Median :30.00 Median :2.200 Toyota : 4 Mean :23.87 Mean :30.09 Mean :2.242 Volkswagen: 4 3rd Qu.:26.00 3rd Qu.:33.00 3rd Qu.:2.800 Honda : 3 Max. :46.00 Max. :50.00 Max. :4.500 (Other) :21  tapply(suburbs$pop, suburbs$state, summary) by(suburbs$pop, suburbs\$state, summary)