> v <- c(40,2,83,28,58)
> f <- factor(c("A","C","C","B","C"))
> library(MASS) Warning message: 패키지 ‘MASS’는 R 버전 3.2.5에서 작성되었습니다 > split(Cars93$MPG.city, Cars93$Origin) # Origin별로 MPG.city를 나눠라 $USA [1] 22 19 16 19 16 16 25 25 19 21 18 15 [13] 17 17 20 23 20 29 23 22 17 21 18 29 [25] 20 31 23 22 22 24 15 21 18 17 18 23 [37] 19 24 23 18 19 23 31 23 19 19 19 28 $`non-USA` [1] 25 18 20 19 22 46 30 24 42 24 29 22 [13] 26 20 17 18 18 29 28 26 18 17 20 19 [25] 29 18 29 24 17 21 20 33 25 23 39 32 [37] 25 22 18 25 17 21 18 21 20
Comparison between (among) groups
> g <- split(Cars93$MPG.city, Cars93$Origin)
> g
$USA
 [1] 22 19 16 19 16 16 25 25 19 21 18 15
[13] 17 17 20 23 20 29 23 22 17 21 18 29
[25] 20 31 23 22 22 24 15 21 18 17 18 23
[37] 19 24 23 18 19 23 31 23 19 19 19 28
$`non-USA`
 [1] 25 18 20 19 22 46 30 24 42 24 29 22
[13] 26 20 17 18 18 29 28 26 18 17 20 19
[25] 29 18 29 24 17 21 20 33 25 23 39 32
[37] 25 22 18 25 17 21 18 21 20
> mean(g$USA)
[1] 20.95833
> mean(g$`non-USA`)
[1] 23.86667
> 
# or
> sapply(g, mean)
     USA  non-USA 
20.95833 23.86667 
# or retain list format
> lapply(g, mean)
$USA
[1] 20.95833
$`non-USA`
[1] 23.86667
S1 <- c(89, 85, 85, 86, 88, 89, 86, 82, 96, 85, 93, 91, 
        98, 87, 94, 77, 87, 98, 85, 89, 95, 85, 93, 93, 
        97, 71, 97, 93, 75, 68, 98, 95, 79, 94, 98, 95)
S2 <- c(60, 98, 94, 95, 99, 97, 100, 73, 93, 91, 98, 
        86, 66, 83, 77, 97, 91, 93, 71, 91, 95, 100, 
        72, 96, 91, 76, 100, 97, 99, 95, 97, 77, 94, 
        99, 88, 100, 94, 93, 86)
S3 <- c(95, 86, 90, 90, 75, 83, 96, 85, 83, 84, 81, 98, 
        77, 94, 84, 89, 93, 99, 91, 77, 95, 90, 91, 87, 
        85, 76, 99, 99, 97, 97, 97, 77, 93, 96, 90, 87, 
        97, 88)
S4 <- c(67, 93, 63, 83, 87, 97, 96, 92, 93, 96, 87, 90, 
        94, 90, 82, 91, 85, 93, 83, 90, 87, 99, 94, 88, 
        90, 72, 81, 93, 93, 94, 97, 89, 96, 95, 82, 97)
scores <- list(S1=S1,S2=S2,S3=S3,S4=S4)
scores $S1 [1] 89 85 85 86 88 89 86 82 96 85 93 91 98 87 94 77 87 98 85 89 [21] 95 85 93 93 97 71 97 93 75 68 98 95 79 94 98 95 $S2 [1] 60 98 94 95 99 97 100 73 93 91 98 86 66 83 77 [16] 97 91 93 71 91 95 100 72 96 91 76 100 97 99 95 [31] 97 77 94 99 88 100 94 93 86 $S3 [1] 95 86 90 90 75 83 96 85 83 84 81 98 77 94 84 89 93 99 91 77 [21] 95 90 91 87 85 76 99 99 97 97 97 77 93 96 90 87 97 88 $S4 [1] 67 93 63 83 87 97 96 92 93 96 87 90 94 90 82 91 85 93 83 90 [21] 87 99 94 88 90 72 81 93 93 94 97 89 96 95 82 97
lapply(list_name, function)
lapply(scores, length) $S1 [1] 36 $S2 [1] 39 $S3 [1] 38 $S4 [1] 36
sapply(list_name, function)
> sapply(scores, length) S1 S2 S3 S4 36 39 38 36
> sapply(scores, mean)
      S1       S2       S3       S4 
88.77778 89.79487 89.23684 88.86111 
> sapply(scores, sd)
       S1        S2        S3        S4 
 7.720515 10.543592  7.178926  8.208542
If the called function returns a vector, sapply will form the results into a matrix. The range function, for example, returns a two-element vector:
> sapply(scores, range)
     S1  S2 S3 S4
[1,] 68  60 75 63
[2,] 98 100 99 99
If the called function returns a structured object, such as a list, then you will need to use lapply rather than sapply. Structured objects cannot be put into a vector. Suppose we want to perform a t test on every semester. The t.test function returns a list, so we must use lapply:
> tests <- lapply(scores, t.test)
> longdata<- c(-1.850152, -1.406571, -1.0104817, -3.7170704, 
           -0.2804896, 0.9496313, 1.346517, -0.1580926, 1.6272786, 
           -2.4483321, -0.5407272, -1.708678, -0.3480616, -0.2757667, 
           -1.2177024)
> long <- matrix(longdata, 3,5)
> colnames(long) <- c("trial1","trial2","trial3","trial4","trial5")
> rownames(long) <- c("Moe", "Larry", "Curly")
> long
         trial1     trial2     trial3     trial4     trial5
Moe   -1.850152 -3.7170704  1.3465170  2.4483321 -0.3480616
Larry -1.406571 -0.2804896 -0.1580926 -0.5407272 -0.2757667
Curly -1.010482  0.9496313  1.6272786 -1.7086780 -1.2177024
apply(long, 1, mean)
       Moe      Larry      Curly 
-1.6529530  1.2427334 -0.8181872
apply(long, 1, range)
            Moe      Larry      Curly
[1,] -3.7170704 -0.1580926 -1.7086779
[2,] -0.2804896  2.4483321 -0.2757667
apply(matrix, 2, function)
1 → row by row
2 → column by column
> tapply(vector, factor, function)
city county state pop Chicago Cook IL 2853114 Kenosha Kenosha WI 90352 Aurora Kane IL 171782 Elgin Kane IL 94487 Gary Lake(IN) IN 102746 Joliet Kendall IL 106221 Naperville DuPage IL 147779 Arlington Heights Cook IL 76031 Bolingbrook Will IL 70834 Cicero Cook IL 72616 Evanston Cook IL 74239 Hammond Lake(IN) IN 83048 Palatine Cook IL 67232 Schaumburg Cook IL 75386 Skokie Cook IL 63348 Waukegan Lake(IL) IL 91452
suburbs <- read.csv("suburbs.csv", head=T, sep="	")
suburbs <- read.csv("http://commres.net/wiki/_export/code/r/data_transformations?codeblock=15", head=T, sep="	")
> attach(suburbs) > pop [1] 2853114 90352 171782 94487 102746 106221 147779 76031 70834 [10] 72616 74239 83048 67232 75386 63348 91452 We can easily compute sums and averages for all the cities: > sum(pop) [1] 4240667 > mean(pop) [1] 265041.7
factors by county = 8
> county [1] Cook Kenosha Kane Kane Lake(IN) Kendall [7] DuPage Cook Will Cook Cook Lake(IN) [13] Cook Cook Cook Lake(IL) 8 Levels: Cook DuPage Kane Kendall Kenosha Lake(IL) ... Will
> tapply(pop, county, sum)
    Cook   DuPage     Kane  Kendall  Kenosha Lake(IL) Lake(IN)     Will 
 3281966   147779   266269   106221    90352    91452   185794    70834 
> tapply(pop,county,mean)
    Cook   DuPage     Kane  Kendall  Kenosha Lake(IL) Lake(IN)     Will 
468852.3 147779.0 133134.5 106221.0  90352.0  91452.0  92897.0  70834.0 
The function given to tapply should expect a single argument: a vector containing all the members of one group. A good example is the length function, which takes a vector parameter and returns the vector’s length. Use it to count the number of data in each group; in this case, the number of cities in each county:
> tapply(pop,county,length)
    Cook   DuPage     Kane  Kendall  Kenosha Lake(IL) Lake(IN)     Will 
       7        1        2        1        1        1        2        1
> by(dfrm, fact, fun)
dfrm = the data frame, 
fact = grouping factor, 
fun = function. The function should expect one argument, a data frame.
library("MASS")
sel <- Cars93[c("Origin", "Manufacturer", "MPG.city", "MPG.highway", "EngineSize")]
> by(sel, sel$Orig, summary)
sel$Orig: USA
     Origin       Manufacturer    MPG.city      MPG.highway      EngineSize   
 USA    :48   Chevrolet : 8    Min.   :15.00   Min.   :20.00   Min.   :1.300  
 non-USA: 0   Ford      : 8    1st Qu.:18.00   1st Qu.:26.00   1st Qu.:2.200  
              Dodge     : 6    Median :20.00   Median :28.00   Median :3.000  
              Pontiac   : 5    Mean   :20.96   Mean   :28.15   Mean   :3.067  
              Buick     : 4    3rd Qu.:23.00   3rd Qu.:30.00   3rd Qu.:3.800  
              Oldsmobile: 4    Max.   :31.00   Max.   :41.00   Max.   :5.700  
              (Other)   :13                                                   
------------------------------------------------------------------ 
sel$Orig: non-USA
     Origin       Manufacturer    MPG.city      MPG.highway      EngineSize   
 USA    : 0   Mazda     : 5    Min.   :17.00   Min.   :21.00   Min.   :1.000  
 non-USA:45   Hyundai   : 4    1st Qu.:19.00   1st Qu.:25.00   1st Qu.:1.600  
              Nissan    : 4    Median :22.00   Median :30.00   Median :2.200  
              Toyota    : 4    Mean   :23.87   Mean   :30.09   Mean   :2.242  
              Volkswagen: 4    3rd Qu.:26.00   3rd Qu.:33.00   3rd Qu.:2.800  
              Honda     : 3    Max.   :46.00   Max.   :50.00   Max.   :4.500  
              (Other)   :21                                                   
tapply(suburbs$pop, suburbs$state, summary)
by(suburbs$pop, suburbs$state, summary)