User Tools

Site Tools


r:general_statistics

This is an old revision of the document!


Null Hypotheses, Alternative Hypotheses, and p-Values

9.1. Summarizing Your Data

library(MASS)    # to include Cars93 data

> summary(Cars93$Manufacturer)
        Acura          Audi           BMW         Buick 
            2             2             1             4 
     Cadillac     Chevrolet      Chrylser      Chrysler 
            2             8             1             2 
        Dodge         Eagle          Ford           Geo 
            6             2             8             2 
        Honda       Hyundai      Infiniti         Lexus 
            3             4             1             2 
      Lincoln         Mazda Mercedes-Benz       Mercury 
            2             5             2             2 
   Mitsubishi        Nissan    Oldsmobile      Plymouth 
            2             4             4             1 
      Pontiac          Saab        Saturn        Subaru 
            5             1             1             3 
       Suzuki        Toyota    Volkswagen         Volvo 
            1             4             4             2 
> summary(Cars93$MPG.city)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  15.00   18.00   21.00   22.37   25.00   46.00 
> summary(Cars93$MPG.highway)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  20.00   26.00   28.00   29.09   31.00   50.00 
suburbs.csv
city	county	state	pop
Chicago	Cook	IL	2853114
Kenosha	Kenosha	WI	90352
Aurora	Kane	IL	171782
Elgin	Kane	IL	94487
Gary	Lake(IN)	IN	102746
Joliet	Kendall	IL	106221
Naperville	DuPage	IL	147779
Arlington Heights	Cook	IL	76031
Bolingbrook	Will	IL	70834
Cicero	Cook	IL	72616
Evanston	Cook	IL	74239
Hammond	Lake(IN)	IN	83048
Palatine	Cook	IL	67232
Schaumburg	Cook	IL	75386
Skokie	Cook	IL	63348
Waukegan	Lake(IL)	IL	91452
suburbs <- read.csv("suburbs.csv", head=T, sep="	")
suburbs <- read.csv("http://commres.net/wiki/_export/code/r/general_statistics?codeblock=1", head=T, sep="\t")
> summary(suburbs)
       X                        city         county 
 Min.   : 1.00   Arlington Heights: 1   Cook    :7  
 1st Qu.: 4.75   Aurora           : 1   Kane    :2  
 Median : 8.50   Bolingbrook      : 1   Lake(IN):2  
 Mean   : 8.50   Chicago          : 1   DuPage  :1  
 3rd Qu.:12.25   Cicero           : 1   Kendall :1  
 Max.   :16.00   Elgin            : 1   Kenosha :1  
                 (Other)          :10   (Other) :2  
 state        pop         
 IL:13   Min.   :  63348  
 IN: 2   1st Qu.:  73833  
 WI: 1   Median :  86700  
         Mean   : 265042  
         3rd Qu.: 103615  
         Max.   :2853114  
                        

Calculating Relative Frequencies

> mean(Cars93$MPG.city > 14)  # see the summary(Cars93$MPG.city) the above = 100%, min = 15이므로
[1] 1

x <- Cars93$MPG.city      # city mileage data

mean(abs(x-mean(x)) > 2*sd(x))  # fraction of observation that exceed two sd from the city mileage mean
[1] 0.03225806

Tabulating Factors and Creating Contingency Tables

> table(Cars93$Manufacturer, Cars93$Cylinders)
               
                3 4 5 6 8 rotary
  Acura         0 1 0 1 0      0
  Audi          0 0 0 2 0      0
  BMW           0 1 0 0 0      0
  Buick         0 1 0 3 0      0
  Cadillac      0 0 0 0 2      0
  Chevrolet     0 3 0 3 2      0
  Chrylser      0 0 0 1 0      0
  Chrysler      0 1 0 1 0      0
  Dodge         0 4 0 2 0      0
  Eagle         0 1 0 1 0      0
  Ford          0 5 0 2 1      0
  Geo           1 1 0 0 0      0
  Honda         0 3 0 0 0      0
  Hyundai       0 4 0 0 0      0
  Infiniti      0 0 0 0 1      0
  Lexus         0 0 0 2 0      0
  Lincoln       0 0 0 1 1      0
  Mazda         0 3 0 1 0      1
  Mercedes-Benz 0 1 0 1 0      0
  Mercury       0 1 0 1 0      0
  Mitsubishi    0 1 0 1 0      0
  Nissan        0 2 0 2 0      0
  Oldsmobile    0 2 0 2 0      0
  Plymouth      0 1 0 0 0      0
  Pontiac       0 2 0 3 0      0
  Saab          0 1 0 0 0      0
  Saturn        0 1 0 0 0      0
  Subaru        1 2 0 0 0      0
  Suzuki        1 0 0 0 0      0
  Toyota        0 4 0 0 0      0
  Volkswagen    0 2 1 1 0      0
  Volvo         0 1 1 0 0      0
> 
> attach(suburbs)
> table(city,state)
                   state
city                IL IN WI
  Arlington Heights  1  0  0
  Aurora             1  0  0
  Bolingbrook        1  0  0
  Chicago            1  0  0
  Cicero             1  0  0
  Elgin              1  0  0
  Evanston           1  0  0
  Gary               0  1  0
  Hammond            0  1  0
  Joliet             1  0  0
  Kenosha            0  0  1
  Naperville         1  0  0
  Palatine           1  0  0
  Schaumburg         1  0  0
  Skokie             1  0  0
  Waukegan           1  0  0
> 

Testing Categorical Variables for Independence

Smoke column records the students smoking habit, while the Exer column records their exercise level. The allowed values in Smoke are “Heavy”, “Regul” (regularly), “Occas” (occasionally) and “Never”. As for Exer, they are “Freq” (frequently), “Some” and “None”.

We can tally the students smoking habit against the exercise level with the table function in R. The result is called the contingency table of the two variables.

> library(MASS)       # load the MASS package 
> tbl = table(survey$Smoke, survey$Exer) 
> tbl                 # the contingency table 
       
        Freq None Some
  Heavy    7    1    3
  Never   87   18   84
  Occas   12    3    4
  Regul    9    1    7
 
> summary(tbl)
Number of cases in table: 236 
Number of factors: 2 
Test for independence of all factors:
	Chisq = 5.489, df = 6, p-value = 0.4828
	Chi-squared approximation may be incorrect

> chisq.test(tbl) 

	Pearson's Chi-squared test

data:  tbl
X-squared = 5.4885, df = 6, p-value = 0.4828
> library(MASS)
> cardata <- data.frame(Cars93$Origin, Cars93$Type)
> cardata
   Cars93.Origin Cars93.Type
1        non-USA       Small
2        non-USA     Midsize
3        non-USA     Compact
4        non-USA     Midsize
5        non-USA     Midsize
6            USA     Midsize
7            USA       Large
8            USA       Large
9            USA     Midsize
10           USA       Large
11           USA     Midsize
12           USA     Compact
13           USA     Compact
14           USA      Sporty
15           USA     Midsize
16           USA         Van
17           USA         Van
18           USA       Large
19           USA      Sporty
20           USA       Large
21           USA     Compact
22           USA       Large
23           USA       Small
24           USA       Small
25           USA     Compact
26           USA         Van
27           USA     Midsize
28           USA      Sporty
29           USA       Small
30           USA       Large
31           USA       Small
32           USA       Small
33           USA     Compact
34           USA      Sporty
35           USA      Sporty
36           USA         Van
37           USA     Midsize
38           USA       Large
39       non-USA       Small
40       non-USA      Sporty
41       non-USA      Sporty
42       non-USA       Small
43       non-USA     Compact
44       non-USA       Small
45       non-USA       Small
46       non-USA      Sporty
47       non-USA     Midsize
48       non-USA     Midsize
49       non-USA     Midsize
50       non-USA     Midsize
51           USA     Midsize
52           USA       Large
53       non-USA       Small
54       non-USA       Small
55       non-USA     Compact
56       non-USA         Van
57       non-USA      Sporty
58       non-USA     Compact
59       non-USA     Midsize
60           USA      Sporty
61           USA     Midsize
62       non-USA       Small
63       non-USA     Midsize
64       non-USA       Small
65       non-USA     Compact
66       non-USA         Van
67       non-USA     Midsize
68           USA     Compact
69           USA     Midsize
70           USA         Van
71           USA       Large
72           USA      Sporty
73           USA       Small
74           USA     Compact
75           USA      Sporty
76           USA     Midsize
77           USA       Large
78       non-USA     Compact
79           USA       Small
80       non-USA       Small
81       non-USA       Small
82       non-USA     Compact
83       non-USA       Small
84       non-USA       Small
85       non-USA      Sporty
86       non-USA     Midsize
87       non-USA         Van
88       non-USA       Small
89       non-USA         Van
90       non-USA     Compact
91       non-USA      Sporty
92       non-USA     Compact
93       non-USA     Midsize
> cartbl <- table(cardata)
> cartbl
             Cars93.Type
Cars93.Origin Compact Large Midsize Small Sporty Van
      USA           7    11      10     7      8   5
      non-USA       9     0      12    14      6   4
> summary(cartbl)
Number of cases in table: 93 
Number of factors: 2 
Test for independence of all factors:
	Chisq = 14.08, df = 5, p-value = 0.01511
	Chi-squared approximation may be incorrect

> chisq.test(cartbl)

	Pearson's Chi-squared test

data:  cartbl
X-squared = 14.08, df = 5, p-value = 0.01511

Warning message:
In chisq.test(cartbl) : 카이제곱 approximation은 정확하지 않을수도 있습니다
> 

Calculating Quantiles (and Quartiles) of a Dataset

Data faithful

> duration = faithful$eruptions     # the eruption durations 
> quantile(duration)                # apply the quantile function 
    0%    25%    50%    75%   100% 
1.6000 2.1627 4.0000 4.4543 5.1000

> quantile(faithful$eruptions, c(.025,.975))
    2.5%    97.5% 
1.750000 4.907425 

Inverting a Quantile

> dur <- faithful$eruptions
> dur > mean(dur)
  [1]  TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE
 [15]  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE
 [29]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE FALSE
 [43]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE
 [57]  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE
 [71]  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
 [85]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE
 [99] FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE
[113]  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE
[127] FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
[141]  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE
[155]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE
[169] FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE
[183]  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE
[197]  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE
[211] FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
[225]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE
[239]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
[253]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE
[267]  TRUE  TRUE FALSE  TRUE FALSE  TRUE 
> mean(dur > mean(dur))
[1] 0.6176471

Converting Data to Z-Scores

> scale(dur)
> (dur - mean(dur)) / sd(dur)
  [1]  0.09831763 -1.47873278 -0.13561152 -1.05555759
  [5]  0.91575542 -0.52987412  1.06207065  0.09831763
  [9] -1.34731192  0.75542196 -1.44982019  0.37605373
 [13]  0.62400110 -1.52253974  1.06207065 -1.15718973
 [17] -1.52253974  1.14968457 -1.65396061  0.66780805
 [21] -1.47873278 -1.52253974 -0.03310324 -0.36866452
 [25]  0.91575542  0.09831763 -1.33241755  0.52149282
 [29]  0.31735241  0.82814151  0.71161501  0.85793024
 [33] -0.10582279  0.47768586  0.30245804 -1.28861060
 [37] -1.42003146  1.17859716 -1.44982019  1.13479020
 [41]  0.75542196 -1.40601324  0.94554415 -1.52253974
 [45]  0.91575542 -0.14962974  0.30245804 -1.21589105
 [49]  1.00336933 -1.30350496  1.14968457  1.07608888
 [53] -1.44982019  1.17859716 -1.53743411  1.22240411
 [57]  0.20082590 -1.59525929  0.94554415  0.72650937
 [61] -1.09936455  0.88684283 -1.52253974  1.14968457
 [65] -1.46383842  0.79922892  0.59508851  1.06207065
 [69] -1.24480364  1.06207065  0.47768586 -1.33241755
 [73]  0.88684283  0.44877327 -1.31839933  1.38361371
 [77] -1.28861060  0.94554415  0.34626500  0.09831763
 [81]  0.56529978  0.74052760  0.53638718 -0.74890890
 [85]  0.50747459  1.26621107  0.40496632  0.90173720
 [89] -1.15718973  0.44877327 -1.12827714  0.74052760
 [93] -1.42003146  1.16457893 -1.44982019  0.71161501
 [97]  1.03315806  0.22973849 -1.42003146  1.23729848
[101] -0.88032977  0.77031633 -1.21589105  0.88684283
[105]  0.49258023 -1.42003146  1.06207065 -1.49362715
[109]  1.19349152  0.17103717  1.09098325 -1.04066323
[113]  1.23729848  0.81412328 -1.56634670  1.00336933
[117] -1.02576886  0.97445674 -1.46383842  0.81412328
[121] -0.76292713  0.50747459  0.66780805 -1.33241755
[125]  0.97445674  0.24463286 -1.37622451  0.88684283
[129] -1.06957582  1.01826370 -1.42003146  0.59508851
[133] -0.60259367  0.74052760 -1.44982019  0.78433455
[137] -1.40601324  1.26621107 -1.27459237  0.21484413
[141]  0.65291369 -1.09936455  0.91575542  1.16457893
[145]  0.74052760 -1.31839933  1.00336933 -1.28861060
[149]  1.41252630 -1.47873278  1.35382498  0.44877327
[153] -0.95304931  0.97445674  0.06940504  0.44877327
[157]  0.88684283  0.52149282 -1.47873278  0.41986068
[161] -1.12827714  0.58019414 -1.30350496  0.30245804
[165]  0.01070371  0.95956238 -0.98196191  1.32491239
[169] -1.36220628  0.98935111 -1.37622451 -1.23078541
[173]  0.95956238 -0.13561152  0.59508851  0.74052760
[177]  0.88684283 -0.93815495  0.44877327  0.59508851
[181] -1.40601324  0.95956238  0.66780805  0.24463286
[185] -1.27459237  0.82814151  0.52149282 -1.44982019
[189]  0.81412328 -1.14317150  1.14968457 -1.44982019
[193]  1.14968457  0.53638718  0.41898454  0.65291369
[197]  0.01070371  0.76944019 -1.08447018  1.03315806
[201] -1.21589105  0.75542196  0.56529978 -1.42003146
[205]  0.97445674 -1.49362715  0.77031633  0.31735241
[209] -1.36220628  0.88684283 -0.96794368  1.06207065
[213] -1.42003146  0.30245804 -0.06201583  0.65291369
[217] -0.95304931  1.14968457 -1.30350496  0.58019414
[221] -1.42003146  0.68270242 -1.52253974  0.87194847
[225]  0.44877327  0.55128155  0.52149282  0.68270242
[229]  0.37605373  0.93064979  0.52149282 -0.93815495
[233]  0.60910673 -1.11338277  0.84303588 -1.40601324
[237] -1.43492583  0.69672064  0.40496632 -1.01175064
[241]  0.58019414 -0.99685627  1.26621107 -0.51497976
[245]  0.95956238  0.30245804 -1.23078541  0.77031633
[249] -1.18697846  0.75542196 -1.12827714  0.84303588
[253]  0.06940504  0.88684283  0.58019414  0.28843981
[257]  0.37605373  0.84303588 -1.30350496  0.69672064
[261]  1.12077198  0.91575542 -1.43492583  0.66780805
[265] -1.31839933 -1.08447018  1.10587761  0.55128155
[269] -1.17208409  0.81412328 -1.46383842  0.85793024
> 

> zdur <- (dur - mean(dur)) / sd(dur)
> mean(zdur)
[1] 8.972251e-17
> round(mean(zdur))
[1] 0
> round(sd(zdur))
[1] 1

Testing the Mean of a Sample (t Test)

> x <- rnorm(50, mean=100, sd=15)
> x
 [1] 131.29017  97.35285  68.55689 119.24865 114.97441
 [6] 110.92271  87.44801 107.84821  96.40073  94.05540
[11] 103.92445 108.29920 103.30896  90.64378 101.93417
[16] 104.98465 104.78447 101.35980 132.19438  93.98066
[21]  66.58195  88.89819  99.72429  67.95182  72.04780
[26]  59.89571 110.21253  93.68151  94.66022 109.50416
[31]  79.13363 120.83159  84.41475  89.10295 112.79365
[36]  97.52189 106.10858  69.67159  99.79406  91.11620
[41] 112.55720  86.77234  75.10422 122.06707  70.24902
[46] 101.42973 106.64096  76.63938  67.97055  93.60273
> t.test(x, mu=95)

	One Sample t-test

data:  x
t = 0.40834, df = 49, p-value = 0.6848
alternative hypothesis: true mean is not equal to 95
95 percent confidence interval:
  91.0636 100.9441
sample estimates:
mean of x 
 96.00386 

> t.test(x, mu=100)

        One Sample t-test

data:  x 
t = 0.3372, df = 49, p-value = 0.7374
alternative hypothesis: true mean is not equal to 100 
95 percent confidence interval:
  97.16167 103.98297 
sample estimates:
mean of x 
 100.5723 

My students who was taking my method of learning will be different from the population whose mean is 60.

a = c(65, 78, 88, 55, 48, 95, 66, 57, 79, 81)

> t.test(a, mu=60)

	One Sample t-test

data:  a
t = 2.3079, df = 9, p-value = 0.0464
alternative hypothesis: true mean is not equal to 60
95 percent confidence interval:
 60.22187 82.17813
sample estimates:
mean of x 
     71.2 


qt(0.975, 9)
[1] 2.262157

Forming a Confidence Interval for a Mean

> set.seed(1024)
> x <- rnorm(50, mean=100, sd=15)
> s <- sd(x)
> m <- mean(x)
> n <- length(x)
> n
[1] 50
> m
[1] 96.00386
> s
[1] 17.38321
> SE <- s / sqrt(n)
> SE
[1] 2.458358
## qt fun: qt(prob, df) zscore 2점에 해당하는 점수는?
> qtv <- qt(.975, df=n-1)
> qtv
[1] 
## qtv는 2에 해당하는 95퍼센트 CL 
## 이 때의 CI는 
> E <- qtv*SE 
> E
[1] 4.940254
> m + c(-E, E)
[1]  91.0636 100.9441
> 
> t.test(x)

	One Sample t-test

data:  x
t = 39.052, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
  91.0636 100.9441
sample estimates:
mean of x 
 96.00386 

Testing for Normality

> shapiro.test(x)

	Shapiro-Wilk normality test

data:  x
W = 0.97415, p-value = 0.3386

The large p-value suggests the underlying population could be normally distributed. The next example reports a small p-value for y, so it is unlikely that this sample came from a normal population:

normal distribution assumed → var.equal=T
normal distribution not assumed → var.equal=F

Comparing the Means of Two Samples

> mtcars
                     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
> 

> mtcars$mpg
 [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4
[16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 15.8 19.7
[31] 15.0 21.4
> mtcars$am    # 0 = auto   1 = manual
 [1] 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1
> 
L = mtcars$am == 0 
mpg.auto = mtcars[L,]$mpg 
mpg.auto                    # automatic transmission mileage 
 [1] 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4 10.4 14.7 21.5 15.5 15.2
[18] 13.3 19.2

mpg.manual = mtcars[!L,]$mpg 
mpg.manual                  # manual transmission mileage 
 [1] 21.0 21.0 22.8 32.4 30.4 33.9 27.3 26.0 30.4 15.8 19.7 15.0 21.4

t.test(mpg.auto, mpg.manual)

	Welch Two Sample t-test

data:  mpg.auto and mpg.manual
t = -3.7671, df = 18.332, p-value = 0.001374
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -11.280194  -3.209684
sample estimates:
mean of x mean of y 
 17.14737  24.39231 

OR

> t.test(mtcars$mpg~mtcars$am)

	Welch Two Sample t-test

data:  mtcars$mpg by mtcars$am
t = -3.7671, df = 18.332, p-value = 0.001374
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -11.280194  -3.209684
sample estimates:
mean in group 0 mean in group 1 
       17.14737        24.39231 

Another eg.

> a = c(175, 168, 168, 190, 156, 181, 182, 175, 174, 179)
> b = c(185, 169, 173, 173, 188, 186, 175, 174, 179, 180)
> t.test(a,b, var.equal=TRUE, paired=FALSE)

Two Sample t-test

data: a and b
t = -0.9474, df = 18, p-value = 0.356
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
  -10.93994 4.13994
sample estimates:
mean of x mean of y
    174.8     178.2


> qt(0.975, 18)
[1] 2.100922
> var.test(a,b)

     F test to compare two variances

data: a and b
F = 2.1028, num df = 9, denom df = 9, p-value = 0.2834
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
   0.5223017 8.4657950
sample estimates:
 ratio of variances
2.102784


> qf(0.95, 9, 9)
[1] 3.178893

Tabulated value of F for alpha = 0.05,
degrees of freedom of numerator = 9,
and degrees of freedom of denominator = 9,
using the function qf(p, df.num, df.den):

e.g.,

> sleep
>    extra group ID
> 1    0.7     1  1
> 2   -1.6     1  2
> 3   -0.2     1  3
> 4   -1.2     1  4
> 5   -0.1     1  5
> 6    3.4     1  6
> 7    3.7     1  7
> 8    0.8     1  8
> 9    0.0     1  9
> 10   2.0     1 10
> 11   1.9     2  1
> 12   0.8     2  2
> 13   1.1     2  3
> 14   0.1     2  4
> 15  -0.1     2  5
> 16   4.4     2  6
> 17   5.5     2  7
> 18   1.6     2  8
> 19   4.6     2  9
> 20   3.4     2 10
> sleep_wide <- data.frame(
    ID=1:10,
    group1=sleep$extra[1:10],
    group2=sleep$extra[11:20]
)
sleep_wide
>    ID group1 group2
> 1   1    0.7    1.9
> 2   2   -1.6    0.8
> 3   3   -0.2    1.1
> 4   4   -1.2    0.1
> 5   5   -0.1   -0.1
> 6   6    3.4    4.4
> 7   7    3.7    5.5
> 8   8    0.8    1.6
> 9   9    0.0    4.6
> 10 10    2.0    3.4

Ignore the ID variable for a convenience.

# Welch t-test
t.test(extra ~ group, sleep)
> 
> 	Welch Two Sample t-test
> 
> data:  extra by group
> t = -1.8608, df = 17.776, p-value = 0.07939
> alternative hypothesis: true difference in means is not equal to 0
> 95 percent confidence interval:
>  -3.3654832  0.2054832
> sample estimates:
> mean in group 1 mean in group 2 
>            0.75            2.33

# Same for wide data (two separate vectors)
> t.test(sleep_wide$group1, sleep_wide$group2)

t.test does not assume equal variances, which means it tests Welch's Two sample t-test. In this case, the df = df=17.776. To use Student’s t-test, which assumes equal variance between the two group, use set var.equal=TRUE. In this case, df = 18 (n-2).

# Student t-test
> t.test(extra ~ group, sleep, var.equal=TRUE)
> 
> 	Two Sample t-test
> 
> data:  extra by group
> t = -1.8608, df = 18, p-value = 0.07919
> alternative hypothesis: true difference in means is not equal to 0
> 95 percent confidence interval:
>  -3.363874  0.203874
> sample estimates:
> mean in group 1 mean in group 2 
>             0.75            2.33
#  Same for wide data (two separate vectors)
> t.test(sleep_wide$group1, sleep_wide$group2, var.equal=TRUE)

Paired-sample t-test

You can also compare paired data, using a paired-sample t-test. You might have observations before and after a treatment, or of two matched subjects with different treatments.

# Sort by group then ID
> sleep <- sleep[order(sleep$group, sleep$ID), ]

# Paired t-test
> t.test(extra ~ group, sleep, paired=TRUE)
 
 	Paired t-test
 
 data:  extra by group
 t = -4.0621, df = 9, p-value = 0.002833
 alternative hypothesis: true difference in means is not equal to 0
 95 percent confidence interval:
  -2.4598858 -0.7001142
 sample estimates:
 mean of the differences 
                   -1.58
# Same for wide data (two separate vectors)
> t.test(sleep.wide$group1, sleep.wide$group2, paired=TRUE)

	Paired t-test

data:  sleep_wide$group1 and sleep_wide$group2
t = -4.0621, df = 9, p-value = 0.002833
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -2.4598858 -0.7001142
sample estimates:
mean of the differences 
                  -1.58 

The paired t-test is equivalent to testing whether difference between each pair of observations has a population mean of 0. (See below for comparing a single group to a population mean.)

> t.test(sleep_wide$group1 - sleep_wide$group2, mu=0, var.equal=TRUE)

	One Sample t-test

data:  sleep_wide$group1 - sleep_wide$group2
t = -4.0621, df = 9, p-value = 0.002833
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
 -2.4598858 -0.7001142
sample estimates:
mean of x 
    -1.58 

Comparing a group against an expected population mean: one-sample t-test

Suppose that you want to test whether the data in column extra is drawn from a population whose true mean is 0. In this case, the group and ID columns are ignored.

t.test(sleep$extra, mu=0)
> 
> 	One Sample t-test
> 
> data:  sleep$extra
> t = 3.413, df = 19, p-value = 0.002918
> alternative hypothesis: true mean is not equal to 0
> 95 percent confidence interval:
>  0.5955845 2.4844155
> sample estimates:
> mean of x 
>      1.54

Paired t-test

repeated measure

> library(MASS)         # load the MASS package 
> head(immer) 
  Loc Var    Y1    Y2
1  UF   M  81.0  80.7
2  UF   S 105.4  82.3
3  UF   V 119.7  80.4
4  UF   T 109.7  87.2
5  UF   P  98.3  84.2
6   W   M 146.6 100.4

> t.test(immer$Y1, immer$Y2, paired=TRUE) 

	Paired t-test

data:  immer$Y1 and immer$Y2
t = 3.324, df = 29, p-value = 0.002413
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
  6.121954 25.704713
sample estimates:
mean of the differences 
               15.91333 
r/general_statistics.1570748021.txt.gz · Last modified: 2019/10/11 07:53 by hkimscil

Donate Powered by PHP Valid HTML5 Valid CSS Driven by DokuWiki