b:head_first_statistics:visualization
Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| b:head_first_statistics:visualization [2023/09/11 08:11] – [Scatter plot] hkimscil | b:head_first_statistics:visualization [2025/09/08 08:22] (current) – [Histogram Modality] hkimscil | ||
|---|---|---|---|
| Line 79: | Line 79: | ||
| | 999 | 2 | | | 999 | 2 | | ||
| + | {{: | ||
| in R . . . . | in R . . . . | ||
| Line 90: | Line 90: | ||
| hist(dat, breaks=5) | hist(dat, breaks=5) | ||
| </ | </ | ||
| + | {{: | ||
| + | < | ||
| + | dat.iq <- rnorm(1000, 100, 15) | ||
| + | head(dat.iq) | ||
| + | tail(dat.iq) | ||
| + | head(dat.iq, | ||
| + | tail(dat.iq, | ||
| + | |||
| + | mean(dat.iq) | ||
| + | sd(dat.iq) | ||
| + | |||
| + | hist(dat.iq) | ||
| + | hist(dat.iq, | ||
| + | |||
| + | set.seed(101) | ||
| + | dat.iq <- rnorm(1000, 100, 15) | ||
| + | head(dat.iq) | ||
| + | tail(dat.iq) | ||
| + | head(dat.iq, | ||
| + | tail(dat.iq, | ||
| + | |||
| + | mean(dat.iq) | ||
| + | sd(dat.iq) | ||
| + | |||
| + | hist(dat.iq) | ||
| + | hist(dat.iq, | ||
| + | </ | ||
| ====== Scatter plot ====== | ====== Scatter plot ====== | ||
| < | < | ||
| Line 138: | Line 165: | ||
| | | ||
| - | {{:c:ps1-1:2019:pasted:20190909-075028.png}} | + | {{:b:head_first_statistics:pasted:20240904-083016.png}} |
| explanatory (설명) variable at x axis | explanatory (설명) variable at x axis | ||
| Line 146: | Line 173: | ||
| Drawing a line among the data. | Drawing a line among the data. | ||
| + | |||
| < | < | ||
| abline(lm(mpg~wt), | abline(lm(mpg~wt), | ||
| - | lines(lowess(wt, | + | </ |
| - | {{:c:ps1-1:2019:pasted:20190909-075639.png}} | + | {{:b:head_first_statistics:pasted:20240904-083157.png}} |
| + | Outlier에 대한 주의 | ||
| + | [{{: | ||
| + | <WRAP clear /> | ||
| - | A bit more fancy line | ||
| - | < | ||
| - | # by Number of Car Cylinders | ||
| - | library(car) | ||
| - | scatterplot(mpg ~ wt | cyl, data=mtcars, | ||
| - | | ||
| - | | ||
| - | | ||
| - | {{: | ||
| - | Line can be: | + | ====== Presentation ====== |
| + | For a very good example, see | ||
| + | https:// | ||
| + | * Life expectancy data: {{: | ||
| - | **__관계의 방향 (direction)__** | + | <WRAP clear/> |
| - | ^ 관계의 방향 | + | ====== Histogram skewedness ====== |
| - | | {{:r.positive.png}} | {{: | + | <WRAP column half> |
| + | < | ||
| + | #### | ||
| + | # left-skewed distribution | ||
| + | # 1. | ||
| + | set.seed(1) | ||
| + | data <- rbeta(500, shape1 = 10, shape2 = 2) | ||
| + | hist(data, probability = TRUE, | ||
| + | main = " | ||
| + | xlab = " | ||
| + | col = " | ||
| + | # 2. | ||
| + | # install.packages(" | ||
| + | library(fitdistrplus) | ||
| - | **__관계의 모양 | + | fit <- fitdist(data, " |
| - | ^ 관계의 모양 | + | alpha_est <- fit$estimate[" |
| - | | {{: | + | beta_est <- fit$estimate[" |
| - | **__관계의 정도 (힘)__** | + | # 3. |
| - | ^ 관계의 정도 (힘) ^^ | + | curve(dbeta(x, shape1 = alpha_est, shape2 = beta_est), |
| - | | [{{: | + | add = TRUE, col = " |
| - | | [{{: | + | </ |
| - | <WRAP clear /> | + | </WRAP> |
| - | Pearson' | + | |
| - | __Relations, | + | |
| - | [{{: | + | |
| - | <WRAP clear /> | + | |
| - | __Interpretation with limited range__ | + | <WRAP column half> |
| - | [{{:r_eg.15.71.png? | + | {{:b:head_first_statistics: |
| - | [{{:r_eg.15.7b1.png?250 |Figure_7._Correlation_And_Range}}] | + | </ |
| - | 데이터의 [[Range]]에 대한 판단에 신중해야 한다. 왜냐 하면, 데이터의 어느 곳을 자르느냐에 따라서 r 값이 심하게 변하기 때문이다. | + | <WRAP clear/> |
| - | <WRAP clear /> | + | <WRAP column half> |
| - | __Outliers__ | + | < |
| - | [{{: | + | set.seed(1) |
| - | [{{:r_eg.15.8b.png? | + | data <- rbeta(500, shape1 = 10, shape2 = 10) |
| - | 위의 설명과 관련하여, 만약에 아주 심한 Outlier가 존재한다면 두 변인 간의 상관관계에 심한 영향을 준다. | + | hist(data, probability = TRUE, |
| - | [{{: | + | main = " |
| + | xlab = " | ||
| + | col = " | ||
| - | make it sure that there is __no data entry error__. | + | # 2. |
| - | {{:r.crime.scatterplot.for.single.by.state.jpg}} | + | # install.packages(" |
| + | library(fitdistrplus) | ||
| + | fit <- fitdist(data, | ||
| + | alpha_est <- fit$estimate[" | ||
| + | beta_est <- fit$estimate[" | ||
| - | <WRAP clear /> | + | # 3. |
| + | curve(dbeta(x, | ||
| + | add = TRUE, col = " | ||
| + | </code> | ||
| + | </WRAP> | ||
| - | see | + | <WRAP column half> |
| - | https:// | + | {{:b: |
| - | * Life expectancy data: {{:life.exp.csv}} | + | </ |
| + | <WRAP clear/> | ||
| + | <WRAP column half> | ||
| < | < | ||
| - | le <- as.data.frame(read.csv("http:// | + | ## |
| - | colnames(le)[1] <- "c.code" | + | # right-skewed distribution |
| - | lea <- le$X2017 | + | # 1. |
| - | leb <- lea[complete.cases(lea)] | + | set.seed(1) |
| - | hist(leb, color="grey") | + | data <- rbeta(500, shape1 = 2, shape2 = 10) |
| + | hist(data, probability = TRUE, | ||
| + | main = "Histogram with Right-skewed Distribution", | ||
| + | | ||
| + | col = " | ||
| + | |||
| + | # install.packages(" | ||
| + | library(fitdistrplus) | ||
| + | |||
| + | fit <- fitdist(data, | ||
| + | alpha_est | ||
| + | beta_est | ||
| + | |||
| + | # | ||
| + | curve(dbeta(x, shape1 = alpha_est, shape2 = beta_est), | ||
| + | add = TRUE, col = "red", lwd = 2) | ||
| </ | </ | ||
| + | </ | ||
| + | <WRAP column half> | ||
| + | {{: | ||
| + | </ | ||
| + | <WRAP clear/> | ||
| - | [{{:c:ps1-1:2019:pasted:20190909-110252.png|Life expectancy in 2017}}] | + | ====== Histogram Modality====== |
| - | <WRAP clear/> | + | <WRAP column half> |
| - | [{{:c:ps1-1:2019:pasted:20190909-104759.png|Distribution of temperature}}] | + | Unimodal |
| - | <WRAP clear/> | + | < |
| - | [{{:c:ps1-1: | + | ### unimodal data |
| - | <WRAP clear/>. | + | set.seed(1) |
| - | [{{:c:ps1-1:2019:pasted:20190909-111001.png|modality}}] | + | d.1 <- rnorm(500, 10, 2) |
| - | <WRAP clear/>. | + | hist(d.1, breaks = 30, probability = T, |
| - | box plot | + | main = "Hist with Unimodal distrib", |
| + | xlab = " | ||
| + | col = " | ||
| + | lines(density(d.1), | ||
| + | col = " | ||
| + | </ | ||
| + | </ | ||
| + | |||
| + | <WRAP column half> | ||
| + | {{:b:head_first_statistics:pasted:20250903-083409.png}} | ||
| + | </ | ||
| + | |||
| + | <WRAP clear/> | ||
| + | |||
| + | Bimodal distribution | ||
| + | <WRAP column half> | ||
| + | < | ||
| + | ### bimodal data | ||
| + | set.seed(1) | ||
| + | d.1 <- rnorm(500, 10, 2) | ||
| + | d.2 <- rnorm(500, 20, 2) | ||
| + | d.all <- c(d.1, d.2) | ||
| + | hist(d.all, breaks = 30, probability = T, | ||
| + | main = "Hist with bimodal distrib", | ||
| + | xlab = " | ||
| + | col = " | ||
| + | lines(density(d.all), | ||
| + | col = " | ||
| + | </ | ||
| + | </ | ||
| + | |||
| + | <WRAP column half> | ||
| + | {{:b:head_first_statistics:pasted:20250903-083524.png}} | ||
| + | </ | ||
| + | <WRAP clear/> | ||
| + | |||
| + | <WRAP column half> | ||
| + | < | ||
| + | ### multi-modal data | ||
| + | # Parameters for the first normal distribution (Mode 1) | ||
| + | m.1 <- 50 | ||
| + | sd.1 <- 5 | ||
| + | |||
| + | # Parameters for the second normal distribution (Mode 2) | ||
| + | m.2 <- 100 | ||
| + | sd.2 <- 15 | ||
| + | |||
| + | m.3 <- 160 | ||
| + | sd.3 <- 6 | ||
| + | |||
| + | # Mixing proportion for Mode 1 | ||
| + | prop.1 <- 0.3 | ||
| + | # Mixing proportion for Mode 2 | ||
| + | prop.2 <- 0.6 # This is 1 - prop1 | ||
| + | # Mixing proportion for Mode 2 | ||
| + | prop.3 <- 1.0 # This is 1 - prop1 | ||
| + | |||
| + | # Number of samples to generate | ||
| + | n.sam <- 1000 | ||
| + | |||
| + | # Create an empty vector to store the combined samples | ||
| + | |||
| + | mm.dist <- numeric(n.sam) | ||
| + | set.seed(1) | ||
| + | for (i in 1: | ||
| + | # Randomly choose which distribution to sample from | ||
| + | tmp <- runif(1) | ||
| + | if (tmp < prop.1) | ||
| + | mm.dist[i] <- rnorm(1, mean = m.1, sd = sd.1) | ||
| + | } else if (tmp < prop.2) { | ||
| + | mm.dist[i] <- rnorm(1, mean = m.2, sd = sd.2) | ||
| + | | ||
| + | mm.dist[i] <- rnorm(1, mean = m.3, sd = sd.3) | ||
| + | } | ||
| + | |||
| + | } | ||
| + | |||
| + | hist(mm.dist, | ||
| + | main = " | ||
| + | xlab = " | ||
| + | freq = FALSE, probability = T, | ||
| + | col = " | ||
| + | lines(density(mm.dist), | ||
| + | col = " | ||
| + | |||
| + | </code> | ||
| + | </ | ||
| + | <WRAP column half> | ||
| + | {{:b:head_first_statistics:pasted:20250908-082219.png}} | ||
| + | </ | ||
| + | <WRAP clear/> | ||
| + | |||
| + | |||
| + | ====== | ||
| + | <WRAP column half> | ||
| < | < | ||
| # Boxplot of MPG by Car Cylinders | # Boxplot of MPG by Car Cylinders | ||
| Line 228: | Line 388: | ||
| ylab=" | ylab=" | ||
| </ | </ | ||
| - | {{: | + | </ |
| + | <WRAP column half> | ||
| + | {{: | ||
| + | </ | ||
| + | <WRAP clear/> | ||
| + | ====== see also ====== | ||
| + | https:// | ||
b/head_first_statistics/visualization.1694387515.txt.gz · Last modified: by hkimscil
