---
title: "Sampling Distributions
(Try 2)"
author: "Evan L. Ray"
date: "October 30, 2017"
output: ioslides_presentation
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, cache = TRUE)
require(ggplot2)
require(scales)
require(dplyr)
require(tidyr)
require(readr)
require(mosaic)
```
## Sampling Distribution of Sample Mean
* The **sampling distribution** is the distribution of values of the sample mean, across all different samples
```{r, echo = FALSE, message = FALSE, warning = FALSE}
babies <- read_csv("https://mhc-stat140-2017.github.io/data/misc/babies1998/babies_dec_1998.csv")
babies <- filter(babies, !is.na(gestation))
set.seed(1)
```
```{r, echo = FALSE, fig.height=1.25}
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) +
geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Population") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, fig.height=1.25}
babies_sample <- babies %>% sample_n(size = 10)
sample_means <- data.frame(
sample_mean = mean(babies_sample$gestation)
)
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies_sample) +
geom_vline(mapping = aes(xintercept = mean(babies_sample$gestation)), color = 'red') +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Sample, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, fig.height = 1.25}
ggplot() +
geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = 1, data = sample_means) +
xlim(range(babies$gestation)) +
xlab("Sample Means, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
## Sampling Distribution of Sample Mean
* The **sampling distribution** is the distribution of values of the sample mean, across all different samples
```{r, echo = FALSE, fig.height=1.25}
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) +
geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Population") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, fig.height=1.25}
babies_sample <- babies %>% sample_n(size = 10)
sample_means <- bind_rows(
sample_means,
data.frame(
sample_mean = mean(babies_sample$gestation)
)
)
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies_sample) +
geom_vline(mapping = aes(xintercept = mean(babies_sample$gestation)), color = 'red') +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Sample, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, fig.height = 1.25}
ggplot() +
geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = 1, data = sample_means) +
xlim(range(babies$gestation)) +
xlab("Sample Means, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
## Sampling Distribution of Sample Mean
* The **sampling distribution** is the distribution of values of the sample mean, across all different samples
```{r, echo = FALSE, fig.height=1.25}
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) +
geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Population") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, fig.height=1.25}
babies_sample <- babies %>% sample_n(size = 10)
sample_means <- bind_rows(
sample_means,
data.frame(
sample_mean = mean(babies_sample$gestation)
)
)
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies_sample) +
geom_vline(mapping = aes(xintercept = mean(babies_sample$gestation)), color = 'red') +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Sample, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, fig.height = 1.25}
ggplot() +
geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = 1, data = sample_means) +
xlim(range(babies$gestation)) +
xlab("Sample Means, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
## Sampling Distribution of Sample Mean
* The **sampling distribution** is the distribution of values of the sample mean, across all different samples
```{r, echo = FALSE, fig.height=1.25}
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) +
geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Population") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, fig.height=1.25}
babies_sample <- babies %>% sample_n(size = 10)
sample_means <- bind_rows(
sample_means,
data.frame(
sample_mean = mean(babies_sample$gestation)
)
)
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies_sample) +
geom_vline(mapping = aes(xintercept = mean(babies_sample$gestation)), color = 'red') +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Sample, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, fig.height = 1.25}
ggplot() +
geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = 1, data = sample_means) +
xlim(range(babies$gestation)) +
xlab("Sample Means, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
## Sampling Distribution of Sample Mean
* The **sampling distribution** is the distribution of values of the sample mean, across all different samples
```{r, echo = FALSE, fig.height=1.25}
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) +
geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Population") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, fig.height=1.25}
babies_sample <- babies %>% sample_n(size = 10)
sample_means <- bind_rows(
sample_means,
data.frame(
sample_mean = mean(babies_sample$gestation)
)
)
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies_sample) +
geom_vline(mapping = aes(xintercept = mean(babies_sample$gestation)), color = 'red') +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Sample, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, fig.height = 1.25}
ggplot() +
geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = 1, data = sample_means) +
xlim(range(babies$gestation)) +
xlab("Sample Means, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
## Sampling Distribution of Sample Mean
* The **sampling distribution** is the distribution of values of the sample mean, across all different samples
```{r, echo = FALSE, fig.height=1.25}
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) +
geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Population") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, fig.height=1.25}
babies_sample <- babies %>% sample_n(size = 10)
sample_means <- bind_rows(
sample_means,
data.frame(
sample_mean = mean(babies_sample$gestation)
)
)
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies_sample) +
geom_vline(mapping = aes(xintercept = mean(babies_sample$gestation)), color = 'red') +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Sample, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, fig.height = 1.25}
ggplot() +
geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = 1, data = sample_means) +
xlim(range(babies$gestation)) +
xlab("Sample Means, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
## Sampling Distribution of Sample Mean
* The **sampling distribution** is the distribution of values of the sample mean, across all different samples
```{r, echo = FALSE, fig.height=1.25}
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) +
geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Population") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, fig.height=1.25}
babies_sample <- babies %>% sample_n(size = 10)
sample_means <- bind_rows(
sample_means,
data.frame(
sample_mean = mean(babies_sample$gestation)
)
)
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies_sample) +
geom_vline(mapping = aes(xintercept = mean(babies_sample$gestation)), color = 'red') +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Sample, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, fig.height = 1.25}
ggplot() +
geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = 1, data = sample_means) +
xlim(range(babies$gestation)) +
xlab("Sample Means, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
## Sampling Distribution of Sample Mean
* The **sampling distribution** is the distribution of values of the sample mean, across all different samples
```{r, echo = FALSE, fig.height=1.25}
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) +
geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Population") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, fig.height=1.25}
babies_sample <- babies %>% sample_n(size = 10)
sample_means <- bind_rows(
sample_means,
data.frame(
sample_mean = mean(babies_sample$gestation)
)
)
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies_sample) +
geom_vline(mapping = aes(xintercept = mean(babies_sample$gestation)), color = 'red') +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Sample, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, fig.height = 1.25}
ggplot() +
geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = 1, data = sample_means) +
xlim(range(babies$gestation)) +
xlab("Sample Means, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
## Sampling Distribution of Sample Mean
* The **sampling distribution** is the distribution of values of the sample mean, across all different samples
```{r, echo = FALSE, fig.height=1.25}
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) +
geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Population") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, fig.height=1.25}
babies_sample <- babies %>% sample_n(size = 10)
sample_means <- bind_rows(
sample_means,
data.frame(
sample_mean = mean(babies_sample$gestation)
)
)
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies_sample) +
geom_vline(mapping = aes(xintercept = mean(babies_sample$gestation)), color = 'red') +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Sample, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, fig.height = 1.25}
ggplot() +
geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = 1, data = sample_means) +
xlim(range(babies$gestation)) +
xlab("Sample Means, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
## Sampling Distribution of Sample Mean
* The **sampling distribution** is the distribution of values of the sample mean, across all different samples
```{r, echo = FALSE, fig.height=1.25}
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) +
geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Population") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, fig.height=1.25}
babies_sample <- babies %>% sample_n(size = 10)
sample_means <- bind_rows(
sample_means,
data.frame(
sample_mean = mean(babies_sample$gestation)
)
)
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies_sample) +
geom_vline(mapping = aes(xintercept = mean(babies_sample$gestation)), color = 'red') +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Sample, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, fig.height = 1.25}
ggplot() +
geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = 1, data = sample_means) +
xlim(range(babies$gestation)) +
xlab("Sample Means, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
## Sampling Distribution of Sample Mean
* The **sampling distribution** is the distribution of values of the sample mean, across all different samples
```{r, echo = FALSE, fig.height=1.25}
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) +
geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Population") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, fig.height=1.25}
babies_sample <- babies %>% sample_n(size = 10)
sample_means <- bind_rows(
sample_means,
data.frame(
sample_mean = mean(babies_sample$gestation)
)
)
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies_sample) +
geom_vline(mapping = aes(xintercept = mean(babies_sample$gestation)), color = 'red') +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Sample, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, fig.height = 1.25}
ggplot() +
geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = 1, data = sample_means) +
xlim(range(babies$gestation)) +
xlab("Sample Means, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
## Sampling Distribution of Sample Mean
* The **sampling distribution** is the distribution of values of the sample mean, across all different samples of a certain size $n$.
```{r, echo = FALSE, fig.height=1.25}
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) +
geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Population") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, fig.height=1.25}
sample_means <- bind_rows(
sample_means,
{do(10000) * {
babies_sample <- babies %>% sample_n(size = 10)
data.frame(
sample_mean = mean(babies_sample$gestation)
)
}} %>% select(sample_mean)
)
```
```{r, echo = FALSE, warning = FALSE, fig.height=1.25}
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), alpha = 0, binwidth = 1, data = babies_sample) +
geom_text(mapping = aes(x = x, y = y, label = label), size = 8,
data = data.frame(x = 32, y = 0.15, label = "( ...draw 10,000 more samples of size n... )")) +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Sample, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, warning = FALSE, fig.height = 1.25}
ggplot() +
geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = .2, data = sample_means) +
xlim(range(babies$gestation)) +
xlab("Sample Means, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
## Sampling Distribution Depends on $n$
* Always centered at population mean, but as $n$ increases:
* standard deviation gets smaller
* distribution looks more normal
```{r, echo = FALSE, fig.height=1.05}
ggplot() +
geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) +
geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') +
xlim(range(babies$gestation)) +
xlab("Gestation Time (weeks) -- Population") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, warning = FALSE, fig.height = 1.05}
ggplot() +
geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = .2, data = sample_means) +
xlim(range(babies$gestation)) +
xlab("Sample Means, n = 10") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
```{r, echo = FALSE, fig.height=1.05}
sample_means_1000 <- bind_rows(
sample_means,
{do(10000) * {
babies_sample <- babies %>% sample_n(size = 1000)
data.frame(
sample_mean = mean(babies_sample$gestation)
)
}} %>% select(sample_mean)
)
```
```{r, echo = FALSE, warning = FALSE, fig.height = 1.05}
ggplot() +
geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = .2, data = sample_means_1000) +
xlim(range(babies$gestation)) +
xlab("Sample Means, n = 1000") +
theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm"))
```
## Sample Mean: Central Limit Theorem
* $Y_1, Y_2, \ldots, Y_n$ are independent observations of a quantitative variable
* Population has mean $\mu$ and standard deviation $\sigma$
* Compute the sample mean: $\bar{Y} = \frac{1}{n}\sum_{i=1}^n Y_i$
* The sampling distribution of $\bar{Y}$:
* has mean $\mu$
* has standard deviation $\sigma/\sqrt{n}$
* for large enough $n$, it is approximately normal
* putting this together: the sampling distribution of $\bar{Y}$ is approximately Normal($\mu$, $\sigma/\sqrt{n}$) for large enough $n$.
## More on $n$
* The sample size required for the sample mean be normally distributed depends on the population distribution