--- title: "Sampling Distributions
(Try 2)" author: "Evan L. Ray" date: "October 30, 2017" output: ioslides_presentation --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = FALSE, cache = TRUE) require(ggplot2) require(scales) require(dplyr) require(tidyr) require(readr) require(mosaic) ``` ## Sampling Distribution of Sample Mean * The **sampling distribution** is the distribution of values of the sample mean, across all different samples ```{r, echo = FALSE, message = FALSE, warning = FALSE} babies <- read_csv("https://mhc-stat140-2017.github.io/data/misc/babies1998/babies_dec_1998.csv") babies <- filter(babies, !is.na(gestation)) set.seed(1) ``` ```{r, echo = FALSE, fig.height=1.25} ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) + geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Population") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, fig.height=1.25} babies_sample <- babies %>% sample_n(size = 10) sample_means <- data.frame( sample_mean = mean(babies_sample$gestation) ) ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies_sample) + geom_vline(mapping = aes(xintercept = mean(babies_sample$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Sample, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, fig.height = 1.25} ggplot() + geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = 1, data = sample_means) + xlim(range(babies$gestation)) + xlab("Sample Means, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ## Sampling Distribution of Sample Mean * The **sampling distribution** is the distribution of values of the sample mean, across all different samples ```{r, echo = FALSE, fig.height=1.25} ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) + geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Population") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, fig.height=1.25} babies_sample <- babies %>% sample_n(size = 10) sample_means <- bind_rows( sample_means, data.frame( sample_mean = mean(babies_sample$gestation) ) ) ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies_sample) + geom_vline(mapping = aes(xintercept = mean(babies_sample$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Sample, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, fig.height = 1.25} ggplot() + geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = 1, data = sample_means) + xlim(range(babies$gestation)) + xlab("Sample Means, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ## Sampling Distribution of Sample Mean * The **sampling distribution** is the distribution of values of the sample mean, across all different samples ```{r, echo = FALSE, fig.height=1.25} ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) + geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Population") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, fig.height=1.25} babies_sample <- babies %>% sample_n(size = 10) sample_means <- bind_rows( sample_means, data.frame( sample_mean = mean(babies_sample$gestation) ) ) ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies_sample) + geom_vline(mapping = aes(xintercept = mean(babies_sample$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Sample, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, fig.height = 1.25} ggplot() + geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = 1, data = sample_means) + xlim(range(babies$gestation)) + xlab("Sample Means, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ## Sampling Distribution of Sample Mean * The **sampling distribution** is the distribution of values of the sample mean, across all different samples ```{r, echo = FALSE, fig.height=1.25} ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) + geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Population") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, fig.height=1.25} babies_sample <- babies %>% sample_n(size = 10) sample_means <- bind_rows( sample_means, data.frame( sample_mean = mean(babies_sample$gestation) ) ) ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies_sample) + geom_vline(mapping = aes(xintercept = mean(babies_sample$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Sample, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, fig.height = 1.25} ggplot() + geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = 1, data = sample_means) + xlim(range(babies$gestation)) + xlab("Sample Means, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ## Sampling Distribution of Sample Mean * The **sampling distribution** is the distribution of values of the sample mean, across all different samples ```{r, echo = FALSE, fig.height=1.25} ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) + geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Population") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, fig.height=1.25} babies_sample <- babies %>% sample_n(size = 10) sample_means <- bind_rows( sample_means, data.frame( sample_mean = mean(babies_sample$gestation) ) ) ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies_sample) + geom_vline(mapping = aes(xintercept = mean(babies_sample$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Sample, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, fig.height = 1.25} ggplot() + geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = 1, data = sample_means) + xlim(range(babies$gestation)) + xlab("Sample Means, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ## Sampling Distribution of Sample Mean * The **sampling distribution** is the distribution of values of the sample mean, across all different samples ```{r, echo = FALSE, fig.height=1.25} ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) + geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Population") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, fig.height=1.25} babies_sample <- babies %>% sample_n(size = 10) sample_means <- bind_rows( sample_means, data.frame( sample_mean = mean(babies_sample$gestation) ) ) ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies_sample) + geom_vline(mapping = aes(xintercept = mean(babies_sample$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Sample, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, fig.height = 1.25} ggplot() + geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = 1, data = sample_means) + xlim(range(babies$gestation)) + xlab("Sample Means, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ## Sampling Distribution of Sample Mean * The **sampling distribution** is the distribution of values of the sample mean, across all different samples ```{r, echo = FALSE, fig.height=1.25} ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) + geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Population") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, fig.height=1.25} babies_sample <- babies %>% sample_n(size = 10) sample_means <- bind_rows( sample_means, data.frame( sample_mean = mean(babies_sample$gestation) ) ) ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies_sample) + geom_vline(mapping = aes(xintercept = mean(babies_sample$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Sample, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, fig.height = 1.25} ggplot() + geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = 1, data = sample_means) + xlim(range(babies$gestation)) + xlab("Sample Means, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ## Sampling Distribution of Sample Mean * The **sampling distribution** is the distribution of values of the sample mean, across all different samples ```{r, echo = FALSE, fig.height=1.25} ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) + geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Population") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, fig.height=1.25} babies_sample <- babies %>% sample_n(size = 10) sample_means <- bind_rows( sample_means, data.frame( sample_mean = mean(babies_sample$gestation) ) ) ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies_sample) + geom_vline(mapping = aes(xintercept = mean(babies_sample$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Sample, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, fig.height = 1.25} ggplot() + geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = 1, data = sample_means) + xlim(range(babies$gestation)) + xlab("Sample Means, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ## Sampling Distribution of Sample Mean * The **sampling distribution** is the distribution of values of the sample mean, across all different samples ```{r, echo = FALSE, fig.height=1.25} ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) + geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Population") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, fig.height=1.25} babies_sample <- babies %>% sample_n(size = 10) sample_means <- bind_rows( sample_means, data.frame( sample_mean = mean(babies_sample$gestation) ) ) ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies_sample) + geom_vline(mapping = aes(xintercept = mean(babies_sample$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Sample, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, fig.height = 1.25} ggplot() + geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = 1, data = sample_means) + xlim(range(babies$gestation)) + xlab("Sample Means, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ## Sampling Distribution of Sample Mean * The **sampling distribution** is the distribution of values of the sample mean, across all different samples ```{r, echo = FALSE, fig.height=1.25} ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) + geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Population") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, fig.height=1.25} babies_sample <- babies %>% sample_n(size = 10) sample_means <- bind_rows( sample_means, data.frame( sample_mean = mean(babies_sample$gestation) ) ) ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies_sample) + geom_vline(mapping = aes(xintercept = mean(babies_sample$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Sample, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, fig.height = 1.25} ggplot() + geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = 1, data = sample_means) + xlim(range(babies$gestation)) + xlab("Sample Means, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ## Sampling Distribution of Sample Mean * The **sampling distribution** is the distribution of values of the sample mean, across all different samples ```{r, echo = FALSE, fig.height=1.25} ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) + geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Population") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, fig.height=1.25} babies_sample <- babies %>% sample_n(size = 10) sample_means <- bind_rows( sample_means, data.frame( sample_mean = mean(babies_sample$gestation) ) ) ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies_sample) + geom_vline(mapping = aes(xintercept = mean(babies_sample$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Sample, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, fig.height = 1.25} ggplot() + geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = 1, data = sample_means) + xlim(range(babies$gestation)) + xlab("Sample Means, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ## Sampling Distribution of Sample Mean * The **sampling distribution** is the distribution of values of the sample mean, across all different samples of a certain size $n$. ```{r, echo = FALSE, fig.height=1.25} ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) + geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Population") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, fig.height=1.25} sample_means <- bind_rows( sample_means, {do(10000) * { babies_sample <- babies %>% sample_n(size = 10) data.frame( sample_mean = mean(babies_sample$gestation) ) }} %>% select(sample_mean) ) ``` ```{r, echo = FALSE, warning = FALSE, fig.height=1.25} ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), alpha = 0, binwidth = 1, data = babies_sample) + geom_text(mapping = aes(x = x, y = y, label = label), size = 8, data = data.frame(x = 32, y = 0.15, label = "( ...draw 10,000 more samples of size n... )")) + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Sample, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, warning = FALSE, fig.height = 1.25} ggplot() + geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = .2, data = sample_means) + xlim(range(babies$gestation)) + xlab("Sample Means, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ## Sampling Distribution Depends on $n$ * Always centered at population mean, but as $n$ increases: * standard deviation gets smaller * distribution looks more normal ```{r, echo = FALSE, fig.height=1.05} ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) + geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Population") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, warning = FALSE, fig.height = 1.05} ggplot() + geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = .2, data = sample_means) + xlim(range(babies$gestation)) + xlab("Sample Means, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, fig.height=1.05} sample_means_1000 <- bind_rows( sample_means, {do(10000) * { babies_sample <- babies %>% sample_n(size = 1000) data.frame( sample_mean = mean(babies_sample$gestation) ) }} %>% select(sample_mean) ) ``` ```{r, echo = FALSE, warning = FALSE, fig.height = 1.05} ggplot() + geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = .2, data = sample_means_1000) + xlim(range(babies$gestation)) + xlab("Sample Means, n = 1000") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ## Sample Mean: Central Limit Theorem * $Y_1, Y_2, \ldots, Y_n$ are independent observations of a quantitative variable * Population has mean $\mu$ and standard deviation $\sigma$ * Compute the sample mean: $\bar{Y} = \frac{1}{n}\sum_{i=1}^n Y_i$ * The sampling distribution of $\bar{Y}$: * has mean $\mu$ * has standard deviation $\sigma/\sqrt{n}$ * for large enough $n$, it is approximately normal * putting this together: the sampling distribution of $\bar{Y}$ is approximately Normal($\mu$, $\sigma/\sqrt{n}$) for large enough $n$. ## More on $n$ * The sample size required for the sample mean be normally distributed depends on the population distribution
```{r, echo = FALSE, fig.height=1.15, fig.width = 3.5} ggplot() + geom_histogram(mapping = aes(x = gestation, y = ..density..), binwidth = 1, data = babies) + geom_vline(mapping = aes(xintercept = mean(babies$gestation)), color = 'red') + xlim(range(babies$gestation)) + xlab("Gestation Time (weeks) -- Population") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, warning = FALSE, fig.height = 1.15, fig.width = 3.5} ggplot() + geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = .2, data = sample_means) + xlim(range(babies$gestation)) + xlab("Sample Means, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE} sample_means_100 <- bind_rows( sample_means, {do(10000) * { babies_sample <- babies %>% sample_n(size = 30) data.frame( sample_mean = mean(babies_sample$gestation) ) }} %>% select(sample_mean) ) ``` ```{r, echo = FALSE, warning = FALSE, fig.height = 1.15, fig.width = 3.5} ggplot() + geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = .2, data = sample_means_100) + xlim(range(babies$gestation)) + xlab("Sample Means, n = 30") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, message = FALSE, warning = FALSE} ceo_salaries <- read_csv("https://mhc-stat140-2017.github.io/data/sdm4/CEO_Salary_2012.csv") ceo_salaries <- ceo_salaries %>% mutate(pay = `1-Year Pay ($mil)`) %>% filter(pay > 0) ``` ```{r, echo = FALSE} sample_means_10 <- do(10000) * { ceo_salaries_sample <- ceo_salaries %>% sample_n(size = 10) data.frame( sample_mean = mean(ceo_salaries_sample$pay) ) } ``` ```{r, echo = FALSE} sample_means_100 <- do(10000) * { ceo_salaries_sample <- ceo_salaries %>% sample_n(size = 30) data.frame( sample_mean = mean(ceo_salaries_sample$pay) ) } ``` ```{r, echo = FALSE, fig.height=1.15, fig.width = 3.5, warning = FALSE} ggplot() + geom_histogram(mapping = aes(x = pay, y = ..density..), binwidth = 1, data = ceo_salaries) + geom_vline(mapping = aes(xintercept = mean(ceo_salaries$pay)), color = 'red') + xlim(range(ceo_salaries$pay)) + xlab("CEO Pay (Millions of $) -- Population") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, warning = FALSE, fig.height = 1.15, fig.width = 3.5} ggplot() + geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = .2, data = sample_means_10) + xlim(range(ceo_salaries$pay)) + xlab("Sample Means, n = 10") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ``` ```{r, echo = FALSE, warning = FALSE, fig.height = 1.15, fig.width = 3.5} ggplot() + geom_histogram(mapping = aes(x = sample_mean, y = ..density..), binwidth = .2, data = sample_means_100) + xlim(range(ceo_salaries$pay)) + xlab("Sample Means, n = 30") + theme(plot.margin = unit(x = c(0, 0, 0, 0), units = "cm")) ```
## Estimating the Success Probability * Suppose we want to estimate the proportion $p$ of US households who own the home they live in. * We take a sample of size $n$ and count the number of households in our sample who own their home: $$X \sim \text{Binomial}(n, p)$$ * How can we estimate $p$ using $X$? ## Sampling distribution of $\hat{p}$ * We will estimate the probability of success using $$\hat{p} = \frac{X}{n}$$ * Remember that we can write $X$ as a sum of independent Bernoulli Random Variables: $X = X_1 + X_2 + \cdots + X_n$ * So $\hat{p} = \frac{X}{n} = \frac{1}{n} \sum_i X_i$ is a sample mean of independent Bernoulli random variables * Since $\hat{p} = \frac{1}{n} \sum_i X_i$, the Central Limit Theorem tells us the approximate sampling distribution of $\hat{p}$, for large enough $n$. * https://istats.shinyapps.io/SampDist_Prop/ ## Sampling distribution of $\hat{p}$ * For a single Bernoulli random variable, * $E(X_i) = p$ * $SD(X_i) = \sqrt{p(1 - p)}$ * The CLT says that for large enough $n$, the sampling distribution of $\hat{p}$ is approximately $$\hat{p} \sim \text{Normal}(p, \sqrt{p(1 - p)/n})$$ * For estimating a proportion/probability $p$, we say $n$ is large enough if the **success/failures** condition is satisfied: * $np \geq 10$ and $n(1 - p) \geq 10$