Central Limit Theorem

If \(Y_1, Y_2, \ldots, Y_n\) are independent observations from a population having mean \(\mu\) and finite standard deviation \(\sigma\), then the sampling distribution of \(\bar{Y} = \frac{1}{n}\sum_{i=1}^n Y_i\) is approximately Normal(\(\mu\), \(\sigma/\sqrt{n}\)) for large enough \(n\).

As a follow-up: suppose \(X \sim Binomial(n, p)\), and we estimate the probability of success by the proportion of successes in our sample: \(\hat{p} = X / n\). Then \(\hat{p}\) follows a Normal(\(p\), \(\sqrt{p(1 - p)/n}\)) distribution for large enough \(n\).

Means of Quantitative Variables

Let’s explore the Central Limit Theorem with two different data sets containing measures of quantitative variables.

Gestation Times

The following data set contains a record of every baby born in December 1998 in the United States, from the Census Bureau and National Center for Health Statistics. For each baby, we have recordings of several different variables including the mother’s age and highest educational attainment, marital status, the baby’s gestation time in weeks (how many weeks pregnant the mother was when she gave birth), birth weight, and so on. In the original data, there were some records where the gestation time was missing; I have removed those records from the data set, since we’re going to anlayze the gestation times in this lab.

babies <- read_csv("https://mhc-stat140-2017.github.io/data/misc/babies1998/babies_dec_1998.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   momage = col_integer(),
##   momeduc = col_integer(),
##   mommarital = col_integer(),
##   numlive = col_integer(),
##   prenatalstart = col_integer(),
##   fage = col_integer(),
##   dobmm = col_integer(),
##   gestation = col_integer(),
##   sex = col_character(),
##   birthweight = col_integer(),
##   apgar5 = col_integer(),
##   smoked = col_integer(),
##   numcigarette = col_integer(),
##   alcohol = col_character(),
##   numdrinks = col_integer()
## )
babies <- filter(babies, !is.na(gestation))
head(babies)
## # A tibble: 6 x 16
##      X1 momage momeduc mommarital numlive prenatalstart  fage dobmm
##   <int>  <int>   <int>      <int>   <int>         <int> <int> <int>
## 1 54965     23      12          2       1             0    24    12
## 2 54966     32       8          1       2             3    44    12
## 3 55094     25      10          2       1             4    21    12
## 4 55156     32      14          1       3             2    35    12
## 5 55189     18      12          2       0             3    19    12
## 6 55193     19      NA          2       1             3    NA    12
## # ... with 8 more variables: gestation <int>, sex <chr>,
## #   birthweight <int>, apgar5 <int>, smoked <int>, numcigarette <int>,
## #   alcohol <chr>, numdrinks <int>
str(babies)
## Classes 'tbl_df', 'tbl' and 'data.frame':    330717 obs. of  16 variables:
##  $ X1           : int  54965 54966 55094 55156 55189 55193 55194 55310 55313 55320 ...
##  $ momage       : int  23 32 25 32 18 19 22 19 23 20 ...
##  $ momeduc      : int  12 8 10 14 12 NA 10 NA 11 11 ...
##  $ mommarital   : int  2 1 2 1 2 2 2 2 1 1 ...
##  $ numlive      : int  1 2 1 3 0 1 1 0 0 0 ...
##  $ prenatalstart: int  0 3 4 2 3 3 2 1 1 2 ...
##  $ fage         : int  24 44 21 35 19 NA 31 NA 27 23 ...
##  $ dobmm        : int  12 12 12 12 12 12 12 12 12 12 ...
##  $ gestation    : int  41 47 37 35 37 35 41 40 38 38 ...
##  $ sex          : chr  "male" "male" "male" "male" ...
##  $ birthweight  : int  3310 3095 3430 3742 3657 2575 2740 3410 3095 3204 ...
##  $ apgar5       : int  9 6 9 9 10 9 9 9 9 9 ...
##  $ smoked       : int  2 2 1 2 2 2 2 2 2 2 ...
##  $ numcigarette : int  0 0 10 0 0 0 0 0 0 0 ...
##  $ alcohol      : chr  "no" "no" "no" "no" ...
##  $ numdrinks    : int  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 16
##   .. ..$ X1           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ momage       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ momeduc      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ mommarital   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ numlive      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ prenatalstart: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ fage         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ dobmm        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ gestation    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ sex          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ birthweight  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ apgar5       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ smoked       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ numcigarette : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ alcohol      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ numdrinks    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

Descriptive Analysis and Population Parameters

With this data set, we’re in the unusual position of having data for an entire population: we have data for every baby born in December 1998. Make a histogram or density plot of the gestation time for each baby (gestation), and calculate the mean, median, and standard deviation of the gestation times. Note that the mean, median, and standard deviation you calculate here are Population Parameters, since they are numbers that describe the population. You should see that the distribution of gestation times is skewed to the left, but isn’t too far off from following a normal model.

SOLUTION:

ggplot() +
  geom_histogram(mapping = aes(x = gestation), data = babies)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

mean(babies$gestation)
## [1] 38.8
median(babies$gestation)
## [1] 39
sd(babies$gestation)
## [1] 2.61

Sample Means

The following code takes a random sample of 10 babies from the population of all babies born in December 1998, and calculates the mean gestation time for the babies in the sample. Since this mean describes a sample, it is a sample statistic. Go ahead and run this chunk a few times. You will see that for every sample you take, you get a different sample mean.

babies_sample <- sample_n(babies, size = 10)
mean(babies_sample$gestation)
## [1] 37.8

How do the sample means you got from samples of size 10 compare to the population mean you calculated above?

SOLUTION:

The sample means are fairly close to the population mean of 38.8 weeks.

Sampling Distribution of Sample Mean for \(n = 10\)

The R code in the chunk below repeats this process of drawing samples of size 10 and computing the resulting sample mean 1000 times, and saves the results in a data frame called simulation_results. It then plots a density curve of these sample means (overlaid with a normal distribution curve in red, approximating the distribution of sample means). This density curve is an approximation to the sampling distribution of the sample mean (recall that the sampling distribution is the distribution of the sample statistic across all possible samples of a given size – here, we’re just looking at the results from 1000 possible samples).

Go ahead and run all of the code in the chunk below (you don’t need to modify the code at all).

n <- 10

simulation_results <- do(1000) * {
  babies_sample <- sample_n(babies, size = n)
  
  data.frame(
    sample_mean = mean(babies_sample$gestation)
  )
}

ggplot() +
  geom_density(mapping = aes(x = sample_mean), data = simulation_results) +
  stat_function(mapping = aes(x = sample_mean),
    fun = dnorm,
    colour = "red",
    args = list(mean = mean(simulation_results$sample_mean), sd = sd(simulation_results$sample_mean)),
    data = simulation_results)

mean(simulation_results$sample_mean)
## [1] 38.8
sd(simulation_results$sample_mean)
## [1] 0.821

Comment on the sampling distribution you got using sample sizes of \(n = 10\).

Here are some questions to think about:

  • Shape: Is the sampling distribution still skewed, or would a normal model be appropriate now? Is the sampling distribution “closer to” following a normal distribution than the original distribution of gestation times?
  • Center: Is the sampling distribution centered near the population mean?
  • Spread: How does the standard deviation of the sampling distribution compare to the standard deviation of values in the original population? Recall that the Central Limit Theorem says that the standard deviation of the sample mean should be about equal to the standard deviation of the population divided by the square root of the sample size. Does that seem to be about right in this case?

SOLUTION:

The sampling distribution is a little bit skewed to the left, but not as much as the distribution of values in the population.

The average of the sample means across different samples is close to the population mean

The standard deviation of sample means (0.82) is smaller than the standard deviation of values in the population (2.61). \[\frac{2.61}{\sqrt{10}} = 0.83\], which is very close to the standard deviation of the sample means in this simulation.

Sampling Distribution of Sample Mean for Other Values of \(n\)

Let’s try increasing the sample size and see what happens. The R chunk below is exactly the same as the one you used above. Try changing just the first line to set n equal to larger values. I recommend starting by increasing n 10 at a time. As you increase n, what happens to the shape of the sampling distribution? Where is it centered? And how does the standard deviation change?

n <- 30

simulation_results <- do(1000) * {
  babies_sample <- sample_n(babies, size = n)
  
  data.frame(
    sample_mean = mean(babies_sample$gestation)
  )
}

ggplot() +
  geom_density(mapping = aes(x = sample_mean), data = simulation_results) +
  stat_function(mapping = aes(x = sample_mean),
    fun = dnorm,
    colour = "red",
    args = list(mean = mean(simulation_results$sample_mean), sd = sd(simulation_results$sample_mean)),
    data = simulation_results)

mean(simulation_results$sample_mean)
## [1] 38.8
sd(simulation_results$sample_mean)
## [1] 0.471

As you increase n, note what happens to the shape of the sampling distribution? Where is it centered? And how does the standard deviation change? How big does the sample size have to be before you feel comfortable with using a normal distribution to approximate the sampling distribution of the sample mean?

SOLUTION:

With sample sizes of \(n = 30\), sampling distribution of the sample mean is more symmetric, and matches a normal distribution fairly well. I thought a sample size of \(n = 20\) didn’t match the normal distribution as well.

The average of the sample means is still about equal to the population mean of 38.8.

The standard deviation of the sample means is about 0.36, which is smaller than the standard deviation of the population or of the sample means with a sample size of 10.

CEO Salaries

The following data set contains a record of the pay for the top 500 CEOs of American corporations as of 2012, according to Forbes. Let’s look at the pay variable, which has salary in millions of dollars (note that because of how some CEOs are compensated, the salary for some CEOs was listed as $0 – I’ve filtered those records out of the data set here).

ceo_salaries <- read_csv("https://mhc-stat140-2017.github.io/data/sdm4/CEO_Salary_2012.csv")
## Parsed with column specification:
## cols(
##   Rank = col_integer(),
##   Name = col_character(),
##   Company = col_character(),
##   `1-Year Pay ($mil)` = col_double(),
##   `5 Year Pay ($mil)` = col_double(),
##   `Shares Owned ($mil)` = col_double(),
##   Age = col_integer(),
##   Efficiency = col_integer(),
##   `Log Pay` = col_double()
## )
ceo_salaries <- ceo_salaries %>%
  mutate(pay = `1-Year Pay ($mil)`) %>%
  filter(pay > 0)

Descriptive Analysis

Make a histogram or density plot of the pay for each CEO (pay), and calculate the mean and median of the CEO pay. You should see that the distribution of CEO pay is skewed to the right, quite severely, and would not be well represented by a normal model. Is it appropriate to summarize the center of this distribution using the sample mean?

SOLUTION:

# Your code goes here
ggplot() +
  geom_histogram(mapping = aes(x = pay), bins = 30, data = ceo_salaries)

mean(ceo_salaries$pay)
## [1] 10.5
median(ceo_salaries$pay)
## [1] 6.97

Sampling Distribution of Sample Mean

Let’s repeat the same process you did with baby gestation times above, with the CEO salaries. Try various sample sizes in the first line of code and see what happens to the sampling distribution for the sample mean as the sample size increases. As you increase n, note what happens to the shape of the sampling distribution, where is it centered, and how the standard deviation changes. How large does the sample size have to be for you to be comfortable using a normal approximation to the sampling distribution?

n <- 70

simulation_results <- do(1000) * {
  ceo_salaries_sample <- sample_n(ceo_salaries, size = n)
  
  data.frame(
    sample_mean = mean(ceo_salaries_sample$pay)
  )
}

ggplot() +
  geom_density(mapping = aes(x = sample_mean), data = simulation_results) +
  stat_function(mapping = aes(x = sample_mean),
    fun = dnorm,
    colour = "red",
    args = list(mean = mean(simulation_results$sample_mean), sd = sd(simulation_results$sample_mean)),
    data = simulation_results)

mean(simulation_results$sample_mean)
## [1] 10.6
sd(simulation_results$sample_mean)
## [1] 1.24

As you increase n, note what happens to the shape of the sampling distribution, where is it centered, and how the standard deviation changes. How large does the sample size have to be for you to be comfortable using a normal approximation to the sampling distribution?

SOLUTION:

As before, the distribution of sample means is centered at the population mean, has a standard deviation that decreases with the sample size, and becomes closer to a normal distribution as the sample size increases. A normal model approximation to the sampling distribution of the sample mean seems reasonable with sample sizes around 70 or so.

Proportions of Categorical Variables (with 2 levels)

We’ve said that for a categorical variable with two levels (“success” and “failure”) the sampling distribution of the proportion of successes in the sample is approximately Normal(\(p\), \(\sqrt{p(1 - p)/n}\)) distribution if n is large enough. Our book says that n is “large enough” if the success/failure condition is satisfied: \(np \geq 10\) and \(n(1 - p) \geq 10\). (If you’re looking at the R Markdown version of this lab, the symbol “geq” stands for greater than or equal to). That is, the expected number of successes in our sample has to be at least 10 and the expected number of failures has to be at least 10. Let’s explore that here, to see if you agree with the book’s cutoffs.

Run the R chunk below. You should see a histogram-style plot showing the probabilities for a binomial distribution, overlaid with a red normal distribution curve. You should also see sliders that you can use to adjust the values of n and p in the binomial distribution (if you don’t, try clicking the gear icon in the top left of the plot). Try out different values of n and p, and see which combinations lead to a reasonable normal approximation to the binomial distribution.

Don’t worry about the code to make this plot, it’s a little involved…

library(manipulate)

manipulate(
  {
    phat_sampling_dist <- data.frame(
      phat = seq(from = 0, to = n) / n,
      probability = dbinom(x = seq(from = 0, to = n), size = n, prob = p))

    density_df <- data.frame(
      x = seq(from = -0.2, to = 1.2, by = 0.1)
    )
    
    ggplot() +
      geom_bar(mapping = aes(x = phat, y = probability),
        stat = "identity",
        data = phat_sampling_dist) +
      stat_function(mapping = aes(x = x),
        fun = function(x) {
          max(phat_sampling_dist$probability) *
            dnorm(x, mean = p, sd = sqrt(p * (1 - p) / n)) / 
            dnorm(p, mean = p, sd = sqrt(p * (1 - p) / n))
        },
        colour = "red",
        data = density_df)
  },
  n = slider(10, 1000, step = 10, initial = 10, label = "n"),
  p = slider(0, 1, step = 0.01, initial = 0.5, label = "p")
)

Does the success/failure condition seem reasonable to you as a check for whether the sampling distribution of p-hat is approximately normal?

SOLUTION:

This rule seems to work pretty well. Here are a couple of example plots where the rule’s conditions are satisfied and a normal distribution seems appropriate. In the last one, I’ve zoomed in to the relevant part of the x axis to see what’s going on.

p <- 0.5
n <- 20

phat_sampling_dist <- data.frame(
  phat = seq(from = 0, to = n) / n,
  probability = dbinom(x = seq(from = 0, to = n), size = n, prob = p)
)

density_df <- data.frame(
  x = seq(from = -0.2, to = 1.2, by = 0.1)
)

ggplot() +
  geom_bar(mapping = aes(x = phat, y = probability),
    stat = "identity",
    data = phat_sampling_dist) +
  stat_function(mapping = aes(x = x),
    fun = function(x) {
      max(phat_sampling_dist$probability) *
        dnorm(x, mean = p, sd = sqrt(p * (1 - p) / n)) / 
        dnorm(p, mean = p, sd = sqrt(p * (1 - p) / n))
    },
    colour = "red",
    data = density_df)

p <- 0.1
n <- 100

phat_sampling_dist <- data.frame(
  phat = seq(from = 0, to = n) / n,
  probability = dbinom(x = seq(from = 0, to = n), size = n, prob = p)
)

density_df <- data.frame(
  x = seq(from = -0.2, to = 1.2, by = 0.1)
)

ggplot() +
  geom_bar(mapping = aes(x = phat, y = probability),
    stat = "identity",
    data = phat_sampling_dist) +
  stat_function(mapping = aes(x = x),
    fun = function(x) {
      max(phat_sampling_dist$probability) *
        dnorm(x, mean = p, sd = sqrt(p * (1 - p) / n)) / 
        dnorm(p, mean = p, sd = sqrt(p * (1 - p) / n))
    },
    colour = "red",
    data = density_df)

p <- 0.01
n <- 1000

phat_sampling_dist <- data.frame(
  phat = seq(from = 0, to = n) / n,
  probability = dbinom(x = seq(from = 0, to = n), size = n, prob = p)
)

density_df <- data.frame(
  x = seq(from = -0.2, to = 1.2, by = 0.1)
)

ggplot() +
  geom_bar(mapping = aes(x = phat, y = probability),
    stat = "identity",
    data = phat_sampling_dist) +
  stat_function(mapping = aes(x = x),
    fun = function(x) {
      max(phat_sampling_dist$probability) *
        dnorm(x, mean = p, sd = sqrt(p * (1 - p) / n)) / 
        dnorm(p, mean = p, sd = sqrt(p * (1 - p) / n))
    },
    colour = "red",
    data = density_df) +
  xlim(c(0, 0.1))
## Warning: Removed 900 rows containing missing values (position_stack).