September 15, 2017

The HELPrct data

head(HELPrct)
##   age homeless substance dep_score mental_score
## 1  37   housed   cocaine        49    25.111990
## 2  37 homeless   alcohol        30    26.670307
## 3  26   housed    heroin        39     6.762923
## 4  39   housed    heroin        15    43.967880
## 5  32 homeless   cocaine        39    21.675755
## 6  47   housed   cocaine         6    55.508991

The depression score is a number between 0 and 60, with high scores indicating greater depressive symptoms.

The mental component score is a number between 0 and 100, with 0 indicating disability and 100 indicating no disability

mutate: add a new variable

Let's create a combined measure of mental health, equal to (60 - depression score) + mental component score:

HELPrct <- mutate(HELPrct,
  mental_health = (60 - dep_score) + mental_score)
head(HELPrct)
##   age homeless substance dep_score mental_score mental_health
## 1  37   housed   cocaine        49    25.111990      36.11199
## 2  37 homeless   alcohol        30    26.670307      56.67031
## 3  26   housed    heroin        39     6.762923      27.76292
## 4  39   housed    heroin        15    43.967880      88.96788
## 5  32 homeless   cocaine        39    21.675755      42.67575
## 6  47   housed   cocaine         6    55.508991     109.50899

arrange: sort according to a variable

HELPrct_sorted <- arrange(HELPrct, mental_health)
head(HELPrct_sorted)
##   age homeless substance dep_score mental_score mental_health
## 1  47   housed   alcohol        52     7.226597      15.22660
## 2  35   housed    heroin        54     9.406377      15.40638
## 3  41   housed   alcohol        56    11.499865      15.49986
## 4  22   housed    heroin        51     7.035307      16.03531
## 5  40 homeless   cocaine        60    16.786348      16.78635
## 6  32   housed   cocaine        51     7.938221      16.93822
HELPrct_sorted <- arrange(HELPrct, desc(mental_health))
head(HELPrct_sorted)
##   age homeless substance dep_score mental_score mental_health
## 1  49 homeless   cocaine         1     59.45393      118.4539
## 2  36 homeless   cocaine         3     59.93001      116.9300
## 3  27   housed   cocaine         3     57.83459      114.8346
## 4  34   housed   cocaine         5     59.45409      114.4541
## 5  43 homeless    heroin         7     60.54208      113.5421
## 6  32   housed   alcohol         4     57.26089      113.2609

Statistics for numeric variables

mean(HELPrct$mental_health)
## [1] 58.829
sd(HELPrct$mental_health)
## [1] 23.2508
min(HELPrct$mental_health)
## [1] 15.2266
max(HELPrct$mental_health)
## [1] 118.4539

Statistics for numeric variables, cont.

median(HELPrct$mental_health)
## [1] 55.99989
quantile(HELPrct$mental_health, p = 0.5)
##      50% 
## 55.99989
quantile(HELPrct$mental_health, p = c(0.25, 0.75))
##      25%      75% 
## 41.44986 75.00374

Statistics for numeric variables, cont.

quantile(HELPrct$mental_health, p = 0.75) -
  quantile(HELPrct$mental_health, p = 0.25)
##      75% 
## 33.55388
IQR(HELPrct$mental_health)
## [1] 33.55388

summarize: statistics in a data frame

quantile(HELPrct$mental_health, p = 0.25)
##      25% 
## 41.44986
median(HELPrct$mental_health)
## [1] 55.99989
HELPrct_summary <- summarize(HELPrct,
  Q1_mh = quantile(mental_health, p = 0.25),
  median_mh = median(mental_health),
  Q3_mh = quantile(mental_health, p = 0.75))
HELPrct_summary
##      Q1_mh median_mh    Q3_mh
## 1 41.44986  55.99989 75.00374

Tangent: Putting the ideas together

ggplot() +
  geom_histogram(mapping = aes(x = mental_health, y = ..density..),
    data = HELPrct) +
  geom_vline(mapping = aes(xintercept = Q1_mh), data = HELPrct_summary) +
  geom_vline(mapping = aes(xintercept = median_mh), data = HELPrct_summary) +
  geom_vline(mapping = aes(xintercept = Q3_mh), data = HELPrct_summary)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

group_by: set up a grouping

Nothing visible happens – it's all behind the scenes!

HELPrct_grouped <- group_by(HELPrct, substance)
head(HELPrct_grouped)
## # A tibble: 6 x 6
## # Groups:   substance [3]
##     age homeless substance dep_score mental_score mental_health
##   <int>   <fctr>    <fctr>     <int>        <dbl>         <dbl>
## 1    37   housed   cocaine        49    25.111990      36.11199
## 2    37 homeless   alcohol        30    26.670307      56.67031
## 3    26   housed    heroin        39     6.762923      27.76292
## 4    39   housed    heroin        15    43.967880      88.96788
## 5    32 homeless   cocaine        39    21.675755      42.67575
## 6    47   housed   cocaine         6    55.508991     109.50899

group_by continued…

…but now summaries are calculated separately for each group:

HELPrct_grouped_summary <- summarize(HELPrct_grouped,
  mean_mh = mean(mental_health),
  sd_mh = sd(mental_health),
  iqr_mh = IQR(mental_health))
HELPrct_grouped_summary
## # A tibble: 3 x 4
##   substance  mean_mh    sd_mh   iqr_mh
##      <fctr>    <dbl>    <dbl>    <dbl>
## 1   alcohol 57.48247 22.59207 33.46739
## 2   cocaine 64.72928 24.62013 33.09359
## 3    heroin 53.51844 20.93001 27.19790

%>% – the pipe operator

Combine multiple steps that happen one after the other.

HELPrct_grouped_summary2 <- HELPrct %>%
  group_by(substance) %>%
  summarize(
    mean_mh = mean(mental_health),
    sd_mh = sd(mental_health))

HELPrct_grouped_summary2
## # A tibble: 3 x 3
##   substance  mean_mh    sd_mh
##      <fctr>    <dbl>    <dbl>
## 1   alcohol 57.48247 22.59207
## 2   cocaine 64.72928 24.62013
## 3    heroin 53.51844 20.93001

Read as "… and then …"

%>% – how does it work?

The result of the calculation on the left hand side of %>% is used as the first argument to the function on the right hand side.

HELPrct %>%
  group_by(substance)

is the same as

group_by(HELPrct, substance)

Back to statistics…

  • If the distribution is unimodal and symmetric
    • Measure center with the mean: \(\bar{y} = \frac{1}{n} \sum_{i = 1}^n y_i\)
    • Measure spread with the standard deviation – think "average distance from the mean" (though this is not formally quite right): \(s = \sqrt{\frac{\sum_{i = 1}^n (y_i - \bar{y})^2}{n - 1}}\)
  • If the distribution has multiple modes, or is skewed, or has outliers:
    • Measure center with the median – half of the data are less than the median and half of the data are greater than the median.
    • Measure spread with the Interquartile Range (IQR) – the width of the interval that contains the middle half of the data.

Why? Recall Bechdel data

movie_stats <- summarize(movies, mean_intgross = mean(intgross_2013), median_intgross = median(intgross_2013))
ggplot() +
  geom_histogram(aes(x = intgross_2013), data = movies) + 
  geom_vline(aes(xintercept = mean_intgross), color = "orange", data = movie_stats) +
  geom_vline(aes(xintercept = median_intgross), color = "blue", data = movie_stats)

Outlier effects on summary statistics

summary_all_obs <- movies %>%
  summarize(mean_intgross = mean(intgross_2013), median_intgross = median(intgross_2013), sd_mpg = sd(intgross_2013), iqr_intgross = IQR(intgross_2013))
summary_all_obs
## # A tibble: 1 x 4
##   mean_intgross median_intgross    sd_mpg iqr_intgross
##           <dbl>           <dbl>     <dbl>        <dbl>
## 1     197837985        96239640 283507948    208246367
summary_drop_outliers <- movies %>%
  filter(intgross_2013 < 3 * 10^9) %>%
  summarize(mean_intgross = mean(intgross_2013), median_intgross = median(intgross_2013), sd_intgross = sd(intgross_2013), iqr_intgross = IQR(intgross_2013))
summary_drop_outliers
## # A tibble: 1 x 4
##   mean_intgross median_intgross sd_intgross iqr_intgross
##           <dbl>           <dbl>       <dbl>        <dbl>
## 1     192968630        96010975   257693116    207732880