ggplot stuff

(I actually didn’t do this, but follow along in Ben’s repo: https://bbest.github.io/ds-repo/)

Exploring gapminder data

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.2
## ── Attaching packages ─────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1.9000     ✔ purrr   0.2.4     
## ✔ tibble  1.3.4          ✔ dplyr   0.7.4     
## ✔ tidyr   0.7.2          ✔ stringr 1.2.0     
## ✔ readr   1.1.1          ✔ forcats 0.2.0
## Warning: package 'purrr' was built under R version 3.4.2
## Warning: package 'dplyr' was built under R version 3.4.2
## ── Conflicts ────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
gapminder <- readr::read_csv("https://raw.githubusercontent.com/OHI-Science/data-science-training/master/data/gapminder.csv")
## Parsed with column specification:
## cols(
##   country = col_character(),
##   year = col_integer(),
##   pop = col_double(),
##   continent = col_character(),
##   lifeExp = col_double(),
##   gdpPercap = col_double()
## )
gapminder
## # A tibble: 1,704 x 6
##        country  year      pop continent lifeExp gdpPercap
##          <chr> <int>    <dbl>     <chr>   <dbl>     <dbl>
##  1 Afghanistan  1952  8425333      Asia  28.801  779.4453
##  2 Afghanistan  1957  9240934      Asia  30.332  820.8530
##  3 Afghanistan  1962 10267083      Asia  31.997  853.1007
##  4 Afghanistan  1967 11537966      Asia  34.020  836.1971
##  5 Afghanistan  1972 13079460      Asia  36.088  739.9811
##  6 Afghanistan  1977 14880372      Asia  38.438  786.1134
##  7 Afghanistan  1982 12881816      Asia  39.854  978.0114
##  8 Afghanistan  1987 13867957      Asia  40.822  852.3959
##  9 Afghanistan  1992 16317921      Asia  41.674  649.3414
## 10 Afghanistan  1997 22227415      Asia  41.763  635.3414
## # ... with 1,694 more rows
## useful functions to get to know your data:
head(gapminder) # show the first 6 rows of the data
## # A tibble: 6 x 6
##       country  year      pop continent lifeExp gdpPercap
##         <chr> <int>    <dbl>     <chr>   <dbl>     <dbl>
## 1 Afghanistan  1952  8425333      Asia  28.801  779.4453
## 2 Afghanistan  1957  9240934      Asia  30.332  820.8530
## 3 Afghanistan  1962 10267083      Asia  31.997  853.1007
## 4 Afghanistan  1967 11537966      Asia  34.020  836.1971
## 5 Afghanistan  1972 13079460      Asia  36.088  739.9811
## 6 Afghanistan  1977 14880372      Asia  38.438  786.1134
tail(gapminder) # show the last 6 rows of data
## # A tibble: 6 x 6
##    country  year      pop continent lifeExp gdpPercap
##      <chr> <int>    <dbl>     <chr>   <dbl>     <dbl>
## 1 Zimbabwe  1982  7636524    Africa  60.363  788.8550
## 2 Zimbabwe  1987  9216418    Africa  62.351  706.1573
## 3 Zimbabwe  1992 10704340    Africa  60.377  693.4208
## 4 Zimbabwe  1997 11404948    Africa  46.809  792.4500
## 5 Zimbabwe  2002 11926563    Africa  39.989  672.0386
## 6 Zimbabwe  2007 12311143    Africa  43.487  469.7093
head(gapminder, 10) # say how many!
## # A tibble: 10 x 6
##        country  year      pop continent lifeExp gdpPercap
##          <chr> <int>    <dbl>     <chr>   <dbl>     <dbl>
##  1 Afghanistan  1952  8425333      Asia  28.801  779.4453
##  2 Afghanistan  1957  9240934      Asia  30.332  820.8530
##  3 Afghanistan  1962 10267083      Asia  31.997  853.1007
##  4 Afghanistan  1967 11537966      Asia  34.020  836.1971
##  5 Afghanistan  1972 13079460      Asia  36.088  739.9811
##  6 Afghanistan  1977 14880372      Asia  38.438  786.1134
##  7 Afghanistan  1982 12881816      Asia  39.854  978.0114
##  8 Afghanistan  1987 13867957      Asia  40.822  852.3959
##  9 Afghanistan  1992 16317921      Asia  41.674  649.3414
## 10 Afghanistan  1997 22227415      Asia  41.763  635.3414
tail(gapminder, 9)
## # A tibble: 9 x 6
##    country  year      pop continent lifeExp gdpPercap
##      <chr> <int>    <dbl>     <chr>   <dbl>     <dbl>
## 1 Zimbabwe  1967  4995432    Africa  53.995  569.7951
## 2 Zimbabwe  1972  5861135    Africa  55.635  799.3622
## 3 Zimbabwe  1977  6642107    Africa  57.674  685.5877
## 4 Zimbabwe  1982  7636524    Africa  60.363  788.8550
## 5 Zimbabwe  1987  9216418    Africa  62.351  706.1573
## 6 Zimbabwe  1992 10704340    Africa  60.377  693.4208
## 7 Zimbabwe  1997 11404948    Africa  46.809  792.4500
## 8 Zimbabwe  2002 11926563    Africa  39.989  672.0386
## 9 Zimbabwe  2007 12311143    Africa  43.487  469.7093
## check out the structure of your data
str(gapminder)
## Classes 'tbl_df', 'tbl' and 'data.frame':    1704 obs. of  6 variables:
##  $ country  : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ pop      : num  8425333 9240934 10267083 11537966 13079460 ...
##  $ continent: chr  "Asia" "Asia" "Asia" "Asia" ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ gdpPercap: num  779 821 853 836 740 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 6
##   .. ..$ country  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ year     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ pop      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ continent: list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ lifeExp  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ gdpPercap: list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
## column names 
names(gapminder)
## [1] "country"   "year"      "pop"       "continent" "lifeExp"   "gdpPercap"
## dimensions of the data
dim(gapminder)
## [1] 1704    6
ncol(gapminder)
## [1] 6
nrow(gapminder)
## [1] 1704
## combine with c()
c(nrow(gapminder), ncol(gapminder))
## [1] 1704    6
## summary statistics
summary(gapminder)
##    country               year           pop             continent        
##  Length:1704        Min.   :1952   Min.   :6.001e+04   Length:1704       
##  Class :character   1st Qu.:1966   1st Qu.:2.794e+06   Class :character  
##  Mode  :character   Median :1980   Median :7.024e+06   Mode  :character  
##                     Mean   :1980   Mean   :2.960e+07                     
##                     3rd Qu.:1993   3rd Qu.:1.959e+07                     
##                     Max.   :2007   Max.   :1.319e+09                     
##     lifeExp        gdpPercap       
##  Min.   :23.60   Min.   :   241.2  
##  1st Qu.:48.20   1st Qu.:  1202.1  
##  Median :60.71   Median :  3531.8  
##  Mean   :59.47   Mean   :  7215.3  
##  3rd Qu.:70.85   3rd Qu.:  9325.5  
##  Max.   :82.60   Max.   :113523.1
## for everything above, we were operating on the whole gapminder dataset.

head(gapminder$lifeExp)
## [1] 28.801 30.332 31.997 34.020 36.088 38.438

read_csv is from the readr package (part of the tidyverse). It is not read.csv, which is part of base R.

Data wrangling with dplyr

filter() by rows

filter(gapminder, lifeExp < 29)
## # A tibble: 2 x 6
##       country  year     pop continent lifeExp gdpPercap
##         <chr> <int>   <dbl>     <chr>   <dbl>     <dbl>
## 1 Afghanistan  1952 8425333      Asia  28.801  779.4453
## 2      Rwanda  1992 7290203    Africa  23.599  737.0686
filter(gapminder, country == "Mexico")
## # A tibble: 12 x 6
##    country  year       pop continent lifeExp gdpPercap
##      <chr> <int>     <dbl>     <chr>   <dbl>     <dbl>
##  1  Mexico  1952  30144317  Americas  50.789  3478.126
##  2  Mexico  1957  35015548  Americas  55.190  4131.547
##  3  Mexico  1962  41121485  Americas  58.299  4581.609
##  4  Mexico  1967  47995559  Americas  60.110  5754.734
##  5  Mexico  1972  55984294  Americas  62.361  6809.407
##  6  Mexico  1977  63759976  Americas  65.032  7674.929
##  7  Mexico  1982  71640904  Americas  67.405  9611.148
##  8  Mexico  1987  80122492  Americas  69.498  8688.156
##  9  Mexico  1992  88111030  Americas  71.455  9472.384
## 10  Mexico  1997  95895146  Americas  73.670  9767.298
## 11  Mexico  2002 102479927  Americas  74.902 10742.441
## 12  Mexico  2007 108700891  Americas  76.195 11977.575
sweden <- filter(gapminder, country == "Sweden")
mean(sweden$lifeExp)
## [1] 76.177
mean(filter(gapminder, country == "Sweden")$lifeExp)
## [1] 76.177
## pipe operator `%>%` 

## same output!
gapminder %>% head(3)
## # A tibble: 3 x 6
##       country  year      pop continent lifeExp gdpPercap
##         <chr> <int>    <dbl>     <chr>   <dbl>     <dbl>
## 1 Afghanistan  1952  8425333      Asia  28.801  779.4453
## 2 Afghanistan  1957  9240934      Asia  30.332  820.8530
## 3 Afghanistan  1962 10267083      Asia  31.997  853.1007
head(gapminder, 3)
## # A tibble: 3 x 6
##       country  year      pop continent lifeExp gdpPercap
##         <chr> <int>    <dbl>     <chr>   <dbl>     <dbl>
## 1 Afghanistan  1952  8425333      Asia  28.801  779.4453
## 2 Afghanistan  1957  9240934      Asia  30.332  820.8530
## 3 Afghanistan  1962 10267083      Asia  31.997  853.1007
## let's use the %>% with our Sweden example
sweden <- filter(gapminder, country == "Sweden")
x <- mean(sweden$lifeExp)

## getting ahead of ourselves
y <- gapminder %>% 
  filter(country == "Sweden") %>% 
  summarize(mean_lifeExp = mean(lifeExp))

select() by columns

## select 2 columns
select(gapminder, year, lifeExp)
## # A tibble: 1,704 x 2
##     year lifeExp
##    <int>   <dbl>
##  1  1952  28.801
##  2  1957  30.332
##  3  1962  31.997
##  4  1967  34.020
##  5  1972  36.088
##  6  1977  38.438
##  7  1982  39.854
##  8  1987  40.822
##  9  1992  41.674
## 10  1997  41.763
## # ... with 1,694 more rows
## with the pipe operator
gapminder %>%
  select(year, lifeExp) %>%
  tail()
## # A tibble: 6 x 2
##    year lifeExp
##   <int>   <dbl>
## 1  1982  60.363
## 2  1987  62.351
## 3  1992  60.377
## 4  1997  46.809
## 5  2002  39.989
## 6  2007  43.487
## combine what we've learned with the pipe

gapminder %>%
  filter(country == "Cambodia") %>%
  select(country, year, pop, gdpPercap)
## # A tibble: 12 x 4
##     country  year      pop gdpPercap
##       <chr> <int>    <dbl>     <dbl>
##  1 Cambodia  1952  4693836  368.4693
##  2 Cambodia  1957  5322536  434.0383
##  3 Cambodia  1962  6083619  496.9136
##  4 Cambodia  1967  6960067  523.4323
##  5 Cambodia  1972  7450606  421.6240
##  6 Cambodia  1977  6978607  524.9722
##  7 Cambodia  1982  7272485  624.4755
##  8 Cambodia  1987  8371791  683.8956
##  9 Cambodia  1992 10150094  682.3032
## 10 Cambodia  1997 11782962  734.2852
## 11 Cambodia  2002 12926707  896.2260
## 12 Cambodia  2007 14131858 1713.7787
## same as above
gapminder %>%
  filter(country == "Cambodia") %>%
  select(-continent, -lifeExp)
## # A tibble: 12 x 4
##     country  year      pop gdpPercap
##       <chr> <int>    <dbl>     <dbl>
##  1 Cambodia  1952  4693836  368.4693
##  2 Cambodia  1957  5322536  434.0383
##  3 Cambodia  1962  6083619  496.9136
##  4 Cambodia  1967  6960067  523.4323
##  5 Cambodia  1972  7450606  421.6240
##  6 Cambodia  1977  6978607  524.9722
##  7 Cambodia  1982  7272485  624.4755
##  8 Cambodia  1987  8371791  683.8956
##  9 Cambodia  1992 10150094  682.3032
## 10 Cambodia  1997 11782962  734.2852
## 11 Cambodia  2002 12926707  896.2260
## 12 Cambodia  2007 14131858 1713.7787
## a few more things with `filter()`
gapminder %>%
  filter(country == "Mexico",
         year == 2002)
## # A tibble: 1 x 6
##   country  year       pop continent lifeExp gdpPercap
##     <chr> <int>     <dbl>     <chr>   <dbl>     <dbl>
## 1  Mexico  2002 102479927  Americas  74.902  10742.44
## `mutate()` to add columns

gapminder %>%
  mutate(gdp = pop * gdpPercap)
## # A tibble: 1,704 x 7
##        country  year      pop continent lifeExp gdpPercap         gdp
##          <chr> <int>    <dbl>     <chr>   <dbl>     <dbl>       <dbl>
##  1 Afghanistan  1952  8425333      Asia  28.801  779.4453  6567086330
##  2 Afghanistan  1957  9240934      Asia  30.332  820.8530  7585448670
##  3 Afghanistan  1962 10267083      Asia  31.997  853.1007  8758855797
##  4 Afghanistan  1967 11537966      Asia  34.020  836.1971  9648014150
##  5 Afghanistan  1972 13079460      Asia  36.088  739.9811  9678553274
##  6 Afghanistan  1977 14880372      Asia  38.438  786.1134 11697659231
##  7 Afghanistan  1982 12881816      Asia  39.854  978.0114 12598563401
##  8 Afghanistan  1987 13867957      Asia  40.822  852.3959 11820990309
##  9 Afghanistan  1992 16317921      Asia  41.674  649.3414 10595901589
## 10 Afghanistan  1997 22227415      Asia  41.763  635.3414 14121995875
## # ... with 1,694 more rows
## let's add an index
test <- gapminder %>%
  mutate(gdp = pop * gdpPercap, 
         index = 1:nrow(gapminder)) %>%
  tail()

## not the right way, shouldn't work (only returns half)
gapminder %>%
  filter(country == c("Egypt", "Vietnam") )
## # A tibble: 12 x 6
##    country  year      pop continent lifeExp gdpPercap
##      <chr> <int>    <dbl>     <chr>   <dbl>     <dbl>
##  1   Egypt  1952 22223309    Africa  41.893 1418.8224
##  2   Egypt  1962 28173309    Africa  46.992 1693.3359
##  3   Egypt  1972 34807417    Africa  51.137 2024.0081
##  4   Egypt  1982 45681811    Africa  56.006 3503.7296
##  5   Egypt  1992 59402198    Africa  63.674 3794.7552
##  6   Egypt  2002 73312559    Africa  69.806 4754.6044
##  7 Vietnam  1957 28998543      Asia  42.887  676.2854
##  8 Vietnam  1967 39463910      Asia  47.838  637.1233
##  9 Vietnam  1977 50533506      Asia  55.764  713.5371
## 10 Vietnam  1987 62826491      Asia  62.820  820.7994
## 11 Vietnam  1997 76048996      Asia  70.672 1385.8968
## 12 Vietnam  2007 85262356      Asia  74.249 2441.5764
## `%in%` operator lets you filter multiple things within a `c()`
gapminder %>%
  filter(country %in% c("Egypt", "Vietnam") )
## # A tibble: 24 x 6
##    country  year      pop continent lifeExp gdpPercap
##      <chr> <int>    <dbl>     <chr>   <dbl>     <dbl>
##  1   Egypt  1952 22223309    Africa  41.893  1418.822
##  2   Egypt  1957 25009741    Africa  44.444  1458.915
##  3   Egypt  1962 28173309    Africa  46.992  1693.336
##  4   Egypt  1967 31681188    Africa  49.293  1814.881
##  5   Egypt  1972 34807417    Africa  51.137  2024.008
##  6   Egypt  1977 38783863    Africa  53.319  2785.494
##  7   Egypt  1982 45681811    Africa  56.006  3503.730
##  8   Egypt  1987 52799062    Africa  59.797  3885.461
##  9   Egypt  1992 59402198    Africa  63.674  3794.755
## 10   Egypt  1997 66134291    Africa  67.217  4173.182
## # ... with 14 more rows
## find the maximum gdpPercap of Egypt and Vietnam, in a new column.
gapminder %>%
  filter(country %in% c("Egypt", "Vietnam") ) %>%
  mutate(max_gdpPercap = max(gdpPercap))
## # A tibble: 24 x 7
##    country  year      pop continent lifeExp gdpPercap max_gdpPercap
##      <chr> <int>    <dbl>     <chr>   <dbl>     <dbl>         <dbl>
##  1   Egypt  1952 22223309    Africa  41.893  1418.822      5581.181
##  2   Egypt  1957 25009741    Africa  44.444  1458.915      5581.181
##  3   Egypt  1962 28173309    Africa  46.992  1693.336      5581.181
##  4   Egypt  1967 31681188    Africa  49.293  1814.881      5581.181
##  5   Egypt  1972 34807417    Africa  51.137  2024.008      5581.181
##  6   Egypt  1977 38783863    Africa  53.319  2785.494      5581.181
##  7   Egypt  1982 45681811    Africa  56.006  3503.730      5581.181
##  8   Egypt  1987 52799062    Africa  59.797  3885.461      5581.181
##  9   Egypt  1992 59402198    Africa  63.674  3794.755      5581.181
## 10   Egypt  1997 66134291    Africa  67.217  4173.182      5581.181
## # ... with 14 more rows
## group_by() so that we can get 2 maxes
gapminder %>%
  filter(country %in% c("Egypt", "Vietnam") ) %>%
  group_by(country) %>%
  # mutate(max_gdpPercap = max(gdpPercap))
  summarize(max_gdpPercap = max(gdpPercap))
## # A tibble: 2 x 2
##   country max_gdpPercap
##     <chr>         <dbl>
## 1   Egypt      5581.181
## 2 Vietnam      2441.576

group_by() and summarize()

gapminder %>%
  group_by(country) %>%
  summarize(max_gdpPercap = max(gdpPercap))

## let's keep the year associated with that max_gdpPercap
gapminder %>%
  group_by(country) %>%
  mutate(max_gdpPercap = max(gdpPercap)) %>%
  filter(max_gdpPercap == gdpPercap) %>%
  arrange(max_gdpPercap) # if you wanted descending order, arrange(desc(max_gdpPercap))