ggplot
stuff(I actually didn’t do this, but follow along in Ben’s repo: https://bbest.github.io/ds-repo/)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.2
## ── Attaching packages ─────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1.9000 ✔ purrr 0.2.4
## ✔ tibble 1.3.4 ✔ dplyr 0.7.4
## ✔ tidyr 0.7.2 ✔ stringr 1.2.0
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## Warning: package 'purrr' was built under R version 3.4.2
## Warning: package 'dplyr' was built under R version 3.4.2
## ── Conflicts ────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
gapminder <- readr::read_csv("https://raw.githubusercontent.com/OHI-Science/data-science-training/master/data/gapminder.csv")
## Parsed with column specification:
## cols(
## country = col_character(),
## year = col_integer(),
## pop = col_double(),
## continent = col_character(),
## lifeExp = col_double(),
## gdpPercap = col_double()
## )
gapminder
## # A tibble: 1,704 x 6
## country year pop continent lifeExp gdpPercap
## <chr> <int> <dbl> <chr> <dbl> <dbl>
## 1 Afghanistan 1952 8425333 Asia 28.801 779.4453
## 2 Afghanistan 1957 9240934 Asia 30.332 820.8530
## 3 Afghanistan 1962 10267083 Asia 31.997 853.1007
## 4 Afghanistan 1967 11537966 Asia 34.020 836.1971
## 5 Afghanistan 1972 13079460 Asia 36.088 739.9811
## 6 Afghanistan 1977 14880372 Asia 38.438 786.1134
## 7 Afghanistan 1982 12881816 Asia 39.854 978.0114
## 8 Afghanistan 1987 13867957 Asia 40.822 852.3959
## 9 Afghanistan 1992 16317921 Asia 41.674 649.3414
## 10 Afghanistan 1997 22227415 Asia 41.763 635.3414
## # ... with 1,694 more rows
## useful functions to get to know your data:
head(gapminder) # show the first 6 rows of the data
## # A tibble: 6 x 6
## country year pop continent lifeExp gdpPercap
## <chr> <int> <dbl> <chr> <dbl> <dbl>
## 1 Afghanistan 1952 8425333 Asia 28.801 779.4453
## 2 Afghanistan 1957 9240934 Asia 30.332 820.8530
## 3 Afghanistan 1962 10267083 Asia 31.997 853.1007
## 4 Afghanistan 1967 11537966 Asia 34.020 836.1971
## 5 Afghanistan 1972 13079460 Asia 36.088 739.9811
## 6 Afghanistan 1977 14880372 Asia 38.438 786.1134
tail(gapminder) # show the last 6 rows of data
## # A tibble: 6 x 6
## country year pop continent lifeExp gdpPercap
## <chr> <int> <dbl> <chr> <dbl> <dbl>
## 1 Zimbabwe 1982 7636524 Africa 60.363 788.8550
## 2 Zimbabwe 1987 9216418 Africa 62.351 706.1573
## 3 Zimbabwe 1992 10704340 Africa 60.377 693.4208
## 4 Zimbabwe 1997 11404948 Africa 46.809 792.4500
## 5 Zimbabwe 2002 11926563 Africa 39.989 672.0386
## 6 Zimbabwe 2007 12311143 Africa 43.487 469.7093
head(gapminder, 10) # say how many!
## # A tibble: 10 x 6
## country year pop continent lifeExp gdpPercap
## <chr> <int> <dbl> <chr> <dbl> <dbl>
## 1 Afghanistan 1952 8425333 Asia 28.801 779.4453
## 2 Afghanistan 1957 9240934 Asia 30.332 820.8530
## 3 Afghanistan 1962 10267083 Asia 31.997 853.1007
## 4 Afghanistan 1967 11537966 Asia 34.020 836.1971
## 5 Afghanistan 1972 13079460 Asia 36.088 739.9811
## 6 Afghanistan 1977 14880372 Asia 38.438 786.1134
## 7 Afghanistan 1982 12881816 Asia 39.854 978.0114
## 8 Afghanistan 1987 13867957 Asia 40.822 852.3959
## 9 Afghanistan 1992 16317921 Asia 41.674 649.3414
## 10 Afghanistan 1997 22227415 Asia 41.763 635.3414
tail(gapminder, 9)
## # A tibble: 9 x 6
## country year pop continent lifeExp gdpPercap
## <chr> <int> <dbl> <chr> <dbl> <dbl>
## 1 Zimbabwe 1967 4995432 Africa 53.995 569.7951
## 2 Zimbabwe 1972 5861135 Africa 55.635 799.3622
## 3 Zimbabwe 1977 6642107 Africa 57.674 685.5877
## 4 Zimbabwe 1982 7636524 Africa 60.363 788.8550
## 5 Zimbabwe 1987 9216418 Africa 62.351 706.1573
## 6 Zimbabwe 1992 10704340 Africa 60.377 693.4208
## 7 Zimbabwe 1997 11404948 Africa 46.809 792.4500
## 8 Zimbabwe 2002 11926563 Africa 39.989 672.0386
## 9 Zimbabwe 2007 12311143 Africa 43.487 469.7093
## check out the structure of your data
str(gapminder)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1704 obs. of 6 variables:
## $ country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ pop : num 8425333 9240934 10267083 11537966 13079460 ...
## $ continent: chr "Asia" "Asia" "Asia" "Asia" ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ gdpPercap: num 779 821 853 836 740 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 6
## .. ..$ country : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ year : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ pop : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ continent: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ lifeExp : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ gdpPercap: list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
## column names
names(gapminder)
## [1] "country" "year" "pop" "continent" "lifeExp" "gdpPercap"
## dimensions of the data
dim(gapminder)
## [1] 1704 6
ncol(gapminder)
## [1] 6
nrow(gapminder)
## [1] 1704
## combine with c()
c(nrow(gapminder), ncol(gapminder))
## [1] 1704 6
## summary statistics
summary(gapminder)
## country year pop continent
## Length:1704 Min. :1952 Min. :6.001e+04 Length:1704
## Class :character 1st Qu.:1966 1st Qu.:2.794e+06 Class :character
## Mode :character Median :1980 Median :7.024e+06 Mode :character
## Mean :1980 Mean :2.960e+07
## 3rd Qu.:1993 3rd Qu.:1.959e+07
## Max. :2007 Max. :1.319e+09
## lifeExp gdpPercap
## Min. :23.60 Min. : 241.2
## 1st Qu.:48.20 1st Qu.: 1202.1
## Median :60.71 Median : 3531.8
## Mean :59.47 Mean : 7215.3
## 3rd Qu.:70.85 3rd Qu.: 9325.5
## Max. :82.60 Max. :113523.1
## for everything above, we were operating on the whole gapminder dataset.
head(gapminder$lifeExp)
## [1] 28.801 30.332 31.997 34.020 36.088 38.438
read_csv
is from the readr
package (part of the tidyverse). It is not read.csv
, which is part of base R.
filter()
by rowsfilter(gapminder, lifeExp < 29)
## # A tibble: 2 x 6
## country year pop continent lifeExp gdpPercap
## <chr> <int> <dbl> <chr> <dbl> <dbl>
## 1 Afghanistan 1952 8425333 Asia 28.801 779.4453
## 2 Rwanda 1992 7290203 Africa 23.599 737.0686
filter(gapminder, country == "Mexico")
## # A tibble: 12 x 6
## country year pop continent lifeExp gdpPercap
## <chr> <int> <dbl> <chr> <dbl> <dbl>
## 1 Mexico 1952 30144317 Americas 50.789 3478.126
## 2 Mexico 1957 35015548 Americas 55.190 4131.547
## 3 Mexico 1962 41121485 Americas 58.299 4581.609
## 4 Mexico 1967 47995559 Americas 60.110 5754.734
## 5 Mexico 1972 55984294 Americas 62.361 6809.407
## 6 Mexico 1977 63759976 Americas 65.032 7674.929
## 7 Mexico 1982 71640904 Americas 67.405 9611.148
## 8 Mexico 1987 80122492 Americas 69.498 8688.156
## 9 Mexico 1992 88111030 Americas 71.455 9472.384
## 10 Mexico 1997 95895146 Americas 73.670 9767.298
## 11 Mexico 2002 102479927 Americas 74.902 10742.441
## 12 Mexico 2007 108700891 Americas 76.195 11977.575
sweden <- filter(gapminder, country == "Sweden")
mean(sweden$lifeExp)
## [1] 76.177
mean(filter(gapminder, country == "Sweden")$lifeExp)
## [1] 76.177
## pipe operator `%>%`
## same output!
gapminder %>% head(3)
## # A tibble: 3 x 6
## country year pop continent lifeExp gdpPercap
## <chr> <int> <dbl> <chr> <dbl> <dbl>
## 1 Afghanistan 1952 8425333 Asia 28.801 779.4453
## 2 Afghanistan 1957 9240934 Asia 30.332 820.8530
## 3 Afghanistan 1962 10267083 Asia 31.997 853.1007
head(gapminder, 3)
## # A tibble: 3 x 6
## country year pop continent lifeExp gdpPercap
## <chr> <int> <dbl> <chr> <dbl> <dbl>
## 1 Afghanistan 1952 8425333 Asia 28.801 779.4453
## 2 Afghanistan 1957 9240934 Asia 30.332 820.8530
## 3 Afghanistan 1962 10267083 Asia 31.997 853.1007
## let's use the %>% with our Sweden example
sweden <- filter(gapminder, country == "Sweden")
x <- mean(sweden$lifeExp)
## getting ahead of ourselves
y <- gapminder %>%
filter(country == "Sweden") %>%
summarize(mean_lifeExp = mean(lifeExp))
select()
by columns## select 2 columns
select(gapminder, year, lifeExp)
## # A tibble: 1,704 x 2
## year lifeExp
## <int> <dbl>
## 1 1952 28.801
## 2 1957 30.332
## 3 1962 31.997
## 4 1967 34.020
## 5 1972 36.088
## 6 1977 38.438
## 7 1982 39.854
## 8 1987 40.822
## 9 1992 41.674
## 10 1997 41.763
## # ... with 1,694 more rows
## with the pipe operator
gapminder %>%
select(year, lifeExp) %>%
tail()
## # A tibble: 6 x 2
## year lifeExp
## <int> <dbl>
## 1 1982 60.363
## 2 1987 62.351
## 3 1992 60.377
## 4 1997 46.809
## 5 2002 39.989
## 6 2007 43.487
## combine what we've learned with the pipe
gapminder %>%
filter(country == "Cambodia") %>%
select(country, year, pop, gdpPercap)
## # A tibble: 12 x 4
## country year pop gdpPercap
## <chr> <int> <dbl> <dbl>
## 1 Cambodia 1952 4693836 368.4693
## 2 Cambodia 1957 5322536 434.0383
## 3 Cambodia 1962 6083619 496.9136
## 4 Cambodia 1967 6960067 523.4323
## 5 Cambodia 1972 7450606 421.6240
## 6 Cambodia 1977 6978607 524.9722
## 7 Cambodia 1982 7272485 624.4755
## 8 Cambodia 1987 8371791 683.8956
## 9 Cambodia 1992 10150094 682.3032
## 10 Cambodia 1997 11782962 734.2852
## 11 Cambodia 2002 12926707 896.2260
## 12 Cambodia 2007 14131858 1713.7787
## same as above
gapminder %>%
filter(country == "Cambodia") %>%
select(-continent, -lifeExp)
## # A tibble: 12 x 4
## country year pop gdpPercap
## <chr> <int> <dbl> <dbl>
## 1 Cambodia 1952 4693836 368.4693
## 2 Cambodia 1957 5322536 434.0383
## 3 Cambodia 1962 6083619 496.9136
## 4 Cambodia 1967 6960067 523.4323
## 5 Cambodia 1972 7450606 421.6240
## 6 Cambodia 1977 6978607 524.9722
## 7 Cambodia 1982 7272485 624.4755
## 8 Cambodia 1987 8371791 683.8956
## 9 Cambodia 1992 10150094 682.3032
## 10 Cambodia 1997 11782962 734.2852
## 11 Cambodia 2002 12926707 896.2260
## 12 Cambodia 2007 14131858 1713.7787
## a few more things with `filter()`
gapminder %>%
filter(country == "Mexico",
year == 2002)
## # A tibble: 1 x 6
## country year pop continent lifeExp gdpPercap
## <chr> <int> <dbl> <chr> <dbl> <dbl>
## 1 Mexico 2002 102479927 Americas 74.902 10742.44
## `mutate()` to add columns
gapminder %>%
mutate(gdp = pop * gdpPercap)
## # A tibble: 1,704 x 7
## country year pop continent lifeExp gdpPercap gdp
## <chr> <int> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 Afghanistan 1952 8425333 Asia 28.801 779.4453 6567086330
## 2 Afghanistan 1957 9240934 Asia 30.332 820.8530 7585448670
## 3 Afghanistan 1962 10267083 Asia 31.997 853.1007 8758855797
## 4 Afghanistan 1967 11537966 Asia 34.020 836.1971 9648014150
## 5 Afghanistan 1972 13079460 Asia 36.088 739.9811 9678553274
## 6 Afghanistan 1977 14880372 Asia 38.438 786.1134 11697659231
## 7 Afghanistan 1982 12881816 Asia 39.854 978.0114 12598563401
## 8 Afghanistan 1987 13867957 Asia 40.822 852.3959 11820990309
## 9 Afghanistan 1992 16317921 Asia 41.674 649.3414 10595901589
## 10 Afghanistan 1997 22227415 Asia 41.763 635.3414 14121995875
## # ... with 1,694 more rows
## let's add an index
test <- gapminder %>%
mutate(gdp = pop * gdpPercap,
index = 1:nrow(gapminder)) %>%
tail()
## not the right way, shouldn't work (only returns half)
gapminder %>%
filter(country == c("Egypt", "Vietnam") )
## # A tibble: 12 x 6
## country year pop continent lifeExp gdpPercap
## <chr> <int> <dbl> <chr> <dbl> <dbl>
## 1 Egypt 1952 22223309 Africa 41.893 1418.8224
## 2 Egypt 1962 28173309 Africa 46.992 1693.3359
## 3 Egypt 1972 34807417 Africa 51.137 2024.0081
## 4 Egypt 1982 45681811 Africa 56.006 3503.7296
## 5 Egypt 1992 59402198 Africa 63.674 3794.7552
## 6 Egypt 2002 73312559 Africa 69.806 4754.6044
## 7 Vietnam 1957 28998543 Asia 42.887 676.2854
## 8 Vietnam 1967 39463910 Asia 47.838 637.1233
## 9 Vietnam 1977 50533506 Asia 55.764 713.5371
## 10 Vietnam 1987 62826491 Asia 62.820 820.7994
## 11 Vietnam 1997 76048996 Asia 70.672 1385.8968
## 12 Vietnam 2007 85262356 Asia 74.249 2441.5764
## `%in%` operator lets you filter multiple things within a `c()`
gapminder %>%
filter(country %in% c("Egypt", "Vietnam") )
## # A tibble: 24 x 6
## country year pop continent lifeExp gdpPercap
## <chr> <int> <dbl> <chr> <dbl> <dbl>
## 1 Egypt 1952 22223309 Africa 41.893 1418.822
## 2 Egypt 1957 25009741 Africa 44.444 1458.915
## 3 Egypt 1962 28173309 Africa 46.992 1693.336
## 4 Egypt 1967 31681188 Africa 49.293 1814.881
## 5 Egypt 1972 34807417 Africa 51.137 2024.008
## 6 Egypt 1977 38783863 Africa 53.319 2785.494
## 7 Egypt 1982 45681811 Africa 56.006 3503.730
## 8 Egypt 1987 52799062 Africa 59.797 3885.461
## 9 Egypt 1992 59402198 Africa 63.674 3794.755
## 10 Egypt 1997 66134291 Africa 67.217 4173.182
## # ... with 14 more rows
## find the maximum gdpPercap of Egypt and Vietnam, in a new column.
gapminder %>%
filter(country %in% c("Egypt", "Vietnam") ) %>%
mutate(max_gdpPercap = max(gdpPercap))
## # A tibble: 24 x 7
## country year pop continent lifeExp gdpPercap max_gdpPercap
## <chr> <int> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 Egypt 1952 22223309 Africa 41.893 1418.822 5581.181
## 2 Egypt 1957 25009741 Africa 44.444 1458.915 5581.181
## 3 Egypt 1962 28173309 Africa 46.992 1693.336 5581.181
## 4 Egypt 1967 31681188 Africa 49.293 1814.881 5581.181
## 5 Egypt 1972 34807417 Africa 51.137 2024.008 5581.181
## 6 Egypt 1977 38783863 Africa 53.319 2785.494 5581.181
## 7 Egypt 1982 45681811 Africa 56.006 3503.730 5581.181
## 8 Egypt 1987 52799062 Africa 59.797 3885.461 5581.181
## 9 Egypt 1992 59402198 Africa 63.674 3794.755 5581.181
## 10 Egypt 1997 66134291 Africa 67.217 4173.182 5581.181
## # ... with 14 more rows
## group_by() so that we can get 2 maxes
gapminder %>%
filter(country %in% c("Egypt", "Vietnam") ) %>%
group_by(country) %>%
# mutate(max_gdpPercap = max(gdpPercap))
summarize(max_gdpPercap = max(gdpPercap))
## # A tibble: 2 x 2
## country max_gdpPercap
## <chr> <dbl>
## 1 Egypt 5581.181
## 2 Vietnam 2441.576
group_by()
and summarize()
gapminder %>%
group_by(country) %>%
summarize(max_gdpPercap = max(gdpPercap))
## let's keep the year associated with that max_gdpPercap
gapminder %>%
group_by(country) %>%
mutate(max_gdpPercap = max(gdpPercap)) %>%
filter(max_gdpPercap == gdpPercap) %>%
arrange(max_gdpPercap) # if you wanted descending order, arrange(desc(max_gdpPercap))