Introduction

From the article on Mistakes, we’ve drawn a few -

“At The Economist, we take data visualisation seriously. Every week we publish around 40 charts across print, the website and our apps. With every single one, we try our best to visualise the numbers accurately and in a way that best supports the story. But sometimes we get it wrong. We can do better in future if we learn from our mistakes — and other people may be able to learn from them, too.”

Here I will try and draw the improved plots as suggested by the article on economist or make a version I think is best. All this done towards the weekly social data project Tidy Tuesday.

Analysis

Load libraries

rm(list = ls())
library(tidyverse)
library(lubridate)
library(ggplot2)
library(gridExtra)
library(scales)
theme_set(theme_light())

Analyzing Britain’s Political Left

corbyn <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-16/corbyn.csv")

corbyn
## # A tibble: 6 x 2
##   political_group avg_facebook_likes
##   <chr>                        <dbl>
## 1 Jeremy Corbyn                 5210
## 2 Labour Party                   845
## 3 Momentum                       229
## 4 Owen Smith                     127
## 5 Andy Burnham                   105
## 6 Saving Labour                   56
corbyn %>% 
  mutate(pct_likes = avg_facebook_likes/sum(avg_facebook_likes)) %>% 
  ggplot(aes(political_group, pct_likes, fill = "red")) + 
  geom_col(show.legend = FALSE) +
  scale_y_continuous(labels = percent_format()) + 
  coord_flip() + 
  labs(y = "% of likes over the political groups",
       x = "Political Group",
       title = "Percentage of Average Facebook likes for different political groups",
       caption = "Based on data from The Economist about political left in Britain")

Analyzing decline of dog weights

dogs <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-16/dogs.csv")

dogs
## # A tibble: 10 x 3
##     year avg_weight avg_neck
##    <dbl>      <dbl>    <dbl>
##  1  2006       20.5     44.3
##  2  2007       20.0     43.8
##  3  2008       19.4     43.4
##  4  2009       19.2     43.2
##  5  2010       19.1     43.2
##  6  2011       19.0     43.1
##  7  2012       18.6     42.8
##  8  2013       18.5     42.8
##  9  2014       18.4     42.7
## 10  2015       18.1     42.5
dogs %>% 
  mutate(year = as.factor(year),
         weight_to_neck = avg_weight/avg_neck) %>% 
  ggplot(aes(x = year, y = weight_to_neck)) + 
  geom_line(aes(group = 1)) +
  geom_point() +
  labs(x = "Year",
       y = "Average Weight to Average Neck Ratio",
       title = "Average Weight to Neck Ratio over Years",
       caption = "Based on data from The Economist")

Analyzing Brexit data

brexit <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-16/brexit.csv")

brexit
## # A tibble: 85 x 3
##    date     percent_responding_right percent_responding_wrong
##    <chr>                       <dbl>                    <dbl>
##  1 02/08/16                       46                       42
##  2 09/08/16                       45                       44
##  3 17/08/16                       46                       43
##  4 23/08/16                       45                       43
##  5 31/08/16                       47                       44
##  6 14/09/16                       46                       43
##  7 12/10/16                       45                       44
##  8 20/10/16                       45                       44
##  9 15/11/16                       46                       43
## 10 29/11/16                       44                       45
## # … with 75 more rows
brexit %>% 
  mutate(date = dmy(date)) %>% 
  ggplot(aes(x = date)) + 
  geom_smooth(aes(y = percent_responding_right, colour = "percent_responding_right"), se = FALSE) + 
  geom_point(aes(y = percent_responding_right, colour = "percent_responding_right")) +
  geom_smooth(aes(y = percent_responding_wrong, colour = "percent_responding_wrong"), se = FALSE) +
  geom_point(aes(y = percent_responding_wrong, colour = "percent_responding_wrong")) + 
  scale_color_manual(labels = c("Right", "Wrong"), values = c("blue", "red")) + 
  labs(x = "Date",
       y = "Response Percentage",
       title = "Response behaviour of people about Brexit over time on the question",
       subtitle = "In hindsight, do you think Britain was right or wrong to vote to leave the EU?",
       caption = "Based on data from The Economist",
       color = "Response")

Analyzing US trade deficit

trade <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-16/trade.csv")

trade
## # A tibble: 22 x 3
##     year trade_deficit manufacture_employment
##    <dbl>         <dbl>                  <dbl>
##  1  1995      -3.38e10              17244583.
##  2  1996      -3.95e10              17236750 
##  3  1997      -4.97e10              17417833.
##  4  1998      -5.69e10              17560000 
##  5  1999      -6.87e10              17322667.
##  6  2000      -8.38e10              17265250 
##  7  2001      -8.31e10              16440583.
##  8  2002      -1.03e11              15256833.
##  9  2003      -1.24e11              14508500 
## 10  2004      -1.62e11              14314750 
## # … with 12 more rows
trade %>% 
  mutate(year = as.factor(year),
         trade_deficit = trade_deficit/1000000000,
         manufacture_employment = manufacture_employment/1000000) -> trade

trade %>% 
  ggplot(aes(x = year, y = trade_deficit, fill = "trade_deficit")) + 
  geom_col(show.legend = FALSE) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  labs(x = "Year",
       y = "Trade Deficit in billions",
       title = "US Trade Deficit over years",
       caption = "Based on data from The Economist") -> p1

trade %>% 
  ggplot(aes(x = year, y = manufacture_employment, color = "manufacture_employment")) + 
  geom_line(aes(group = 1), show.legend = FALSE) + 
  geom_point(show.legend = FALSE) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  labs(x = "Year",
       y = "Manufacturing Employment in millions",
       title = "Manufacturing Employment over years",
       caption = "Based on data from The Economist") -> p2

grid.arrange(p1, p2, nrow = 1)

Analyzing pension benefits

pensions <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-16/pensions.csv")

pensions
## # A tibble: 35 x 3
##    country        pop_65_percent gov_spend_percent_gdp
##    <chr>                   <dbl>                 <dbl>
##  1 Australia               15.0                   5.2 
##  2 Austria                 18.8                  13.9 
##  3 Belgium                 18.2                  10.4 
##  4 Brazil                   7.84                 12   
##  5 Canada                  16.1                   4.31
##  6 Chile                   11                     3.25
##  7 Czech Republic          18.1                   9.09
##  8 Denmark                 19.0                   8.45
##  9 Estonia                 18.8                   6.99
## 10 Finland                 20.5                  11.4 
## # … with 25 more rows
pensions %>% 
  mutate(spend_per_head = gov_spend_percent_gdp/pop_65_percent) %>% 
  ggplot(aes(x = pop_65_percent, y = gov_spend_percent_gdp, color = country, size = spend_per_head)) + 
  geom_point(show.legend = FALSE) + 
  geom_text(aes(label = country), hjust = -0.15, vjust = 0, show.legend = FALSE) + 
  labs(x = "Percent of population aged 65 or older",
       y = "Percent of government spending on pension benefits as percent of GDP",
       title = "Government Spend vs. Population over 65",
       subtitle = "Size of point represents the spend per head",
       caption = "Based on data from The Economist")

Analyzing EU balance

eu_balance <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-16/eu_balance.csv")

eu_balance
## # A tibble: 266 x 4
##    country account_type  year   value
##    <chr>   <chr>        <dbl>   <dbl>
##  1 Belgium current       2009  -3755 
##  2 Germany current       2009 141234 
##  3 Estonia current       2009    360 
##  4 Ireland current       2009  -7912.
##  5 Greece  current       2009 -29323 
##  6 Spain   current       2009 -46191 
##  7 France  current       2009 -10652 
##  8 Italy   current       2009 -29717 
##  9 Cyprus  current       2009  -1431 
## 10 Latvia  current       2009   1463 
## # … with 256 more rows
eu_balance %>% 
  mutate(year = as.factor(year),
         account_type = as.factor(account_type),
         country = as.factor(country)) %>% 
  group_by(year, account_type) %>% 
  mutate(perc = value/sum(value)) %>% 
  top_n(5, perc) %>% 
  ungroup() %>% 
  ggplot(aes(x = year, y = value, fill = country)) +
  geom_col() + 
  facet_wrap(~account_type) +
  labs(x = "Year",
       y = "Value in billions of euros",
       title = "Top 5 countries per year and account type",
       subtitle = "Based on percentage of balances",
       caption = "Based on data from The Economist")

Analyzing papers published

women_research <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-16/women_research.csv")

women_research
## # A tibble: 60 x 3
##    country        field           percent_women
##    <chr>          <chr>                   <dbl>
##  1 Japan          Health sciences          0.24
##  2 Chile          Health sciences          0.43
##  3 United Kingdom Health sciences          0.45
##  4 United States  Health sciences          0.46
##  5 Mexico         Health sciences          0.46
##  6 Denmark        Health sciences          0.47
##  7 EU28           Health sciences          0.48
##  8 France         Health sciences          0.48
##  9 Canada         Health sciences          0.49
## 10 Australia      Health sciences          0.5 
## # … with 50 more rows
women_research %>% 
  ggplot(aes(x = field, y = percent_women, color = country, size = percent_women)) + 
  geom_point() + 
  scale_size(guide = "none") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  labs(x = "Field of study",
       y = "Percentage of Women",
       title = "Women among researchers with papers published 2011-15 as % of total by field of study for 12 countries",
       subtitle = "Size of point also represents percentage of women",
       color = "Country",
       caption = "Based on data from The Economist")