Tidy Tuesday Exercise

Add Necessary Packages

library(tidytuesdayR)
library(tidyverse) 
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.1     ✔ purrr   1.0.1
✔ tibble  3.1.8     ✔ dplyr   1.1.0
✔ tidyr   1.2.1     ✔ stringr 1.5.0
✔ readr   2.1.3     ✔ forcats 0.5.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(lubridate) #change data type to data
Loading required package: timechange

Attaching package: 'lubridate'

The following objects are masked from 'package:base':

    date, intersect, setdiff, union
library(skimr) #skim dataframes
library(plotly) #interactive plots 

Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout
library(ggstatsplot) #stats plots 
You can cite this package as:
     Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
     Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167
library(gt) #create tables 
library(scales) #build unique color pallets

Attaching package: 'scales'

The following object is masked from 'package:purrr':

    discard

The following object is masked from 'package:readr':

    col_factor
library(knitr) #format table output
library(kableExtra)

Attaching package: 'kableExtra'

The following object is masked from 'package:dplyr':

    group_rows

Add Data

age_gaps <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-14/age_gaps.csv')
Rows: 1155 Columns: 13
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (6): movie_name, director, actor_1_name, actor_2_name, character_1_gend...
dbl  (5): release_year, age_difference, couple_number, actor_1_age, actor_2_age
date (2): actor_1_birthdate, actor_2_birthdate

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Clean and Explore the data

glimpse(age_gaps)
Rows: 1,155
Columns: 13
$ movie_name         <chr> "Harold and Maude", "Venus", "The Quiet American", …
$ release_year       <dbl> 1971, 2006, 2002, 1998, 2010, 1992, 2009, 1999, 199…
$ director           <chr> "Hal Ashby", "Roger Michell", "Phillip Noyce", "Joe…
$ age_difference     <dbl> 52, 50, 49, 45, 43, 42, 40, 39, 38, 38, 36, 36, 35,…
$ couple_number      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ actor_1_name       <chr> "Ruth Gordon", "Peter O'Toole", "Michael Caine", "D…
$ actor_2_name       <chr> "Bud Cort", "Jodie Whittaker", "Do Thi Hai Yen", "T…
$ character_1_gender <chr> "woman", "man", "man", "man", "man", "man", "man", …
$ character_2_gender <chr> "man", "woman", "woman", "woman", "man", "woman", "…
$ actor_1_birthdate  <date> 1896-10-30, 1932-08-02, 1933-03-14, 1930-09-17, 19…
$ actor_2_birthdate  <date> 1948-03-29, 1982-06-03, 1982-10-01, 1975-11-08, 19…
$ actor_1_age        <dbl> 75, 74, 69, 68, 81, 59, 62, 69, 57, 77, 59, 56, 65,…
$ actor_2_age        <dbl> 23, 24, 20, 23, 38, 17, 22, 30, 19, 39, 23, 20, 30,…
#Change the column order 
age_gaps = age_gaps %>%
  relocate(actor_1_birthdate,actor_1_age,character_1_gender, .after = actor_1_name) %>%
  relocate(actor_2_birthdate,actor_2_age,character_2_gender, .after = actor_2_name)%>%
  relocate(couple_number, .before = age_difference)

Table displaying the data

head(age_gaps) %>% 
  gt() %>%
  tab_header(
    title = "Tidy Tuesday - Hollywood Age Gaps",
    subtitle = "Reformated"
  )%>% 
 tab_style(
     locations = cells_title(groups = "title"),
     style     = list(
       cell_text(weight = "bold", size = 24)
     ))
Tidy Tuesday - Hollywood Age Gaps
Reformated
movie_name release_year director couple_number age_difference actor_1_name actor_1_birthdate actor_1_age character_1_gender actor_2_name actor_2_birthdate actor_2_age character_2_gender
Harold and Maude 1971 Hal Ashby 1 52 Ruth Gordon 1896-10-30 75 woman Bud Cort 1948-03-29 23 man
Venus 2006 Roger Michell 1 50 Peter O'Toole 1932-08-02 74 man Jodie Whittaker 1982-06-03 24 woman
The Quiet American 2002 Phillip Noyce 1 49 Michael Caine 1933-03-14 69 man Do Thi Hai Yen 1982-10-01 20 woman
The Big Lebowski 1998 Joel Coen 1 45 David Huddleston 1930-09-17 68 man Tara Reid 1975-11-08 23 woman
Beginners 2010 Mike Mills 1 43 Christopher Plummer 1929-12-13 81 man Goran Visnjic 1972-09-09 38 man
Poison Ivy 1992 Katt Shea 1 42 Tom Skerritt 1933-08-25 59 man Drew Barrymore 1975-02-22 17 woman

Numeric Data Summary

#numeric data summary
numeric_summary=skim(age_gaps,where(is.numeric))
numeric_summary = data.frame(numeric_summary)

#Display as a table 
(numeric_summary) %>% 
  gt() %>%
  tab_header(
    title = "Numeric Data Summary"
  )%>% 
 tab_style(
     locations = cells_title(groups = "title"),
     style     = list(
       cell_text(weight = "bold", size = 24)
     ))
Numeric Data Summary
skim_type skim_variable n_missing complete_rate numeric.mean numeric.sd numeric.p0 numeric.p25 numeric.p50 numeric.p75 numeric.p100 numeric.hist
numeric release_year 0 1 2000.799134 16.3658191 1935 1997 2004 2012 2022 ▁▁▁▆▇
numeric couple_number 0 1 1.398268 0.7544188 1 1 1 2 7 ▇▁▁▁▁
numeric age_difference 0 1 10.424242 8.5110857 0 4 8 15 52 ▇▃▂▁▁
numeric actor_1_age 0 1 40.635498 10.4241730 18 33 39 47 81 ▂▇▅▂▁
numeric actor_2_age 0 1 30.211255 7.4959523 17 25 29 34 68 ▇▇▂▁▁

Determine the age range difference and the count of occurences as a table

#find the frequency of unique age gap values
ageRange = as.data.frame(table(age_gaps$age_difference)) 
#rename Var1 to Age_Gap
ageRange = rename(ageRange, Age_Gap = Var1)
#ageRange = ageRange %>% arrange(desc(Freq))

#create table based on the frequence
ageRange_Table = ageRange%>% 
  gt()%>% 
cols_label(
   Age_Gap = md("**Age Gap**"),
   Freq = md("**Freq**")
  )%>% 
  tab_header(
    title = "Age Gap Frequency",
  )%>% 
 tab_style(
     locations = cells_title(groups = "title"),
     style     = list(
       cell_text(weight = "bold", size = 24)
     ))

#heat map of the frequencies
minfreq = min(ageRange$Freq)
maxfreq = max(ageRange$Freq)
ageGap_Pallet = col_numeric(c("#FEF0D9", "#990000"), domain = c(minfreq, maxfreq), alpha = 0.75)

(ageRange_Table = ageRange_Table %>% 
    data_color(columns = c(Freq),
               colors = ageGap_Pallet))
Age Gap Frequency
Age Gap Freq
0 30
1 77
2 85
3 85
4 66
5 71
6 50
7 71
8 59
9 52
10 45
11 51
12 36
13 37
14 31
15 22
16 33
17 30
18 30
19 21
20 20
21 25
22 11
23 13
24 15
25 18
26 10
27 6
28 10
29 8
30 8
31 2
32 6
33 3
34 4
35 2
36 2
38 2
39 1
40 1
42 1
43 1
45 1
49 1
50 1
52 1

We can see from this table that smaller age gaps are more frequent than larger age gaps. Lets visualize this data as a bar plot

age_bar = ggplot(ageRange, aes(x = Age_Gap, y = Freq)) +
  geom_col(width = 0.3,color="blue") + theme(axis.text.x = element_text(face="bold", 
                           size=8, angle=45)) + xlab("Age Gap (Years)") + ylab("Frequency") +ggtitle("Age Gap Frequency") +theme(
  plot.title = element_text(color="Black", size=14, face="bold"))
ggplotly(age_bar)

As the age increases, the frequency decreases.

##Sex distribution of the the older and younger actors

#find the frequency of unique age gap values
actor1Freq = as.data.frame(table(age_gaps$character_1_gender)) 
#rename Var1 to Age_Gap
actor1Freq = rename(actor1Freq, Sex = Var1)

#create table based on the frequence
actor1_Table = actor1Freq%>% 
  gt()%>% 
cols_label(
   Sex = md("**Sex**"),
   Freq = md("**Freq**")
  )%>% 
  tab_header(
    title = "Actor 1 Frequency",
  )%>% 
 tab_style(
     locations = cells_title(groups = "title"),
     style     = list(
       cell_text(weight = "bold", size = 24)
     ))

#find the frequency of unique age gap values
actor2Freq = as.data.frame(table(age_gaps$character_2_gender)) 
#rename Var1 to Age_Gap
actor2Freq = rename(actor2Freq, Sex = Var1)

#create table based on the frequency
actor2_Table = actor2Freq%>% 
  gt()%>% 
cols_label(
   Sex = md("**Sex**"),
   Freq = md("**Freq**")
  )%>% 
  tab_header(
    title = "Actor 2 Frequency",
  )%>% 
 tab_style(
     locations = cells_title(groups = "title"),
     style     = list(
       cell_text(weight = "bold", size = 24)
     ))
actor1Freq %>%
  kable("html", align = 'clc') %>%
    kable_styling(full_width = F, position = "float_left")
actor2Freq %>%
  kable("html", align = 'clc') %>%
    kable_styling(full_width = F, position = "right")

Character Sex Frequency Tables

Actor 1
Sex Freq
man 941
woman 214
Actor 2
Sex Freq
man 215
woman 940

There are more male older actors and than female. How does the age gap differ?

ggplot(age_gaps, aes(x=character_1_gender, y=age_difference,color=character_1_gender)) +geom_point() + 
  geom_line() + geom_boxplot()+ggtitle("Actor 1") +theme(
  plot.title = element_text(color="Black", size=14, face="bold"))

**There is a larger age gap for male actors who are older than their partner than female actors who are older.

###There are more female younger actors and than male How does the age gap differ?

ggplot(age_gaps, aes(x=character_2_gender, y=age_difference,color=character_2_gender)) +geom_point() + 
  geom_line() + geom_boxplot()+ geom_boxplot()+ggtitle("Actor 2") +theme(
  plot.title = element_text(color="Black", size=14, face="bold"))

###Max Age difference by year

#plot the max age difference each year over the years 
Max = age_gaps %>%
  group_by(release_year) %>%
  summarise(max = max(age_difference))

ggplot(Max, aes(x = release_year, y = max)) +
  geom_line()+geom_smooth(method = "loess", color = "red", fill = "red")+ggtitle("Max Age difference by year") +theme(
  plot.title = element_text(color="Black", size=14, face="bold"))
`geom_smooth()` using formula = 'y ~ x'

Average age difference by year

#plot the average age difference over the years 

AverageTable = age_gaps %>%
  group_by(release_year) %>%
  summarise(average = mean(age_difference))

ggplot(AverageTable, aes(x = release_year, y = average)) +
  geom_line()+geom_smooth(method = "loess", color = "red", fill = "red")+ggtitle("Average Age difference by year") +theme(
  plot.title = element_text(color="Black", size=14, face="bold"))
`geom_smooth()` using formula = 'y ~ x'

There were alot more movies created in the later years so this may not be a great representation of the data set and age gaps over time.

Do certain directors cast actors with larger age gaps?

Directors who cast with smaller age gaps

d_small=age_gaps%>%
  select(director,age_difference)%>%
  filter(age_difference<25)%>%
  count(director)%>%
  arrange(desc(n))%>%
  filter(n>1)
nrow(d_small)
[1] 245
#create table
d_small_Table = head(d_small)%>% 
  gt()%>% 
cols_label(
   director = md("**Director**"),
   n = md("**Freq**")
  )%>% 
  tab_header(
    title = "Directors who cast small age gaps",
    subtitle = "This table lists the head of the table of directors who cast actors in a movie with an age gap smaller than to 25 years at least 3 times"
  )%>% 
 tab_style(
     locations = cells_title(groups = "title"),
     style     = list(
       cell_text(weight = "bold", size = 24)
     )) %>%
  tab_source_note(
    source_note = "There are a total of 245 directors who cast at least 2 movies with age gaps below 25 years of age"
  )

d_small_Table
Directors who cast small age gaps
This table lists the head of the table of directors who cast actors in a movie with an age gap smaller than to 25 years at least 3 times
Director Freq
Woody Allen 12
John Glen 11
Martin Scorsese 11
Mike Newell 10
Dennis Dugan 9
Guy Hamilton 9
There are a total of 245 directors who cast at least 2 movies with age gaps below 25 years of age
dSmall_concat=inner_join(age_gaps,d_small,by="director")

Directors who cast with larger age gaps (only ones with more than 1)

d_large=age_gaps%>%
  select(director,age_difference)%>%
  filter(age_difference>=25)%>%
  count(director)%>%
  arrange(desc(n))%>%
  filter(n>1)
d_large
# A tibble: 12 × 2
   director                 n
   <chr>                <int>
 1 Woody Allen              8
 2 John Glen                4
 3 John Huston              3
 4 Adrian Lyne              2
 5 Alfred Hitchcock         2
 6 Howard Hawks             2
 7 Joel Coen                2
 8 Martin Scorsese          2
 9 Paul Thomas Anderson     2
10 Roger Michell            2
11 Scott Cooper             2
12 Stanley Donen            2
dLarge_concat=inner_join(age_gaps,d_large,by="director")

#create table
d_large_Table = d_large%>% 
  gt()%>% 
cols_label(
   director = md("**Director**"),
   n = md("**Freq**")
  )%>% 
  tab_header(
    title = "Directors with large age gaps",
    subtitle = "This table lists the directors who cast actors in a movie with an aga gap greater than or equal to 25 at least 2 times"
  )%>% 
 tab_style(
     locations = cells_title(groups = "title"),
     style     = list(
       cell_text(weight = "bold", size = 24)
     ))

d_large_Table
Directors with large age gaps
This table lists the directors who cast actors in a movie with an aga gap greater than or equal to 25 at least 2 times
Director Freq
Woody Allen 8
John Glen 4
John Huston 3
Adrian Lyne 2
Alfred Hitchcock 2
Howard Hawks 2
Joel Coen 2
Martin Scorsese 2
Paul Thomas Anderson 2
Roger Michell 2
Scott Cooper 2
Stanley Donen 2
p = plot_ly(dLarge_concat, x = ~director, y = ~age_difference,
             type = "scatter", mode = 'markers', marker = list(color = "blue"), 
             text =~paste('</br> Director: ', director,
                      '</br> Movie: ', movie_name,
                      '</br> Age Gap: ', age_difference),
             hoverinfo = "text")
p = p  %>% layout (title = "Directors with age gaps over 25 years and their movies",xaxis = list(title = "Director"), yaxis = list(title = "Age Gap"))
p