Superman Data: Actors, Box Office, and Reviews

Scraping and combining data from Rotten Tomatoes, Letterboxd, and Box Office Mojo to analyze Superman movies and TV shows.

This post documents the process of collecting and combining data about Superman actors, movie reviews, and box office performance from multiple sources.

library(dplyr)
library(haven)
library(labelled)
library(tinytable)
library(readr)
library(here)
library(stringr)
library(lubridate)
library(readxl)
library(ggplot2)
library(scales)

Part 1: Superman Actor Data

First, we compile data about the actors who have played Superman and Lois Lane across different films and TV shows.

Load and Process Actor Data

superman_df <- read_excel("superman_raw.xlsx", sheet = "superman")

superman_actors <- superman_df |>
  mutate(
    clark_birth = ymd(clark_birth),
    lois_birth = ymd(lois_birth),
    release_date = ymd(release_date),
    clark_age = time_length(interval(clark_birth, release_date), "years"), 
    lois_age = time_length(interval(lois_birth, release_date), "years")
  ) |> 
  select(-release_date, -clark_birth, -lois_birth)

numeric_cols <- which(sapply(superman_actors, is.numeric))

superman_actors |>
  tt(caption = "Superman and Lois Lane Actors") |>
  format_tt(j = numeric_cols, digits = 2) |>
  style_tt(bootstrap_class = "table table-striped table-hover")
Superman and Lois Lane Actors
type title year clark_actor clark_height lois_actor lois_height clark_age lois_age
Film Superman 2025 David Corenswet 1.9 Rachel Brosnahan 1.6 32 35
Film Superman: The Movie 1978 Christopher Reeve 1.9 Margot Kidder 1.7 26 30
TV Show Smallville 2001 Tom Welling 1.9 Erica Durance 1.7 24 23
Film Superman Returns 2006 Brandon Routh 1.9 Kate Bosworth 1.6 27 23
Film Superman & the Mole Men 1951 George Reeves 1.9 Phyllis Coates 1.6 38 25
Film Man of Steel 2013 Henry Cavill 1.9 Amy Adams 1.6 30 39
Serial Superman 1948 Kirk Alyn 1.9 Noel Neill 1.6 37 27
TV Show Superman & Lois 2021 Tyler Hoechlin 1.8 Elizabeth Tulloch 1.7 33 40
TV Show Lois & Clark: The New Adventures of Superman 1993 Dean Cain 1.8 Teri Hatcher 1.7 27 29
TV Show The Adventures of Superboy 1988 John Haymes Newton 1.8 NA NA 23 NA
TV Show The Adventures of Superboy 1989 Gerard Christopher 1.8 NA NA 31 NA

Create SPSS Version with Labels

For use in statistics classes, we create a properly labeled SPSS file:

superman_data <- superman_actors |> 
  mutate(
    across(where(is.numeric), ~ifelse(is.na(.), -99, .)),
    across(where(is.character), ~ifelse(is.na(.), "-99", .))
  )

# Create value labels for categorical variables
type_values <- unique(superman_data$type)
type_labels <- setNames(1:length(type_values), type_values)

title_values <- unique(superman_data$title)
title_labels <- setNames(1:length(title_values), title_values)

actor_values <- unique(superman_data$clark_actor)
actor_values <- actor_values[!is.na(actor_values)]
actor_labels <- setNames(1:length(actor_values), actor_values)

lois_actor_values <- unique(superman_data$lois_actor)
lois_actor_values <- lois_actor_values[!is.na(lois_actor_values)]
lois_actor_labels <- setNames(1:length(lois_actor_values), lois_actor_values)

var_labels <- c(
  type = "Media Type",
  title = "Title of Superman Media",
  year = "Year of first superman media appearance",
  clark_actor = "Name of actor playing Superman/Clark Kent",
  clark_height = "Height of Clark Kent/Superman actor (meters)",
  lois_actor = "Name of actress playing Lois Lane",
  lois_height = "Height of Lois Lane actress (meters)",
  clark_age = "Age of Clark Kent/Superman actor at Release Date",
  lois_age = "Age of Lois Lane actress at Release Date"
)

superman_labelled <- superman_data |>
  mutate(
    type = as.numeric(factor(type, levels = names(type_labels))),
    title = as.numeric(factor(title, levels = names(title_labels))),
    clark_actor = as.numeric(factor(clark_actor, levels = names(actor_labels))),
    lois_actor = as.numeric(factor(lois_actor, levels = names(lois_actor_labels)))
  ) |>
  set_variable_labels(!!!var_labels) |>
  set_value_labels(
    type = type_labels,
    title = title_labels,
    clark_actor = actor_labels,
    lois_actor = lois_actor_labels
  ) |> 
  select(year, title, type, clark_actor, clark_height, clark_age, lois_actor, lois_height, lois_age)

# Set SPSS attributes
for (col in names(superman_labelled)) {
  if (col %in% c("type", "title", "clark_actor", "lois_actor")) {
    attr(superman_labelled[[col]], "spss_measure") <- "nominal"
    attr(superman_labelled[[col]], "spss_format") <- "F8.0"
  } else if (col %in% c("year")) {
    attr(superman_labelled[[col]], "spss_measure") <- "scale"
    attr(superman_labelled[[col]], "spss_format") <- "F4.0"
  } else if (col %in% c("clark_height", "lois_height")) {
    attr(superman_labelled[[col]], "spss_measure") <- "scale"
    attr(superman_labelled[[col]], "spss_format") <- "F4.2"
  } else if (col %in% c("clark_age", "lois_age")) {
    attr(superman_labelled[[col]], "spss_measure") <- "scale"
    attr(superman_labelled[[col]], "spss_format") <- "F5.2"
  }
}

attr(superman_labelled, "label") <- "Superman Data"

write_sav(superman_labelled, "superman.sav")
saveRDS(superman_actors, "superman.rds")

Part 2: Rotten Tomatoes Data

We scrape critic and audience scores from Rotten Tomatoes for Superman movies and TV shows.

Scraping Function

scrape_movie <- function(x, ...) {
  library(rvest)
  library(dplyr)
  library(stringr)
  
  movie_page <- read_html(
    x,
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
  )
  
  is_tv <- str_detect(x, "/tv/")
  
  title <- movie_page |> 
    html_element("title") |> 
    html_text() |>
    str_replace(" \\| Rotten Tomatoes$", "") |>
    str_trim()
  
  page_text <- movie_page |> html_text()
  
  critics_score <- page_text |>
    str_extract("(\\d+)%\\s*(Avg\\.\\s*)?Tomatometer") |>
    str_extract("\\d+") |>
    as.numeric()
  
  critics_count <- page_text |>
    str_extract("(\\d+)\\s*Reviews") |>
    str_extract("\\d+") |>
    as.numeric()
  
  critics_status <- NA_character_
  if (!is.na(critics_score)) {
    if (str_detect(page_text, regex("Certified Fresh", ignore_case = TRUE))) {
      critics_status <- "Certified Fresh"
    } else if (critics_score >= 60) {
      critics_status <- "Fresh"
    } else {
      critics_status <- "Rotten"
    }
  }
  
  audience_score <- page_text |>
    str_extract("(\\d+)%\\s*(Avg\\.\\s*)?Popcornmeter") |>
    str_extract("\\d+") |>
    as.numeric()
  
  audience_count_text <- page_text |>
    str_extract("([\\d,]+)\\+?\\s*(Verified\\s*)?Ratings")
  
  audience_count <- if (!is.na(audience_count_text)) {
    audience_count_text |>
      str_extract("[\\d,]+") |>
      str_replace_all(",", "") |>
      as.numeric()
  } else {
    NA_real_
  }
  
  synopsis <- movie_page |> 
    html_element('meta[name="description"]') |>
    html_attr("content")
  
  poster_url <- movie_page |> 
    html_element('meta[property="og:image"]') |>
    html_attr("content")
  
  tibble(
    title = title %||% NA_character_,
    critics_score = critics_score %||% NA_real_,
    critics_status = critics_status,
    critics_count = critics_count %||% NA_real_,
    audience_score = audience_score %||% NA_real_,
    audience_count = audience_count %||% NA_real_,
    synopsis = synopsis %||% NA_character_,
    poster_url = poster_url %||% NA_character_,
    url = x,
    type = if (is_tv) "TV" else "Movie"
  )
}

Scrape Multiple Movies

# Scrape all Superman movies and shows
mm_rt <- scrape_movie("https://www.rottentomatoes.com/m/superman_and_the_mole_men")
mm_rt$title <- str_replace(str_trim(mm_rt$title), "Superman and the Mole Men", "Superman & the Mole Men")

stm_rt <- scrape_movie("https://www.rottentomatoes.com/m/superman_the_movie")
s2_rt <- scrape_movie("https://www.rottentomatoes.com/m/superman_ii")
s3_rt <- scrape_movie("https://www.rottentomatoes.com/m/superman_iii")
s4_rt <- scrape_movie("https://www.rottentomatoes.com/m/superman_iv_the_quest_for_peace")
sr_rt <- scrape_movie("https://www.rottentomatoes.com/m/superman_returns")
mos_rt <- scrape_movie("https://www.rottentomatoes.com/m/superman_man_of_steel")
sm_25 <- scrape_movie("https://www.rottentomatoes.com/m/superman_2025")
lc_rt <- scrape_movie("https://www.rottentomatoes.com/tv/lois_clark_the_new_adventures_of_superman")
sb_rt <- scrape_movie("https://www.rottentomatoes.com/tv/the_adventures_of_superboy")
sl_rt <- scrape_movie("https://www.rottentomatoes.com/tv/superman_and_lois")
sm_rt <- scrape_movie("https://www.rottentomatoes.com/tv/smallville")

rt <- bind_rows(mm_rt, sm_25, stm_rt, lc_rt, sr_rt, sm_rt, mos_rt, sl_rt, sb_rt)

# Create display table
rt_display <- rt |>
  mutate(
    poster = case_when(
      is.na(poster_url) | poster_url == "Poster URL not available" ~ "--",
      TRUE ~ paste0('<img src="', poster_url, '" height="70">')
    ),
    critics_score = if_else(is.na(critics_score), "--", paste0(critics_score, "%")),
    audience_score = if_else(is.na(audience_score), "--", paste0(audience_score, "%")),
    critics_count = if_else(is.na(critics_count), "--", as.character(critics_count)),
    audience_count = if_else(is.na(audience_count), "--", as.character(audience_count)),
    critics_status = if_else(is.na(critics_status), "--", critics_status)
  ) |> 
  select(poster, title, critics_score, critics_status, critics_count, 
         audience_score, audience_count)

rt_display |>
  rename(
    Poster = poster,
    Title = title,
    `Critics Score` = critics_score,
    `Critics Status` = critics_status,
    `# Critics Reviews` = critics_count,
    `Audience Score` = audience_score,
    `# Audience Ratings` = audience_count
  ) |>
  tt() |>
  format_tt(escape = FALSE)

Create SPSS Version of RT Data

title_labels <- c(
  "Superman (2025)" = 1,
  "Superman & the Mole Men" = 5,
  "Superman: The Movie" = 2,
  "Lois & Clark: The New Adventures of Superman" = 8,
  "Superman Returns" = 4,
  "Smallville" = 3,
  "Man of Steel" = 6,
  "Superman & Lois" = 7,
  "The Adventures of Superboy" = 9
)

rt_data <- rt |> 
  mutate(
    across(where(is.numeric), ~ifelse(is.na(.), -99, .)),
    across(where(is.character), ~ifelse(is.na(.), "-99", .))
  )

var_labels <- list(
  title = "Title of Superman movie/TV show",
  critics_score = "Percentage of positive critic reviews (0-100)",
  critics_status = "Critic consensus: Fresh/Rotten/Certified Fresh",
  critics_count = "Number of critic reviews collected",
  audience_score = "Percentage of positive audience ratings (0-100)",
  audience_count = "Number of audience ratings collected",
  synopsis = "Brief description of the movie/TV show",
  poster_url = "URL of the poster image",
  url = "Rotten Tomatoes page URL"
)

value_labels <- list(
  title = title_labels,
  critics_status = c("Fresh" = 1, "Rotten" = 2, "Certified Fresh" = 3)
)

rt_labelled <- rt_data |>
  mutate(
    title = case_when(
      title %in% names(title_labels) ~ as.numeric(title_labels[title]),
      TRUE ~ -99
    ),
    critics_status = case_when(
      critics_status == "Fresh" ~ 1,
      critics_status == "Rotten" ~ 2,
      critics_status == "Certified Fresh" ~ 3,
      TRUE ~ -99
    )
  ) |>
  labelled::set_variable_labels(.labels = var_labels) |>
  labelled::set_value_labels(.labels = value_labels) |>
  select(title, critics_score, critics_status, critics_count,
         audience_score, audience_count, synopsis, poster_url, url)

write_sav(rt_labelled, "rtomatoes.sav")

Part 3: Box Office Data

We scrape box office performance data from Box Office Mojo.

Find Movie IDs

find_movie_id <- function(movie_title) {
  search_term <- gsub(" ", "+", movie_title)
  search_url <- paste0("https://www.boxofficemojo.com/search/?q=", search_term)
  
  search_page <- rvest::read_html(search_url)
  
  search_results <- search_page |>
    rvest::html_nodes("a.a-size-medium.a-link-normal.a-text-bold")
  
  result_links <- rvest::html_attr(search_results, "href")
  result_titles <- rvest::html_text(search_results)
  
  results_df <- data.frame(
    title = result_titles,
    link = result_links,
    stringsAsFactors = FALSE
  )
  
  results_df$movie_id <- stringr::str_extract(results_df$link, "tt[0-9]+")
  
  return(results_df)
}

superman_list <- find_movie_id("Superman")
superman_list |> tt()

Box Office Mojo Scraping Function

extract_complete_movie_data <- function(movie_id) {
  library(rvest)
  library(xml2)
  
  url <- paste0("https://www.boxofficemojo.com/title/", movie_id, "/")
  page <- read_html(url)
  
  movie_data <- data.frame(movie_id = movie_id, stringsAsFactors = FALSE)
  
  # Extract movie summary info box
  summary_box <- page |> html_node(".a-section.mojo-summary")
  
  if (!is.na(summary_box)) {
    title_element <- summary_box |> html_node("h1.a-size-extra-large")
    
    if (!is.na(title_element)) {
      full_title <- html_text(title_element) |> str_trim()
      main_title <- str_replace(full_title, "\\s*\\(\\d{4}\\)$", "")
      year <- str_extract(full_title, "\\(\\d{4}\\)") |> 
        str_replace_all("[\\(\\)]", "")
      
      movie_data$title <- main_title
      movie_data$year <- year
    }
    
    description <- summary_box |>
      html_node("span.a-size-medium") |>
      html_text() |>
      str_trim()
    
    if (!is.na(description)) {
      movie_data$description <- description
    }
    
    img_element <- summary_box |> html_node("img")
    
    if (!is.na(img_element)) {
      movie_data$poster_url <- html_attr(img_element, "src")
      img_hires <- html_attr(img_element, "data-a-hires")
      if (!is.na(img_hires)) {
        movie_data$poster_url_hires <- img_hires
      }
    }
  }
  
  # Extract box office summary table data
  summary_section <- page |>
    html_node(".a-section.a-spacing-none.mojo-summary-table")
  
  if (!is.na(summary_section)) {
    data_sections <- summary_section |>
      html_nodes(".a-section.a-spacing-none")
    
    for (section in data_sections) {
      if (length(xml_find_first(section, ".//span[@class='a-size-small']")) == 0) {
        next
      }
      
      category <- section |>
        html_node(".a-size-small") |>
        html_text() |>
        str_trim() |>
        str_replace_all("\\s*\\([^)]*\\)\\s*", "") |>
        str_trim()
      
      money_node <- section |> html_node("span.money")
      money_value <- if (!is.na(money_node)) html_text(money_node) else NA
      
      percent_node <- section |> html_node("span.percent")
      percent <- if (!is.na(percent_node)) html_text(percent_node) else NA
      
      if (!is.na(money_value)) {
        clean_money <- gsub("[$,]", "", money_value)
        numeric_money <- as.numeric(clean_money)
        
        category_clean <- tolower(str_replace_all(category, "[^[:alnum:]]", "_"))
        category_clean <- str_replace_all(category_clean, "_+", "_")
        category_clean <- str_remove(category_clean, "_$")
        
        movie_data[[paste0(category_clean, "_gross")]] <- money_value
        movie_data[[paste0(category_clean, "_gross_numeric")]] <- numeric_money
        
        if (!is.na(percent)) {
          movie_data[[paste0(category_clean, "_percent")]] <- percent
          percent_numeric <- as.numeric(gsub("[%]", "", percent))
          movie_data[[paste0(category_clean, "_percent_numeric")]] <- percent_numeric
        }
      }
    }
  }
  
  return(movie_data)
}

Extract Multiple Movies

extract_multiple_movies <- function(movie_ids) {
  all_data <- NULL
  
  for (id in movie_ids) {
    cat("Processing movie ID:", id, "\n")
    
    tryCatch({
      movie_data <- extract_complete_movie_data(id)
      
      if (is.null(all_data)) {
        all_data <- movie_data
      } else {
        # Handle different columns
        missing_cols <- setdiff(names(all_data), names(movie_data))
        for (col in missing_cols) movie_data[[col]] <- NA
        
        missing_cols <- setdiff(names(movie_data), names(all_data))
        for (col in missing_cols) all_data[[col]] <- NA
        
        all_data <- bind_rows(all_data, movie_data)
      }
    }, error = function(e) {
      cat("Error processing movie ID:", id, "- Error:", e$message, "\n")
    })
    
    Sys.sleep(2)  # Be polite to the server
  }
  
  return(all_data)
}

# Superman movie IDs from IMDB/Box Office Mojo
movie_ids <- c(
  "tt5950044",
  "tt0078346",
  "tt0770828",
  "tt0348150",
  "tt0081573",
  "tt0086393",
  "tt0094074", 
  "tt2975590"
)

all_movies <- extract_multiple_movies(movie_ids)

# Display results
all_movies |>
  mutate(
    poster = if_else(
      !is.na(poster_url),
      paste0('<img src="', poster_url, '" height="80">'),
      ""
    )
  ) |>
  select(poster, title, year, worldwide_gross, budget) |>
  tt() |>
  format_tt(escape = FALSE) |>
  style_tt(bootstrap_class = "table table-striped")

Clean Box Office Data

clean_boxoffice_df <- all_movies |>
  mutate(
    year = as.numeric(year),
    box_office_numeric = worldwide_gross_numeric,
    budget_numeric = as.numeric(budget_numeric),
    opening = as.numeric(domestic_opening_numeric),
    domestic = as.numeric(domestic_gross_numeric),
    percent = as.numeric(domestic_percent_numeric),
    
    title = str_trim(title),
    
    release_date = str_extract(earliest_release_date, "\\w+ \\d+, \\d{4}"),
    release_date = as.Date(release_date, format = "%B %d, %Y"),
    
    decade = paste0(floor(year / 10) * 10, "s"),
    
    mpaa_rating = str_trim(mpaa),
    mpaa_rating = if_else(is.na(mpaa_rating), "Unrated", mpaa_rating),
    
    is_original_series = year >= 1978 & year <= 1987,
    is_modern_era = year == 2006,
    is_dceu = year >= 2010 & year <= 2024,
    is_dcu = year >= 2025,
    
    clark_actor = case_when(
      is_original_series ~ "Christopher Reeve",
      is_modern_era ~ "Brandon Routh",
      is_dceu ~ "Henry Cavill",
      is_dcu ~ "David Corenswet"
    )
  ) |> 
  select(
    movie_id, title, year, release_date, decade, mpaa_rating,
    budget_numeric, box_office_numeric, opening, domestic, percent,
    is_original_series, is_modern_era, is_dceu, is_dcu, clark_actor,
    poster_url_hires
  )

write.csv(clean_boxoffice_df, "boxoffice_raw.csv", row.names = FALSE)
write_sav(boxoffice_labelled, "boxoffice.sav")

Part 4: Letterboxd Reviews

We also scrape user reviews from Letterboxd for sentiment analysis.

Letterboxd Scraping Function

# Source the letterboxd scraping functions
source("letterbox.R")

safe_scrape <- function(movie_slug, num_pages = 2, random_pages = TRUE, max_page = 5) {
  tryCatch({
    result <- scrape_movie_reviews(
      movie_slug, 
      num_pages = num_pages, 
      random_pages = random_pages, 
      max_page = max_page,
      file = FALSE
    )
    if (is.null(result) || nrow(result) == 0) {
      message("No data returned for: ", movie_slug)
      return(NULL)
    }
    return(result)
  }, error = function(e) {
    message("Error scraping ", movie_slug, ": ", e$message)
    return(NULL)
  })
}

# Scrape reviews for each movie
superman_1948 <- safe_scrape("superman-1948", max_page = 5)
atomman <- safe_scrape("atom-man-vs-superman", max_page = 10)
moleman <- safe_scrape("superman-and-the-mole-men", max_page = 10)
superman2025 <- safe_scrape("superman-2025", max_page = 50)
superman1 <- safe_scrape("superman", max_page = 50)
superman2 <- safe_scrape("superman-ii", max_page = 50)
superman3 <- safe_scrape("superman-iii", max_page = 50)
superman4 <- safe_scrape("superman-iv-the-quest-for-peace", max_page = 50)
superman_returns <- safe_scrape("superman-returns", max_page = 50)
man_of_steel <- safe_scrape("man-of-steel", max_page = 50)

# Combine all results
all_scrapes <- list(
  superman_1948, atomman, moleman, superman1, 
  superman_returns, man_of_steel, superman2025
)

valid_scrapes <- Filter(Negate(is.null), all_scrapes)

if (length(valid_scrapes) > 0) {
  letterboxd <- bind_rows(valid_scrapes) |>
    rename(title = movie_title)
  
  message("Successfully scraped ", nrow(letterboxd), " reviews")
}

Part 5: Combined Dataset

Finally, we combine all data sources into a single comprehensive dataset.

Combine All Sources

superman_spss <- read_sav("superman.sav")
rt_labelled <- read_sav("rtomatoes.sav")
boxoffice_df <- read_csv("boxoffice_raw.csv")

# Join actor data with RT data
dat <- superman_spss |> 
  full_join(rt_labelled, by = "title")

superman_rt <- dat |> 
  mutate(
    across(where(is.numeric), ~if_else(is.na(.), -99, .)),
    across(where(is.character), ~if_else(is.na(.), "-99", .))
  )

# Join with box office data
superman_boxoffice <- superman_actors |> 
  full_join(boxoffice_df, by = "clark_actor") |>
  mutate(
    title = coalesce(as.character(title.x), as.character(title.y)),
    year = coalesce(as.numeric(year.x), as.numeric(year.y))
  ) |>
  select(-ends_with(".x"), -ends_with(".y"))

# Save combined files
write_sav(superman_rt, "superman_rt.sav")
saveRDS(superman_rt, "superman_rt.rds")
write_sav(superman_boxoffice, "superman_complete.sav")

Part 6: Visualizations

Superman Actor Heights Over Time

ggplot(superman_actors |> filter(!is.na(clark_height)), 
       aes(x = year, y = clark_height)) +
  geom_point(aes(color = type), size = 5, alpha = 0.8) +
  geom_text(
    aes(label = clark_actor, color = type), 
    hjust = -0.1, 
    vjust = 0.5, 
    size = 3.5, 
    fontface = "bold",
    show.legend = FALSE
  ) +
  scale_color_manual(
    values = c("film" = "#0073CF", "tv" = "#E21A22"),
    labels = c("Film", "TV Show")
  ) +
  labs(
    title = "Superman Actor Heights Over Time",
    subtitle = "Height in meters by year of first appearance",
    x = "Year",
    y = "Height (m)",
    color = "Media Type"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "bottom",
    plot.title = element_text(face = "bold", size = 18),
    panel.grid.minor = element_blank()
  ) +
  scale_x_continuous(breaks = seq(1950, 2030, by = 10)) +
  scale_y_continuous(limits = c(1.75, 2.05)) +
  coord_cartesian(clip = "off")

Height Comparison: Superman vs Lois Lane

superman_actors |>
  filter(!is.na(lois_height) & !is.na(clark_height)) |>
  ggplot(aes(x = clark_height, y = lois_height)) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", alpha = 0.3) +
  geom_point(aes(color = type), size = 6, alpha = 0.7) +
  geom_text(
    aes(label = title), 
    hjust = -0.1, 
    vjust = 1.5, 
    size = 3
  ) +
  scale_color_manual(
    values = c("film" = "#0073CF", "tv" = "#E21A22"),
    labels = c("Film", "TV Show")
  ) +
  labs(
    title = "Superman vs Lois Lane: Actor Height Comparison",
    subtitle = "Dashed line represents equal height",
    x = "Superman Actor Height (m)",
    y = "Lois Lane Actress Height (m)",
    color = "Media Type"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "bottom",
    plot.title = element_text(face = "bold", size = 18),
    panel.grid.minor = element_blank()
  )

Age Comparison: Superman vs Lois Lane

superman_actors |>
  filter(!is.na(lois_age) & !is.na(clark_age)) |>
  ggplot(aes(x = clark_age, y = lois_age)) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", alpha = 0.3) +
  geom_point(aes(color = type), size = 6, alpha = 0.7) +
  geom_text(
    aes(label = clark_actor), 
    hjust = -0.1, 
    vjust = 1.5, 
    size = 3
  ) +
  scale_color_manual(
    values = c("film" = "#0073CF", "tv" = "#E21A22"),
    labels = c("Film", "TV Show")
  ) +
  labs(
    title = "Superman vs Lois Lane: Actor Age Comparison",
    subtitle = "Age at time of release; dashed line represents equal age",
    x = "Superman Actor Age (years)",
    y = "Lois Lane Actress Age (years)",
    color = "Media Type"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "bottom",
    plot.title = element_text(face = "bold", size = 18),
    panel.grid.minor = element_blank()
  ) +
  xlim(20, 45) +
  ylim(20, 45)

Back to top