12.1 Text Analysis with R: 2024 US Presidential Debate
12.2 Load Packages
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.2 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.2 ✔ tibble 3.2.1
✔ lubridate 1.9.2 ✔ tidyr 1.3.0
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)library(skimr)
12.3 Cleaning Text Data to Data Frame
Import
# Read the transcripttranscript <-readLines("data/debate.txt")transcript[1:10]
[1] "PARTICIPANTS:"
[2] "Vice President Kamala Harris (D) and"
[3] "Former President Donald Trump (R)"
[4] ""
[5] "MODERATORS:"
[6] "Linsey Davis (ABC News) and"
[7] "David Muir (ABC News)"
[8] ""
[9] "MUIR: Good evening, I'm David Muir. And thank you for joining us for tonight's ABC News Presidential Debate. We want to welcome viewers watching on ABC and around the world tonight. Vice President Kamala Harris and President Donald Trump are just moments away from taking the stage in this unprecedented race for president."
[10] ""
# Initialize an empty data framedata <-data.frame(speaker =character(),text =character(),stringsAsFactors =FALSE)# Define a regular expression pattern to detect speaker linesspeaker_pattern <-"^[A-Z ]+:"# Lines that start with uppercase letters followed by a colon# Initialize variablescurrent_speaker <-NAcurrent_text <-""for (line in transcript) {# Check if the line matches the speaker patternif (str_detect(line, speaker_pattern)) {# If there's accumulated text, save it before moving to the next speakerif (!is.na(current_speaker) && current_text !="") { data <-rbind(data, data.frame(speaker = current_speaker, text = current_text, stringsAsFactors =FALSE)) current_text <-"" }# Extract the speaker split_line <-str_split_fixed(line, ":", 2) current_speaker <-str_trim(split_line[1])# Start accumulating text current_text <-str_trim(split_line[2]) } else {# Accumulate text current_text <-paste(current_text, str_trim(line)) }}# Add the last piece of textif (!is.na(current_speaker) && current_text !="") { data <-rbind(data, data.frame(speaker = current_speaker, text = current_text, stringsAsFactors =FALSE))}
# Calculate word frequenciesword_counts <- tidy_data_clean %>%count(speaker, word, sort =TRUE)
Analyze Top Words
# Get top 10 words for each candidatetop_words <- word_counts %>%group_by(speaker) %>%top_n(10, n) %>%ungroup() %>%arrange(speaker, -n)# View the resultprint(top_words)
# A tibble: 22 × 3
speaker word n
<chr> <chr> <int>
1 Harris president 56
2 Harris people 48
3 Harris donald 32
4 Harris trump 31
5 Harris american 27
6 Harris united 21
7 Harris plan 20
8 Harris understand 18
9 Harris care 14
10 Harris talk 14
# ℹ 12 more rows
# Separate bigrams into two wordsbigrams_separated <- bigrams %>%separate(bigram, into =c("word1", "word2"), sep =" ")# Remove stop wordsbigrams_filtered <- bigrams_separated %>%filter(!word1 %in% stop_words$word,!word2 %in% stop_words$word) %>%# remove numbers filter(!str_detect(word1, "\\d+"),!str_detect(word2, "\\d+"))
# Unite the words back into bigramsbigram_counts <- bigrams_filtered %>%unite(bigram, word1, word2, sep =" ") %>%count(speaker, bigram, sort =TRUE)
# Get top 10 bigrams for each candidatetop_bigrams <- bigram_counts %>%group_by(speaker) %>%top_n(10, n) %>%ungroup() %>%arrange(speaker, -n)# View the resultprint(top_bigrams)
# A tibble: 23 × 3
speaker bigram n
<chr> <chr> <int>
1 Harris donald trump 26
2 Harris american people 18
3 Harris vice president 11
4 Harris affordable care 7
5 Harris care act 7
6 Harris donald trump's 6
7 Harris health care 5
8 Harris middle class 5
9 Harris national security 5
10 Harris trump left 4
# ℹ 13 more rows
12.7 Plot
Top words
# Plottop_words %>%mutate(word =reorder_within(word, n, speaker)) %>%ggplot(aes(word, n, fill = speaker)) +geom_col(show.legend =FALSE) +facet_wrap(~speaker, scales ="free_y") +coord_flip() +scale_x_reordered() +labs(x ="Words", y ="Frequency", title ="Top Words by Candidate")
Top Bigrams
# Plottop_bigrams %>%mutate(bigram =reorder_within(bigram, n, speaker)) %>%ggplot(aes(bigram, n, fill = speaker)) +geom_col(show.legend =FALSE) +facet_wrap(~speaker, scales ="free_y") +coord_flip() +scale_x_reordered() +labs(x ="Bigrams", y ="Frequency", title ="Top Bigrams by Candidate")
12.8 Word Cloud
library(wordcloud)
Loading required package: RColorBrewer
library(RColorBrewer)
# Filter word counts for each candidateharris_words <- word_counts %>%filter(speaker =="Harris")trump_words <- word_counts %>%filter(speaker =="Trump")
# Set up the plotting area for two plotspar(mfrow =c(1, 2))# Word cloud for Harriswordcloud(words = harris_words$word, freq = harris_words$n, min.freq =2,max.words =100,random.order =FALSE, rot.per =0.35, colors =brewer.pal(8, "Blues"),scale =c(4, 0.5))
Warning in wordcloud(words = harris_words$word, freq = harris_words$n, min.freq
= 2, : ensuring could not be fit on page. It will not be plotted.
Warning in wordcloud(words = harris_words$word, freq = harris_words$n, min.freq
= 2, : focusing could not be fit on page. It will not be plotted.
Warning in wordcloud(words = harris_words$word, freq = harris_words$n, min.freq
= 2, : government could not be fit on page. It will not be plotted.
Warning in wordcloud(words = harris_words$word, freq = harris_words$n, min.freq
= 2, : investing could not be fit on page. It will not be plotted.
Warning in wordcloud(words = harris_words$word, freq = harris_words$n, min.freq
= 2, : knowing could not be fit on page. It will not be plotted.
Warning in wordcloud(words = harris_words$word, freq = harris_words$n, min.freq
= 2, : protecting could not be fit on page. It will not be plotted.
Warning in wordcloud(words = harris_words$word, freq = harris_words$n, min.freq
= 2, : stop could not be fit on page. It will not be plotted.
Warning in wordcloud(words = harris_words$word, freq = harris_words$n, min.freq
= 2, : understanding could not be fit on page. It will not be plotted.
Warning in wordcloud(words = harris_words$word, freq = harris_words$n, min.freq
= 2, : understands could not be fit on page. It will not be plotted.
Warning in wordcloud(words = harris_words$word, freq = harris_words$n, min.freq
= 2, : woman could not be fit on page. It will not be plotted.
Warning in wordcloud(words = harris_words$word, freq = harris_words$n, min.freq
= 2, : zelenskyy could not be fit on page. It will not be plotted.
# Define the list of agenda keywordsagenda_keywords <-c("border", "abortion", "economy", "immigration", "health", "security", "israel", "russia", "china", "ukraine")
# Convert words to lowercasetidy_data_clean <- tidy_data_clean %>%mutate(word =tolower(word))# Filter words that are in the agenda_keywords listagenda_data <- tidy_data_clean %>%filter(word %in% agenda_keywords)# Count the frequency of each keyword per candidateagenda_counts <- agenda_data %>%count(speaker, word) %>%arrange(speaker, desc(n))
print(agenda_counts)
speaker word n
1 Harris security 9
2 Harris economy 8
3 Harris abortion 6
4 Harris health 6
5 Harris israel 5
6 Harris ukraine 5
7 Harris border 3
8 Harris china 3
9 Harris russia 2
10 Harris immigration 1
11 Trump economy 13
12 Trump border 11
13 Trump china 10
14 Trump abortion 8
15 Trump russia 8
16 Trump israel 7
17 Trump ukraine 6
18 Trump health 2
19 Trump immigration 2
20 Trump security 2
12.10 Heatmap
# Create a heatmapggplot(agenda_counts, aes(x = word, y = speaker, fill = n)) +geom_tile() +scale_fill_gradient(low ="white", high ="red") +labs(title ="Agenda Keyword Frequencies Heatmap",x =" ",y =" ") +theme_minimal() +# remove gridtheme(panel.grid.major =element_blank(), panel.grid.minor =element_blank() )