class: center, middle, inverse, title-slide .title[ # Visualizing text + network data ] .author[ ### MACS 40700
University of Chicago ] --- ## Setup .small[ ``` r # load packages library(tidyverse) library(tidytext) library(ggtext) library(glue) library(ggwordcloud) library(ggraph) library(igraph) # set default theme for ggplot2 ggplot2::theme_set(ggplot2::theme_minimal(base_size = 16)) # set default figure parameters for knitr knitr::opts_chunk$set( fig.width = 8, fig.asp = 0.618, fig.retina = 2, dpi = 150, out.width = "60%" ) # dplyr print min and max options(dplyr.print_max = 10, dplyr.print_min = 10) ``` ] --- class: middle, inverse # Text data --- class: middle <img src="images/ts_positive_words_album_track.png" width="80%" style="display: block; margin: auto;" /> --- ## Text as data Text can be represented as data in a variety of ways: - **String**: Character vector - **Corpus**: Raw strings annotated with additional metadata and details - **Document-term matrix**: Sparse matrix describing a collection (i.e., a corpus) of documents with one row for each document and one column for each term, with word counts (or another measure of how common the word is in that text) as values - **Word embeddings**: A dense array structure (typically a 3D array) describing a collection of documents where each word is projected onto an `\(n\)`-dimensional feature space that encodes the meaning of the words based on their semantic similarity --- ## Tidy text - Each row is a **token** - A token can be a word, bigram (two words), ngram (n words), sentence, paragraph, etc. - Each column is a variable - Each type of observational unit is a row --- class: center, middle, inverse ## Tidy text: Application ### Love Actually: popular *Christmas movie* ... --- ## `Tidy text`: Application ``` r love_actually %>% slice_head(n = 6) ``` ``` ## # A tibble: 6 × 4 ## scene line speaker dialogue ## <dbl> <dbl> <chr> <chr> ## 1 1 1 (Man) 'Whenever I get gloomy with the state of the world, I thi… ## 2 2 2 Billy ♪ I feel it in my fingers ♪ I feel it in my toes ♪ Feel i… ## 3 2 3 Joe I'm afraid you did it again, Bill. ## 4 2 4 Billy It's just I know the old version so well, you know. ## 5 2 5 Joe Well, we all do. That's why we're making the new version. ## 6 2 6 Billy Right, OK, let's go. ♪ I feel it in my fingers ♪ In my fi… ``` --- ## Tokenize into words With `tidytext::unnest_tokens()`: ``` r love_actually %>% unnest_tokens( output = word, # first argument is output input = dialogue, # second argument is input token = "words" # third argument is token, with default "words" ) %>% slice_head(n = 6) ``` ``` ## # A tibble: 6 × 4 ## scene line speaker word ## <dbl> <dbl> <chr> <chr> ## 1 1 1 (Man) whenever ## 2 1 1 (Man) i ## 3 1 1 (Man) get ## 4 1 1 (Man) gloomy ## 5 1 1 (Man) with ## 6 1 1 (Man) the ``` --- ## Most common words .task[ Why do these words appear so commonly in Love Actually? ] ``` r love_actually %>% unnest_tokens(word, dialogue) %>% count(word, sort = TRUE) ``` ``` ## # A tibble: 1,770 × 2 ## word n ## <chr> <int> ## 1 you 334 ## 2 i 300 ## 3 the 263 ## 4 a 201 ## 5 and 199 ## 6 to 199 ## 7 it 150 ## 8 is 124 ## 9 of 112 ## 10 no 111 ## # ℹ 1,760 more rows ``` --- ## Stop words - In computing, stop words are words which are filtered out before or after processing of natural language data (text) - They usually refer to most common words in a language, but there is not a single list of stop words used by all natural language processing tools .pull-left[ English: .small[ ``` r get_stopwords("en") ``` ``` ## # A tibble: 175 × 2 ## word lexicon ## <chr> <chr> ## 1 i snowball ## 2 me snowball ## 3 my snowball ## 4 myself snowball ## 5 we snowball ## 6 our snowball ## 7 ours snowball ## 8 ourselves snowball ## 9 you snowball ## 10 your snowball ## # ℹ 165 more rows ``` ] ] .pull-right[ Spanish: .small[ ``` r get_stopwords("es") ``` ``` ## # A tibble: 308 × 2 ## word lexicon ## <chr> <chr> ## 1 de snowball ## 2 la snowball ## 3 que snowball ## 4 el snowball ## 5 en snowball ## 6 y snowball ## 7 a snowball ## 8 los snowball ## 9 del snowball ## 10 se snowball ## # ℹ 298 more rows ``` ] ] --- ## Most common words ``` r love_actually %>% unnest_tokens(word, dialogue) %>% anti_join(stop_words) %>% count(word, sort = TRUE) ``` ``` ## # A tibble: 1,355 × 2 ## word n ## <chr> <int> ## 1 christmas 49 ## 2 yeah 48 ## 3 er 45 ## 4 love 40 ## 5 erm 39 ## 6 sir 28 ## 7 portuguese 25 ## 8 god 23 ## 9 bye 21 ## 10 time 20 ## # ℹ 1,345 more rows ``` --- ## Portuguese?! ``` r love_actually %>% filter(str_detect(dialogue, "[Pp]ortuguese")) ``` ``` ## # A tibble: 25 × 4 ## scene line speaker dialogue ## <dbl> <dbl> <chr> <chr> ## 1 32 360 woman Unfortunately, she cannot speak French, just like you. S… ## 2 33 367 Jamie (Pidgin Portuguese) Bello. Er, bella. Er, mon-montagno, … ## 3 38 421 Aurelia (Portuguese) Thank you very much but no. If you saw my s… ## 4 38 423 Aurelia (Portuguese)Just don't go eating it all yourself, you're… ## 5 39 426 Aurelia (Portuguese) Nao! Eu peco imensa desculpa. Oh, no. Hold … ## 6 39 430 Aurelia (Portuguese) Fuck - it's cold! ## 7 39 432 Aurelia (Portuguese) This stuff better be good. ## 8 39 434 Aurelia (Portuguese) I don't want to drown saving some shit my g… ## 9 39 436 Aurelia (Portuguese) What kind of an idiot doesn't do copies? ## 10 39 438 Aurelia (Portuguese) Try not to disturb the eels. ## # ℹ 15 more rows ``` --- ## Data cleaning - Remove language identifiers ``` r love_actually <- love_actually %>% mutate(dialogue = str_remove(dialogue, "(Portuguese)")) ``` - Take another look ``` r love_actually %>% filter(str_detect(dialogue, "[Pp]ortuguese")) ``` ``` ## # A tibble: 0 × 4 ## # ℹ 4 variables: scene <dbl>, line <dbl>, speaker <chr>, dialogue <chr> ``` --- ## Most common words Without "Portuguese" ``` r love_actually %>% unnest_tokens(word, dialogue) %>% anti_join(stop_words) %>% count(word, sort = TRUE) ``` ``` ## # A tibble: 1,354 × 2 ## word n ## <chr> <int> ## 1 christmas 49 ## 2 yeah 48 ## 3 er 45 ## 4 love 40 ## 5 erm 39 ## 6 sir 28 ## 7 god 23 ## 8 bye 21 ## 9 time 20 ## 10 ah 19 ## # ℹ 1,344 more rows ``` --- ## Visualizing Most common words Visualize the top 10 words .panelset.sideways[ .panel[.panel-name[Code] ``` r love_actually %>% unnest_tokens(word, dialogue) %>% anti_join(stop_words) %>% count(word, sort = TRUE) %>% * slice_head(n = 10) %>% ggplot(aes(y = word, x = n)) + geom_col() ``` ] .panel[.panel-name[Plot] <img src="index_files/figure-html/unnamed-chunk-14-1.png" width="100%" style="display: block; margin: auto;" /> ] ] --- ## Visualizing Most common words Reorder the words by frequency .panelset.sideways[ .panel[.panel-name[Code] ``` r love_actually %>% unnest_tokens(word, dialogue) %>% anti_join(stop_words) %>% count(word, sort = TRUE) %>% slice_head(n = 10) %>% * ggplot(aes(y = fct_reorder(word, n), x = n)) + geom_col() ``` ] .panel[.panel-name[Plot] <img src="index_files/figure-html/unnamed-chunk-15-1.png" width="100%" style="display: block; margin: auto;" /> ] ] --- ## Visualizing Most common words Bring in some color .panelset.sideways[ .panel[.panel-name[Code] ``` r love_actually %>% unnest_tokens(word, dialogue) %>% anti_join(stop_words) %>% count(word, sort = TRUE) %>% slice_head(n = 10) %>% ggplot(aes(y = fct_reorder(word, n), x = n)) + * geom_col(fill = "#BD2D2A") + labs( x = "Count", y = NULL, title = "Most common words in Love Actually" ) ``` ] .panel[.panel-name[Plot] <img src="index_files/figure-html/unnamed-chunk-16-1.png" width="100%" style="display: block; margin: auto;" /> ] ] --- ## Visualizing the most common words Use `ggtext::element_textbox_simple()` to add color to title .panelset.sideways[ .panel[.panel-name[Code] ``` r love_actually %>% unnest_tokens(word, dialogue) %>% anti_join(stop_words) %>% count(word, sort = TRUE) %>% slice_head(n = 10) %>% ggplot(aes(y = fct_reorder(word, n), x = n)) + geom_col(fill = "#BD2D2A") + labs( x = "Count", y = NULL, * title = "<span style = 'color:#808080'>Most common words in</span>", * subtitle ="<span style = 'color:#BD2D2A;font-weight:strong'><b>love</b></span> actually" ) + theme( * plot.title = element_textbox_simple(halign = 1), * plot.subtitle = element_textbox_simple(size = 30, halign = 1), * plot.title.position = "plot" ) ``` ] .panel[.panel-name[Plot] <img src="index_files/figure-html/unnamed-chunk-17-1.png" width="100%" style="display: block; margin: auto;" /> ] ] --- ### Beyond our scope: TD-IDF There are additional measures of 'uniqueness' for words. <img src="images/ts_idf_words.png" width="80%" style="display: block; margin: auto;" /> --- ## Skillz combo! <img src="https://static01.nyt.com/images/2020/11/12/learning/ThanksgivingMapLN/ThanksgivingMapLN-superJumbo.png?quality=75&auto=webp" width="75%" style="display: block; margin: auto;" /> .footnote[[Source: NYTimes 2020](https://www.nytimes.com/2020/11/12/learning/whats-going-on-in-this-graph-thanksgiving-side-dishes.html)] --- ## Wordclouds Using `ggwordcloud::geom_text_wordcloud()`: .panelset.sideways[ .panel[.panel-name[Code] ``` r set.seed(42) love_actually %>% unnest_tokens(output = word, input = dialogue) %>% anti_join(stop_words) %>% count(word, sort = TRUE) %>% filter(n > 5) %>% ggplot(aes(label = word, size = n)) + geom_text_wordcloud() + scale_size_area(max_size = 10) ``` ] .panel[.panel-name[Plot] <img src="index_files/figure-html/unnamed-chunk-19-1.png" width="100%" style="display: block; margin: auto;" /> ] ] --- ## Wordclouds, with color .panelset.sideways[ .panel[.panel-name[Code] ``` r set.seed(42) love_actually %>% unnest_tokens(output = word, input = dialogue) %>% anti_join(stop_words) %>% count(word, sort = TRUE) %>% filter(n > 5) %>% ggplot(aes(label = word, size = n)) + * geom_text_wordcloud(aes(color = n)) + scale_size_area(max_size = 10) + * scale_color_distiller(palette = "Reds") ``` ] .panel[.panel-name[Plot] <img src="index_files/figure-html/unnamed-chunk-20-1.png" width="100%" style="display: block; margin: auto;" /> ] ] --- ## Wordclouds, with color (dark = popular) .panelset.sideways[ .panel[.panel-name[Code] ``` r set.seed(42) love_actually %>% unnest_tokens(output = word, input = dialogue) %>% anti_join(stop_words) %>% count(word, sort = TRUE) %>% filter(n > 5) %>% ggplot(aes(label = word, size = n)) + * geom_text_wordcloud(aes(color = n)) + scale_size_area(max_size = 10) + * scale_color_distiller(palette = "Reds", direction = 1) ``` ] .panel[.panel-name[Plot] <img src="index_files/figure-html/unnamed-chunk-21-1.png" width="100%" style="display: block; margin: auto;" /> ] ] --- class: center, middle # CAVEAT: WORDCLOUDS ARE NOT ANALYSIS!!!!!! --- class: inverse, center, middle # Your turn! --- # Activity: Make a visualization - Find relevant data (see next slides for more detail) * If you are struggling -- you can use The Office data: https://www.kaggle.com/datasets/nasirkhalid24/the-office-us-complete-dialoguetranscript - Remove stopwords - Create a relevant visualization (e.g. top words)
--- ## Activity: finding relevant data * Honestly -- Kaggle can often be an easy source but probably not the best aside from quick / short projects * Google: 'thing you care about' + 'dataset' * APIs (see MACS 30550) * Scraping (beyond our context MACS 30550 or MACS 30112/30122) --- # Activity: guided (The Office) [step 1: load] ``` ## Importing fonts may take a few minutes, depending on the number of fonts and the speed of the system. ## Continue? [y/n] ``` Data source: [Kaggle Office data](https://www.kaggle.com/datasets/nasirkhalid24/the-office-us-complete-dialoguetranscript) ``` r office_words <- office %>% unnest_tokens( output = word, # first argument is output input = line, # second argument is input token = "words" # third argument is token, with default "words" ) office_words %>% slice_head(n = 6) ``` ``` ## # A tibble: 6 × 7 ## season episode title scene speaker ...7 word ## <dbl> <dbl> <chr> <dbl> <chr> <chr> <chr> ## 1 1 1 Pilot 1 Michael <NA> all ## 2 1 1 Pilot 1 Michael <NA> right ## 3 1 1 Pilot 1 Michael <NA> jim ## 4 1 1 Pilot 1 Michael <NA> your ## 5 1 1 Pilot 1 Michael <NA> quarterlies ## 6 1 1 Pilot 1 Michael <NA> look ``` --- # Activity: guided (The Office) [step2: no stopwords] Notice what we have here -- why would this NOT be a good approach? ``` r office_words %>% anti_join(stop_words) %>% count(word, sort = TRUE) ``` ``` ## # A tibble: 19,111 × 2 ## word n ## <chr> <int> ## 1 yeah 2909 ## 2 hey 2213 ## 3 michael 1855 ## 4 uh 1467 ## 5 gonna 1416 ## 6 dwight 1354 ## 7 jim 1163 ## 8 time 1157 ## 9 pam 1046 ## 10 guys 947 ## # ℹ 19,101 more rows ``` --- .panelset.sideways[ .panel[.panel-name[Code] ``` r *custom_words <-data.frame("word" = c("hey", "uh", "um")) *custom_words <- full_join(stop_words, custom_words) *char_names <- c("michael", "jim", "dwight", "pam") office_words %>% anti_join(custom_words) %>% count(word, sort = TRUE) %>% mutate(character = ifelse( word %in% char_names, "yes","no")) %>% slice_head(n = 10) %>% ggplot(aes(y = fct_reorder(word, n), x = n, fill = character)) + geom_col() + scale_fill_manual(values = c("yes" = "#2B152C", "no" = "#B3A6A3")) + labs( x = "Count", y = NULL, title = "<span style = 'color:#808080'>Most common words in</span>", subtitle ="<b>The Office</b>" ) + theme( plot.title = element_textbox_simple(halign = 1), plot.subtitle = element_textbox_simple(size = 30, halign = 1, family = "courier prime"), plot.title.position = "plot" ) ``` ] .panel[.panel-name[Plot] <img src="index_files/figure-html/unnamed-chunk-26-1.png" width="100%" style="display: block; margin: auto;" /> ] ] --- # Closing thoughts: * What did we do well in our previous graphic? * How might we improve? --- class: inverse, middle # Network data --- ## Network data - A **network** refers to an object composed of elements and relationships or connections between those elements -- - **Network analysis** is a collection of techniques for examining the relationships between entities, and depicting the structure of those relationships - Network analysis spans a number of domains, including social networks, bibliometrics, epidemiology, bioinformatics, complex systems, and text analysis -- - **Graph theory** provides the formal basis for network analysis, across domains, and provides a common language for describing the structure of networks -- - **Network visualization** involves the visualization of the relationships (edges or links) between data elements (nodes) --- ## Purpose Main concern in designing a network visualization is the purpose it has to serve: - What are the structural properties that we want to highlight? - What are the key concerns we want to address? <img src="images/network-purpose.png" width="50%" style="display: block; margin: auto;" /> .footnote[ Source: https://kateto.net/network-visualization ] --- ## Format - Network maps are not the only visualization appropriate for graphs - Other network representation formats, including simple charts of key characteristics, may be more appropriate in some cases <img src="images/network-format.png" width="50%" style="display: block; margin: auto;" /> .footnote[ Source: https://kateto.net/network-visualization ] --- ## Bigrams as tokens **Bigram** is a sequence of two adjacent elements ferom a string of tokens, e.g., two consecutive words ``` r bigrams <- love_actually %>% unnest_tokens(bigram, dialogue, token = "ngrams", n = 2) bigrams ``` ``` ## # A tibble: 8,966 × 4 ## scene line speaker bigram ## <dbl> <dbl> <chr> <chr> ## 1 1 1 (Man) whenever i ## 2 1 1 (Man) i get ## 3 1 1 (Man) get gloomy ## 4 1 1 (Man) gloomy with ## 5 1 1 (Man) with the ## 6 1 1 (Man) the state ## 7 1 1 (Man) state of ## 8 1 1 (Man) of the ## 9 1 1 (Man) the world ## 10 1 1 (Man) world i ## # ℹ 8,956 more rows ``` --- ## Most common bigrams ``` r bigrams %>% count(bigram, sort = TRUE) ``` ``` ## # A tibble: 6,356 × 2 ## bigram n ## <chr> <int> ## 1 <NA> 111 ## 2 thank you 32 ## 3 do you 25 ## 4 this is 25 ## 5 come on 24 ## 6 you know 24 ## 7 and i 21 ## 8 in the 21 ## 9 are you 20 ## 10 i think 20 ## # ℹ 6,346 more rows ``` --- ## Removing stop words from bigrams Step 1: Separate the bigram into words ``` r bigrams_separated <- bigrams %>% separate(bigram, c("word1", "word2"), sep = " ") bigrams_separated ``` ``` ## # A tibble: 8,966 × 5 ## scene line speaker word1 word2 ## <dbl> <dbl> <chr> <chr> <chr> ## 1 1 1 (Man) whenever i ## 2 1 1 (Man) i get ## 3 1 1 (Man) get gloomy ## 4 1 1 (Man) gloomy with ## 5 1 1 (Man) with the ## 6 1 1 (Man) the state ## 7 1 1 (Man) state of ## 8 1 1 (Man) of the ## 9 1 1 (Man) the world ## 10 1 1 (Man) world i ## # ℹ 8,956 more rows ``` --- ## Removing stop words from bigrams Step 2: Remove bigram if either word is a stop word ``` r bigrams_filtered <- bigrams_separated %>% filter(!(word1 %in% stop_words$word)) %>% filter(!(word2 %in% stop_words$word)) bigrams_filtered ``` ``` ## # A tibble: 930 × 5 ## scene line speaker word1 word2 ## <dbl> <dbl> <chr> <chr> <chr> ## 1 1 1 (Man) arrivals gate ## 2 1 1 (Man) heathrow airport ## 3 1 1 (Man) opinion started ## 4 1 1 (Man) sons mothers ## 5 1 1 (Man) daughters husbands ## 6 1 1 (Man) wives boyfriends ## 7 1 1 (Man) boyfriends girlfriends ## 8 1 1 (Man) planes hit ## 9 1 1 (Man) twin towers ## 10 1 1 (Man) phone calls ## # ℹ 920 more rows ``` --- ## Removing stop words from bigrams Step 3: Put the bigrams back together ``` r bigrams_united <- bigrams_filtered %>% unite(bigram, word1, word2, sep = " ") bigrams_united ``` ``` ## # A tibble: 930 × 4 ## scene line speaker bigram ## <dbl> <dbl> <chr> <chr> ## 1 1 1 (Man) arrivals gate ## 2 1 1 (Man) heathrow airport ## 3 1 1 (Man) opinion started ## 4 1 1 (Man) sons mothers ## 5 1 1 (Man) daughters husbands ## 6 1 1 (Man) wives boyfriends ## 7 1 1 (Man) boyfriends girlfriends ## 8 1 1 (Man) planes hit ## 9 1 1 (Man) twin towers ## 10 1 1 (Man) phone calls ## # ℹ 920 more rows ``` --- ## Most common bigrams, again Hmm... ``` r bigrams_united %>% count(bigram, sort = TRUE) ``` ``` ## # A tibble: 708 × 2 ## bigram n ## <chr> <int> ## 1 NA NA 111 ## 2 prime minister 13 ## 3 bye bye 11 ## 4 baby baby 6 ## 5 merry christmas 6 ## 6 uncle jamie 5 ## 7 baby bye 4 ## 8 billy mack 4 ## 9 bye baby 4 ## 10 goodbye baby 4 ## # ℹ 698 more rows ``` --- ## Most common bigrams, again ``` r bigrams_united <- bigrams_united %>% filter(bigram != "NA NA") bigrams_united %>% count(bigram, sort = TRUE) ``` ``` ## # A tibble: 707 × 2 ## bigram n ## <chr> <int> ## 1 prime minister 13 ## 2 bye bye 11 ## 3 baby baby 6 ## 4 merry christmas 6 ## 5 uncle jamie 5 ## 6 baby bye 4 ## 7 billy mack 4 ## 8 bye baby 4 ## 9 goodbye baby 4 ## 10 joni mitchell 4 ## # ℹ 697 more rows ``` --- ## Visualizing the most common bigrams .panelset.sideways[ .panel[.panel-name[Code] ``` r bigrams_united %>% count(bigram, sort = TRUE) %>% slice_head(n = 10) %>% ggplot(aes(y = fct_reorder(bigram, n), x = n)) + geom_col(fill = "#BD2D2A") + labs( x = "Count", y = NULL, * title = "<span style = 'color:#808080'>Most common bigrams in</span>", * subtitle ="<span style = 'color:#BD2D2A;font-weight:strong'><b>love</b></span> actually" ) + theme( * plot.title = element_textbox_simple(halign = 1), * plot.subtitle = element_textbox_simple(size = 30, halign = 1), * plot.title.position = "plot" ) ``` ] .panel[.panel-name[Plot] <img src="index_files/figure-html/unnamed-chunk-36-1.png" width="100%" style="display: block; margin: auto;" /> ] ] --- ## Network of bigrams - We may be interested in visualizing all of the relationships among words simultaneously, rather than just the top few at a time - One approach is to arrange the words into a network (or "graph") - Here, "graph" is not in the sense of a visualization, but as a combination of connected nodes - A graph can be constructed from a tidy object since it has three variables: - **from**: the node an edge is coming from - **to**: the node an edge is going towards - **weight**: a numeric value associated with each edge --- ## From tidy data frame to graph Using `igraph::graph_from_data_frame()`: ``` r bigram_graph <- bigrams_united %>% separate(bigram, c("word1", "word2"), sep = " ") %>% count(word1, word2, sort = TRUE) %>% filter(n > 1) %>% # filter for bigrams that occur more than once graph_from_data_frame() bigram_graph ``` ``` ## IGRAPH 2c63c62 DN-- 83 62 -- ## + attr: name (v/c), n (e/n) ## + edges from 2c63c62 (vertex names): ## [1] prime ->minister bye ->bye baby ->baby ## [4] merry ->christmas uncle ->jamie baby ->bye ## [7] billy ->mack bye ->baby goodbye ->baby ## [10] joni ->mitchell uh ->huh er ->er ## [13] er ->natalie hate ->uncle love ->christmas ## [16] natalie ->live toes ->feel toes ->yeah ## [19] ah ->er ah ->natalie american ->girls ## [22] baby ->goodbye bad ->news boa ->noite ## + ... omitted several edges ``` --- ## Visualizing the network of bigrams Common bigrams in Love Actually that occurred more than once and where neither word was a stop word <img src="index_files/figure-html/unnamed-chunk-38-1.png" width="75%" style="display: block; margin: auto;" /> --- ## Visualizing the network of bigrams Step 1. Set up the plot object with `ggraph::ggraph()`: ``` r set.seed(42) ggraph(bigram_graph) ``` <img src="index_files/figure-html/unnamed-chunk-39-1.png" width="75%" style="display: block; margin: auto;" /> --- ## Visualizing the network of bigrams Step 2. Add edges ``` r set.seed(42) ggraph(bigram_graph) + geom_edge_link() ``` <img src="index_files/figure-html/unnamed-chunk-40-1.png" width="50%" style="display: block; margin: auto;" /> --- ## Visualizing the network of bigrams Step 3. Add nodes ``` r set.seed(42) ggraph(bigram_graph) + geom_edge_link() + geom_node_point() ``` <img src="index_files/figure-html/unnamed-chunk-41-1.png" width="50%" style="display: block; margin: auto;" /> --- ## Visualizing the network of bigrams Step 4. Add text ``` r set.seed(42) ggraph(bigram_graph) + geom_edge_link() + geom_node_point() + geom_node_text(aes(label = name)) ``` <img src="index_files/figure-html/unnamed-chunk-42-1.png" width="45%" style="display: block; margin: auto;" /> --- ## Visualizing the network of bigrams Step 5. Consider layouts <img src="index_files/figure-html/unnamed-chunk-43-1.png" width="35%" /><img src="index_files/figure-html/unnamed-chunk-43-2.png" width="35%" /> <img src="index_files/figure-html/unnamed-chunk-44-1.png" width="35%" /><img src="index_files/figure-html/unnamed-chunk-44-2.png" width="35%" /> --- ## Visualizing the network of bigrams Step 6. Pick a layout ``` r set.seed(42) ggraph(bigram_graph, layout = "fr") + geom_edge_link() + geom_node_point() + geom_node_text(aes(label = name)) ``` <img src="index_files/figure-html/unnamed-chunk-45-1.png" width="50%" style="display: block; margin: auto;" /> --- ## Visualizing the network of bigrams Step 7. Adjust text ``` r set.seed(42) ggraph(bigram_graph, layout = "fr") + geom_edge_link() + geom_node_point() + geom_node_text(aes(label = name), hjust = 1, vjust = 1) ``` <img src="index_files/figure-html/unnamed-chunk-46-1.png" width="50%" style="display: block; margin: auto;" /> --- ## Visualizing the network of bigrams Step 8. Add more context with `edge_alpha = n` .panelset.sideways[ .panel[.panel-name[Code] ``` r a <- grid::arrow(type = "closed", length = unit(0.1, "inches")) set.seed(42) ggraph(bigram_graph, layout = "fr") + geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a, end_cap = circle(0.01, "inches")) + geom_node_point(color = "lightpink", size = 3) + geom_node_text(aes(label = name), vjust = 1, hjust = 1) ``` ] .panel[.panel-name[Plot] <img src="index_files/figure-html/unnamed-chunk-47-1.png" width="100%" style="display: block; margin: auto;" /> ] ] --- # Application: Continue with your data * Find bigrams * Visualize Build on your prior data to explore word occurrences --- # Application: Office walk-through .panelset.sideways[ .panel[.panel-name[Code] ``` r a <- grid::arrow(type = "closed", length = unit(0.1, "inches")) set.seed(42) ggraph(bigram_graph_office, layout = "fr") + geom_edge_link(aes(edge_alpha = log(n)), show.legend = FALSE, arrow = a, end_cap = circle(0.01, "inches")) + geom_node_point(color = "#B3A6A3") + geom_node_text(aes(label = name), vjust = 1, hjust = 1) ``` ] .panel[.panel-name[Plot] <img src="index_files/figure-html/unnamed-chunk-49-1.png" width="100%" style="display: block; margin: auto;" /> ] ] --- class: center, middle, inverse # Networks: character co-occurrence ### (you may not have the data to be able to do this) --- ## Main characters ``` r main_characters <- read_csv(here::here("13-visualize-text-network", "data/love-actually-cast.csv")) ``` .pull-left[ ``` r main_characters %>% slice(1:10) ``` ``` ## # A tibble: 10 × 2 ## speaker actor ## <chr> <chr> ## 1 PM Hugh Grant ## 2 Daniel Liam Neeson ## 3 Harry Alan Rickman ## 4 Jamie Colin Firth ## 5 Karen Emma Thompson ## 6 Colin Kris Marshall ## 7 Sam Thomas Sangster ## 8 Sarah Laura Linney ## 9 Natalie Martine McCutcheon ## 10 Billy Bill Nighy ``` ] .pull-right[ ``` r main_characters %>% slice(11:20) ``` ``` ## # A tibble: 10 × 2 ## speaker actor ## <chr> <chr> ## 1 Mark Andrew Lincoln ## 2 Aurelia Lúcia Moniz ## 3 Jack Martin Freeman ## 4 Juliet Keira Knightley ## 5 Karl Rodrigo Santoro ## 6 Mia Heike Makatsch ## 7 Judy Joanna Page ## 8 Tony Abdul Salis ## 9 Joe Gregor Fisher ## 10 Peter Chiwetel Ejiofor ``` ] --- ## Dialogues for main characters ``` r love_actually <- love_actually %>% inner_join(main_characters) %>% mutate(character = glue("{speaker} ({actor})")) dim(love_actually) ``` ``` ## [1] 748 6 ``` --- ## Lines-per-scene-per-character ``` r character_scene_counts <- love_actually %>% count(scene, character) character_scene_counts ``` ``` ## # A tibble: 162 × 3 ## scene character n ## <dbl> <glue> <int> ## 1 2 Billy (Bill Nighy) 5 ## 2 2 Joe (Gregor Fisher) 3 ## 3 3 Jamie (Colin Firth) 5 ## 4 4 Daniel (Liam Neeson) 3 ## 5 4 Karen (Emma Thompson) 6 ## 6 5 Colin (Kris Marshall) 4 ## 7 6 Jack (Martin Freeman) 2 ## 8 6 Judy (Joanna Page) 1 ## 9 7 Mark (Andrew Lincoln) 4 ## 10 7 Peter (Chiwetel Ejiofor) 4 ## # ℹ 152 more rows ``` --- ## Look who's talking to whom ``` r character_scene_counts %>% ggplot(aes(x = scene, y = character)) + geom_point() + geom_path(aes(group = scene)) ``` <img src="index_files/figure-html/unnamed-chunk-55-1.png" width="60%" style="display: block; margin: auto;" /> --- ## Identify monologues ``` r character_scene_counts %>% group_by(scene) %>% mutate(conversation = if_else(n() == 1, "monologue", "not monologue")) ``` ``` ## # A tibble: 162 × 4 ## # Groups: scene [76] ## scene character n conversation ## <dbl> <glue> <int> <chr> ## 1 2 Billy (Bill Nighy) 5 not monologue ## 2 2 Joe (Gregor Fisher) 3 not monologue ## 3 3 Jamie (Colin Firth) 5 monologue ## 4 4 Daniel (Liam Neeson) 3 not monologue ## 5 4 Karen (Emma Thompson) 6 not monologue ## 6 5 Colin (Kris Marshall) 4 monologue ## 7 6 Jack (Martin Freeman) 2 not monologue ## 8 6 Judy (Joanna Page) 1 not monologue ## 9 7 Mark (Andrew Lincoln) 4 not monologue ## 10 7 Peter (Chiwetel Ejiofor) 4 not monologue ## # ℹ 152 more rows ``` --- ## Look who's talking to whom, again .task[ What's happening in the last scene? ] .panelset.sideways[ .panel[.panel-name[Code] ``` r character_scene_counts %>% group_by(scene) %>% mutate(conversation = if_else(n() == 1, "monologue", "not monologue")) %>% filter(conversation == "not monologue") %>% ggplot(aes(x = scene, y = character)) + geom_point() + geom_path(aes(group = scene)) ``` ] .panel[.panel-name[Plot] <img src="index_files/figure-html/unnamed-chunk-57-1.png" width="100%" style="display: block; margin: auto;" /> ] ] --- ## Remove the last scene So that it doesn't look like everyone is connected to each other in the network ``` r character_scene_counts <- character_scene_counts %>% filter(scene != max(character_scene_counts$scene)) ``` --- ## Character scene "matrix" Each row is a character, each column is a scene .small[ ``` r character_scene_counts %>% pivot_wider( names_from = scene, values_from = n ) ``` ``` ## # A tibble: 20 × 76 ## character `2` `3` `4` `5` `6` `7` `8` `9` `10` `11` `12` ## <glue> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> ## 1 Billy (Bil… 5 NA NA NA NA NA NA NA NA NA NA ## 2 Joe (Grego… 3 NA NA NA NA NA NA NA NA NA NA ## 3 Jamie (Col… NA 5 NA NA NA NA NA NA 5 NA NA ## 4 Daniel (Li… NA NA 3 NA NA NA NA NA NA NA NA ## 5 Karen (Emm… NA NA 6 NA NA NA NA NA NA NA NA ## 6 Colin (Kri… NA NA NA 4 NA NA NA NA NA 10 5 ## 7 Jack (Mart… NA NA NA NA 2 NA NA NA NA NA NA ## 8 Judy (Joan… NA NA NA NA 1 NA NA NA NA NA NA ## 9 Mark (Andr… NA NA NA NA NA 4 NA 1 NA 1 NA ## 10 Peter (Chi… NA NA NA NA NA 4 NA 4 NA NA NA ## 11 Natalie (M… NA NA NA NA NA NA 4 NA NA NA NA ## 12 PM (Hugh G… NA NA NA NA NA NA 12 NA NA NA NA ## 13 Juliet (Ke… NA NA NA NA NA NA NA 1 NA NA NA ## 14 Tony (Abdu… NA NA NA NA NA NA NA NA NA NA 4 ## 15 Sarah (Lau… NA NA NA NA NA NA NA NA NA NA NA ## 16 Harry (Ala… NA NA NA NA NA NA NA NA NA NA NA ## 17 Karl (Rodr… NA NA NA NA NA NA NA NA NA NA NA ## 18 Mia (Heike… NA NA NA NA NA NA NA NA NA NA NA ## 19 Sam (Thoma… NA NA NA NA NA NA NA NA NA NA NA ## 20 Aurelia (L… NA NA NA NA NA NA NA NA NA NA NA ## # ℹ 64 more variables: `13` <int>, `14` <int>, `15` <int>, `16` <int>, ## # `17` <int>, `18` <int>, `19` <int>, `20` <int>, `21` <int>, `22` <int>, ## # `23` <int>, `24` <int>, `25` <int>, `26` <int>, `27` <int>, `28` <int>, ## # `29` <int>, `30` <int>, `31` <int>, `32` <int>, `33` <int>, `34` <int>, ## # `35` <int>, `36` <int>, `37` <int>, `38` <int>, `39` <int>, `40` <int>, ## # `42` <int>, `43` <int>, `44` <int>, `45` <int>, `46` <int>, `47` <int>, ## # `48` <int>, `49` <int>, `50` <int>, `51` <int>, `52` <int>, `53` <int>, … ``` ] --- ## Character scene "matrix" ... and each cell is a 0/1 depending on whether that combination ever occurred .small[ ``` r character_scene_occurences <- character_scene_counts %>% pivot_wider( names_from = scene, names_prefix = "s", values_from = n, values_fn = length, values_fill = 0 ) character_scene_occurences ``` ``` ## # A tibble: 20 × 76 ## character s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 s12 ## <glue> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> ## 1 Billy (Bil… 1 0 0 0 0 0 0 0 0 0 0 ## 2 Joe (Grego… 1 0 0 0 0 0 0 0 0 0 0 ## 3 Jamie (Col… 0 1 0 0 0 0 0 0 1 0 0 ## 4 Daniel (Li… 0 0 1 0 0 0 0 0 0 0 0 ## 5 Karen (Emm… 0 0 1 0 0 0 0 0 0 0 0 ## 6 Colin (Kri… 0 0 0 1 0 0 0 0 0 1 1 ## 7 Jack (Mart… 0 0 0 0 1 0 0 0 0 0 0 ## 8 Judy (Joan… 0 0 0 0 1 0 0 0 0 0 0 ## 9 Mark (Andr… 0 0 0 0 0 1 0 1 0 1 0 ## 10 Peter (Chi… 0 0 0 0 0 1 0 1 0 0 0 ## 11 Natalie (M… 0 0 0 0 0 0 1 0 0 0 0 ## 12 PM (Hugh G… 0 0 0 0 0 0 1 0 0 0 0 ## 13 Juliet (Ke… 0 0 0 0 0 0 0 1 0 0 0 ## 14 Tony (Abdu… 0 0 0 0 0 0 0 0 0 0 1 ## 15 Sarah (Lau… 0 0 0 0 0 0 0 0 0 0 0 ## 16 Harry (Ala… 0 0 0 0 0 0 0 0 0 0 0 ## 17 Karl (Rodr… 0 0 0 0 0 0 0 0 0 0 0 ## 18 Mia (Heike… 0 0 0 0 0 0 0 0 0 0 0 ## 19 Sam (Thoma… 0 0 0 0 0 0 0 0 0 0 0 ## 20 Aurelia (L… 0 0 0 0 0 0 0 0 0 0 0 ## # ℹ 64 more variables: s13 <int>, s14 <int>, s15 <int>, s16 <int>, s17 <int>, ## # s18 <int>, s19 <int>, s20 <int>, s21 <int>, s22 <int>, s23 <int>, ## # s24 <int>, s25 <int>, s26 <int>, s27 <int>, s28 <int>, s29 <int>, ## # s30 <int>, s31 <int>, s32 <int>, s33 <int>, s34 <int>, s35 <int>, ## # s36 <int>, s37 <int>, s38 <int>, s39 <int>, s40 <int>, s42 <int>, ## # s43 <int>, s44 <int>, s45 <int>, s46 <int>, s47 <int>, s48 <int>, ## # s49 <int>, s50 <int>, s51 <int>, s52 <int>, s53 <int>, s54 <int>, … ``` ] --- ## Actually as a matrix .small[ ``` r character_scene_matrix <- character_scene_occurences %>% column_to_rownames(var = "character") %>% as.matrix() character_scene_matrix ``` ``` ## s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 s12 s13 s14 s15 ## Billy (Bill Nighy) 1 0 0 0 0 0 0 0 0 0 0 0 0 0 ## Joe (Gregor Fisher) 1 0 0 0 0 0 0 0 0 0 0 0 0 0 ## Jamie (Colin Firth) 0 1 0 0 0 0 0 0 1 0 0 0 0 0 ## Daniel (Liam Neeson) 0 0 1 0 0 0 0 0 0 0 0 0 1 0 ## Karen (Emma Thompson) 0 0 1 0 0 0 0 0 0 0 0 0 0 0 ## Colin (Kris Marshall) 0 0 0 1 0 0 0 0 0 1 1 0 0 0 ## Jack (Martin Freeman) 0 0 0 0 1 0 0 0 0 0 0 1 0 0 ## Judy (Joanna Page) 0 0 0 0 1 0 0 0 0 0 0 1 0 0 ## Mark (Andrew Lincoln) 0 0 0 0 0 1 0 1 0 1 0 0 0 1 ## Peter (Chiwetel Ejiofor) 0 0 0 0 0 1 0 1 0 0 0 0 0 0 ## Natalie (Martine McCutcheon) 0 0 0 0 0 0 1 0 0 0 0 0 0 0 ## PM (Hugh Grant) 0 0 0 0 0 0 1 0 0 0 0 0 0 0 ## Juliet (Keira Knightley) 0 0 0 0 0 0 0 1 0 0 0 0 0 0 ## Tony (Abdul Salis) 0 0 0 0 0 0 0 0 0 0 1 0 0 0 ## Sarah (Laura Linney) 0 0 0 0 0 0 0 0 0 0 0 0 0 1 ## Harry (Alan Rickman) 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ## Karl (Rodrigo Santoro) 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ## Mia (Heike Makatsch) 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ## Sam (Thomas Sangster) 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ## Aurelia (Lúcia Moniz) 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ## s16 s17 s18 s19 s20 s21 s22 s23 s24 s25 s26 s27 ## Billy (Bill Nighy) 0 1 0 0 0 0 0 0 0 0 0 0 ## Joe (Gregor Fisher) 0 1 0 0 0 0 0 0 0 0 0 0 ## Jamie (Colin Firth) 0 0 0 0 0 0 0 0 0 0 1 0 ## Daniel (Liam Neeson) 0 0 0 0 0 0 0 1 1 0 0 0 ## Karen (Emma Thompson) 0 0 0 0 0 0 0 1 0 0 0 0 ## Colin (Kris Marshall) 0 0 0 0 0 1 0 0 0 0 0 0 ## Jack (Martin Freeman) 0 0 0 0 1 0 0 0 0 0 0 0 ## Judy (Joanna Page) 0 0 0 0 1 0 0 0 0 0 0 0 ## Mark (Andrew Lincoln) 0 0 0 0 0 0 0 0 0 0 0 0 ## Peter (Chiwetel Ejiofor) 0 0 0 0 0 0 0 0 0 0 0 0 ## Natalie (Martine McCutcheon) 0 0 0 1 0 0 0 0 0 0 0 1 ## PM (Hugh Grant) 0 0 1 1 0 0 0 0 0 0 0 1 ## Juliet (Keira Knightley) 0 0 0 0 0 0 0 0 0 0 0 0 ## Tony (Abdul Salis) 0 0 0 0 0 1 0 0 0 0 0 0 ## Sarah (Laura Linney) 1 0 0 0 0 0 0 0 0 1 0 0 ## Harry (Alan Rickman) 1 0 0 0 0 0 1 0 0 0 0 0 ## Karl (Rodrigo Santoro) 1 0 0 0 0 0 0 0 0 1 0 0 ## Mia (Heike Makatsch) 1 0 0 0 0 0 1 0 0 0 0 0 ## Sam (Thomas Sangster) 0 0 0 0 0 0 0 0 1 0 0 0 ## Aurelia (Lúcia Moniz) 0 0 0 0 0 0 0 0 0 0 0 0 ## s28 s29 s30 s31 s32 s33 s34 s35 s36 s37 s38 s39 ## Billy (Bill Nighy) 0 1 0 0 0 0 0 0 0 0 0 0 ## Joe (Gregor Fisher) 0 0 0 0 0 0 0 0 0 0 0 0 ## Jamie (Colin Firth) 0 0 0 0 1 1 0 0 0 0 1 1 ## Daniel (Liam Neeson) 1 0 0 0 0 0 0 0 0 0 0 0 ## Karen (Emma Thompson) 0 0 0 0 0 0 0 0 1 0 0 0 ## Colin (Kris Marshall) 0 0 0 0 0 0 0 0 0 0 0 0 ## Jack (Martin Freeman) 0 0 0 0 0 0 0 0 0 0 0 0 ## Judy (Joanna Page) 0 0 0 0 0 0 0 0 0 0 0 0 ## Mark (Andrew Lincoln) 0 0 1 0 0 0 0 0 0 0 0 0 ## Peter (Chiwetel Ejiofor) 0 0 1 0 0 0 0 0 0 0 0 0 ## Natalie (Martine McCutcheon) 0 0 0 0 0 0 1 0 0 0 0 0 ## PM (Hugh Grant) 0 0 0 0 0 0 1 1 1 1 0 0 ## Juliet (Keira Knightley) 0 0 1 0 0 0 0 0 0 0 0 0 ## Tony (Abdul Salis) 0 0 0 0 0 0 0 0 0 0 0 0 ## Sarah (Laura Linney) 0 0 0 1 0 0 0 0 0 0 0 0 ## Harry (Alan Rickman) 0 0 0 1 0 0 0 0 1 0 0 0 ## Karl (Rodrigo Santoro) 0 0 0 0 0 0 0 0 0 0 0 0 ## Mia (Heike Makatsch) 0 0 0 1 0 0 0 0 0 0 0 0 ## Sam (Thomas Sangster) 1 0 0 0 0 0 0 0 0 0 0 0 ## Aurelia (Lúcia Moniz) 0 0 0 0 1 0 0 0 0 0 1 1 ## s40 s42 s43 s44 s45 s46 s47 s48 s49 s50 s51 s52 ## Billy (Bill Nighy) 0 0 0 0 0 0 0 1 0 0 0 0 ## Joe (Gregor Fisher) 0 0 0 0 0 0 0 0 0 0 0 0 ## Jamie (Colin Firth) 0 0 0 0 1 0 0 0 0 0 0 0 ## Daniel (Liam Neeson) 0 0 1 0 0 1 0 0 0 0 0 0 ## Karen (Emma Thompson) 0 0 0 0 0 0 1 0 1 0 1 0 ## Colin (Kris Marshall) 0 0 0 0 0 0 0 0 0 0 0 0 ## Jack (Martin Freeman) 0 0 0 0 0 0 0 0 0 0 0 0 ## Judy (Joanna Page) 0 0 0 0 0 0 0 0 0 0 0 0 ## Mark (Andrew Lincoln) 1 0 0 0 0 0 0 0 0 0 0 0 ## Peter (Chiwetel Ejiofor) 0 0 0 0 0 0 0 0 0 0 0 0 ## Natalie (Martine McCutcheon) 0 0 0 0 0 0 0 0 0 0 0 0 ## PM (Hugh Grant) 0 1 0 1 0 0 0 0 0 0 0 0 ## Juliet (Keira Knightley) 1 0 0 0 0 0 0 0 0 0 0 0 ## Tony (Abdul Salis) 0 0 0 0 0 0 0 0 0 0 0 0 ## Sarah (Laura Linney) 0 0 0 0 0 0 0 0 1 1 0 1 ## Harry (Alan Rickman) 0 0 0 0 0 0 1 0 0 0 1 0 ## Karl (Rodrigo Santoro) 0 0 0 0 0 0 0 0 1 1 0 0 ## Mia (Heike Makatsch) 0 0 0 0 0 0 1 0 0 0 0 0 ## Sam (Thomas Sangster) 0 0 1 0 0 1 0 0 0 0 0 0 ## Aurelia (Lúcia Moniz) 0 0 0 0 1 0 0 0 0 0 0 0 ## s53 s54 s55 s56 s57 s58 s59 s60 s61 s62 s63 s64 ## Billy (Bill Nighy) 0 0 0 0 0 0 0 1 0 0 0 0 ## Joe (Gregor Fisher) 0 0 0 0 0 0 0 0 0 0 0 0 ## Jamie (Colin Firth) 0 0 0 1 0 0 0 0 0 1 0 0 ## Daniel (Liam Neeson) 0 0 0 0 0 0 1 0 0 0 0 1 ## Karen (Emma Thompson) 1 0 0 1 0 1 0 0 0 0 0 0 ## Colin (Kris Marshall) 0 1 0 0 1 0 0 0 0 0 0 0 ## Jack (Martin Freeman) 0 0 0 0 0 0 0 0 1 0 0 0 ## Judy (Joanna Page) 0 0 1 0 0 0 0 0 1 0 0 0 ## Mark (Andrew Lincoln) 0 0 0 0 0 0 0 0 0 0 0 0 ## Peter (Chiwetel Ejiofor) 0 0 0 0 0 0 0 0 0 0 0 0 ## Natalie (Martine McCutcheon) 0 0 0 0 0 0 0 0 0 0 0 0 ## PM (Hugh Grant) 0 0 0 0 0 0 0 0 0 0 0 0 ## Juliet (Keira Knightley) 0 0 0 0 0 0 0 0 0 0 0 0 ## Tony (Abdul Salis) 0 1 0 0 1 0 0 0 0 0 0 0 ## Sarah (Laura Linney) 0 0 0 0 0 0 0 0 0 0 1 0 ## Harry (Alan Rickman) 1 0 0 1 0 1 0 0 0 0 0 0 ## Karl (Rodrigo Santoro) 0 0 0 0 0 0 0 0 0 0 1 0 ## Mia (Heike Makatsch) 1 0 0 0 0 0 0 0 0 0 0 0 ## Sam (Thomas Sangster) 0 0 0 0 0 0 1 0 0 0 0 1 ## Aurelia (Lúcia Moniz) 0 0 0 0 0 0 0 0 0 0 0 0 ## s65 s66 s67 s68 s69 s70 s71 s72 s73 s74 s75 s77 ## Billy (Bill Nighy) 0 1 0 0 0 0 0 0 0 0 0 0 ## Joe (Gregor Fisher) 0 1 0 0 0 0 0 0 0 0 0 0 ## Jamie (Colin Firth) 0 0 0 0 0 0 0 0 0 1 0 0 ## Daniel (Liam Neeson) 0 0 0 0 0 0 0 0 1 0 1 0 ## Karen (Emma Thompson) 0 0 0 0 0 1 0 1 0 0 0 0 ## Colin (Kris Marshall) 0 0 0 0 0 0 0 0 0 0 0 0 ## Jack (Martin Freeman) 0 0 0 0 1 0 0 0 0 0 0 0 ## Judy (Joanna Page) 0 0 0 0 1 0 0 0 0 0 0 0 ## Mark (Andrew Lincoln) 1 0 0 0 0 0 0 0 0 0 0 0 ## Peter (Chiwetel Ejiofor) 1 0 0 0 0 0 0 0 0 0 0 0 ## Natalie (Martine McCutcheon) 0 0 1 1 1 1 1 0 0 0 0 0 ## PM (Hugh Grant) 0 0 1 1 1 1 1 0 0 0 0 0 ## Juliet (Keira Knightley) 1 0 0 0 0 0 0 0 0 0 0 0 ## Tony (Abdul Salis) 0 0 0 0 0 0 0 0 0 0 0 0 ## Sarah (Laura Linney) 0 0 0 0 0 0 0 0 0 0 0 0 ## Harry (Alan Rickman) 0 0 0 0 0 0 0 1 0 0 0 0 ## Karl (Rodrigo Santoro) 0 0 0 0 0 0 0 0 0 0 0 0 ## Mia (Heike Makatsch) 0 0 1 0 0 0 0 0 0 0 0 0 ## Sam (Thomas Sangster) 0 0 0 0 1 0 0 0 1 0 1 1 ## Aurelia (Lúcia Moniz) 0 0 0 0 0 0 0 0 0 0 0 0 ## s78 ## Billy (Bill Nighy) 0 ## Joe (Gregor Fisher) 0 ## Jamie (Colin Firth) 1 ## Daniel (Liam Neeson) 0 ## Karen (Emma Thompson) 0 ## Colin (Kris Marshall) 0 ## Jack (Martin Freeman) 0 ## Judy (Joanna Page) 0 ## Mark (Andrew Lincoln) 0 ## Peter (Chiwetel Ejiofor) 0 ## Natalie (Martine McCutcheon) 0 ## PM (Hugh Grant) 0 ## Juliet (Keira Knightley) 0 ## Tony (Abdul Salis) 0 ## Sarah (Laura Linney) 0 ## Harry (Alan Rickman) 0 ## Karl (Rodrigo Santoro) 0 ## Mia (Heike Makatsch) 0 ## Sam (Thomas Sangster) 0 ## Aurelia (Lúcia Moniz) 1 ``` ] --- ## Calculate co-occurrences .small[ ``` r cooccur <- character_scene_matrix %*% t(character_scene_matrix) cooccur ``` ``` ## Billy (Bill Nighy) Joe (Gregor Fisher) ## Billy (Bill Nighy) 6 3 ## Joe (Gregor Fisher) 3 3 ## Jamie (Colin Firth) 0 0 ## Daniel (Liam Neeson) 0 0 ## Karen (Emma Thompson) 0 0 ## Colin (Kris Marshall) 0 0 ## Jack (Martin Freeman) 0 0 ## Judy (Joanna Page) 0 0 ## Mark (Andrew Lincoln) 0 0 ## Peter (Chiwetel Ejiofor) 0 0 ## Natalie (Martine McCutcheon) 0 0 ## PM (Hugh Grant) 0 0 ## Juliet (Keira Knightley) 0 0 ## Tony (Abdul Salis) 0 0 ## Sarah (Laura Linney) 0 0 ## Harry (Alan Rickman) 0 0 ## Karl (Rodrigo Santoro) 0 0 ## Mia (Heike Makatsch) 0 0 ## Sam (Thomas Sangster) 0 0 ## Aurelia (Lúcia Moniz) 0 0 ## Jamie (Colin Firth) Daniel (Liam Neeson) ## Billy (Bill Nighy) 0 0 ## Joe (Gregor Fisher) 0 0 ## Jamie (Colin Firth) 12 0 ## Daniel (Liam Neeson) 0 11 ## Karen (Emma Thompson) 1 2 ## Colin (Kris Marshall) 0 0 ## Jack (Martin Freeman) 0 0 ## Judy (Joanna Page) 0 0 ## Mark (Andrew Lincoln) 0 0 ## Peter (Chiwetel Ejiofor) 0 0 ## Natalie (Martine McCutcheon) 0 0 ## PM (Hugh Grant) 0 0 ## Juliet (Keira Knightley) 0 0 ## Tony (Abdul Salis) 0 0 ## Sarah (Laura Linney) 0 0 ## Harry (Alan Rickman) 1 0 ## Karl (Rodrigo Santoro) 0 0 ## Mia (Heike Makatsch) 0 0 ## Sam (Thomas Sangster) 0 8 ## Aurelia (Lúcia Moniz) 5 0 ## Karen (Emma Thompson) Colin (Kris Marshall) ## Billy (Bill Nighy) 0 0 ## Joe (Gregor Fisher) 0 0 ## Jamie (Colin Firth) 1 0 ## Daniel (Liam Neeson) 2 0 ## Karen (Emma Thompson) 11 0 ## Colin (Kris Marshall) 0 6 ## Jack (Martin Freeman) 0 0 ## Judy (Joanna Page) 0 0 ## Mark (Andrew Lincoln) 0 1 ## Peter (Chiwetel Ejiofor) 0 0 ## Natalie (Martine McCutcheon) 1 0 ## PM (Hugh Grant) 2 0 ## Juliet (Keira Knightley) 0 0 ## Tony (Abdul Salis) 0 4 ## Sarah (Laura Linney) 1 0 ## Harry (Alan Rickman) 7 0 ## Karl (Rodrigo Santoro) 1 0 ## Mia (Heike Makatsch) 2 0 ## Sam (Thomas Sangster) 0 0 ## Aurelia (Lúcia Moniz) 0 0 ## Jack (Martin Freeman) Judy (Joanna Page) ## Billy (Bill Nighy) 0 0 ## Joe (Gregor Fisher) 0 0 ## Jamie (Colin Firth) 0 0 ## Daniel (Liam Neeson) 0 0 ## Karen (Emma Thompson) 0 0 ## Colin (Kris Marshall) 0 0 ## Jack (Martin Freeman) 5 5 ## Judy (Joanna Page) 5 6 ## Mark (Andrew Lincoln) 0 0 ## Peter (Chiwetel Ejiofor) 0 0 ## Natalie (Martine McCutcheon) 1 1 ## PM (Hugh Grant) 1 1 ## Juliet (Keira Knightley) 0 0 ## Tony (Abdul Salis) 0 0 ## Sarah (Laura Linney) 0 0 ## Harry (Alan Rickman) 0 0 ## Karl (Rodrigo Santoro) 0 0 ## Mia (Heike Makatsch) 0 0 ## Sam (Thomas Sangster) 1 1 ## Aurelia (Lúcia Moniz) 0 0 ## Mark (Andrew Lincoln) Peter (Chiwetel Ejiofor) ## Billy (Bill Nighy) 0 0 ## Joe (Gregor Fisher) 0 0 ## Jamie (Colin Firth) 0 0 ## Daniel (Liam Neeson) 0 0 ## Karen (Emma Thompson) 0 0 ## Colin (Kris Marshall) 1 0 ## Jack (Martin Freeman) 0 0 ## Judy (Joanna Page) 0 0 ## Mark (Andrew Lincoln) 7 4 ## Peter (Chiwetel Ejiofor) 4 4 ## Natalie (Martine McCutcheon) 0 0 ## PM (Hugh Grant) 0 0 ## Juliet (Keira Knightley) 4 3 ## Tony (Abdul Salis) 0 0 ## Sarah (Laura Linney) 1 0 ## Harry (Alan Rickman) 0 0 ## Karl (Rodrigo Santoro) 0 0 ## Mia (Heike Makatsch) 0 0 ## Sam (Thomas Sangster) 0 0 ## Aurelia (Lúcia Moniz) 0 0 ## Natalie (Martine McCutcheon) PM (Hugh Grant) ## Billy (Bill Nighy) 0 0 ## Joe (Gregor Fisher) 0 0 ## Jamie (Colin Firth) 0 0 ## Daniel (Liam Neeson) 0 0 ## Karen (Emma Thompson) 1 2 ## Colin (Kris Marshall) 0 0 ## Jack (Martin Freeman) 1 1 ## Judy (Joanna Page) 1 1 ## Mark (Andrew Lincoln) 0 0 ## Peter (Chiwetel Ejiofor) 0 0 ## Natalie (Martine McCutcheon) 9 9 ## PM (Hugh Grant) 9 15 ## Juliet (Keira Knightley) 0 0 ## Tony (Abdul Salis) 0 0 ## Sarah (Laura Linney) 0 0 ## Harry (Alan Rickman) 0 1 ## Karl (Rodrigo Santoro) 0 0 ## Mia (Heike Makatsch) 1 1 ## Sam (Thomas Sangster) 1 1 ## Aurelia (Lúcia Moniz) 0 0 ## Juliet (Keira Knightley) Tony (Abdul Salis) ## Billy (Bill Nighy) 0 0 ## Joe (Gregor Fisher) 0 0 ## Jamie (Colin Firth) 0 0 ## Daniel (Liam Neeson) 0 0 ## Karen (Emma Thompson) 0 0 ## Colin (Kris Marshall) 0 4 ## Jack (Martin Freeman) 0 0 ## Judy (Joanna Page) 0 0 ## Mark (Andrew Lincoln) 4 0 ## Peter (Chiwetel Ejiofor) 3 0 ## Natalie (Martine McCutcheon) 0 0 ## PM (Hugh Grant) 0 0 ## Juliet (Keira Knightley) 4 0 ## Tony (Abdul Salis) 0 4 ## Sarah (Laura Linney) 0 0 ## Harry (Alan Rickman) 0 0 ## Karl (Rodrigo Santoro) 0 0 ## Mia (Heike Makatsch) 0 0 ## Sam (Thomas Sangster) 0 0 ## Aurelia (Lúcia Moniz) 0 0 ## Sarah (Laura Linney) Harry (Alan Rickman) ## Billy (Bill Nighy) 0 0 ## Joe (Gregor Fisher) 0 0 ## Jamie (Colin Firth) 0 1 ## Daniel (Liam Neeson) 0 0 ## Karen (Emma Thompson) 1 7 ## Colin (Kris Marshall) 0 0 ## Jack (Martin Freeman) 0 0 ## Judy (Joanna Page) 0 0 ## Mark (Andrew Lincoln) 1 0 ## Peter (Chiwetel Ejiofor) 0 0 ## Natalie (Martine McCutcheon) 0 0 ## PM (Hugh Grant) 0 1 ## Juliet (Keira Knightley) 0 0 ## Tony (Abdul Salis) 0 0 ## Sarah (Laura Linney) 8 2 ## Harry (Alan Rickman) 2 10 ## Karl (Rodrigo Santoro) 5 1 ## Mia (Heike Makatsch) 2 5 ## Sam (Thomas Sangster) 0 0 ## Aurelia (Lúcia Moniz) 0 0 ## Karl (Rodrigo Santoro) Mia (Heike Makatsch) ## Billy (Bill Nighy) 0 0 ## Joe (Gregor Fisher) 0 0 ## Jamie (Colin Firth) 0 0 ## Daniel (Liam Neeson) 0 0 ## Karen (Emma Thompson) 1 2 ## Colin (Kris Marshall) 0 0 ## Jack (Martin Freeman) 0 0 ## Judy (Joanna Page) 0 0 ## Mark (Andrew Lincoln) 0 0 ## Peter (Chiwetel Ejiofor) 0 0 ## Natalie (Martine McCutcheon) 0 1 ## PM (Hugh Grant) 0 1 ## Juliet (Keira Knightley) 0 0 ## Tony (Abdul Salis) 0 0 ## Sarah (Laura Linney) 5 2 ## Harry (Alan Rickman) 1 5 ## Karl (Rodrigo Santoro) 5 1 ## Mia (Heike Makatsch) 1 6 ## Sam (Thomas Sangster) 0 0 ## Aurelia (Lúcia Moniz) 0 0 ## Sam (Thomas Sangster) Aurelia (Lúcia Moniz) ## Billy (Bill Nighy) 0 0 ## Joe (Gregor Fisher) 0 0 ## Jamie (Colin Firth) 0 5 ## Daniel (Liam Neeson) 8 0 ## Karen (Emma Thompson) 0 0 ## Colin (Kris Marshall) 0 0 ## Jack (Martin Freeman) 1 0 ## Judy (Joanna Page) 1 0 ## Mark (Andrew Lincoln) 0 0 ## Peter (Chiwetel Ejiofor) 0 0 ## Natalie (Martine McCutcheon) 1 0 ## PM (Hugh Grant) 1 0 ## Juliet (Keira Knightley) 0 0 ## Tony (Abdul Salis) 0 0 ## Sarah (Laura Linney) 0 0 ## Harry (Alan Rickman) 0 0 ## Karl (Rodrigo Santoro) 0 0 ## Mia (Heike Makatsch) 0 0 ## Sam (Thomas Sangster) 10 0 ## Aurelia (Lúcia Moniz) 0 5 ``` ] --- ## Convert to a graph Using `igraph::graph_from_adjacency_matrix()`: .small[ ``` r cooccur_graph <- graph_from_adjacency_matrix( cooccur, weighted = TRUE, mode = "undirected", diag = FALSE # don't include diagonals ) cooccur_graph ``` ``` ## IGRAPH 84230bf UNW- 20 37 -- ## + attr: name (v/c), weight (e/n) ## + edges from 84230bf (vertex names): ## [1] Billy (Bill Nighy) --Joe (Gregor Fisher) ## [2] Jamie (Colin Firth) --Karen (Emma Thompson) ## [3] Jamie (Colin Firth) --Harry (Alan Rickman) ## [4] Jamie (Colin Firth) --Aurelia (Lúcia Moniz) ## [5] Daniel (Liam Neeson) --Karen (Emma Thompson) ## [6] Daniel (Liam Neeson) --Sam (Thomas Sangster) ## [7] Karen (Emma Thompson)--Natalie (Martine McCutcheon) ## [8] Karen (Emma Thompson)--PM (Hugh Grant) ## + ... omitted several edges ``` ] --- ## Visualizing co-occurences .panelset.sideways[ .panel[.panel-name[Code] ``` r set.seed(42) ggraph(cooccur_graph, layout = "fr") + geom_edge_link(color = "gray") + geom_node_point(color = "lightpink", size = 3) + geom_node_text(aes(label = name), hjust = 0.2, vjust = 1) ``` ] .panel[.panel-name[Plot] <img src="index_files/figure-html/unnamed-chunk-64-1.png" width="100%" style="display: block; margin: auto;" /> ] ] --- ### Co-occurences: The Office .panelset.sideways[ .panel[.panel-name[Code] ``` r char_names <- c("michael", "jim", "dwight", "pam", "angela", "bob", "phyllis", "oscar", "creed", "kevin", "stanley", "andy", "kelly", "toby", "robert", "nellie", "todd", "jan", "carol", "ryan", "meredith", "holly", "david", "karen", "jo") character_scene_counts_office <- office %>% count(scene, speaker, season, episode) character_scene_counts_office %>% filter(speaker %in% str_to_sentence(char_names)) %>% filter(season == 1 & episode == 1) %>% ggplot(aes(x = scene, y = speaker)) + geom_point() + geom_path(aes(group = scene)) ``` ] .panel[.panel-name[Plot] <img src="index_files/figure-html/unnamed-chunk-65-1.png" width="100%" style="display: block; margin: auto;" /> ] ] --- ### Co-occurences: The Office ``` r character_scene_matrix_office <- character_scene_counts_office %>% filter(speaker %in% str_to_sentence(char_names)) %>% filter(season < 4) %>% group_by(season, episode, scene) %>% pivot_wider( names_from = c(season, episode, scene), names_prefix = "s", values_from = n, values_fn = length, values_fill = 0 ) %>% column_to_rownames(var = "speaker") %>% as.matrix() cooccur_office <- character_scene_matrix_office %*% t(character_scene_matrix_office) cooccur_office <- graph_from_adjacency_matrix( cooccur_office, weighted = TRUE, mode = "undirected", diag = FALSE ) # don't include diagonals ``` --- ### Co-occurences: The Office .panelset.sideways[ .panel[.panel-name[Code] ``` r set.seed(42) ggraph(cooccur_office, layout = "fr") + geom_edge_link(color = "gray") + geom_node_point(color = "#2B152C", size = 3) + geom_node_text(aes(label = name), hjust = 0.2, vjust = 1, repel = TRUE) ``` ] .panel[.panel-name[Plot] <img src="index_files/figure-html/unnamed-chunk-67-1.png" width="100%" style="display: block; margin: auto;" /> ] ] --- # `set.seed()` Play around setting different values for the seed for your visualization .panelset.sideways[ .panel[.panel-name[Code] ``` r set.seed(13) ggraph(cooccur_office, layout = "fr") + geom_edge_link(color = "gray") + geom_node_point(color = "#2B152C", size = 3) + geom_node_text(aes(label = name), hjust = 0.2, vjust = 1, repel = TRUE) ``` ] .panel[.panel-name[Plot] <img src="index_files/figure-html/unnamed-chunk-68-1.png" width="100%" style="display: block; margin: auto;" /> ] ] --- # Recap * WORDCLOUDS ARE NOT ANALYSIS * Think about your data structure for what makes sense for your analysis * Networks are good for connections * Language plays a key role when doing text analysis -- how do you think about what does/not have meaning? --- ## Acknowledgements - [Analyzing networks of characters in 'Love Actually'](http://varianceexplained.org/r/love-actually-network/) by David Robinson - [Text Mining with R](https://www.tidytextmining.com/) by Julia Silge and David Robinson - [Office Dataset](https://www.kaggle.com/datasets/nehaprabhavalkar/the-office-dataset/versions/5?resource=download)