Second session of the R meetup for Digital Humanists main page previous next
American University of Beirut
Last time we downloaded RStudio. We tried a few packages to see what you might be able to do. We also looked at Matt Jockers’ book Text Analysis with R for Students of Literature.
I asked participants to take the code from Jockers’ site and try it both for Moby Dick and for a few texts of their choice.
You might want to check out the top downloads at Project Gutenberg here.
Today we will work with Carroll’s Alice in Wonderland. It is found at Gutenberg here. A UTF-8 text file without the boilerplate is here.
HW for next meetup:
-pick two texts of your own
-identify boilerplate, notes, any particularities
-create a plot, 20 MFW, relative frequencies in the text, label plot (x, y axes, include MFW on the x axis)
-dispersion plots with significant words
-wordcloud
# R meetup #2
# The code below is adapted from Jockers' Text Analysis with R
#set up of Textual Analysis with R
#use a working directory
setwd("~/TextAnalysisWithR/TextAnalysisWithR")
text_v <- scan("data/plainText/melville.txt", what = "character", sep = "\n")
#pulling in Moby Dick (Jockers' example of most MFW in Moby Dick)
text_v <- scan("http://www.gutenberg.org/cache/epub/2701/pg2701.txt", what = "character", sep = "\n")
novel_v <- text_v[408:18576]
novel <- paste(novel_v, collapse = " ")
novel_lower <- tolower(novel)
moby_words_l <- strsplit(novel_lower, "\\W")
moby_words_v <- unlist(moby_words_l)
blanks <- which(moby_words_v == "")
moby_words_v <- moby_words_v[-blanks]
moby_counts <- table(moby_words_v)
sorted_moby_t <- sort(moby_counts, decreasing = TRUE)
moby_freq_t <- sorted_moby_t/length(moby_words_v)
moby_freq_t[1:10]
plot(moby_freq_t[1:10], type="l")
# making dispersion plots
whale_positions_v <- which(moby_words_v == "whale")
ahab_positions_v <- which(moby_words_v == "ahab")
dispersion <- rep(0, length(moby_words_v))
dispersion[whale_positions_v] <- 1
par(mfrow=c(2,1))
plot(dispersion, type="h")
dispersion_a <- rep(0, length(moby_words_v))
dispersion_a[ahab_positions_v] <- 1
plot(dispersion_a, type="h")
# OR use plain text from the Internet
# be sure to explore the nature of your text
# Arabian Nights Lane / Pool vol 1
text_AN <- scan("http://www.gutenberg.org/files/34206/34206-0.txt", what = "character", sep = "\n")
# text has 2264text_AN1 lines
# see the text (text begins at 968)
text_AN [1:1000]
# where does the text end?
text_AN [22000:22641]
# Alice in Wonderland (starts at 13, ends at 2494, 2790 lines long)
text_AW <- scan("http://djwrisley.com/publicdata/Alice.txt", what = "character", sep = "\n")
text_AW [1:500]
text_AW [2400:2790]
text_AW [2495]
# assign the part of the text you want to analyze
novel_AW <- text_AW
novel_AW
# try this with a text of your choice from Project Gutenberg
# check the length
length(text_AW)
length(novel_AW)
class(novel_AW)
novel <- paste(novel_AW, collapse = " ")
# make it all lowercase
novel_lower <- tolower(novel)
#split it all into strings and make a list
alice_words_l <- strsplit(novel_lower, "\\W")
# check what kind of class the variable is
class(alice_words_l)
#unlist - make the list into a vector
alice_words_v <- unlist(alice_words_l)
# define blanks and remove them (note that you are overwriting here!)
blanks <- which(alice_words_v == "")
alice_words_v <- alice_words_v[-blanks]
# create a table of the words in descending order
alice_counts <- table(alice_words_v)
sorted_alice_t <- sort(alice_counts, decreasing = TRUE)
sorted_alice_t
# try to sort in ascending order
# calculate the frequency and make a plot of the top 10
alice_freq_t <- sorted_alice_t/length(alice_words_v)
alice_freq_t[1:10]
plot(alice_freq_t[1:10], type="l")
#look lower down in the list
alice_freq_t [1:200]
#try a different kind of plot (?plot for info)
?plot
plot(alice_freq_t[1:10], type="b", xlab = "MFW", sub = "MFW in Alice in Wonderland")
axis (1, 1:10, labels=names(alice_freq_t [1:10]))
# try some dispersion plots with Alice
alice_positions_v <- which(alice_words_v == "alice")
rabbit_positions_v <- which(alice_words_v == "rabbit")
queen_positions_v <- which(alice_words_v == "queen")
dispersion <- rep(0, length(alice_words_v))
dispersion[alice_positions_v] <- 1
par(mfrow=c(2,1))
plot(dispersion, type="h")
dispersion_b <- rep(0, length(alice_words_v))
dispersion_b[queen_positions_v] <- 1
par(mfrow=c(2,1))
plot(dispersion_b, type="h")
dispersion_a <- rep(0, length(alice_words_v))
dispersion_a[rabbit_positions_v] <- 1
plot(dispersion_a, type="h")
# can you add labels to these dispersions?
# try DOWN versus UP as an experiment
down_positions_v <- which(alice_words_v == "down")
up_positions_v <- which(alice_words_v == "up")
dispersion <- rep(0, length(alice_words_v))
dispersion[down_positions_v] <- 1
par(mfrow=c(2,1))
plot(dispersion, type="h")
dispersion_b <- rep(0, length(alice_words_v))
dispersion_b[up_positions_v] <- 1
plot(dispersion_b, type="h")
# create dispersion plot to illustrate relative usage of Gryphon
gryphon_positions_v <- which(alice_words_v == "gryphon")
dispersion <- rep(0, length(alice_words_v))
dispersion[gryphon_positions_v] <- 1
par(mfrow=c(2,1))
plot(dispersion, type="h")
# try some of your own with your favorite text