Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added EFunkhouser.MP3_pdf.pdf
Binary file not shown.
34 changes: 34 additions & 0 deletions get_booktext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
''' Import code for textmining project. '''
from pattern.web import *
from os.path import exists
import pickle

def get_booktext(url_string,file_name):
''' Given the url of the plaintext book from the Gutenberg Project, saves it as a pickled file.
Inputs:
url_string: String. address of plaintext book text.
file_name: String. desired file name of the pickled file. To be saved in ~/TextMining directory.
Outputs:
None. Just writes the pickle to the given file_name.'''

if exists(file_name) == True:
print "You've already downloaded the dang book. Returning None."
return None
else:
f = open(file_name,'w')
full_text = URL(url_string).download() #very long string
f.write(full_text)
f.close()
print 'Done!'
return None


#The import/download calls
get_booktext('http://eremita.di.uminho.pt/gutenberg/2/4/241/241.txt','Clotelle.txt')
get_booktext('http://www.gutenberg.org/cache/epub/46160/pg46160.txt','Malaeska.txt')
get_booktext('http://gutenberg.readingroo.ms/3/1/8/6/31869/31869.txt','Lamplighter.txt')
get_booktext('http://gutenberg.readingroo.ms/2/7/0/2701/2701.txt','MobyDick.txt')
get_booktext('http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/3/33/33.txt','ScarletLetter.txt')
get_booktext('http://eremita.di.uminho.pt/gutenberg/2/0/203/203.txt','UncleToms.txt')
get_booktext('http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/1/1/2/1/11214/11214.txt','Garies.txt')
get_booktext('http://eremita.di.uminho.pt/gutenberg/5/8/584/584.txt','OurNig.txt')
101 changes: 101 additions & 0 deletions one_book_sentiment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
from pattern.en import *
from os.path import exists
import pickle
import plotly.plotly as plot
import plotly.graph_objs as maketrace
import pandas

def plot_storyline(book_file_name, plot_name='test-plot'):
''' Plots sentiment (-1 to 1) of book text using rolling window of 5 (default) sentences.
Input: book_file_name, ie 'Clotelle.txt'
Commented output: a sentiment plot over the length of the book.
Modified output: x (0 to 1000) and y (sentiment scores)'''

listAndCoeff= sliding_window_sentiment(book_file_name) #implied windowsize of 6, which then gets scaled based on size of book
sentiment_list = listAndCoeff[0]
scaling_coeff = listAndCoeff[1] #Used in sliding_window_sentiment for window size; now used for rolling mean calc

sentiment_series = pandas.Series(sentiment_list)

roll_mean_sentiment = pandas.rolling_mean(sentiment_series,80*scaling_coeff) #Smoothes out sentiment data

x = scale_xrange(range(len(roll_mean_sentiment))) #scales the length of roll_mean_sentiment to a standard (1001)
y = roll_mean_sentiment
return [x,y]

##TO PLOT JUST ONE BOOK COMMENT OUT 'return' LINE AND UNCOMMENT BELOW:
# trace = maketrace.Scatter( x = range(len(roll_mean_sentiment)), y = roll_mean_sentiment)
# data = [trace]
# plot.iplot(data, filename = plot_name)
## it works!!!

def scale_xrange(xrange):
''' Returns a range of same number of points that spans from 0 to 1000 regardless of input values
Input: xrange, ie [0,1,2]
Output: scaled xrange, ie [0,500,1000]'''
new_range = []
for index in xrange:
new_range.append((1000.0/xrange[-1]) * index)
return new_range


def sliding_window_sentiment(book_file_name, windowSize = 8):
''' This does a couple things.
It gets the list of sentences in the book by calling list_of_all_sentences.
It calculates a 'scaling coefficient' that makes the window size bigger for a particularly long book.
Then it looks at sentences in the window, calculates sentiment and puts it in sentiment_list,
then jumps forward one sentence and looks at the new window.

Inputs: book file name so it can call the list of all sentences function, and the default windowsize.
Outputs: a list of the sentiment score for every window and the scaling coefficient.'''

sentenceList = list_of_all_sentences(book_file_name)
scaling_coeff = len(sentenceList)//900 #Based on Our Nig, shortest book
windowSize = scaling_coeff * windowSize

sentiment_list = []

for i in range(len(sentenceList) - (windowSize-1)):
mySentences = sentenceList[i : i+windowSize]
mySentences = '. '.join(mySentences)
sentiment_list.append(measure_sentiment(mySentences))

return [sentiment_list, scaling_coeff]


def list_of_all_sentences(book_file_name):
''' Pass in the name of the book file (ie 'Clotelle.txt')
Returns: sequential list of every sentence in the book
If it's already pickled that list, this function just unpickles it and passes it out
If it hasn't pickled it yet, it stores a pickled version of the list for the future
but still passes out the unpickled one.
>>> list_of_all_sentences('example.txt')
['Hello', 'It's me', 'I was wondering if after all these years you'd like to meet', 'Hello', 'How aaare you.\n']
'''

book_name = book_file_name[:-4] #take off .txt
pickled_book_name = ('%s' + '_pickled_sentence_list.txt') %(book_name)
if exists(pickled_book_name):
f = open(pickled_book_name,'r')
pickled_sentence_list = f.read()
f.close()
sentence_list = pickle.loads(pickled_sentence_list)
return sentence_list
else:
#First, make a sequential list of every sentence in the book
f = open(book_file_name, 'r')
book_text = f.read()
f.close()
sentence_list = book_text.split('. ') #This is what we will return, but first should pickle it for future use
f2 = open(pickled_book_name,'w')
pickled_sentence_list = pickle.dumps(sentence_list)
f2.write(pickled_sentence_list)
f2.close()
return sentence_list


def measure_sentiment(sentences):
''' Returns the sentiment, from -1 (very negative) to 1 (positive), of a string
The string should be one sentence from the story.'''
(sentim, subjectivity) = sentiment(sentences)
return sentim
48 changes: 48 additions & 0 deletions plot_all_books.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import plotly.plotly as plot
import plotly.graph_objs as maketrace
from one_book_sentiment import *

# x0y0 = plot_storyline('Clotelle.txt')
trace0 = maketrace.Scatter( x = x0y0[0], y = x0y0[1], name = 'Clotelle',
line = dict(color = ('rgb(205, 12, 24)'), width = 4))

# x1y1 = plot_storyline('Garies.txt')
trace1 = maketrace.Scatter( x = x1y1[0], y = x1y1[1], name = 'The Garies and Their Friends',
line = dict(color = ('rgb(205, 12, 24)'), width = 4, dash = 'dash'))

# x2y2 = plot_storyline('OurNig.txt')
trace2 = maketrace.Scatter( x = x2y2[0], y = x2y2[1], name = 'Our Nig',
line = dict(color = ('rgb(205, 12, 24)'), width = 4, dash = 'dashdot'))

# x3y3 = plot_storyline('UncleToms.txt')
trace3 = maketrace.Scatter( x = x3y3[0], y = x3y3[1], name = "Uncle Tom's Cabin",
line = dict(color = ('rgb(205, 12, 24)'), width = 4, dash = 'dot'))

print "Part 1 Done"
###

# x4y4 = plot_storyline('Lamplighter.txt')
trace4 = maketrace.Scatter( x = x4y4[0], y = x4y4[1], name = 'The Lamplighter',
line = dict(color = ('rgb(22, 96, 167)'), width = 4))

# x5y5 = plot_storyline('Malaeska.txt')
trace5 = maketrace.Scatter( x = x5y5[0], y = x5y5[1], name = 'Malaeska',
line = dict(color = ('rgb(22, 96, 167)'), width = 4, dash = 'dash'))

# x6y6 = plot_storyline('MobyDick.txt')
trace6 = maketrace.Scatter( x = x6y6[0], y = x6y6[1], name = 'Moby Dick',
line = dict(color = ('rgb(22, 96, 167)'), width = 4, dash = 'dashdot'))

# x7y7 = plot_storyline('ScarletLetter.txt')
trace7 = maketrace.Scatter( x = x7y7[0], y = x7y7[1], name = 'The Scarlet Letter',
line = dict(color = ('rgb(22, 96, 167)'), width = 4, dash = 'dot'))

print "Part 2 Done"
###
data = [trace0, trace1, trace2, trace3, trace4, trace5, trace6, trace7]
layout = dict(title = 'Storyline Sentiment in Eight 1850s Novels: Contrasting black and white authors',
xaxis = dict(title = 'Progression through novel'),
yaxis = dict(title = 'Sentiment Value'),
)

plot.iplot(dict(data=data, layout=layout), filename='allbooks-plot')
59 changes: 59 additions & 0 deletions trim_booktext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import pickle


def remove_junk_beginning(textBody):
''' Trims unnecessary stuff off the beginning of a gutenberg book.
Looks for 'chapter 1' or 'chapter I' and returns everything after that.
Input is textBody, the text of the book file.
Outputs the sliced text unless Chapter 1/I never makes an appearance, then outputs None.'''

index = textBody.lower().find('chapter 1')
alt_index = textBody.lower().find('chapter i')

if index >= 0:
return textBody[index:]
elif alt_index >= 0:
return textBody[alt_index:]
else:
print "You've already trimmed that shiz down!"
return None

def remove_junk_end(textBody):
''' Trims unnecessary stuff off the end of a gutenberg book.
Finds the index of the first mention of Gutenberg (happens once the actual book content is over)
and returns everything before that.
Input is textBody, the text of the book file
Outputs the sliced text unless 'Gutenberg' never makes an appearance, then outputs None.'''
index = textBody.lower().find('gutenberg')
if index >= 0:
return textBody[:index]

else:
print "You've already trimmed that shiz down!"
return None


def trim_booktext(file_name):
''' Input book file name (string);
this function trims off anything before Chapter 1 and after the end of the novel.
Re-saves text file once it's trimmed so you only need run it once.
Returns None.'''
#Open er up
f = open(file_name,'r')
full_text = f.read()
f.close()

#Process: delete everything before the start of chapter 1
trimmed_text = remove_junk_beginning(full_text)
trimmed_text = remove_junk_end(trimmed_text)

#Rewrite the file w/trimmed version
f = open(file_name,'w')
f.write(trimmed_text)
f.close
print 'Done!'


#Trim down the books to just the body, no extra junk
for fileName in ['Clotelle.txt', 'Malaeska.txt', 'Lamplighter.txt', 'MobyDick.txt', 'ScarletLetter.txt', 'UncleToms.txt', 'Garies.txt', 'OurNig.txt']:
trim_booktext(fileName)