diff --git a/EFunkhouser.MP3_pdf.pdf b/EFunkhouser.MP3_pdf.pdf new file mode 100644 index 0000000..5ef4945 Binary files /dev/null and b/EFunkhouser.MP3_pdf.pdf differ diff --git a/get_booktext.py b/get_booktext.py new file mode 100644 index 0000000..45cf026 --- /dev/null +++ b/get_booktext.py @@ -0,0 +1,34 @@ +''' Import code for textmining project. ''' +from pattern.web import * +from os.path import exists +import pickle + +def get_booktext(url_string,file_name): + ''' Given the url of the plaintext book from the Gutenberg Project, saves it as a pickled file. + Inputs: + url_string: String. address of plaintext book text. + file_name: String. desired file name of the pickled file. To be saved in ~/TextMining directory. + Outputs: + None. Just writes the pickle to the given file_name.''' + + if exists(file_name) == True: + print "You've already downloaded the dang book. Returning None." + return None + else: + f = open(file_name,'w') + full_text = URL(url_string).download() #very long string + f.write(full_text) + f.close() + print 'Done!' + return None + + +#The import/download calls +get_booktext('http://eremita.di.uminho.pt/gutenberg/2/4/241/241.txt','Clotelle.txt') +get_booktext('http://www.gutenberg.org/cache/epub/46160/pg46160.txt','Malaeska.txt') +get_booktext('http://gutenberg.readingroo.ms/3/1/8/6/31869/31869.txt','Lamplighter.txt') +get_booktext('http://gutenberg.readingroo.ms/2/7/0/2701/2701.txt','MobyDick.txt') +get_booktext('http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/3/33/33.txt','ScarletLetter.txt') +get_booktext('http://eremita.di.uminho.pt/gutenberg/2/0/203/203.txt','UncleToms.txt') +get_booktext('http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/1/1/2/1/11214/11214.txt','Garies.txt') +get_booktext('http://eremita.di.uminho.pt/gutenberg/5/8/584/584.txt','OurNig.txt') diff --git a/one_book_sentiment.py b/one_book_sentiment.py new file mode 100644 index 0000000..64596ba --- /dev/null +++ b/one_book_sentiment.py @@ -0,0 +1,101 @@ +from pattern.en import * +from os.path import exists +import pickle +import plotly.plotly as plot +import plotly.graph_objs as maketrace +import pandas + +def plot_storyline(book_file_name, plot_name='test-plot'): + ''' Plots sentiment (-1 to 1) of book text using rolling window of 5 (default) sentences. + Input: book_file_name, ie 'Clotelle.txt' + Commented output: a sentiment plot over the length of the book. + Modified output: x (0 to 1000) and y (sentiment scores)''' + + listAndCoeff= sliding_window_sentiment(book_file_name) #implied windowsize of 6, which then gets scaled based on size of book + sentiment_list = listAndCoeff[0] + scaling_coeff = listAndCoeff[1] #Used in sliding_window_sentiment for window size; now used for rolling mean calc + + sentiment_series = pandas.Series(sentiment_list) + + roll_mean_sentiment = pandas.rolling_mean(sentiment_series,80*scaling_coeff) #Smoothes out sentiment data + + x = scale_xrange(range(len(roll_mean_sentiment))) #scales the length of roll_mean_sentiment to a standard (1001) + y = roll_mean_sentiment + return [x,y] + + ##TO PLOT JUST ONE BOOK COMMENT OUT 'return' LINE AND UNCOMMENT BELOW: + # trace = maketrace.Scatter( x = range(len(roll_mean_sentiment)), y = roll_mean_sentiment) + # data = [trace] + # plot.iplot(data, filename = plot_name) + ## it works!!! + +def scale_xrange(xrange): + ''' Returns a range of same number of points that spans from 0 to 1000 regardless of input values + Input: xrange, ie [0,1,2] + Output: scaled xrange, ie [0,500,1000]''' + new_range = [] + for index in xrange: + new_range.append((1000.0/xrange[-1]) * index) + return new_range + + +def sliding_window_sentiment(book_file_name, windowSize = 8): + ''' This does a couple things. + It gets the list of sentences in the book by calling list_of_all_sentences. + It calculates a 'scaling coefficient' that makes the window size bigger for a particularly long book. + Then it looks at sentences in the window, calculates sentiment and puts it in sentiment_list, + then jumps forward one sentence and looks at the new window. + + Inputs: book file name so it can call the list of all sentences function, and the default windowsize. + Outputs: a list of the sentiment score for every window and the scaling coefficient.''' + + sentenceList = list_of_all_sentences(book_file_name) + scaling_coeff = len(sentenceList)//900 #Based on Our Nig, shortest book + windowSize = scaling_coeff * windowSize + + sentiment_list = [] + + for i in range(len(sentenceList) - (windowSize-1)): + mySentences = sentenceList[i : i+windowSize] + mySentences = '. '.join(mySentences) + sentiment_list.append(measure_sentiment(mySentences)) + + return [sentiment_list, scaling_coeff] + + +def list_of_all_sentences(book_file_name): + ''' Pass in the name of the book file (ie 'Clotelle.txt') + Returns: sequential list of every sentence in the book + If it's already pickled that list, this function just unpickles it and passes it out + If it hasn't pickled it yet, it stores a pickled version of the list for the future + but still passes out the unpickled one. + >>> list_of_all_sentences('example.txt') + ['Hello', 'It's me', 'I was wondering if after all these years you'd like to meet', 'Hello', 'How aaare you.\n'] + ''' + + book_name = book_file_name[:-4] #take off .txt + pickled_book_name = ('%s' + '_pickled_sentence_list.txt') %(book_name) + if exists(pickled_book_name): + f = open(pickled_book_name,'r') + pickled_sentence_list = f.read() + f.close() + sentence_list = pickle.loads(pickled_sentence_list) + return sentence_list + else: + #First, make a sequential list of every sentence in the book + f = open(book_file_name, 'r') + book_text = f.read() + f.close() + sentence_list = book_text.split('. ') #This is what we will return, but first should pickle it for future use + f2 = open(pickled_book_name,'w') + pickled_sentence_list = pickle.dumps(sentence_list) + f2.write(pickled_sentence_list) + f2.close() + return sentence_list + + +def measure_sentiment(sentences): + ''' Returns the sentiment, from -1 (very negative) to 1 (positive), of a string + The string should be one sentence from the story.''' + (sentim, subjectivity) = sentiment(sentences) + return sentim diff --git a/plot_all_books.py b/plot_all_books.py new file mode 100644 index 0000000..9467dc5 --- /dev/null +++ b/plot_all_books.py @@ -0,0 +1,48 @@ +import plotly.plotly as plot +import plotly.graph_objs as maketrace +from one_book_sentiment import * + +# x0y0 = plot_storyline('Clotelle.txt') +trace0 = maketrace.Scatter( x = x0y0[0], y = x0y0[1], name = 'Clotelle', + line = dict(color = ('rgb(205, 12, 24)'), width = 4)) + +# x1y1 = plot_storyline('Garies.txt') +trace1 = maketrace.Scatter( x = x1y1[0], y = x1y1[1], name = 'The Garies and Their Friends', + line = dict(color = ('rgb(205, 12, 24)'), width = 4, dash = 'dash')) + +# x2y2 = plot_storyline('OurNig.txt') +trace2 = maketrace.Scatter( x = x2y2[0], y = x2y2[1], name = 'Our Nig', + line = dict(color = ('rgb(205, 12, 24)'), width = 4, dash = 'dashdot')) + +# x3y3 = plot_storyline('UncleToms.txt') +trace3 = maketrace.Scatter( x = x3y3[0], y = x3y3[1], name = "Uncle Tom's Cabin", + line = dict(color = ('rgb(205, 12, 24)'), width = 4, dash = 'dot')) + +print "Part 1 Done" +### + +# x4y4 = plot_storyline('Lamplighter.txt') +trace4 = maketrace.Scatter( x = x4y4[0], y = x4y4[1], name = 'The Lamplighter', + line = dict(color = ('rgb(22, 96, 167)'), width = 4)) + +# x5y5 = plot_storyline('Malaeska.txt') +trace5 = maketrace.Scatter( x = x5y5[0], y = x5y5[1], name = 'Malaeska', + line = dict(color = ('rgb(22, 96, 167)'), width = 4, dash = 'dash')) + +# x6y6 = plot_storyline('MobyDick.txt') +trace6 = maketrace.Scatter( x = x6y6[0], y = x6y6[1], name = 'Moby Dick', + line = dict(color = ('rgb(22, 96, 167)'), width = 4, dash = 'dashdot')) + +# x7y7 = plot_storyline('ScarletLetter.txt') +trace7 = maketrace.Scatter( x = x7y7[0], y = x7y7[1], name = 'The Scarlet Letter', + line = dict(color = ('rgb(22, 96, 167)'), width = 4, dash = 'dot')) + +print "Part 2 Done" +### +data = [trace0, trace1, trace2, trace3, trace4, trace5, trace6, trace7] +layout = dict(title = 'Storyline Sentiment in Eight 1850s Novels: Contrasting black and white authors', + xaxis = dict(title = 'Progression through novel'), + yaxis = dict(title = 'Sentiment Value'), + ) + +plot.iplot(dict(data=data, layout=layout), filename='allbooks-plot') \ No newline at end of file diff --git a/trim_booktext.py b/trim_booktext.py new file mode 100644 index 0000000..3917c68 --- /dev/null +++ b/trim_booktext.py @@ -0,0 +1,59 @@ +import pickle + + +def remove_junk_beginning(textBody): + ''' Trims unnecessary stuff off the beginning of a gutenberg book. + Looks for 'chapter 1' or 'chapter I' and returns everything after that. + Input is textBody, the text of the book file. + Outputs the sliced text unless Chapter 1/I never makes an appearance, then outputs None.''' + + index = textBody.lower().find('chapter 1') + alt_index = textBody.lower().find('chapter i') + + if index >= 0: + return textBody[index:] + elif alt_index >= 0: + return textBody[alt_index:] + else: + print "You've already trimmed that shiz down!" + return None + +def remove_junk_end(textBody): + ''' Trims unnecessary stuff off the end of a gutenberg book. + Finds the index of the first mention of Gutenberg (happens once the actual book content is over) + and returns everything before that. + Input is textBody, the text of the book file + Outputs the sliced text unless 'Gutenberg' never makes an appearance, then outputs None.''' + index = textBody.lower().find('gutenberg') + if index >= 0: + return textBody[:index] + + else: + print "You've already trimmed that shiz down!" + return None + + +def trim_booktext(file_name): + ''' Input book file name (string); + this function trims off anything before Chapter 1 and after the end of the novel. + Re-saves text file once it's trimmed so you only need run it once. + Returns None.''' + #Open er up + f = open(file_name,'r') + full_text = f.read() + f.close() + + #Process: delete everything before the start of chapter 1 + trimmed_text = remove_junk_beginning(full_text) + trimmed_text = remove_junk_end(trimmed_text) + + #Rewrite the file w/trimmed version + f = open(file_name,'w') + f.write(trimmed_text) + f.close + print 'Done!' + + +#Trim down the books to just the body, no extra junk +for fileName in ['Clotelle.txt', 'Malaeska.txt', 'Lamplighter.txt', 'MobyDick.txt', 'ScarletLetter.txt', 'UncleToms.txt', 'Garies.txt', 'OurNig.txt']: + trim_booktext(fileName)