sd16spring · efunkhouser · Feb 27, 2016 · Feb 28, 2016 · Feb 28, 2016 · Feb 28, 2016
diff --git a/EFunkhouser.MP3_pdf.pdf b/EFunkhouser.MP3_pdf.pdf
diff --git a/get_booktext.py b/get_booktext.py
@@ -0,0 +1,34 @@
+''' Import code for textmining project. '''
+from pattern.web import *
+from os.path import exists
+import pickle
+
+def get_booktext(url_string,file_name):
+	''' Given the url of the plaintext book from the Gutenberg Project, saves it as a pickled file.
+		Inputs:
+		url_string: String. address of plaintext book text.
+		file_name: String. desired file name of the pickled file. To be saved in ~/TextMining directory.
+		Outputs:
+		None. Just writes the pickle to the given file_name.'''
+
+	if exists(file_name) == True:
+		print "You've already downloaded the dang book. Returning None."
+		return None
+	else:
+		f = open(file_name,'w')
+		full_text = URL(url_string).download() #very long string
+		f.write(full_text)
+		f.close()
+		print 'Done!'
+		return None
+
+
+#The import/download calls
+get_booktext('http://eremita.di.uminho.pt/gutenberg/2/4/241/241.txt','Clotelle.txt')
+get_booktext('http://www.gutenberg.org/cache/epub/46160/pg46160.txt','Malaeska.txt')
+get_booktext('http://gutenberg.readingroo.ms/3/1/8/6/31869/31869.txt','Lamplighter.txt')
+get_booktext('http://gutenberg.readingroo.ms/2/7/0/2701/2701.txt','MobyDick.txt')
+get_booktext('http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/3/33/33.txt','ScarletLetter.txt')
+get_booktext('http://eremita.di.uminho.pt/gutenberg/2/0/203/203.txt','UncleToms.txt')
+get_booktext('http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/1/1/2/1/11214/11214.txt','Garies.txt')
+get_booktext('http://eremita.di.uminho.pt/gutenberg/5/8/584/584.txt','OurNig.txt')
diff --git a/one_book_sentiment.py b/one_book_sentiment.py
@@ -0,0 +1,101 @@
+from pattern.en import *
+from os.path import exists
+import pickle
+import plotly.plotly as plot
+import plotly.graph_objs as maketrace
+import pandas
+
+def plot_storyline(book_file_name, plot_name='test-plot'):
+	''' Plots sentiment (-1 to 1) of book text using rolling window of 5 (default) sentences.
+		Input: book_file_name, ie 'Clotelle.txt'
+		Commented output: a sentiment plot over the length of the book.
+		Modified output: x (0 to 1000) and y (sentiment scores)'''
+
+	listAndCoeff= sliding_window_sentiment(book_file_name) #implied windowsize of 6, which then gets scaled based on size of book
+	sentiment_list = listAndCoeff[0]
+	scaling_coeff = listAndCoeff[1] #Used in sliding_window_sentiment for window size; now used for rolling mean calc
+
+	sentiment_series = pandas.Series(sentiment_list)
+
+	roll_mean_sentiment = pandas.rolling_mean(sentiment_series,80*scaling_coeff) #Smoothes out sentiment data
+
+	x = scale_xrange(range(len(roll_mean_sentiment))) #scales the length of roll_mean_sentiment to a standard (1001)
+	y = roll_mean_sentiment
+	return [x,y]
+
+	##TO PLOT JUST ONE BOOK COMMENT OUT 'return' LINE AND UNCOMMENT BELOW:
+	# trace = maketrace.Scatter( x = range(len(roll_mean_sentiment)), y = roll_mean_sentiment)
+	# data = [trace]
+	# plot.iplot(data, filename = plot_name)
+	## it works!!!
+
+def scale_xrange(xrange):
+	''' Returns a range of same number of points that spans from 0 to 1000 regardless of input values
+		Input: xrange, ie [0,1,2]
+		Output: scaled xrange, ie [0,500,1000]'''
+	new_range = []
+	for index in xrange:
+		new_range.append((1000.0/xrange[-1]) * index)
+	return new_range
+
+
+def sliding_window_sentiment(book_file_name, windowSize = 8):
+	''' This does a couple things.
+		It gets the list of sentences in the book by calling list_of_all_sentences.
+		It calculates a 'scaling coefficient' that makes the window size bigger for a particularly long book.
+		Then it looks at sentences in the window, calculates sentiment and puts it in sentiment_list,
+		then jumps forward one sentence and looks at the new window.
+
+		Inputs: book file name so it can call the list of all sentences function, and the default windowsize.
+		Outputs: a list of the sentiment score for every window and the scaling coefficient.'''
+
+	sentenceList = list_of_all_sentences(book_file_name)
+	scaling_coeff = len(sentenceList)//900 #Based on Our Nig, shortest book
+	windowSize = scaling_coeff * windowSize
+
+	sentiment_list = []
+
+	for i in range(len(sentenceList) - (windowSize-1)):
+		mySentences = sentenceList[i : i+windowSize]
+		mySentences = '. '.join(mySentences)
+		sentiment_list.append(measure_sentiment(mySentences))
+
+	return [sentiment_list, scaling_coeff]
+
+
+def list_of_all_sentences(book_file_name):
+	''' Pass in the name of the book file (ie 'Clotelle.txt')
+		Returns: sequential list of every sentence in the book
+		If it's already pickled that list, this function just unpickles it and passes it out
+		If it hasn't pickled it yet, it stores a pickled version of the list for the future
+		but still passes out the unpickled one.
+	>>> list_of_all_sentences('example.txt')
+	['Hello', 'It's me', 'I was wondering if after all these years you'd like to meet', 'Hello', 'How aaare you.\n']
+	'''
+
+	book_name = book_file_name[:-4] #take off .txt
+	pickled_book_name = ('%s' + '_pickled_sentence_list.txt') %(book_name)
+	if exists(pickled_book_name):
+		f = open(pickled_book_name,'r')
+		pickled_sentence_list = f.read()
+		f.close()
+		sentence_list = pickle.loads(pickled_sentence_list)
+		return sentence_list
+	else:
+		#First, make a sequential list of every sentence in the book
+		f = open(book_file_name, 'r')
+		book_text = f.read()
+		f.close()
+		sentence_list = book_text.split('. ') #This is what we will return, but first should pickle it for future use
+		f2 = open(pickled_book_name,'w')
+		pickled_sentence_list = pickle.dumps(sentence_list)
+		f2.write(pickled_sentence_list)
+		f2.close()
+		return sentence_list
+
+
+def measure_sentiment(sentences):
+	''' Returns the sentiment, from -1 (very negative) to 1 (positive), of a string
+		The string should be one sentence from the story.'''
+	(sentim, subjectivity) = sentiment(sentences)
+	return sentim
diff --git a/plot_all_books.py b/plot_all_books.py
@@ -0,0 +1,48 @@
+import plotly.plotly as plot
+import plotly.graph_objs as maketrace
+from one_book_sentiment import *
+
+# x0y0 = plot_storyline('Clotelle.txt')
+trace0 = maketrace.Scatter( x = x0y0[0], y = x0y0[1], name = 'Clotelle',
+	line = dict(color = ('rgb(205, 12, 24)'), width = 4))
+
+# x1y1 = plot_storyline('Garies.txt')
+trace1 = maketrace.Scatter( x = x1y1[0], y = x1y1[1], name = 'The Garies and Their Friends',
+	line = dict(color = ('rgb(205, 12, 24)'), width = 4, dash = 'dash'))
+
+# x2y2 = plot_storyline('OurNig.txt')
+trace2 = maketrace.Scatter( x = x2y2[0], y = x2y2[1], name = 'Our Nig',
+	line = dict(color = ('rgb(205, 12, 24)'), width = 4, dash = 'dashdot'))
+
+# x3y3 = plot_storyline('UncleToms.txt')
+trace3 = maketrace.Scatter( x = x3y3[0], y = x3y3[1], name = "Uncle Tom's Cabin",
+	line = dict(color = ('rgb(205, 12, 24)'), width = 4, dash = 'dot'))
+
+print "Part 1 Done"
+###
+
+# x4y4 = plot_storyline('Lamplighter.txt')
+trace4 = maketrace.Scatter( x = x4y4[0], y = x4y4[1], name = 'The Lamplighter',
+	line = dict(color = ('rgb(22, 96, 167)'), width = 4))
+
+# x5y5 = plot_storyline('Malaeska.txt')
+trace5 = maketrace.Scatter( x = x5y5[0], y = x5y5[1], name = 'Malaeska',
+	line = dict(color = ('rgb(22, 96, 167)'), width = 4, dash = 'dash'))
+
+# x6y6 = plot_storyline('MobyDick.txt')
+trace6 = maketrace.Scatter( x = x6y6[0], y = x6y6[1], name = 'Moby Dick',
+	line = dict(color = ('rgb(22, 96, 167)'), width = 4, dash = 'dashdot'))
+
+# x7y7 = plot_storyline('ScarletLetter.txt')
+trace7 = maketrace.Scatter( x = x7y7[0], y = x7y7[1], name = 'The Scarlet Letter',
+	line = dict(color = ('rgb(22, 96, 167)'), width = 4, dash = 'dot'))
+
+print "Part 2 Done"
+###
+data = [trace0, trace1, trace2, trace3, trace4, trace5, trace6, trace7]
+layout = dict(title = 'Storyline Sentiment in Eight 1850s Novels: Contrasting black and white authors',
+              xaxis = dict(title = 'Progression through novel'),
+              yaxis = dict(title = 'Sentiment Value'),
+              )
+
+plot.iplot(dict(data=data, layout=layout), filename='allbooks-plot')
diff --git a/trim_booktext.py b/trim_booktext.py
@@ -0,0 +1,59 @@
+import pickle
+
+
+def remove_junk_beginning(textBody):
+	''' Trims unnecessary stuff off the beginning of a gutenberg book.
+		Looks for 'chapter 1' or 'chapter I' and returns everything after that.
+		Input is textBody, the text of the book file.
+		Outputs the sliced text unless Chapter 1/I never makes an appearance, then outputs None.'''
+
+	index = textBody.lower().find('chapter 1')
+	alt_index = textBody.lower().find('chapter i')
+
+	if index >= 0:
+		return textBody[index:]
+	elif alt_index >= 0:
+		return textBody[alt_index:]
+	else:
+		print "You've already trimmed that shiz down!"
+		return None
+
+def remove_junk_end(textBody):
+	''' Trims unnecessary stuff off the end of a gutenberg book.
+		Finds the index of the first mention of Gutenberg (happens once the actual book content is over)
+		and returns everything before that.
+		Input is textBody, the text of the book file
+		Outputs the sliced text unless 'Gutenberg' never makes an appearance, then outputs None.'''
+	index = textBody.lower().find('gutenberg')
+	if index >= 0:
+		return textBody[:index]
+
+	else:
+		print "You've already trimmed that shiz down!"
+		return None
+
+
+def trim_booktext(file_name):
+	''' Input book file name (string);
+		this function trims off anything before Chapter 1 and after the end of the novel.
+		Re-saves text file once it's trimmed so you only need run it once.
+		Returns None.'''
+	#Open er up
+	f = open(file_name,'r')
+	full_text = f.read()
+	f.close()
+
+	#Process: delete everything before the start of chapter 1 
+	trimmed_text = remove_junk_beginning(full_text)
+	trimmed_text = remove_junk_end(trimmed_text)
+
+	#Rewrite the file w/trimmed version
+	f = open(file_name,'w')
+	f.write(trimmed_text)
+	f.close
+	print 'Done!'
+
+
+#Trim down the books to just the body, no extra junk
+for fileName in ['Clotelle.txt', 'Malaeska.txt', 'Lamplighter.txt', 'MobyDick.txt', 'ScarletLetter.txt', 'UncleToms.txt', 'Garies.txt', 'OurNig.txt']:
+	trim_booktext(fileName)