diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/README.md b/README.md index d0674e5..12735ff 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,5 @@ -# TextMining -This is the base repo for the text mining and analysis project for Software Design, Spring 2016 at Olin College. +This is a color frequency analysis tool. In order to use the tool, simply run run_books.py. This will automatically +analyze the text of Grimm's fairy tales. If you would like to change the colors searched for in these tales, open +text_filter.py and change the strings in COLORS (a global variable) to whatever you would like to be searched. +After running ths script, follow the prompts presented when you run the program for name of the story (letters and + underscores only), URL of the text file of the story, save file name (letters and underscores only), and graph title. \ No newline at end of file diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gettys_writeup_reflection_textmining.pdf b/gettys_writeup_reflection_textmining.pdf new file mode 100644 index 0000000..2921b3b Binary files /dev/null and b/gettys_writeup_reflection_textmining.pdf differ diff --git a/pickling_import_text.py b/pickling_import_text.py new file mode 100644 index 0000000..eadfdf9 --- /dev/null +++ b/pickling_import_text.py @@ -0,0 +1,16 @@ +from pattern.web import * +import urllib2 +import pickle + + + +def get_text(URL_string, name): + """This function grabs the text from a text file on the web and pickles it for future use. +Arguments: URL of text file to be saved as a string, name of the file for use in naming data variables and pickle files (also a string). + Returns: Pickled data!""" + + tale = URL(URL_string).download() + save_file = open(name + '.pickle', 'w') + pickle.dump(tale, save_file) + save_file.close() + diff --git a/run_books.py b/run_books.py new file mode 100644 index 0000000..e37cfcc --- /dev/null +++ b/run_books.py @@ -0,0 +1,43 @@ +from pickling_import_text import * +from text_filter import * +import sys + + +#def run_func(name_string = 'grimm', url_string='http://www.gutenberg.org/cache/epub/2591/pg2591.txt', graph_title_string='Color Word Frequencies in Brothers Grimm Stories', save_image_string = 'grimm_chart.png'): +def run_func(name_string, url_string, graph_title_string, save_image_string): + get_text(url_string,name_string) + a_text=text_importing(name_string) + tale_list_of_words = tale_slicing(a_text) + tale_dict = color_searching (tale_list_of_words) + item_dump_list = tale_dict.items() + tale_color_freq = list_dumping(item_dump_list) + universal_graph_func(tale_color_freq,graph_title_string,save_image_string) + + + +user_input_name = raw_input("Please input a Name of Story (letters and underscores only)") +user_input_url = raw_input("Please input a URL to a text file to be analysed") +user_input_title = raw_input("Please input what your desired title for the graph that will be generated") +user_input_save = raw_input("Please input the file name you would like the graph to be saved to (letters and underscores only)") + '.png' + +run_func(user_input_name,user_input_url, user_input_title, user_input_save) + + + +#if len(sys.argv) ==0: +# run_func() +#elif len(sys.argv) ==4: +# run_func(name_string=str(sys.argv[0]) , url_string=str(sys.argv[1]), +# graph_title_string=str(sys.argv[2]), save_image_string= str(sys.argv[3])) + +#else: +# print "Incorrect Number of Arguements. Please run without arguements for Grimm anaylsis or follow the following format: name url graph_title save_image_name) + + + + + +#run_func('grimm', 'http://www.gutenberg.org/cache/epub/2591/pg2591.txt','Color Word Frequencies in Brothers Grimm Stories', 'grimm_chart_2.png' ) +#run_func('perrault','https://ia600302.us.archive.org/15/items/thefairytalesofc29021gut/pg29021.txt', 'Color Word Frequencies in Charles Perrault Stories','perrault_chart_2.png', "perrault_chart_2.png") +#run_func('andersen','https://archive.org/download/fairytalesofhans27200gut/27200.txt', 'Color Word Frequencies in Hans Christian Andersen Stories', 'andersen_chart_2.png') + diff --git a/text_filter.py b/text_filter.py new file mode 100644 index 0000000..dd789be --- /dev/null +++ b/text_filter.py @@ -0,0 +1,114 @@ +""" This code was written by Rebecca Gettys, except where otherwise noted. """ +#### MAIN SECIOTN #### +import pickle +import string +import seaborn as sns + +COLORS = ['red', 'orange', 'yellow', 'green', 'blue', 'purple', 'brown', 'grey', 'black', 'white', 'pink', 'ivory', 'tan', 'silver', 'gold', 'rose','gray', 'olive', 'crimson', 'maroon', + 'fuchsia', 'teal', 'lavender', 'lilac', 'aqua', 'azure', 'beige', 'indigo', 'magenta', 'cyan', 'scarlet', + 'canary', 'periwinkle'] + + + + +def text_importing(name): + """Imports previously-pickled fairy tale data (in string format from disk and returns a list of the strings. + Arguements: name of the pickle file of previously pickled data (as a string, without the .pickle ending)! + Returns: a pickle-imported string""" + # Load data for each from from a file (will be part of your data processing script) + input_file = open(name+ '.pickle','r') + tale = pickle.load(input_file) + return tale + + + + +def color_searching(tale): + """Searches the tale for a list of color words and counts the instances of these words up using a dictionary. + Arguments: object (in this contex a list) to search, dictionary to search with + Returns: dictionary containing keys and key-occurance frequencies (how many times the word showed up in the object) + Due to the non-orderedness of dicionaries, hard to use a doctest""" + color_dict = {color:0 for color in COLORS} + for word in tale: #need to slice each tale into a list of words for this to work + if word in color_dict: + current_val = color_dict.get(word) + val = current_val + 1 + color_dict[word] = val #made a dictionary of the string (color, frequnecy) + return color_dict + + + + + +def tale_slicing(tale): + """Slices the tales (strings) up into a list of words without spaces or punctuation + NOTE: https://mail.python.org/pipermail/tutor/2001-October/009454.html explains punctuation removal method that I used + Arguments: list of strings (texts of the gutenberg tales) + Returns: lists of words""" + tale_no_punc = '' + for char in tale: #killing punctuation + if not is_punct_char(char): + tale_no_punc = tale_no_punc+char #so extend the string everytime we run into a letter + list_of_words = [] + list_of_words = tale_no_punc.split( ) #splitting the string into the list) + return list_of_words + + + + + +def is_punct_char(char): + """From python.org (link above), all this does is check if a character is puncutation or not! the ultimate helper funcion! + Arguments: character + Returns: True/False if the character it is given is a puncuation mark - 1 is punctuation, 0 is not """ + return char in string.punctuation #1 is punctuation, 0 is not punctuation + + + + +def list_dumping (list): + """This method I found on #http://stackoverflow.com/questions/7558908/unpacking-a-list-tuple-of-pairs-into-two-lists-tuples; + just a convenient snippet of code which converts from the .items output to 2 lists in correct order + Arguments: list (of two-item-tuples) that need to be seperated into lists + Returns: a list containing keys as items in one list, values as items in the other list, in the correct order""" + color = [] + frequency = [] + for i in list: + color.append(i[0]) + frequency.append(i[1]) + return [color, frequency] + +### END OF MAIN SECTION ### + +### GRAPHING AND DATA PROCESSING ### + + + +## patrick is amazing for helping with this!! + + + +def universal_graph_func(text_variable,title_string,save_file_name_string): + sns.set(font_scale=.8) + sns.axlabel('Color', 'Frequency') + # colors from http://www.color-hex.com and wikipedia + flatui = ["#4b0082", "#ffd700", "#e6e6fa", "#ffff00", "#FF2400", "#ff6eb4", "#d2b48c", "#ff00ff", "#0000ff", + "#C8A2C8", + "#800080", "#FF007F", "#FD3F92", "#000000", "#dc143c", "#CCCCFF", "#ffffff", "#ff0000", "#631919", + "#fffff0", + "#ffa500", "#730000", "#808000", "#00ffff", "#c0c0c0", "#808080", "#7fffd4", "#808080", "#008000", + "#f5f5dc", + "#329999", "#f0ffff", "#FFEF00"] + custom_palette = sns.color_palette(flatui) + colors = text_variable[0] + occurences = text_variable[1] + ax = sns.barplot(colors, occurences, palette = custom_palette) + fig = ax.get_figure() + for item in ax.get_xticklabels(): + item.set_rotation(45) + sns.plt.title(title_string) + fig.savefig(save_file_name_string) + fig.clf() + + +