From 5f6880c0835acdf53bde3181959dc1dd8fecc115 Mon Sep 17 00:00:00 2001 From: lzuehsow Date: Sat, 27 Feb 2016 16:32:03 -0500 Subject: [PATCH 1/7] Project 3 Software Design --- getnews.py | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 getnews.py diff --git a/getnews.py b/getnews.py new file mode 100644 index 0000000..a179ff5 --- /dev/null +++ b/getnews.py @@ -0,0 +1,81 @@ +# import urllib +# from bs4 import BeautifulSoup + +from bs4 import BeautifulSoup, SoupStrainer +import urllib2 +import re + +def main(): + opener = urllib2.build_opener() + opener.addheaders = [('User-agent', 'Mozilla/5.0')] + mainurl = 'http://www.cnn.com/' + # url = 'http://www.cnn.com/2013/10/29/us/florida-shooting-cell-phone-blocks-bullet/index.html?hpt=ju_c2' + # url = 'http://www.cnn.com/2016/02/24/middleeast/swedish-teen-freed-from-isis/index.html' + soup = BeautifulSoup(opener.open(mainurl)) + + # print type(soup.find("div", {"class":"share-bar-whatsapp-container"})) + #1) Link to the website + + #2) title of article + # title = soup.findAll("span", {"class":"cd__headline-text"}) + + #3) Text of the article + # paragraphs = soup.findAll("p", {"class":"zn-body__paragraph"}) + # text = " ".join([ paragraph.text.encode('utf-8') for paragraph in paragraphs]) + + # print url + # print title + # print text + +def getlinks(): + opener = urllib2.build_opener() + opener.addheaders = [('User-agent', 'Mozilla/5.0')] + mainurl = 'http://cnnespanol.cnn.com' + # mainurl = 'http://cnn.com' + soup = BeautifulSoup(opener.open(mainurl)) + + urls = soup.findAll("a", {"href":re.compile("/index.html")}) + text = " ".join([ url.text.encode('utf-8') for url in urls]) + text_file = open('CNNtext.txt', 'a') + text_file.write(text) + text_file.close() + return text + +# def process_file(filename): +# hist = dict() +# fp = open(filename) +# for line in fp: +# process_line(line, hist) +# return hist + +# def process_line(line, hist): +# line = line.replace('-', ' ') + +# for word in line.split(): +# word = word.strip(string.punctuation + string.whitespace) +# word = word.lower() + +# hist[word] = hist.get(word, 0) + 1 + + +# def most_common(hist): +# t = [] +# for key, value in hist.items(): +# t.append((value, key)) + +# t.sort(reverse=True) +# return t + +# hist = process_file() + +# t = most_common(hist) +# common_words = ['the'] +# print 'The most common words are:' +# for freq, word in t[0:10]: +# if word not in common_words: +# print word, '\t\t', freq + + +if __name__ == '__main__': + getlinks() + # main() From edbdf188d68e9923c85b87525f720d07ec808fd4 Mon Sep 17 00:00:00 2001 From: lzuehsow Date: Sat, 27 Feb 2016 19:58:29 -0500 Subject: [PATCH 2/7] New Text Mining Project --- GetWiki.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 GetWiki.py diff --git a/GetWiki.py b/GetWiki.py new file mode 100644 index 0000000..b82164b --- /dev/null +++ b/GetWiki.py @@ -0,0 +1,39 @@ +from pattern.web import * +import os +import string + +def get_text(): + + enwiki = Wikipedia('English') + title = 'United_States' + + + en_us = enwiki.search(title) + en_us_text = en_us.plaintext() + + en_us_file = open('en_us_file.txt', 'w') + en_us_file.write(en_us_text.encode("UTF-8")) + + en_us_file.close + +def open_text(): + common_words_en = ['the', 'that', 'of', 'and', 'or', 'in', 'to', 'a', 'an', 'is', 'are', 'were', 'was', 'by', 'for', 'as', 'has', 'have', 'had', 'on', 'at', 'with', 'from', 'it', 'its', 'also', 'which', 'while'] + # common_words_ch = ['那'] + hist = {} + + with open('en_us_file.txt', 'r') as f: + filetext = [line.translate(None, string.punctuation).lower() for line in f] + for line in filetext: + for word in line.split(): + if word not in common_words_en: + if not word.isdigit(): + if word in hist: + hist[word] += 1 + else: + hist[word] = 1 + sorted_filetext = sorted(hist, key = hist.__getitem__, reverse = True) + + for i in range(1,25): + print "{} {}".format(sorted_filetext[i], hist[sorted_filetext[i]]) + +open_text() \ No newline at end of file From 5be060de632fa549a25290c9c402e0ff63a3df08 Mon Sep 17 00:00:00 2001 From: lzuehsow Date: Sat, 27 Feb 2016 21:28:02 -0500 Subject: [PATCH 3/7] Latest vers. TextMining project --- GetWiki.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/GetWiki.py b/GetWiki.py index b82164b..6614d83 100644 --- a/GetWiki.py +++ b/GetWiki.py @@ -1,39 +1,43 @@ +# -*- coding:utf-8 -*- from pattern.web import * import os import string def get_text(): - - enwiki = Wikipedia('English') + language = 'English' title = 'United_States' + filename = 'en_us_file.txt' + wiki = Wikipedia(language) - en_us = enwiki.search(title) + en_us = wiki.search(title) en_us_text = en_us.plaintext() - en_us_file = open('en_us_file.txt', 'w') + en_us_file = open(filename, 'w') en_us_file.write(en_us_text.encode("UTF-8")) en_us_file.close def open_text(): common_words_en = ['the', 'that', 'of', 'and', 'or', 'in', 'to', 'a', 'an', 'is', 'are', 'were', 'was', 'by', 'for', 'as', 'has', 'have', 'had', 'on', 'at', 'with', 'from', 'it', 'its', 'also', 'which', 'while'] - # common_words_ch = ['那'] + # common_words_ch = ['的','是'在''的','与','或','到','一个','是','用“,”对“,”如“,”具有','从了', 为','有', '它','还','它','而'] hist = {} with open('en_us_file.txt', 'r') as f: filetext = [line.translate(None, string.punctuation).lower() for line in f] for line in filetext: for word in line.split(): - if word not in common_words_en: + if word == 'references': + sorted_filetext = sorted(hist, key = hist.__getitem__, reverse = True) + return (sorted_filetext, hist) + elif word not in common_words_en: if not word.isdigit(): if word in hist: hist[word] += 1 else: hist[word] = 1 - sorted_filetext = sorted(hist, key = hist.__getitem__, reverse = True) - - for i in range(1,25): - print "{} {}".format(sorted_filetext[i], hist[sorted_filetext[i]]) + return -open_text() \ No newline at end of file +sorted_filetext, hist = open_text() +for i in range(1,10): + print "{} {}".format(sorted_filetext[i], hist[sorted_filetext[i]]) \ No newline at end of file From afaac0946db46e86fc6763a2fc4541296658f864 Mon Sep 17 00:00:00 2001 From: lzuehsow Date: Sat, 27 Feb 2016 23:25:01 -0500 Subject: [PATCH 4/7] Latest Vers. TextMining Project --- GetWiki.py | 47 ++++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/GetWiki.py b/GetWiki.py index 6614d83..336b0ab 100644 --- a/GetWiki.py +++ b/GetWiki.py @@ -2,42 +2,55 @@ from pattern.web import * import os import string +import nltk +nltk.download('stopwords') +from nltk.corpus import stopwords -def get_text(): - language = 'English' - title = 'United_States' - filename = 'en_us_file.txt' +dict_lang = {'English':['en', 'references'], 'Swedish':['sv', 'referenser'], 'Portuguese':['pt', 'referências'], 'Hungarian':['hu', 'források'], 'Finnish':['fi', 'lähteet'], \ + 'Turkish':['tr', 'kaynakça'], 'German':['de', 'anmerkungen'], 'Dutch':['nl', 'referenties'], 'Norwegian':['nb', 'referanser'], \ + 'Catalan':['ca', 'referències'], 'Spanish':['es', 'referencias'], 'Russian':['ru', 'Примечания'.lower()], 'Danish':['da', 'referencer'], 'Italian':['it', 'bibliografia']} - wiki = Wikipedia(language) +def get_text(language, title): - en_us = wiki.search(title) - en_us_text = en_us.plaintext() + filename = language + '_us_file.txt' - en_us_file = open(filename, 'w') - en_us_file.write(en_us_text.encode("UTF-8")) + wiki = Wikipedia(language = dict_lang[language][0]) - en_us_file.close + article = wiki.search(title) + article_text = article.plaintext() + + article_file = open(filename, 'w') + article_file.write(article_text.encode("UTF-8")) + + article_file.close + +def open_text(language): + filename = language + '_us_file.txt' + if language in dict_lang: + common_words = set(stopwords.words(language.lower())) + else: + common_words = [] -def open_text(): - common_words_en = ['the', 'that', 'of', 'and', 'or', 'in', 'to', 'a', 'an', 'is', 'are', 'were', 'was', 'by', 'for', 'as', 'has', 'have', 'had', 'on', 'at', 'with', 'from', 'it', 'its', 'also', 'which', 'while'] - # common_words_ch = ['的','是'在''的','与','或','到','一个','是','用“,”对“,”如“,”具有','从了', 为','有', '它','还','它','而'] hist = {} - with open('en_us_file.txt', 'r') as f: + with open(filename, 'r') as f: filetext = [line.translate(None, string.punctuation).lower() for line in f] for line in filetext: for word in line.split(): - if word == 'references': + if word == dict_lang[language][1]: sorted_filetext = sorted(hist, key = hist.__getitem__, reverse = True) return (sorted_filetext, hist) - elif word not in common_words_en: + elif word not in common_words: if not word.isdigit(): if word in hist: hist[word] += 1 else: hist[word] = 1 return +language = 'Spanish' + +get_text(language, 'United_States') +sorted_filetext, hist = open_text(language) -sorted_filetext, hist = open_text() for i in range(1,10): print "{} {}".format(sorted_filetext[i], hist[sorted_filetext[i]]) \ No newline at end of file From 95159c3dbadf02c8724878b931c09fcf4b7a0cd0 Mon Sep 17 00:00:00 2001 From: lzuehsow Date: Sat, 27 Feb 2016 23:40:19 -0500 Subject: [PATCH 5/7] Latest Vers. TextMining Project --- GetWiki.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/GetWiki.py b/GetWiki.py index 336b0ab..649ecdc 100644 --- a/GetWiki.py +++ b/GetWiki.py @@ -6,11 +6,23 @@ nltk.download('stopwords') from nltk.corpus import stopwords +""" + This program takes a language and a Wikipedia article title as an input. It then outputs the top ten most common words within that article, first + checking to see if they're significant (eg: not articles or prepositions, etc.) + + This program supports English, Swedish, Portugese, Hungarian, Finnish, Turkish, German, Dutch, Norwegian, Spanish, Russian, Danish, & Italian. + + The language dictionary is a global because it's referenced in both get_text() and open_text(). It's a lot easier to reference and edit as a global. +""" + dict_lang = {'English':['en', 'references'], 'Swedish':['sv', 'referenser'], 'Portuguese':['pt', 'referências'], 'Hungarian':['hu', 'források'], 'Finnish':['fi', 'lähteet'], \ 'Turkish':['tr', 'kaynakça'], 'German':['de', 'anmerkungen'], 'Dutch':['nl', 'referenties'], 'Norwegian':['nb', 'referanser'], \ - 'Catalan':['ca', 'referències'], 'Spanish':['es', 'referencias'], 'Russian':['ru', 'Примечания'.lower()], 'Danish':['da', 'referencer'], 'Italian':['it', 'bibliografia']} + 'Spanish':['es', 'referencias'], 'Russian':['ru', 'Примечания'.lower()], 'Danish':['da', 'referencer'], 'Italian':['it', 'bibliografia']} def get_text(language, title): + """ + Finds the Wikipedia article in the specified language, then writes it into a plaintext .txt file. (Language_us_file.txt) + """ filename = language + '_us_file.txt' @@ -25,6 +37,10 @@ def get_text(language, title): article_file.close def open_text(language): + """ + Opens the appropriate plaintext file and runs a histogram, creating a dictionary with every non-trivial word that appears and its frequency in the article. + Outputs a tuple of the original list and the complete sorted version. + """ filename = language + '_us_file.txt' if language in dict_lang: common_words = set(stopwords.words(language.lower())) @@ -47,7 +63,7 @@ def open_text(language): else: hist[word] = 1 return -language = 'Spanish' +language = 'Hungarian' get_text(language, 'United_States') sorted_filetext, hist = open_text(language) From 0c8ddccdca435edc0bfc6513ad3626be07560db8 Mon Sep 17 00:00:00 2001 From: lzuehsow Date: Sat, 27 Feb 2016 23:53:59 -0500 Subject: [PATCH 6/7] Final Vers. TextMining Project --- GetWiki.py | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/GetWiki.py b/GetWiki.py index 649ecdc..5a9e4ec 100644 --- a/GetWiki.py +++ b/GetWiki.py @@ -24,25 +24,25 @@ def get_text(language, title): Finds the Wikipedia article in the specified language, then writes it into a plaintext .txt file. (Language_us_file.txt) """ - filename = language + '_us_file.txt' + filename = language + '_us_file.txt' #Creates filename of eventual wiki text file in language "language." - wiki = Wikipedia(language = dict_lang[language][0]) + wiki = Wikipedia(language = dict_lang[language][0]) #Opens wiki in language "language", referencing dict_lang for the appropriate language code - article = wiki.search(title) + article = wiki.search(title) #Finds the right article by seaching wiki for the title article_text = article.plaintext() article_file = open(filename, 'w') - article_file.write(article_text.encode("UTF-8")) - + article_file.write(article_text.encode("UTF-8")) #Creates file "Language_us_file.txt," writes in plaintext of wiki article. article_file.close def open_text(language): """ Opens the appropriate plaintext file and runs a histogram, creating a dictionary with every non-trivial word that appears and its frequency in the article. + Terminates when it detects that it has read the entire article and reached the bibliography. ("Reference" section.) Outputs a tuple of the original list and the complete sorted version. """ - filename = language + '_us_file.txt' - if language in dict_lang: + filename = language + '_us_file.txt' #Creates filename of eventual wiki text file in language "language." + if language in dict_lang: #Checks if stopwords supports this language. Stopwords contains a library of common articles/prepositions/trash words in a couple languages. common_words = set(stopwords.words(language.lower())) else: common_words = [] @@ -50,23 +50,25 @@ def open_text(language): hist = {} with open(filename, 'r') as f: - filetext = [line.translate(None, string.punctuation).lower() for line in f] + filetext = [line.translate(None, string.punctuation).lower() for line in f] #Strips punctuation from the file plaintext and makes everything lowercase for processing for line in filetext: for word in line.split(): - if word == dict_lang[language][1]: - sorted_filetext = sorted(hist, key = hist.__getitem__, reverse = True) + if word == dict_lang[language][1]: #If the end of the wikipedia article (eg: the "References") is reached, the function terminates and returns the sorted histogram. + sorted_filetext = sorted(hist, key = hist.__getitem__, reverse = True) #Sorting the list of all words in the article by their frequency. Most frequent = first. return (sorted_filetext, hist) elif word not in common_words: - if not word.isdigit(): - if word in hist: - hist[word] += 1 + if not word.isdigit(): #Is the "word" actually a word, or a number? + if word in hist: #Is the word in the histogram already? + hist[word] += 1 #increase word occurence frequency by one else: - hist[word] = 1 + hist[word] = 1 #word frequency equals 1 return -language = 'Hungarian' -get_text(language, 'United_States') +language = 'Hungarian' #The language of choice +title = 'United_States' #The article title of choice + +get_text(language, title) sorted_filetext, hist = open_text(language) for i in range(1,10): - print "{} {}".format(sorted_filetext[i], hist[sorted_filetext[i]]) \ No newline at end of file + print "{} {}".format(sorted_filetext[i], hist[sorted_filetext[i]]) #Print the top ten words and their frequencies. \ No newline at end of file From da3a26660a3eecfb68255e26f2ded634afe344fa Mon Sep 17 00:00:00 2001 From: lzuehsow Date: Sun, 28 Feb 2016 01:11:15 -0500 Subject: [PATCH 7/7] Reflection and Analysis --- Reflection.txt | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 Reflection.txt diff --git a/Reflection.txt b/Reflection.txt new file mode 100644 index 0000000..cb232cd --- /dev/null +++ b/Reflection.txt @@ -0,0 +1,37 @@ +Project Overview +I used Wikipedia. I ran a simple histogram on the Wikipedia article plaintext, documenting every word and its frequency. I also eliminated common words from the high-frequency list. + +Implementation +I implemented my function in two main parts: getting the data, and analysing it. The first half of my code pulls the wiki article of the appropriate language from the internet, and writes the plaintext into a locally stored file. The second half makes a dictionary of every word and its frequency, then returns a list sorted by decreasing frequency. + +I used NLTK to get a list of common words in various languages, rather than writing out lists of articles and prepositions by hand, because I was not personally familiar with several of the languages involved, and making these lists would have been error-ridden and highly time-consuming. Though NLTK only supports select languages, it's still vastly easier to use NTLK. + +I also decided to make dict_lang (my dictionary of supported languages, language codes, and translated "reference" strings) a global variable. Since it's pretty bulky text-wise, and since it's used by both of my functions, it was cleanest and most efficient to make dict_lang a global variable. + +Results +It's interesting to note that in the English, Portuguese, and Spanish wiki pages for the US, the word 'war' appears on the top ten most frequent word list. In the Spanish article, the word 'war' appears 57 times; in Portugese, 56 times. Other common words include 'world,'(Portuguese), 'million,' (Danish, Spanish) 'most/more than,' (Spanish, Portuguese) 'change' (Turkish), and 'large.' (Swedish) As expected, 'US,' 'United,' and 'States' are also very common. +It seems that, despite being a relatively new country, America is already "perceived as "larger than life" and a major military/world force. + +Other articles: +Donald Trump- + US- million, new york + Italy- Obama, no + Spanish- Consulted + Portuguese- Reliable, no, favor, independent +Hillary Clinton- + US- Campaign, first lady + Italy- Bill, first lady + Spanish- first lady, plus, bell + Portuguese-Obama, first lady +Russia- + US- Soviet, war, largest, world + Russia- Cinema, consignment + German- Asteroid, states + Turkish- Soviet, Belarus, big, change + +Reflection +My project was appropriately scoped. Unfortunately, I was extremely busy the past week, and I couldn't really devote enough time to this project. + +I did manage to get my entire project done in under 7 hours, which is actually pretty impressive in hindsight. I got a lot faster when I stopped trying to use BeautifulSoup. +I learned a lot about HTML when I was mucking about with BeautifulSoup, though, which is good. Going forward, I know a lot more about how web pages are formatted, and how to pull information from them. +If I could have known something before I started, I would have liked to have known how busy I was going to be this weekend, so that I would have spent time coding instead of researching BeautifulSoup.