diff --git a/Markov_youtube_description.py b/Markov_youtube_description.py new file mode 100644 index 0000000..8121a1d --- /dev/null +++ b/Markov_youtube_description.py @@ -0,0 +1,81 @@ +""" Text mining project: finds the descriptions of Youtube videos using BeautifulSoup and makes a new description using Markov analyis """ +from bs4 import BeautifulSoup +import requests +import string +import random +import pickle +import doctest + +""" +Youtube Descriptions +""" + +def yt_description(youtube_id): + """Returns the youtube description of a video. + >>> yt_description('KenrpZ1oxfU') + ["Republican candidate weighs in on Israeli-Palestinian conflict, trade with China, 9/11 Commission and Iraq War on 'Hannity'"] + >>> yt_description('SWl3xfSqOIY') + ["On 'Hannity,' presidential candidate responds to negative ads, talks feud with pope, illegal immigration"] + """ + printed_description = [] #list that holds Youtube description + html = requests.get('https://www.youtube.com/watch?v={}'.format(youtube_id)) + soup = BeautifulSoup(html.text, "lxml") + description = soup.find('p',id= 'eow-description') #finds the description in a YouTube video + + printed_description.append(str(description.get_text())) #converts description to only text and appends it to an empty list + return printed_description + +def combine_descriptions(video_ids): + """Combines descriptions of different youtube videos into one list + >>> combine_descriptions(['KenrpZ1oxfU','SWl3xfSqOIY']) + ["Republican candidate weighs in on Israeli-Palestinian conflict, trade with China, 9/11 Commission and Iraq War on 'Hannity'", "On 'Hannity,' presidential candidate responds to negative ads, talks feud with pope, illegal immigration"] + """ + combined_descriptions = [] #list that holds the combined YouTube video descriptions + for video_id in video_ids: + combined_descriptions += yt_description(video_id) #combines descriptions into one list + return combined_descriptions + +""" +Markov analysis +""" + +prefix = () +suffix_map = {} + +def Markov_file(filename,order): + for line in filename: + for word in line.strip().split(): + store_words(word,order) + +def store_words(word, order): + global prefix + if len(prefix)