From addd7afb67b410e84e2f40373457acddf2ca8a95 Mon Sep 17 00:00:00 2001 From: Abhijith Neil Abraham Date: Tue, 2 Apr 2019 17:44:59 +0530 Subject: [PATCH 01/12] uploading the example code --- .../malayalam news classification/pattern.py | 25 +++ .../malayalam news classification/ulmfit.py | 172 ++++++++++++++++++ 2 files changed, 197 insertions(+) create mode 100644 examples/09-malayalam/malayalam news classification/pattern.py create mode 100644 examples/09-malayalam/malayalam news classification/ulmfit.py diff --git a/examples/09-malayalam/malayalam news classification/pattern.py b/examples/09-malayalam/malayalam news classification/pattern.py new file mode 100644 index 00000000..2e7d8b55 --- /dev/null +++ b/examples/09-malayalam/malayalam news classification/pattern.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Apr 2 02:17:22 2019 + +@author: abhijithneilabraham +""" + + +from pattern.web import Bing, SEARCH, plaintext,Google +from ulmfit import ULMFiT +engine = Google(license='AIzaSyCND8YQhyxQZU1E4y4gCzg8V61NQ61BYtw') +searched=[] + +for result in engine.search('സഞ്ജു സാംസൺ', type=SEARCH, start=1): + print(repr(plaintext(result.text))) + searched.append(repr(plaintext(result.text))) +print(len(searched)) + +model = ULMFiT("news/") +for i in searched: + x=model.predict(i) + + print(x['intent']) + \ No newline at end of file diff --git a/examples/09-malayalam/malayalam news classification/ulmfit.py b/examples/09-malayalam/malayalam news classification/ulmfit.py new file mode 100644 index 00000000..cb84300e --- /dev/null +++ b/examples/09-malayalam/malayalam news classification/ulmfit.py @@ -0,0 +1,172 @@ +import numpy as np +from fastai.text import * +from fastai.lm_rnn import get_rnn_classifer +import html +from nltk import word_tokenize + + +class Tokenizer(): + def __init__(self, lang='en'): + pass + + def spacy_tok(self,x): + return word_tokenize(x) + + def proc_text(self, s): + return self.spacy_tok(s) + + @staticmethod + def proc_all(ss, lang): + tok = Tokenizer(lang) + return [tok.proc_text(s) for s in ss] + + @staticmethod + def proc_all_mp(ss, lang='en'): + ncpus = num_cpus()//2 + with ProcessPoolExecutor(ncpus) as e: + return sum(e.map(Tokenizer.proc_all, ss, [lang]*len(ss)), []) + + +class ULMFiT: + + def __init__(self,model: str): + model_path = Path(model) + itos_filename = model_path/"news_lm"/"tmp"/'itos.pkl' + trained_classifier_filename = model_path/'models'/'clas_2.h5' + label2index = model_path/"news_clas"/"l2i.npy" + self.l2i = {v:k for k,v in np.load(label2index).item().items()} + self.stoi, self.model = self.load_model(itos_filename, trained_classifier_filename) + self.re1 = re.compile(r' +') + + def load_model(self,itos_filename, classifier_filename): + """Load the classifier and int to string mapping + + Args: + itos_filename (str): The filename of the int to string mapping file (usually called itos.pkl) + classifier_filename (str): The filename of the trained classifier + + Returns: + string to int mapping, trained classifer model + """ + + # load the int to string mapping file + itos = pickle.load(Path(itos_filename).open('rb')) + # turn it into a string to int mapping (which is what we need) + stoi = collections.defaultdict(lambda:0, {str(v):int(k) for k,v in enumerate(itos)}) + + # these parameters aren't used, but this is the easiest way to get a model + bptt,em_sz,nh,nl = 70,400,1150,3 + dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.5 + num_classes = len(self.l2i) # this is the number of classes we want to predict + vs = len(itos) + + model = get_rnn_classifer(bptt, 20*70, num_classes, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1, + layers=[em_sz*3, 50, num_classes], drops=[dps[4], 0.1], + dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3]) + + # load the trained classifier + model.load_state_dict(torch.load(classifier_filename, map_location=lambda storage, loc: storage)) + + # put the classifier into evaluation mode + model.reset() + model.eval() + + return stoi, model + + + def softmax(self,x): + ''' + Numpy Softmax, via comments on https://gist.github.com/stober/1946926 + + >>> res = softmax(np.array([0, 200, 10])) + >>> np.sum(res) + 1.0 + >>> np.all(np.abs(res - np.array([0, 1, 0])) < 0.0001) + True + >>> res = softmax(np.array([[0, 200, 10], [0, 10, 200], [200, 0, 10]])) + >>> np.sum(res, axis=1) + array([ 1., 1., 1.]) + >>> res = softmax(np.array([[0, 200, 10], [0, 10, 200]])) + >>> np.sum(res, axis=1) + array([ 1., 1.]) + ''' + if x.ndim == 1: + x = x.reshape((1, -1)) + max_x = np.max(x, axis=1).reshape((-1, 1)) + exp_x = np.exp(x - max_x) + return exp_x / np.sum(exp_x, axis=1).reshape((-1, 1)) + + def fixup(self, x): + + x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace( + 'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace( + '
', "\n").replace('\\"', '"').replace('','u_n').replace(' @.@ ','.').replace( + ' @-@ ','-').replace('\\', ' \\ ').replace('\u200d','').replace('\xa0',' ').replace( + '\u200c','').replace('“',' ').replace('”',' ').replace('"',' ').replace('\u200b','') + x = re.sub('[\(\[].*?[\)\]]', '', x) + x = re.sub('<[^<]+?>', '', x) + x = re.sub('[A-Za-z]+','ENG ', x) + x = re.sub(r'\d+.?(\d+)?','NUM ',x).replace("(","").replace(")","") + return self.re1.sub(' ', html.unescape(x)) + + def predict_text(self,stoi, model, text): + """Do the actual prediction on the text using the + model and mapping files passed + """ + + # prefix text with tokens: + # xbos: beginning of sentence + # xfld 1: we are using a single field here + input_str = self.fixup(text) +# input_str = re.sub('[A-Za-z]+','ENG ', input_str) +# input_str = re.sub(r'\d+.?(\d+)?','NUM ',input_str).replace("(","").replace(")","") + + # predictions are done on arrays of input. + # We only have a single input, so turn it into a 1x1 array + texts = [input_str] + + # tokenize using the fastai wrapper around spacy + tok = Tokenizer().proc_text(input_str) + + # turn into integers for each word + encoded = [stoi[p] for p in tok] +# print(encoded) + # we want a [x,1] array where x is the number + # of words inputted (including the prefix tokens) + ary = np.reshape(np.array(encoded),(-1,1)) + + # turn this array into a tensor + tensor = torch.from_numpy(ary) + + # wrap in a torch Variable + variable = Variable(tensor) + + # do the predictions + predictions = model(variable) + + # convert back to numpy + numpy_preds = predictions[0].data.numpy() + + return self.softmax(numpy_preds[0])[0], input_str + + def predict(self,text): + intent = {} + output, fixed_text = self.predict_text(self.stoi, self.model, text) + intent_ranking = [] + for i, out in enumerate(output): + temp = {"confidence": float(format(out, 'f')), "name": self.l2i[i]} + intent_ranking.append(temp) + intent_ranking = sorted(intent_ranking, key=lambda e: e['confidence'], reverse=True) + intent.update({ + "intent": intent_ranking.pop(0), + "intent_ranking": intent_ranking + }) + intent.update({"processed_text": fixed_text}) + return intent#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Apr 1 00:54:22 2019 + +@author: abhijithneilabraham +""" + From cc94efe968b22c22770f40ffc253e13304d1286b Mon Sep 17 00:00:00 2001 From: abhijithneilabraham Date: Tue, 2 Apr 2019 18:09:34 +0530 Subject: [PATCH 02/12] Create Readme.md --- .../malayalam news classification/Readme.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 examples/09-malayalam/malayalam news classification/Readme.md diff --git a/examples/09-malayalam/malayalam news classification/Readme.md b/examples/09-malayalam/malayalam news classification/Readme.md new file mode 100644 index 00000000..65ddfa6b --- /dev/null +++ b/examples/09-malayalam/malayalam news classification/Readme.md @@ -0,0 +1,12 @@ +Malayalam News Classification +============================= + +This example shows you how to do a search query for online news in malayalam language and the search results are classified into : +Buisiness, entertainment,sports,Kerala,India. +
Here Google is the used search engine.Open the code sample pattern.py and enter a search keyword in malayalam.
+ +
Before running the program, download the folder News from the following link and use it in the same folder as the sample code
+
https://drive.google.com/open?id=1HPtrsoL9cX70rZ31lWgmPCjdbZfN_zjG
+ + + From f21db91d7a45be5b0c5fa671d05126eda4e70146 Mon Sep 17 00:00:00 2001 From: abhijithneilabraham Date: Tue, 2 Apr 2019 18:11:25 +0530 Subject: [PATCH 03/12] Update Readme.md --- examples/09-malayalam/malayalam news classification/Readme.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/09-malayalam/malayalam news classification/Readme.md b/examples/09-malayalam/malayalam news classification/Readme.md index 65ddfa6b..4f15f797 100644 --- a/examples/09-malayalam/malayalam news classification/Readme.md +++ b/examples/09-malayalam/malayalam news classification/Readme.md @@ -6,7 +6,9 @@ Buisiness, entertainment,sports,Kerala,India.
Here Google is the used search engine.Open the code sample pattern.py and enter a search keyword in malayalam.

Before running the program, download the folder News from the following link and use it in the same folder as the sample code
-
https://drive.google.com/open?id=1HPtrsoL9cX70rZ31lWgmPCjdbZfN_zjG
+
+ +[Link to the model](https://drive.google.com/open?id=1HPtrsoL9cX70rZ31lWgmPCjdbZfN_zjG)
From 39b2d9bd00658bb7e7dbcaaa90b37e6b454551c8 Mon Sep 17 00:00:00 2001 From: abhijithneilabraham Date: Tue, 2 Apr 2019 18:12:16 +0530 Subject: [PATCH 04/12] Update Readme.md --- examples/09-malayalam/malayalam news classification/Readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/09-malayalam/malayalam news classification/Readme.md b/examples/09-malayalam/malayalam news classification/Readme.md index 4f15f797..0107b4bb 100644 --- a/examples/09-malayalam/malayalam news classification/Readme.md +++ b/examples/09-malayalam/malayalam news classification/Readme.md @@ -8,7 +8,7 @@ Buisiness, entertainment,sports,Kerala,India.
Before running the program, download the folder News from the following link and use it in the same folder as the sample code

-[Link to the model](https://drive.google.com/open?id=1HPtrsoL9cX70rZ31lWgmPCjdbZfN_zjG)
+[Download Link to the news file](https://drive.google.com/open?id=1HPtrsoL9cX70rZ31lWgmPCjdbZfN_zjG)
From 1c00df716d58a98b744f1790bb15bb72f159b47f Mon Sep 17 00:00:00 2001 From: abhijithneilabraham Date: Tue, 2 Apr 2019 18:20:25 +0530 Subject: [PATCH 05/12] Update pattern.py --- .../09-malayalam/malayalam news classification/pattern.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/09-malayalam/malayalam news classification/pattern.py b/examples/09-malayalam/malayalam news classification/pattern.py index 2e7d8b55..8f629c0d 100644 --- a/examples/09-malayalam/malayalam news classification/pattern.py +++ b/examples/09-malayalam/malayalam news classification/pattern.py @@ -9,7 +9,7 @@ from pattern.web import Bing, SEARCH, plaintext,Google from ulmfit import ULMFiT -engine = Google(license='AIzaSyCND8YQhyxQZU1E4y4gCzg8V61NQ61BYtw') +engine = Google(license=key) searched=[] for result in engine.search('സഞ്ജു സാംസൺ', type=SEARCH, start=1): @@ -22,4 +22,4 @@ x=model.predict(i) print(x['intent']) - \ No newline at end of file + From 5aaab49c0570ef8341e28d5ee0524a9cd2298182 Mon Sep 17 00:00:00 2001 From: abhijithneilabraham Date: Tue, 2 Apr 2019 18:21:18 +0530 Subject: [PATCH 06/12] Delete pattern.py --- .../malayalam news classification/pattern.py | 25 ------------------- 1 file changed, 25 deletions(-) delete mode 100644 examples/09-malayalam/malayalam news classification/pattern.py diff --git a/examples/09-malayalam/malayalam news classification/pattern.py b/examples/09-malayalam/malayalam news classification/pattern.py deleted file mode 100644 index 8f629c0d..00000000 --- a/examples/09-malayalam/malayalam news classification/pattern.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Tue Apr 2 02:17:22 2019 - -@author: abhijithneilabraham -""" - - -from pattern.web import Bing, SEARCH, plaintext,Google -from ulmfit import ULMFiT -engine = Google(license=key) -searched=[] - -for result in engine.search('സഞ്ജു സാംസൺ', type=SEARCH, start=1): - print(repr(plaintext(result.text))) - searched.append(repr(plaintext(result.text))) -print(len(searched)) - -model = ULMFiT("news/") -for i in searched: - x=model.predict(i) - - print(x['intent']) - From a1512c526ee06e2e17376b7a2f55ccc57dc8ded4 Mon Sep 17 00:00:00 2001 From: abhijithneilabraham Date: Tue, 2 Apr 2019 18:21:52 +0530 Subject: [PATCH 07/12] Create pattern_news.py --- .../pattern_news.py | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 examples/09-malayalam/malayalam news classification/pattern_news.py diff --git a/examples/09-malayalam/malayalam news classification/pattern_news.py b/examples/09-malayalam/malayalam news classification/pattern_news.py new file mode 100644 index 00000000..8f629c0d --- /dev/null +++ b/examples/09-malayalam/malayalam news classification/pattern_news.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Apr 2 02:17:22 2019 + +@author: abhijithneilabraham +""" + + +from pattern.web import Bing, SEARCH, plaintext,Google +from ulmfit import ULMFiT +engine = Google(license=key) +searched=[] + +for result in engine.search('സഞ്ജു സാംസൺ', type=SEARCH, start=1): + print(repr(plaintext(result.text))) + searched.append(repr(plaintext(result.text))) +print(len(searched)) + +model = ULMFiT("news/") +for i in searched: + x=model.predict(i) + + print(x['intent']) + From 4fec119dd79ebad8d652a0a98493ba4d4b9ad5a2 Mon Sep 17 00:00:00 2001 From: abhijithneilabraham Date: Tue, 2 Apr 2019 18:22:05 +0530 Subject: [PATCH 08/12] Update Readme.md --- examples/09-malayalam/malayalam news classification/Readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/09-malayalam/malayalam news classification/Readme.md b/examples/09-malayalam/malayalam news classification/Readme.md index 0107b4bb..61ce3d75 100644 --- a/examples/09-malayalam/malayalam news classification/Readme.md +++ b/examples/09-malayalam/malayalam news classification/Readme.md @@ -3,7 +3,7 @@ Malayalam News Classification This example shows you how to do a search query for online news in malayalam language and the search results are classified into : Buisiness, entertainment,sports,Kerala,India. -
Here Google is the used search engine.Open the code sample pattern.py and enter a search keyword in malayalam.
+
Here Google is the used search engine.Open the code sample pattern_news.py and enter a search keyword in malayalam.

Before running the program, download the folder News from the following link and use it in the same folder as the sample code

From 2f4913562f004d89414c99dc14f88ba6e31ebe5f Mon Sep 17 00:00:00 2001 From: abhijithneilabraham Date: Tue, 2 Apr 2019 18:25:50 +0530 Subject: [PATCH 09/12] Update pattern_news.py --- .../09-malayalam/malayalam news classification/pattern_news.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/09-malayalam/malayalam news classification/pattern_news.py b/examples/09-malayalam/malayalam news classification/pattern_news.py index 8f629c0d..516b3877 100644 --- a/examples/09-malayalam/malayalam news classification/pattern_news.py +++ b/examples/09-malayalam/malayalam news classification/pattern_news.py @@ -11,8 +11,9 @@ from ulmfit import ULMFiT engine = Google(license=key) searched=[] +search_key='സഞ്ജു സാംസൺ' -for result in engine.search('സഞ്ജു സാംസൺ', type=SEARCH, start=1): +for result in engine.search(search_key, type=SEARCH, start=1): print(repr(plaintext(result.text))) searched.append(repr(plaintext(result.text))) print(len(searched)) From cfd00716cf631d3f7955b0b2831892c3ed3bc91b Mon Sep 17 00:00:00 2001 From: Abhijith Neil Abraham Date: Wed, 3 Apr 2019 02:38:32 +0530 Subject: [PATCH 10/12] Update pattern/web/__init__.py --- pattern/web/__init__.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pattern/web/__init__.py b/pattern/web/__init__.py index c76cea88..78484a45 100644 --- a/pattern/web/__init__.py +++ b/pattern/web/__init__.py @@ -16,6 +16,7 @@ from builtins import str, bytes, dict, int, chr from builtins import map, filter, zip from builtins import object, range, next +from translate import Translator from .utils import get_url_query, get_form_action, stringify_values, json_iter_parse @@ -2146,6 +2147,22 @@ def f(v): else: self._pagination[k] = id return results + + + + def translated(self,lang,query): + trans_results=self.search(query, start=1, count=10) + translator= Translator(to_lang=lang) + translation = translator.translate(trans_results) + return translation + + ''' + This translated takes the results from a search query and translates it to the language + specified by the lang keyword. + usage===>Twitter.translated("German","cat") + + ''' + def profile(self, query, start=1, count=10, **kwargs): """ Returns a list of results for the given author id, alias or search query. From ee557ecb992f67acaa76ef0d2d228b95983624e3 Mon Sep 17 00:00:00 2001 From: abhijithneilabraham Date: Wed, 3 Apr 2019 15:39:41 +0530 Subject: [PATCH 11/12] Update .travis.yml --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index c6a4d3f7..97a3514d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -19,6 +19,7 @@ install: - pip freeze # Install and compile libsvm and liblinear - sudo apt-get install -y build-essential + - pip install translate - git clone https://github.com/cjlin1/libsvm - cd libsvm; make lib; sudo cp libsvm.so.2 /lib; sudo ln -s /lib/libsvm.so.2 /lib/libsvm.so; cd .. - git clone https://github.com/cjlin1/liblinear From 881cfb08852b98e430a3c4716a3e2dc1cebc0860 Mon Sep 17 00:00:00 2001 From: abhijithneilabraham Date: Sun, 7 Apr 2019 10:51:28 +0530 Subject: [PATCH 12/12] Update .travis.yml --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 97a3514d..a4cf4957 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,7 +3,7 @@ language: python dist: precise python: - - "3.6" + - "3.6.5" before_install: - export TZ=Europe/Brussels