diff --git a/pattern/text/en/wordnet/pywordnet/concordance.py b/pattern/text/en/wordnet/pywordnet/concordance.py index 34e79fe8..514b587c 100755 --- a/pattern/text/en/wordnet/pywordnet/concordance.py +++ b/pattern/text/en/wordnet/pywordnet/concordance.py @@ -1 +1,131 @@ -# some accessing of the semantic concordance data for wordnet 1.6 # by Des Berry, berry@ais.it import string import os from wordnet import binarySearchFile # Sample entries in the 'taglist' file # ordinary%1:18:01:: 1 br-a01:78,1;86,1;88,4 # ordered%5:00:00:organized:01 2 br-j23:6,14;13,32;66,12 # where the general form is: # lemma%ss_type:lex_filenum:lex_id:head_word:head_id sense_number [location_list] # location_list: filename:sent_num,word_num[;sent_num,word_num...] ss_type = ("NOUN", "VERB", "ADJECTIVE", "ADVERB", "ADJECTIVE SATELLITE") # given a sentence number (and the contents of a semantic concordance file) # return a string of words as the sentence def find_sentence(snum, msg): str = "" % snum s = string.find(msg, str) if s < 0: return "" s = s + len(str) sentence = "" tag = "" while 1: if msg[s] == '\n': s = s + 1 n = string.find(msg, '<', s) if n < 0: break if n - s != 0: if tag == "w" and msg[s] != "'" and len(sentence) > 0: # word form sentence = sentence + " " sentence = sentence + msg[s:n] e = string.find(msg, '>', n) if e < 0: break tag = msg[n + 1] if tag == "/": # check for ending sentence if msg[n + 2] == 's': # end of sentence break s = e + 1 return sentence # given a taglist sense (one line of the tagfile) and where to find the tagfile (root) # return a tuple of # symset type ('1' .. '5') # sense (numeric character string) # list of sentences (constructed from the taglist) def tagsentence(tag, root): s = string.find(tag, '%') sentence = [] type = tag[s + 1] c = s for i in range(0, 4): c = string.find(tag, ':', c + 1) c = string.find(tag, ' ', c + 1) sense = tag[c + 1] c = c + 3 while 1: d = string.find(tag, ' ', c) # file separator if d < 0: loclist = tag[c:] else: loclist = tag[c:d] c = d + 1 e = string.find(loclist, ':') filename = loclist[:e] fh = open(root + filename, "rb") msg = fh.read() fh.close() while 1: e = e + 1 f = string.find(loclist, ';', e) if f < 0: sent_word = loclist[e:] else: sent_word = loclist[e:f] e = f g = string.find(sent_word, ',') sent = sent_word[:g] sentence.append(find_sentence(sent, msg)) if f < 0: break if d < 0: break return (type, sense, sentence) # given a word to search for and where to find the files (root) # displays the information # This could be changed to display in different ways! def sentences(word, root): cache = {} file = open(root + "taglist", "rb") key = word + "%" keylen = len(key) binarySearchFile(file, key + " ", cache, 10) print "Word '%s'" % word while 1: line = file.readline() if line[:keylen] != key: break type, sense, sentence = tagsentence(line, root + "tagfiles/") print ss_type[string.atoi(type) - 1], sense for sent in sentence: print sent def _test(word, corpus, base): print corpus sentences("ordinary", base + corpus + "/") if __name__ == '__main__': base = "C:/win16/dict/semcor/" word = "ordinary" _test(word, "brown1", base) _test(word, "brown2", base) _test(word, "brownv", base) \ No newline at end of file +# some accessing of the semantic concordance data for wordnet 1.6 +# by Des Berry, berry@ais.it + +import string +import os +from wordnet import binarySearchFile + +# Sample entries in the 'taglist' file +# ordinary%1:18:01:: 1 br-a01:78,1;86,1;88,4 +# ordered%5:00:00:organized:01 2 br-j23:6,14;13,32;66,12 +# where the general form is: +# lemma%ss_type:lex_filenum:lex_id:head_word:head_id sense_number +[location_list] +# location_list: filename:sent_num,word_num[;sent_num,word_num...] + +ss_type = ("NOUN", "VERB", "ADJECTIVE", "ADVERB", "ADJECTIVE SATELLITE") + + +# given a sentence number (and the contents of a semantic concordance file) +# return a string of words as the sentence +def find_sentence(snum, msg): + str = "" % snum + s = string.find(msg, str) + if s < 0: + return "" + s = s + len(str) + sentence = "" + tag = "" + while 1: + if msg[s] == '\n': + s = s + 1 + n = string.find(msg, '<', s) + if n < 0: + break + if n - s != 0: + if tag == "w" and msg[s] != "'" and len(sentence) > 0: # word form + sentence = sentence + " " + sentence = sentence + msg[s:n] + e = string.find(msg, '>', n) + if e < 0: + break + tag = msg[n + 1] + if tag == "/": # check for ending sentence + if msg[n + 2] == 's': + # end of sentence + break + s = e + 1 + return sentence + + +# given a taglist sense (one line of the tagfile) and where to find the tagfile (root) +# return a tuple of +# symset type ('1' .. '5') +# sense (numeric character string) +# list of sentences (constructed from the taglist) +def tagsentence(tag, root): + s = string.find(tag, '%') + sentence = [] + type = tag[s + 1] + c = s + for i in range(0, 4): + c = string.find(tag, ':', c + 1) + c = string.find(tag, ' ', c + 1) + sense = tag[c + 1] + c = c + 3 + while 1: + d = string.find(tag, ' ', c) # file separator + if d < 0: + loclist = tag[c:] + else: + loclist = tag[c:d] + c = d + 1 + + e = string.find(loclist, ':') + filename = loclist[:e] + fh = open(root + filename, "rb") + msg = fh.read() + fh.close() + + while 1: + e = e + 1 + f = string.find(loclist, ';', e) + if f < 0: + sent_word = loclist[e:] + else: + sent_word = loclist[e:f] + e = f + + g = string.find(sent_word, ',') + sent = sent_word[:g] + + sentence.append(find_sentence(sent, msg)) + + if f < 0: + break + + if d < 0: + break + return (type, sense, sentence) + + +# given a word to search for and where to find the files (root) +# displays the information +# This could be changed to display in different ways! +def sentences(word, root): + cache = {} + file = open(root + "taglist", "rb") + key = word + "%" + keylen = len(key) + binarySearchFile(file, key + " ", cache, 10) + print("Word '%s'" % word) + while 1: + line = file.readline() + if line[:keylen] != key: + break + type, sense, sentence = tagsentence(line, root + "tagfiles/") + print(ss_type[string.atoi(type) - 1], sense) + for sent in sentence: + print(sent) + + +def _test(word, corpus, base): + print(corpus) + sentences("ordinary", base + corpus + "/") + +if __name__ == '__main__': + base = "C:/win16/dict/semcor/" + word = "ordinary" + _test(word, "brown1", base) + _test(word, "brown2", base) + _test(word, "brownv", base)