Commit 0e788afa authored by Matic Perovšek's avatar Matic Perovšek
Browse files

chunking and triplet extraction

parent 3ce7377e
......@@ -301,6 +301,7 @@ class WorkflowRunner():
print("TRAIN:", train_indices, "TEST:", test_indices)
output_train, output_test = document_corpus.split(train_indices,test_indices)
print "end of split"
else:
output_train = folds[:i] + folds[i+1:]
output_test = folds[i]
......
......@@ -3,7 +3,7 @@
"fields": {
"name": "Bag of Words",
"parent": null,
"order": 5,
"order": 6,
"uid": "6091637f-b3cd-4559-b525-18357b02dbc2"
}
}
\ No newline at end of file
from itertools import izip
from nltk.corpus import conll2000
from nltk.tag.sequential import DefaultTagger
from nltk.tag.sequential import NgramTagger
from workflows.nltoolkit.lib.part_of_speech_tagging import corpus_reader
from workflows.nltoolkit.lib.tagging_common import universal_sentence_tagger_hub
from workflows.textflows import Annotation
__author__ = 'mperice'
import nltk.tag
from nltk.chunk import ChunkParserI
from nltk.chunk.util import conlltags2tree, tree2conlltags
from nltk.tag import UnigramTagger, BigramTagger, ClassifierBasedTagger
from nltk.tree import Tree
import nltk
class TagChunker(ChunkParserI):
'''Chunks tagged tokens using Ngram Tagging.'''
def __init__(self, tagger_class, args,kargs): #=[UnigramTagger, BigramTagger]):
'''Train Ngram taggers on chunked sentences'''
self.tagger = tagger_class(*args,**kargs)
def parse(self, tagged_sent):
'''Parsed tagged tokens into parse Tree of chunks'''
if not tagged_sent: return None
(words, tags) = zip(*tagged_sent)
chunks = self.tagger.tag(tags)
# create conll str for tree parsing
print chunks
return conlltags2tree([(w,t,c) for (w,(t,c)) in zip(words, chunks)])
def nltk_ngram_chunker(input_dict):
training_corpus=corpus_reader(input_dict['training_corpus'],'chunked_sents')
backoff_tagger=input_dict['backoff_chunker']['object'] if input_dict['backoff_chunker'] else DefaultTagger('-None-')
n=int(input_dict['n']) #default 2
#cutoff=int(input_dict['cutoff']) #default 0 'backoff': backoff_tagger,
return {'chunker': TagChunker(NgramTagger,[1],{'train': training_corpus})}
def nltk_regex_chunker(input_dict):
"""
A grammar based chunk parser. ``chunk.RegexpParser`` uses a set of
regular expression patterns to specify the behavior of the parser.
The chunking of the text is encoded using a ``ChunkString``, and
each rule acts by modifying the chunking in the ``ChunkString``.
The rules are all implemented using regular expression matching
and substitution.
A grammar contains one or more clauses in the following form::
NP:
{<DT|JJ>} # chunk determiners and adjectives
}<[\.VI].*>+{ # chink any tag beginning with V, I, or .
<.*>}{<DT> # split a chunk at a determiner
<DT|JJ>{}<NN.*> # merge chunk ending with det/adj
# with one starting with a noun
The patterns of a clause are executed in order. An earlier
pattern may introduce a chunk boundary that prevents a later
pattern from executing. Sometimes an individual pattern will
match on multiple, overlapping extents of the input. As with
regular expression substitution more generally, the chunker will
identify the first match possible, then continue looking for matches
after this one has ended.
The clauses of a grammar are also executed in order. A cascaded
chunk parser is one having more than one clause. The maximum depth
of a parse tree created by this chunk parser is the same as the
number of clauses in the grammar.
When tracing is turned on, the comment portion of a line is displayed
each time the corresponding pattern is applied.
:type _start: str
:ivar _start: The start symbol of the grammar (the root node of
resulting trees)
:type _stages: int
:ivar _stages: The list of parsing stages corresponding to the grammar
"""
grammar = r"""
NP:
{<.*>+} # Chunk everything
}<VBD|IN>+{ # Chink sequences of VBD and IN
"""
#grammar=input_dict['grammar']
chunker = nltk.RegexpParser(grammar)
return {'chunker': chunker}
def chunking_hub(input_dict):
chunker=input_dict['chunker']
group_annotation_name = input_dict['group_annotation']
element_annotation_name = input_dict['element_annotation']
element_pos_feature_name = input_dict['element_pos_feature_name']
output_annotation_name = input_dict['output_feature']
adc = input_dict['adc']
for doc in adc.documents:
if doc.features['contentType'] == "Text":
if not doc.text:
pass
text_grouped,annotations_grouped=doc.get_grouped_annotations_with_texts(element_annotation_name,group_annotation_name)
for element_texts,element_annotations in izip(text_grouped,annotations_grouped):
tagged_sent=zip(element_texts,[ann.features[element_pos_feature_name] for ann in element_annotations])
tree=chunker.parse(tagged_sent) #generate a tree
conll_tags= nltk.chunk.tree2conlltags(tree) #convert to IOB tags
for iob_tag,annotation in izip([a[2] for a in conll_tags],element_annotations):
annotation.features[output_annotation_name]=iob_tag
#for sentence_features, sentence_annotations in izip(new_features,annotations_grouped):
# for feature,annotation in izip(sentence_features,sentence_annotations):
# annotation.features[output_annotation_name]=feature[1] #[0:number_of_letters]
return {'adc': adc}
def extract_annotations_from_IOB_tags(input_dict):
group_annotation_name = input_dict['group_annotation']
element_annotation_name = input_dict['element_annotation']
element_iob_feature_name = input_dict['element_iob_feature_name']
element_pos_feature_name = input_dict['element_pos_feature_name']
output_annotation_name = input_dict['output_annotation']
labels=set([l.strip() for l in input_dict['labels'].split(",")])
adc = input_dict['adc']
for doc in adc.documents:
if doc.features['contentType'] == "Text":
if not doc.text:
pass
_,annotations_grouped=doc.get_grouped_annotations_with_texts(element_annotation_name,group_annotation_name)
for element_annotations in annotations_grouped:
conll_tags=[(ann,ann.features[element_pos_feature_name],ann.features[element_iob_feature_name])
for ann in element_annotations]
tree=nltk.chunk.conlltags2tree(conll_tags)
for label in labels:
for subtree in tree.subtrees(filter=lambda t: t.label()==label):
# print the noun phrase as a list of part-of-speech tagged words
leaves= subtree.leaves()
doc.annotations.append(Annotation(leaves[0][0].span_start,leaves[-1][0].span_end,
output_annotation_name+"_"+label,
features={'Chunk Label': subtree.label()}))
#for iob_tag,annotation in izip([a[2] for a in aaa],element_annotations):
# annotation.features[output_annotation_name]=iob_tag
#for sentence_features, sentence_annotations in izip(new_features,annotations_grouped):
# for feature,annotation in izip(sentence_features,sentence_annotations):
# annotation.features[output_annotation_name]=feature[1] #[0:number_of_letters]
return {'adc': adc}
def flatten_deeptree(tree):
'''
>>> flatten_deeptree(Tree('S', [Tree('NP-SBJ', [Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]), Tree(',', [',']), Tree('ADJP', [Tree('NP', [Tree('CD', ['61']), Tree('NNS', ['years'])]), Tree('JJ', ['old'])]), Tree(',', [','])]), Tree('VP', [Tree('MD', ['will']), Tree('VP', [Tree('VB', ['join']), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['board'])]), Tree('PP-CLR', [Tree('IN', ['as']), Tree('NP', [Tree('DT', ['a']), Tree('JJ', ['nonexecutive']), Tree('NN', ['director'])])]), Tree('NP-TMP', [Tree('NNP', ['Nov.']), Tree('CD', ['29'])])])]), Tree('.', ['.'])]))
Tree('S', [Tree('NP', [('Pierre', 'NNP'), ('Vinken', 'NNP')]), (',', ','), Tree('NP', [('61', 'CD'), ('years', 'NNS')]), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), Tree('NP', [('the', 'DT'), ('board', 'NN')]), ('as', 'IN'), Tree('NP', [('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN')]), Tree('NP-TMP', [('Nov.', 'NNP'), ('29', 'CD')]), ('.', '.')])
'''
return Tree(tree.lable(), flatten_childtrees([c for c in tree]))
def flatten_childtrees(trees):
children = []
for t in trees:
if t.height() < 3:
children.extend(t.pos())
elif t.height() == 3:
children.append(Tree(t.label(), t.pos()))
else:
children.extend(flatten_childtrees([c for c in t]))
return children
def shallow_tree(tree):
'''
>>> shallow_tree(Tree('S', [Tree('NP-SBJ', [Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]), Tree(',', [',']), Tree('ADJP', [Tree('NP', [Tree('CD', ['61']), Tree('NNS', ['years'])]), Tree('JJ', ['old'])]), Tree(',', [','])]), Tree('VP', [Tree('MD', ['will']), Tree('VP', [Tree('VB', ['join']), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['board'])]), Tree('PP-CLR', [Tree('IN', ['as']), Tree('NP', [Tree('DT', ['a']), Tree('JJ', ['nonexecutive']), Tree('NN', ['director'])])]), Tree('NP-TMP', [Tree('NNP', ['Nov.']), Tree('CD', ['29'])])])]), Tree('.', ['.'])]))
Tree('S', [Tree('NP-SBJ', [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ',')]), Tree('VP', [('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD')]), ('.', '.')])
'''
children = []
for t in tree:
if t.height() < 3:
children.extend(t.pos())
else:
children.append(Tree(t.label(), t.pos()))
return Tree(tree.label(), children)
#####################
## tree conversion ##
#####################
def chunk_trees2train_chunks(chunk_sents):
tag_sents = [tree2conlltags(sent) for sent in chunk_sents]
return [[((w,t),c) for (w,t,c) in sent] for sent in tag_sents]
def conll_tag_chunks(chunk_sents):
'''Convert each chunked sentence to list of (tag, chunk_tag) tuples,
so the final result is a list of lists of (tag, chunk_tag) tuples.
>>> from nltk.tree import Tree
>>> t = Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')])])
>>> conll_tag_chunks([t])
[[('DT', 'B-NP'), ('NN', 'I-NP')]]
'''
tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
def ieertree2conlltags(tree, tag=nltk.tag.pos_tag):
# tree.pos() flattens the tree and produces [(word, label)] where label is
# from the word's parent tree label. words in a chunk therefore get the
# chunk tag, while words outside a chunk get the same tag as the tree's
# top label
words, ents = zip(*tree.pos())
iobs = []
prev = None
# construct iob tags from entity names
for ent in ents:
# any entity that is the same as the tree's top label is outside a chunk
if ent == tree.label():
iobs.append('O')
prev = None
# have a previous entity that is equal so this is inside the chunk
elif prev == ent:
iobs.append('I-%s' % ent)
# no previous equal entity in the sequence, so this is the beginning of
# an entity chunk
else:
iobs.append('B-%s' % ent)
prev = ent
# get tags for each word, then construct 3-tuple for conll tags
words, tags = zip(*tag(words))
return zip(words, tags, iobs)
#################
## tag chunker ##
#################
# def chunk_tagger_hub:
# train_sents = conll_tag_chunks(train_chunks)
# self.tagger = None
#
# for cls in tagger_classes:
# self.tagger = cls(train_sents, backoff=self.tagger)
brown = nltk.corpus.brown
########################
## classifier chunker ##
########################
def prev_next_pos_iob(tokens, index, history):
word, pos = tokens[index]
if index == 0:
prevword, prevpos, previob = ('<START>',)*3
else:
prevword, prevpos = tokens[index-1]
previob = history[index-1]
if index == len(tokens) - 1:
nextword, nextpos = ('<END>',)*2
else:
nextword, nextpos = tokens[index+1]
feats = {
'word': word,
'pos': pos,
'nextword': nextword,
'nextpos': nextpos,
'prevword': prevword,
'prevpos': prevpos,
'previob': previob
}
return feats
class ClassifierChunker(ChunkParserI):
def __init__(self, train_sents, feature_detector=prev_next_pos_iob, **kwargs):
if not feature_detector:
feature_detector = self.feature_detector
train_chunks = chunk_trees2train_chunks(train_sents)
self.tagger = ClassifierBasedTagger(train=train_chunks,
feature_detector=feature_detector, **kwargs)
def parse(self, tagged_sent):
if not tagged_sent: return None
chunks = self.tagger.tag(tagged_sent)
return conlltags2tree([(w,t,c) for ((w,t),c) in chunks])
#############
## pattern ##
#############
# class PatternChunker(ChunkParserI):
# def parse(self, tagged_sent):
# # don't import at top since don't want to fail if not installed
# from pattern.en import parse
# s = ' '.join([word for word, tag in tagged_sent])
# # not tokenizing ensures that the number of tagged tokens returned is
# # the same as the number of input tokens
# sents = parse(s, tokenize=False).split()
# if not sents: return None
# return conlltags2tree([(w, t, c) for w, t, c, p in sents[0]])
......@@ -53,12 +53,18 @@ def extract_pos_tagger_name(input_dict):
def corpus_reader(corpus):
def corpus_reader(corpus,extract='tagged_sents'):
if type(corpus)==DocumentCorpus:
raise NotImplementedError
elif extract=='chunked_sents':
try:
return corpus.chunked_sents()
except AttributeError:
raise "The inputed corpus doesn't include chunked sentences."
else:
return corpus.tagged_sents()
def nltk_default_pos_tagger(input_dict):
"""
A tagger that assigns the same tag to every token.
......
......@@ -3,8 +3,8 @@ from workflows.tasks import executeFunction
import nltk
from workflows.textflows import *
#from tagging_common import universal_word_tagger_hub
from tagging_common_parallel import universal_word_tagger_hub
from tagging_common import universal_word_tagger_hub
#from tagging_common_parallel import universal_word_tagger_hub
def stem_lemma_tagger_hub(input_dict):
......
......@@ -35,25 +35,7 @@ def universal_sentence_tagger_hub(input_dict):
if doc.features['contentType'] == "Text":
if not doc.text:
pass
group_annotations=sorted(doc.get_annotations_with_text(group_annotation_name),key=lambda x: x[0].span_start)
element_annotations=sorted(doc.get_annotations_with_text(element_annotation_name),key=lambda x: x[0].span_start)
text_grouped=[] #text_groups= [['First','sentence',['Second','sentance']]
annotations_grouped=[] #annotations_grouped= [[<Annotation span_start:0 span_ned:4>, <Annotation span_start:6 span_ned:11>],[...
i=0
for group_annotation,_ in group_annotations:
elements=[]
sentence_annotations=[]
#find elementary annotations 'contained' in the group_annotation
while i<len(element_annotations) and element_annotations[i][0].span_end<=group_annotation.span_end:
annotation=element_annotations[i][0]
text_block=element_annotations[i][1]
elements.append(text_block)
sentence_annotations.append(annotation)
i+=1
text_grouped.append(elements)
annotations_grouped.append(sentence_annotations)
text_grouped,annotations_grouped=doc.get_grouped_annotations_with_texts(element_annotation_name,group_annotation_name)
new_features=getattr(tagger,tagger_function)(text_grouped,*args,**kwargs)
for sentence_features, sentence_annotations in izip(new_features,annotations_grouped):
......
......@@ -44,7 +44,7 @@ def universal_word_tagger_hub(adc,tagger_dict,input_annotation,output_annotation
return {'adc': adc }
def sentance_tag_a_document(doc,tagger,tagger_function,args,kwargs,
def sentence_tag_a_document(doc,tagger,tagger_function,args,kwargs,
element_annotation_name,group_annotation_name,output_annotation_name):
if doc.features['contentType'] == "Text":
if not doc.text:
......@@ -95,7 +95,7 @@ def universal_sentence_tagger_hub(input_dict):
print "evo nas!!!"
#parallel for document in adc.documents:
new_documents=pool.map(
partial(sentance_tag_a_document,
partial(sentence_tag_a_document,
tagger=tagger,
tagger_function=tagger_function,
args=args,
......
import requests
import json
from workflows.textflows import Annotation
class TripletClient(object):
def __init__(self,url="http://concreteflows.ijs.si:8080/tripletserver/"):
self.base_url = url
@property
def base_url(self):
return self._base_url
@base_url.setter
def base_url(self, value):
self._base_url = value
def reverb(self,text):
r = requests.post(self.base_url+"api/reverb/extract",data=json.dumps({"text":text}))
return r.json()
def ollie(self,text):
r = requests.post(self.base_url+"api/ollie/extract",data=json.dumps({"text":text}))
return r.json()
def triplet_extraction_hub(input_dict):
input_annotation = input_dict['input_annotation']
output_annotation = input_dict['output_annotation']
adc = input_dict['adc']
t = TripletClient()
all_triplets=[]
for document in adc.documents:
if document.features['contentType'] == "Text":
if not document.text:
pass
for annotation,subtext in document.get_annotations_with_text(input_annotation): #all annotations of this type
if subtext:
if False:
extractions = t.reverb(subtext)['extractions']
if extractions:
most_confident=max(extractions,key=lambda a: a['conf'])
print "aaa",subtext
subject=most_confident['arg1']
print subject
start=annotation.span_start+subtext.find(subject)
end=start+len(subject)-1
print subtext[start:end]
document.annotations.append(Annotation(start,end,
output_annotation+"_subject"))
verb=most_confident['rel']
start=end+subtext[end:].find(verb)
end=start+len(verb)-1
print subtext[start:end]
document.annotations.append(Annotation(start,end,
output_annotation+"_verb"))
predicate=most_confident['arg2']
start=end+subtext[end:].find(predicate)
end=start+len(predicate)-1
print subtext[start:end]
document.annotations.append(Annotation(start,end,
output_annotation+"_predicate"))
else: #ollie extractor
extractions = t.ollie(subtext)['extractions']
if False:
most_confident=max(extractions,key=lambda a: a['confidence'])
subject=most_confident['arg1']
if subtext.find(subject)==-1:
aaa=3
start=annotation.span_start+subtext.find(subject)
end=start+len(subject)-1
print subtext[start:end]
document.annotations.append(Annotation(start,end,
output_annotation+"_subject"))
verb=most_confident['rel'].replace("be ","").replace("Be ","")
if subtext[end:].find(verb)==-1:
aaa=3
start=end+subtext[end:].find(verb)
end=start+len(verb)-1
print subtext[start:end]
document.annotations.append(Annotation(start,end,
output_annotation+"_verb"))
predicate=most_confident['arg2']
if subtext[end:].find(predicate)==-1:
aaa=3
start=end+subtext[end:].find(predicate)
end=start+len(predicate)-1
print subtext[start:end]
document.annotations.append(Annotation(start,end,
output_annotation+"_predicate"))
else:
triplets=[(e['arg1'],e['rel'],e['arg2']) for e in extractions]
annotation.features[output_annotation]=triplets
all_triplets.extend(triplets)
return {'adc': adc,'triplets': all_triplets }
# text_grouped=[] #text_groups= [['First','sentence',['Second','sentance']]
# annotations_grouped=[] #annotations_grouped= [[<Annotation span_start:0 span_ned:4>, <Annotation span_start:6 span_ned:11>],[...
#
# i=0
# for group_annotation,_ in group_annotations:
# elements=[]
# sentence_annotations=[]
# #find elementary annotations 'contained' in the group_annotation
# while i<len(element_annotations) and element_annotations[i][0].span_end<=group_annotation.span_end:
# annotation=element_annotations[i][0]
# text_block=element_annotations[i][1]
# elements.append(text_block)
# sentence_annotations.append(annotation)
# i+=1
# text_grouped.append(elements)
# annotations_grouped.append(sentence_annotations)
#
# new_features=getattr(tagger,tagger_function)(text_grouped,*args,**kwargs)
# for sentence_features, sentence_annotations in izip(new_features,annotations_grouped):
# for feature,annotation in izip(sentence_features,sentence_annotations):
# annotation.features[output_annotation_name]=feature[1] #[0:number_of_letters]
\ No newline at end of file
from lib.bag_of_words import *
from lib.classification import *
#from lib.clustering import *
from lib.chunking import *
from lib.dataset import *
from lib.document_corpus import *
#from lib.similarity_matrix import *
......@@ -8,4 +9,5 @@ from lib.part_of_speech_tagging import *
from lib.stop_word_removal import *
from lib.stemming import *
from lib.textual_data_in_out import *
from lib.tokenization import *
\ No newline at end of file
from lib.tokenization import *
from lib.triplet_extraction import *
\ No newline at end of file
......@@ -3,7 +3,7 @@
"fields": {
"name": "Bag of Words",
"parent": null,
"order": 5,
"order": 6,
"uid": "6091637f-b3cd-4559-b525-18357b02dbc2"
}
}
\ No newline at end of file
......@@ -5,7 +5,7 @@
"category": "d41c87c3-c0c9-4073-8328-d4c6d5e8f185",
"treeview_image": "",
"uid": "4d280ddb-4446-484d-8b2d-e1d5c93567c2",
"is_streaming": false,
"windows_queue": false,
"package": "nltoolkit",
"interaction_view": "",
"has_progress_bar": false,
......@@ -19,7 +19,7 @@
"wsdl_method": "",