Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
KT
TextFlows
Commits
0e788afa
Commit
0e788afa
authored
May 04, 2015
by
Matic Perovšek
Browse files
chunking and triplet extraction
parent
3ce7377e
Changes
14
Hide whitespace changes
Inline
Side-by-side
workflows/engine.py
View file @
0e788afa
...
...
@@ -301,6 +301,7 @@ class WorkflowRunner():
print
(
"TRAIN:"
,
train_indices
,
"TEST:"
,
test_indices
)
output_train
,
output_test
=
document_corpus
.
split
(
train_indices
,
test_indices
)
print
"end of split"
else
:
output_train
=
folds
[:
i
]
+
folds
[
i
+
1
:]
output_test
=
folds
[
i
]
...
...
workflows/latino/package_data/categories/6091637f-b3cd-4559-b525-18357b02dbc2.json
View file @
0e788afa
...
...
@@ -3,7 +3,7 @@
"fields"
:
{
"name"
:
"Bag of Words"
,
"parent"
:
null
,
"order"
:
5
,
"order"
:
6
,
"uid"
:
"6091637f-b3cd-4559-b525-18357b02dbc2"
}
}
\ No newline at end of file
workflows/nltoolkit/lib/chunking.py
0 → 100644
View file @
0e788afa
from
itertools
import
izip
from
nltk.corpus
import
conll2000
from
nltk.tag.sequential
import
DefaultTagger
from
nltk.tag.sequential
import
NgramTagger
from
workflows.nltoolkit.lib.part_of_speech_tagging
import
corpus_reader
from
workflows.nltoolkit.lib.tagging_common
import
universal_sentence_tagger_hub
from
workflows.textflows
import
Annotation
__author__
=
'mperice'
import
nltk.tag
from
nltk.chunk
import
ChunkParserI
from
nltk.chunk.util
import
conlltags2tree
,
tree2conlltags
from
nltk.tag
import
UnigramTagger
,
BigramTagger
,
ClassifierBasedTagger
from
nltk.tree
import
Tree
import
nltk
class
TagChunker
(
ChunkParserI
):
'''Chunks tagged tokens using Ngram Tagging.'''
def
__init__
(
self
,
tagger_class
,
args
,
kargs
):
#=[UnigramTagger, BigramTagger]):
'''Train Ngram taggers on chunked sentences'''
self
.
tagger
=
tagger_class
(
*
args
,
**
kargs
)
def
parse
(
self
,
tagged_sent
):
'''Parsed tagged tokens into parse Tree of chunks'''
if
not
tagged_sent
:
return
None
(
words
,
tags
)
=
zip
(
*
tagged_sent
)
chunks
=
self
.
tagger
.
tag
(
tags
)
# create conll str for tree parsing
print
chunks
return
conlltags2tree
([(
w
,
t
,
c
)
for
(
w
,(
t
,
c
))
in
zip
(
words
,
chunks
)])
def
nltk_ngram_chunker
(
input_dict
):
training_corpus
=
corpus_reader
(
input_dict
[
'training_corpus'
],
'chunked_sents'
)
backoff_tagger
=
input_dict
[
'backoff_chunker'
][
'object'
]
if
input_dict
[
'backoff_chunker'
]
else
DefaultTagger
(
'-None-'
)
n
=
int
(
input_dict
[
'n'
])
#default 2
#cutoff=int(input_dict['cutoff']) #default 0 'backoff': backoff_tagger,
return
{
'chunker'
:
TagChunker
(
NgramTagger
,[
1
],{
'train'
:
training_corpus
})}
def
nltk_regex_chunker
(
input_dict
):
"""
A grammar based chunk parser. ``chunk.RegexpParser`` uses a set of
regular expression patterns to specify the behavior of the parser.
The chunking of the text is encoded using a ``ChunkString``, and
each rule acts by modifying the chunking in the ``ChunkString``.
The rules are all implemented using regular expression matching
and substitution.
A grammar contains one or more clauses in the following form::
NP:
{<DT|JJ>} # chunk determiners and adjectives
}<[\.VI].*>+{ # chink any tag beginning with V, I, or .
<.*>}{<DT> # split a chunk at a determiner
<DT|JJ>{}<NN.*> # merge chunk ending with det/adj
# with one starting with a noun
The patterns of a clause are executed in order. An earlier
pattern may introduce a chunk boundary that prevents a later
pattern from executing. Sometimes an individual pattern will
match on multiple, overlapping extents of the input. As with
regular expression substitution more generally, the chunker will
identify the first match possible, then continue looking for matches
after this one has ended.
The clauses of a grammar are also executed in order. A cascaded
chunk parser is one having more than one clause. The maximum depth
of a parse tree created by this chunk parser is the same as the
number of clauses in the grammar.
When tracing is turned on, the comment portion of a line is displayed
each time the corresponding pattern is applied.
:type _start: str
:ivar _start: The start symbol of the grammar (the root node of
resulting trees)
:type _stages: int
:ivar _stages: The list of parsing stages corresponding to the grammar
"""
grammar
=
r
"""
NP:
{<.*>+} # Chunk everything
}<VBD|IN>+{ # Chink sequences of VBD and IN
"""
#grammar=input_dict['grammar']
chunker
=
nltk
.
RegexpParser
(
grammar
)
return
{
'chunker'
:
chunker
}
def
chunking_hub
(
input_dict
):
chunker
=
input_dict
[
'chunker'
]
group_annotation_name
=
input_dict
[
'group_annotation'
]
element_annotation_name
=
input_dict
[
'element_annotation'
]
element_pos_feature_name
=
input_dict
[
'element_pos_feature_name'
]
output_annotation_name
=
input_dict
[
'output_feature'
]
adc
=
input_dict
[
'adc'
]
for
doc
in
adc
.
documents
:
if
doc
.
features
[
'contentType'
]
==
"Text"
:
if
not
doc
.
text
:
pass
text_grouped
,
annotations_grouped
=
doc
.
get_grouped_annotations_with_texts
(
element_annotation_name
,
group_annotation_name
)
for
element_texts
,
element_annotations
in
izip
(
text_grouped
,
annotations_grouped
):
tagged_sent
=
zip
(
element_texts
,[
ann
.
features
[
element_pos_feature_name
]
for
ann
in
element_annotations
])
tree
=
chunker
.
parse
(
tagged_sent
)
#generate a tree
conll_tags
=
nltk
.
chunk
.
tree2conlltags
(
tree
)
#convert to IOB tags
for
iob_tag
,
annotation
in
izip
([
a
[
2
]
for
a
in
conll_tags
],
element_annotations
):
annotation
.
features
[
output_annotation_name
]
=
iob_tag
#for sentence_features, sentence_annotations in izip(new_features,annotations_grouped):
# for feature,annotation in izip(sentence_features,sentence_annotations):
# annotation.features[output_annotation_name]=feature[1] #[0:number_of_letters]
return
{
'adc'
:
adc
}
def
extract_annotations_from_IOB_tags
(
input_dict
):
group_annotation_name
=
input_dict
[
'group_annotation'
]
element_annotation_name
=
input_dict
[
'element_annotation'
]
element_iob_feature_name
=
input_dict
[
'element_iob_feature_name'
]
element_pos_feature_name
=
input_dict
[
'element_pos_feature_name'
]
output_annotation_name
=
input_dict
[
'output_annotation'
]
labels
=
set
([
l
.
strip
()
for
l
in
input_dict
[
'labels'
].
split
(
","
)])
adc
=
input_dict
[
'adc'
]
for
doc
in
adc
.
documents
:
if
doc
.
features
[
'contentType'
]
==
"Text"
:
if
not
doc
.
text
:
pass
_
,
annotations_grouped
=
doc
.
get_grouped_annotations_with_texts
(
element_annotation_name
,
group_annotation_name
)
for
element_annotations
in
annotations_grouped
:
conll_tags
=
[(
ann
,
ann
.
features
[
element_pos_feature_name
],
ann
.
features
[
element_iob_feature_name
])
for
ann
in
element_annotations
]
tree
=
nltk
.
chunk
.
conlltags2tree
(
conll_tags
)
for
label
in
labels
:
for
subtree
in
tree
.
subtrees
(
filter
=
lambda
t
:
t
.
label
()
==
label
):
# print the noun phrase as a list of part-of-speech tagged words
leaves
=
subtree
.
leaves
()
doc
.
annotations
.
append
(
Annotation
(
leaves
[
0
][
0
].
span_start
,
leaves
[
-
1
][
0
].
span_end
,
output_annotation_name
+
"_"
+
label
,
features
=
{
'Chunk Label'
:
subtree
.
label
()}))
#for iob_tag,annotation in izip([a[2] for a in aaa],element_annotations):
# annotation.features[output_annotation_name]=iob_tag
#for sentence_features, sentence_annotations in izip(new_features,annotations_grouped):
# for feature,annotation in izip(sentence_features,sentence_annotations):
# annotation.features[output_annotation_name]=feature[1] #[0:number_of_letters]
return
{
'adc'
:
adc
}
def
flatten_deeptree
(
tree
):
'''
>>> flatten_deeptree(Tree('S', [Tree('NP-SBJ', [Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]), Tree(',', [',']), Tree('ADJP', [Tree('NP', [Tree('CD', ['61']), Tree('NNS', ['years'])]), Tree('JJ', ['old'])]), Tree(',', [','])]), Tree('VP', [Tree('MD', ['will']), Tree('VP', [Tree('VB', ['join']), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['board'])]), Tree('PP-CLR', [Tree('IN', ['as']), Tree('NP', [Tree('DT', ['a']), Tree('JJ', ['nonexecutive']), Tree('NN', ['director'])])]), Tree('NP-TMP', [Tree('NNP', ['Nov.']), Tree('CD', ['29'])])])]), Tree('.', ['.'])]))
Tree('S', [Tree('NP', [('Pierre', 'NNP'), ('Vinken', 'NNP')]), (',', ','), Tree('NP', [('61', 'CD'), ('years', 'NNS')]), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), Tree('NP', [('the', 'DT'), ('board', 'NN')]), ('as', 'IN'), Tree('NP', [('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN')]), Tree('NP-TMP', [('Nov.', 'NNP'), ('29', 'CD')]), ('.', '.')])
'''
return
Tree
(
tree
.
lable
(),
flatten_childtrees
([
c
for
c
in
tree
]))
def
flatten_childtrees
(
trees
):
children
=
[]
for
t
in
trees
:
if
t
.
height
()
<
3
:
children
.
extend
(
t
.
pos
())
elif
t
.
height
()
==
3
:
children
.
append
(
Tree
(
t
.
label
(),
t
.
pos
()))
else
:
children
.
extend
(
flatten_childtrees
([
c
for
c
in
t
]))
return
children
def
shallow_tree
(
tree
):
'''
>>> shallow_tree(Tree('S', [Tree('NP-SBJ', [Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]), Tree(',', [',']), Tree('ADJP', [Tree('NP', [Tree('CD', ['61']), Tree('NNS', ['years'])]), Tree('JJ', ['old'])]), Tree(',', [','])]), Tree('VP', [Tree('MD', ['will']), Tree('VP', [Tree('VB', ['join']), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['board'])]), Tree('PP-CLR', [Tree('IN', ['as']), Tree('NP', [Tree('DT', ['a']), Tree('JJ', ['nonexecutive']), Tree('NN', ['director'])])]), Tree('NP-TMP', [Tree('NNP', ['Nov.']), Tree('CD', ['29'])])])]), Tree('.', ['.'])]))
Tree('S', [Tree('NP-SBJ', [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ',')]), Tree('VP', [('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD')]), ('.', '.')])
'''
children
=
[]
for
t
in
tree
:
if
t
.
height
()
<
3
:
children
.
extend
(
t
.
pos
())
else
:
children
.
append
(
Tree
(
t
.
label
(),
t
.
pos
()))
return
Tree
(
tree
.
label
(),
children
)
#####################
## tree conversion ##
#####################
def
chunk_trees2train_chunks
(
chunk_sents
):
tag_sents
=
[
tree2conlltags
(
sent
)
for
sent
in
chunk_sents
]
return
[[((
w
,
t
),
c
)
for
(
w
,
t
,
c
)
in
sent
]
for
sent
in
tag_sents
]
def
conll_tag_chunks
(
chunk_sents
):
'''Convert each chunked sentence to list of (tag, chunk_tag) tuples,
so the final result is a list of lists of (tag, chunk_tag) tuples.
>>> from nltk.tree import Tree
>>> t = Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')])])
>>> conll_tag_chunks([t])
[[('DT', 'B-NP'), ('NN', 'I-NP')]]
'''
tagged_sents
=
[
tree2conlltags
(
tree
)
for
tree
in
chunk_sents
]
return
[[(
t
,
c
)
for
(
w
,
t
,
c
)
in
sent
]
for
sent
in
tagged_sents
]
def
ieertree2conlltags
(
tree
,
tag
=
nltk
.
tag
.
pos_tag
):
# tree.pos() flattens the tree and produces [(word, label)] where label is
# from the word's parent tree label. words in a chunk therefore get the
# chunk tag, while words outside a chunk get the same tag as the tree's
# top label
words
,
ents
=
zip
(
*
tree
.
pos
())
iobs
=
[]
prev
=
None
# construct iob tags from entity names
for
ent
in
ents
:
# any entity that is the same as the tree's top label is outside a chunk
if
ent
==
tree
.
label
():
iobs
.
append
(
'O'
)
prev
=
None
# have a previous entity that is equal so this is inside the chunk
elif
prev
==
ent
:
iobs
.
append
(
'I-%s'
%
ent
)
# no previous equal entity in the sequence, so this is the beginning of
# an entity chunk
else
:
iobs
.
append
(
'B-%s'
%
ent
)
prev
=
ent
# get tags for each word, then construct 3-tuple for conll tags
words
,
tags
=
zip
(
*
tag
(
words
))
return
zip
(
words
,
tags
,
iobs
)
#################
## tag chunker ##
#################
# def chunk_tagger_hub:
# train_sents = conll_tag_chunks(train_chunks)
# self.tagger = None
#
# for cls in tagger_classes:
# self.tagger = cls(train_sents, backoff=self.tagger)
brown
=
nltk
.
corpus
.
brown
########################
## classifier chunker ##
########################
def
prev_next_pos_iob
(
tokens
,
index
,
history
):
word
,
pos
=
tokens
[
index
]
if
index
==
0
:
prevword
,
prevpos
,
previob
=
(
'<START>'
,)
*
3
else
:
prevword
,
prevpos
=
tokens
[
index
-
1
]
previob
=
history
[
index
-
1
]
if
index
==
len
(
tokens
)
-
1
:
nextword
,
nextpos
=
(
'<END>'
,)
*
2
else
:
nextword
,
nextpos
=
tokens
[
index
+
1
]
feats
=
{
'word'
:
word
,
'pos'
:
pos
,
'nextword'
:
nextword
,
'nextpos'
:
nextpos
,
'prevword'
:
prevword
,
'prevpos'
:
prevpos
,
'previob'
:
previob
}
return
feats
class
ClassifierChunker
(
ChunkParserI
):
def
__init__
(
self
,
train_sents
,
feature_detector
=
prev_next_pos_iob
,
**
kwargs
):
if
not
feature_detector
:
feature_detector
=
self
.
feature_detector
train_chunks
=
chunk_trees2train_chunks
(
train_sents
)
self
.
tagger
=
ClassifierBasedTagger
(
train
=
train_chunks
,
feature_detector
=
feature_detector
,
**
kwargs
)
def
parse
(
self
,
tagged_sent
):
if
not
tagged_sent
:
return
None
chunks
=
self
.
tagger
.
tag
(
tagged_sent
)
return
conlltags2tree
([(
w
,
t
,
c
)
for
((
w
,
t
),
c
)
in
chunks
])
#############
## pattern ##
#############
# class PatternChunker(ChunkParserI):
# def parse(self, tagged_sent):
# # don't import at top since don't want to fail if not installed
# from pattern.en import parse
# s = ' '.join([word for word, tag in tagged_sent])
# # not tokenizing ensures that the number of tagged tokens returned is
# # the same as the number of input tokens
# sents = parse(s, tokenize=False).split()
# if not sents: return None
# return conlltags2tree([(w, t, c) for w, t, c, p in sents[0]])
workflows/nltoolkit/lib/part_of_speech_tagging.py
View file @
0e788afa
...
...
@@ -53,12 +53,18 @@ def extract_pos_tagger_name(input_dict):
def
corpus_reader
(
corpus
):
def
corpus_reader
(
corpus
,
extract
=
'tagged_sents'
):
if
type
(
corpus
)
==
DocumentCorpus
:
raise
NotImplementedError
elif
extract
==
'chunked_sents'
:
try
:
return
corpus
.
chunked_sents
()
except
AttributeError
:
raise
"The inputed corpus doesn't include chunked sentences."
else
:
return
corpus
.
tagged_sents
()
def
nltk_default_pos_tagger
(
input_dict
):
"""
A tagger that assigns the same tag to every token.
...
...
workflows/nltoolkit/lib/stemming.py
View file @
0e788afa
...
...
@@ -3,8 +3,8 @@ from workflows.tasks import executeFunction
import
nltk
from
workflows.textflows
import
*
#
from tagging_common import universal_word_tagger_hub
from
tagging_common_parallel
import
universal_word_tagger_hub
from
tagging_common
import
universal_word_tagger_hub
#
from tagging_common_parallel import universal_word_tagger_hub
def
stem_lemma_tagger_hub
(
input_dict
):
...
...
workflows/nltoolkit/lib/tagging_common.py
View file @
0e788afa
...
...
@@ -35,25 +35,7 @@ def universal_sentence_tagger_hub(input_dict):
if
doc
.
features
[
'contentType'
]
==
"Text"
:
if
not
doc
.
text
:
pass
group_annotations
=
sorted
(
doc
.
get_annotations_with_text
(
group_annotation_name
),
key
=
lambda
x
:
x
[
0
].
span_start
)
element_annotations
=
sorted
(
doc
.
get_annotations_with_text
(
element_annotation_name
),
key
=
lambda
x
:
x
[
0
].
span_start
)
text_grouped
=
[]
#text_groups= [['First','sentence',['Second','sentance']]
annotations_grouped
=
[]
#annotations_grouped= [[<Annotation span_start:0 span_ned:4>, <Annotation span_start:6 span_ned:11>],[...
i
=
0
for
group_annotation
,
_
in
group_annotations
:
elements
=
[]
sentence_annotations
=
[]
#find elementary annotations 'contained' in the group_annotation
while
i
<
len
(
element_annotations
)
and
element_annotations
[
i
][
0
].
span_end
<=
group_annotation
.
span_end
:
annotation
=
element_annotations
[
i
][
0
]
text_block
=
element_annotations
[
i
][
1
]
elements
.
append
(
text_block
)
sentence_annotations
.
append
(
annotation
)
i
+=
1
text_grouped
.
append
(
elements
)
annotations_grouped
.
append
(
sentence_annotations
)
text_grouped
,
annotations_grouped
=
doc
.
get_grouped_annotations_with_texts
(
element_annotation_name
,
group_annotation_name
)
new_features
=
getattr
(
tagger
,
tagger_function
)(
text_grouped
,
*
args
,
**
kwargs
)
for
sentence_features
,
sentence_annotations
in
izip
(
new_features
,
annotations_grouped
):
...
...
workflows/nltoolkit/lib/tagging_common_parallel.py
View file @
0e788afa
...
...
@@ -44,7 +44,7 @@ def universal_word_tagger_hub(adc,tagger_dict,input_annotation,output_annotation
return
{
'adc'
:
adc
}
def
sent
a
nce_tag_a_document
(
doc
,
tagger
,
tagger_function
,
args
,
kwargs
,
def
sent
e
nce_tag_a_document
(
doc
,
tagger
,
tagger_function
,
args
,
kwargs
,
element_annotation_name
,
group_annotation_name
,
output_annotation_name
):
if
doc
.
features
[
'contentType'
]
==
"Text"
:
if
not
doc
.
text
:
...
...
@@ -95,7 +95,7 @@ def universal_sentence_tagger_hub(input_dict):
print
"evo nas!!!"
#parallel for document in adc.documents:
new_documents
=
pool
.
map
(
partial
(
sent
a
nce_tag_a_document
,
partial
(
sent
e
nce_tag_a_document
,
tagger
=
tagger
,
tagger_function
=
tagger_function
,
args
=
args
,
...
...
workflows/nltoolkit/lib/triplet_extraction.py
0 → 100644
View file @
0e788afa
import
requests
import
json
from
workflows.textflows
import
Annotation
class
TripletClient
(
object
):
def
__init__
(
self
,
url
=
"http://concreteflows.ijs.si:8080/tripletserver/"
):
self
.
base_url
=
url
@
property
def
base_url
(
self
):
return
self
.
_base_url
@
base_url
.
setter
def
base_url
(
self
,
value
):
self
.
_base_url
=
value
def
reverb
(
self
,
text
):
r
=
requests
.
post
(
self
.
base_url
+
"api/reverb/extract"
,
data
=
json
.
dumps
({
"text"
:
text
}))
return
r
.
json
()
def
ollie
(
self
,
text
):
r
=
requests
.
post
(
self
.
base_url
+
"api/ollie/extract"
,
data
=
json
.
dumps
({
"text"
:
text
}))
return
r
.
json
()
def
triplet_extraction_hub
(
input_dict
):
input_annotation
=
input_dict
[
'input_annotation'
]
output_annotation
=
input_dict
[
'output_annotation'
]
adc
=
input_dict
[
'adc'
]
t
=
TripletClient
()
all_triplets
=
[]
for
document
in
adc
.
documents
:
if
document
.
features
[
'contentType'
]
==
"Text"
:
if
not
document
.
text
:
pass
for
annotation
,
subtext
in
document
.
get_annotations_with_text
(
input_annotation
):
#all annotations of this type
if
subtext
:
if
False
:
extractions
=
t
.
reverb
(
subtext
)[
'extractions'
]
if
extractions
:
most_confident
=
max
(
extractions
,
key
=
lambda
a
:
a
[
'conf'
])
print
"aaa"
,
subtext
subject
=
most_confident
[
'arg1'
]
print
subject
start
=
annotation
.
span_start
+
subtext
.
find
(
subject
)
end
=
start
+
len
(
subject
)
-
1
print
subtext
[
start
:
end
]
document
.
annotations
.
append
(
Annotation
(
start
,
end
,
output_annotation
+
"_subject"
))
verb
=
most_confident
[
'rel'
]
start
=
end
+
subtext
[
end
:].
find
(
verb
)
end
=
start
+
len
(
verb
)
-
1
print
subtext
[
start
:
end
]
document
.
annotations
.
append
(
Annotation
(
start
,
end
,
output_annotation
+
"_verb"
))
predicate
=
most_confident
[
'arg2'
]
start
=
end
+
subtext
[
end
:].
find
(
predicate
)
end
=
start
+
len
(
predicate
)
-
1
print
subtext
[
start
:
end
]
document
.
annotations
.
append
(
Annotation
(
start
,
end
,
output_annotation
+
"_predicate"
))
else
:
#ollie extractor
extractions
=
t
.
ollie
(
subtext
)[
'extractions'
]
if
False
:
most_confident
=
max
(
extractions
,
key
=
lambda
a
:
a
[
'confidence'
])
subject
=
most_confident
[
'arg1'
]
if
subtext
.
find
(
subject
)
==-
1
:
aaa
=
3
start
=
annotation
.
span_start
+
subtext
.
find
(
subject
)
end
=
start
+
len
(
subject
)
-
1
print
subtext
[
start
:
end
]
document
.
annotations
.
append
(
Annotation
(
start
,
end
,
output_annotation
+
"_subject"
))
verb
=
most_confident
[
'rel'
].
replace
(
"be "
,
""
).
replace
(
"Be "
,
""
)
if
subtext
[
end
:].
find
(
verb
)
==-
1
:
aaa
=
3
start
=
end
+
subtext
[
end
:].
find
(
verb
)
end
=
start
+
len
(
verb
)
-
1
print
subtext
[
start
:
end
]
document
.
annotations
.
append
(
Annotation
(
start
,
end
,
output_annotation
+
"_verb"
))
predicate
=
most_confident
[
'arg2'
]
if
subtext
[
end
:].
find
(
predicate
)
==-
1
:
aaa
=
3
start
=
end
+
subtext
[
end
:].
find
(
predicate
)
end
=
start
+
len
(
predicate
)
-
1
print
subtext
[
start
:
end
]
document
.
annotations
.
append
(
Annotation
(
start
,
end
,
output_annotation
+
"_predicate"
))
else
:
triplets
=
[(
e
[
'arg1'
],
e
[
'rel'
],
e
[
'arg2'
])
for
e
in
extractions
]
annotation
.
features
[
output_annotation
]
=
triplets
all_triplets
.
extend
(
triplets
)
return
{
'adc'
:
adc
,
'triplets'
:
all_triplets
}
# text_grouped=[] #text_groups= [['First','sentence',['Second','sentance']]
# annotations_grouped=[] #annotations_grouped= [[<Annotation span_start:0 span_ned:4>, <Annotation span_start:6 span_ned:11>],[...
#
# i=0
# for group_annotation,_ in group_annotations:
# elements=[]
# sentence_annotations=[]
# #find elementary annotations 'contained' in the group_annotation
# while i<len(element_annotations) and element_annotations[i][0].span_end<=group_annotation.span_end:
# annotation=element_annotations[i][0]
# text_block=element_annotations[i][1]
# elements.append(text_block)
# sentence_annotations.append(annotation)
# i+=1
# text_grouped.append(elements)
# annotations_grouped.append(sentence_annotations)
#
# new_features=getattr(tagger,tagger_function)(text_grouped,*args,**kwargs)
# for sentence_features, sentence_annotations in izip(new_features,annotations_grouped):
# for feature,annotation in izip(sentence_features,sentence_annotations):
# annotation.features[output_annotation_name]=feature[1] #[0:number_of_letters]
\ No newline at end of file
workflows/nltoolkit/library.py
View file @
0e788afa
from
lib.bag_of_words
import
*
from
lib.classification
import
*
#from lib.clustering import *
from
lib.chunking
import
*
from
lib.dataset
import
*
from
lib.document_corpus
import
*
#from lib.similarity_matrix import *
...
...
@@ -8,4 +9,5 @@ from lib.part_of_speech_tagging import *
from
lib.stop_word_removal
import
*
from
lib.stemming
import
*
from
lib.textual_data_in_out
import
*
from
lib.tokenization
import
*
\ No newline at end of file
from
lib.tokenization
import
*
from
lib.triplet_extraction
import
*
\ No newline at end of file
workflows/nltoolkit/package_data/categories/6091637f-b3cd-4559-b525-18357b02dbc2.json
View file @
0e788afa
...
...
@@ -3,7 +3,7 @@
"fields"
:
{
"name"
:
"Bag of Words"
,
"parent"
:
null
,
"order"
:
5
,
"order"
:
6
,
"uid"
:
"6091637f-b3cd-4559-b525-18357b02dbc2"
}
}
\ No newline at end of file
workflows/nltoolkit/package_data/widgets/4d280ddb-4446-484d-8b2d-e1d5c93567c2.json
View file @
0e788afa
...
...
@@ -5,7 +5,7 @@
"category"
:
"d41c87c3-c0c9-4073-8328-d4c6d5e8f185"
,
"treeview_image"
:
""
,
"uid"
:
"4d280ddb-4446-484d-8b2d-e1d5c93567c2"
,
"
is_streaming
"
:
false
,
"
windows_queue
"
:
false
,
"package"
:
"nltoolkit"
,
"interaction_view"
:
""
,
"has_progress_bar"
:
false
,
...
...
@@ -19,7 +19,7 @@
"wsdl_method"
:
""
,