Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Darko Aleksovski
cf_noise
Commits
ea1b9fdd
Commit
ea1b9fdd
authored
Jan 11, 2016
by
Darko Aleksovski
Browse files
Widget modifications
parent
62ede229
Changes
23
Hide whitespace changes
Inline
Side-by-side
cf_noise/library.py
View file @
ea1b9fdd
import
cf_noise.utilities
as
u
import
cf_data_mining.dataset
as
d
# ===================================================================
# HARF (HIGH AGREEMENT RANDOM FOREST)
def
harf
(
input_dict
):
#import orngRF_HARF
from
cf_
bas
e.helpers
import
UnpicklableObject
from
cf_
cor
e.helpers
import
UnpicklableObject
agrLevel
=
input_dict
[
'agr_level'
]
#data = input_dict['data']
harfout
=
UnpicklableObject
(
"orngRF_HARF.HARFLearner(agrLevel ="
+
agrLevel
+
", name='HARF-"
+
str
(
agrLevel
)
+
"')"
)
harfout
.
addimport
(
"import orngRF_HARF"
)
harfout
=
UnpicklableObject
(
"
cf_noise.
orngRF_HARF.HARFLearner(agrLevel ="
+
agrLevel
+
", name='HARF-"
+
str
(
agrLevel
)
+
"')"
)
harfout
.
addimport
(
"import
cf_noise.
orngRF_HARF"
)
#harfLearner = orngRF_HARF.HARFLearner(agrLevel = agrLevel, name = "_HARF-"+agrLevel+"_")
output_dict
=
{}
output_dict
[
'harfout'
]
=
harfout
...
...
@@ -18,23 +21,39 @@ def harf(input_dict):
def
classification_filter
(
input_dict
,
widget
):
import
cf_noise.noiseAlgorithms4lib
as
nalg
output_dict
=
{}
# output_dict['noise_dict']= noiseAlgorithms4lib.cfdecide(input_dict, widget)
output_dict
[
'noise_dict'
]
=
nalg
.
cfdecide
(
input_dict
,
widget
=
None
)
# output_dict['noise_dict']= noiseAlgorithms4lib.cf_decide(input_dict, widget)
orange_dataset
=
u
.
convert_dataset_from_scikit_to_orange
(
input_dict
[
'data'
])
output_dict
[
'noise_dict'
]
=
nalg
.
cf_decide
(
input_dict
[
'learner'
],
orange_dataset
,
int
(
input_dict
[
'k_folds'
]),
widget
=
None
)
return
output_dict
# SATURATION NOISE FILTER
def
saturation_filter
(
input_dict
,
widget
):
import
cf_noise.noiseAlgorithms4lib
as
nalg
orange_dataset
=
u
.
convert_dataset_from_scikit_to_orange
(
input_dict
[
'data'
])
if
not
(
input_dict
[
'satur_type'
]
in
[
'normal'
,
'prune'
]):
raise
Exception
(
"Only 'normal' or 'prune' allowed for 'satur_type'."
)
output_dict
=
{}
output_dict
[
'noise_dict'
]
=
nalg
.
saturation_type
(
input_dict
[
'
d
at
a
'
],
widget
)
output_dict
[
'noise_dict'
]
=
nalg
.
saturation_type
(
orange_dataset
,
input_dict
[
'
s
at
ur_type
'
],
widget
)
return
output_dict
# NOISE RANK
def
noiserank
(
input_dict
):
"""Widget NoiseRank
:param input_dict:
:return:
"""
allnoise
=
{}
data
=
input_dict
[
'data'
]
data
=
u
.
convert_dataset_from_scikit_to_orange
(
input_dict
[
'data'
])
for
item
in
input_dict
[
'noise'
]:
det_by
=
item
[
'name'
]
for
i
in
item
[
'inds'
]:
...
...
@@ -68,12 +87,18 @@ def compareNoisyExamples(item1, item2):
def
noiserank_select
(
postdata
,
input_dict
,
output_dict
):
try
:
output_dict
[
'indices'
]
=
outselection
=
[
int
(
i
)
for
i
in
postdata
[
'selected'
]]
data
=
input_dict
[
'data'
]
# data = input_dict['data']
data
=
u
.
convert_dataset_from_scikit_to_orange
(
input_dict
[
'data'
])
selection
=
[
0
]
*
len
(
data
)
for
i
in
outselection
:
selection
[
i
]
=
1
outdata
=
data
.
select
(
selection
,
1
)
output_dict
[
'selection'
]
=
outdata
data_scikit
=
u
.
convert_dataset_from_orange_to_scikit
(
outdata
)
output_dict
[
'selection'
]
=
data_scikit
except
KeyError
:
output_dict
[
'selection'
]
=
None
...
...
@@ -83,10 +108,22 @@ def noiserank_select(postdata,input_dict, output_dict):
# EVALUATION OF NOISE DETECTION PERFORMANCE
def
add_class_noise
(
input_dict
):
"""Widget Add Class Noise
"""
"""
data_scikit
=
input_dict
[
'data'
]
if
not
(
d
.
is_target_nominal
(
data_scikit
)):
raise
Exception
(
"Widget Add Class Noise accepts only datasets with nominal class!"
)
data
=
u
.
convert_dataset_from_scikit_to_orange
(
data_scikit
)
import
cf_noise.noiseAlgorithms4lib
as
nalg
output_dict
=
nalg
.
addClassNoise
(
input_dict
[
'data'
],
input_dict
[
'noise_level'
],
input_dict
[
'rnd_seed'
])
noise_indices
,
orange_data
=
nalg
.
add_class_noise
(
data
,
input_dict
[
'noise_level'
],
input_dict
[
'rnd_seed'
])
data
=
u
.
convert_dataset_from_orange_to_scikit
(
orange_data
)
output_dict
=
{
'noise_inds'
:
noise_indices
,
'noisy_data'
:
data
}
return
output_dict
def
aggr_results
(
input_dict
):
...
...
@@ -99,6 +136,7 @@ def aggr_results(input_dict):
output_dict
[
'aggr_dict'
]
=
{
'positives'
:
input_dict
[
'pos_inds'
],
'by_alg'
:
input_dict
[
'detected_inds'
]}
return
output_dict
def
eval_batch
(
input_dict
):
"""Widget "Evaluate Repeated Detection"
"""
...
...
@@ -160,71 +198,65 @@ def eval_noise_detection(input_dict):
output_dict
[
'nd_eval'
]
=
sorted
(
performance
,
key
=
itemgetter
(
'name'
))
return
output_dict
def
avrg_std
(
input_dict
):
"""Widget "Average and Standard Deviation" which for some reason is missing from source.ijs.si
-> to be connected on the left using widget "Evaluate Repeated Detection" (eval_batch)
# ENSEMBLE
def
noise_detect_ensemble
(
input_dict
):
""" Noise detection ensemble
:param input_dict:
:return:
"""
perf_results
=
input_dict
[
'perf_results'
]
stats
=
{}
# Aggregate performance results
n
=
len
(
perf_results
)
for
i
in
range
(
n
):
for
item
in
perf_results
[
i
]:
alg
=
item
[
'name'
]
if
not
stats
.
has_key
(
alg
):
stats
[
alg
]
=
{}
stats
[
alg
][
'precisions'
]
=
[
item
[
'precision'
]]
stats
[
alg
][
'recalls'
]
=
[
item
[
'recall'
]]
stats
[
alg
][
'fscores'
]
=
[
item
[
'fscore'
]]
stats
[
alg
][
'fbeta'
]
=
item
[
'fbeta'
]
import
math
ens
=
{}
data_inds
=
input_dict
[
'data_inds'
]
ens_type
=
input_dict
[
'ens_type'
]
for
item
in
data_inds
:
#det_by = item['detected_by']
for
i
in
item
[
'inds'
]:
if
not
ens
.
has_key
(
i
):
ens
[
i
]
=
1
else
:
stats
[
alg
][
'precisions'
].
append
(
item
[
'precision'
])
stats
[
alg
][
'recalls'
].
append
(
item
[
'recall'
])
stats
[
alg
][
'fscores'
].
append
(
item
[
'fscore'
])
# if last experiment: compute averages
if
i
==
n
-
1
:
stats
[
alg
][
'avrg_pr'
]
=
reduce
(
lambda
x
,
y
:
x
+
y
,
stats
[
alg
][
'precisions'
])
/
n
stats
[
alg
][
'avrg_re'
]
=
reduce
(
lambda
x
,
y
:
x
+
y
,
stats
[
alg
][
'recalls'
])
/
n
stats
[
alg
][
'avrg_fs'
]
=
reduce
(
lambda
x
,
y
:
x
+
y
,
stats
[
alg
][
'fscores'
])
/
n
# Compute Standard Deviations
import
numpy
avrgstdout
=
[]
print
stats
for
alg
,
stat
in
stats
.
items
():
avrgstdout
.
append
({
'name'
:
alg
,
'precision'
:
stat
[
'avrg_pr'
],
'recall'
:
stat
[
'avrg_re'
],
'fscore'
:
stat
[
'avrg_fs'
],
'fbeta'
:
stat
[
'fbeta'
],
'std_pr'
:
numpy
.
std
(
stat
[
'precisions'
]),
'std_re'
:
numpy
.
std
(
stat
[
'recalls'
]),
'std_fs'
:
numpy
.
std
(
stat
[
'fscores'
])
})
ens
[
i
]
+=
1
ens_out
=
{}
ens_out
[
'name'
]
=
input_dict
[
'ens_name'
]
ens_out
[
'inds'
]
=
[]
n_algs
=
len
(
data_inds
)
print
ens_type
if
ens_type
==
"consensus"
:
ens_out
[
'inds'
]
=
sorted
([
x
[
0
]
for
x
in
ens
.
items
()
if
x
[
1
]
==
n_algs
])
else
:
# majority
ens_out
[
'inds'
]
=
sorted
([
x
[
0
]
for
x
in
ens
.
items
()
if
x
[
1
]
>=
math
.
floor
(
n_algs
/
2
+
1
)])
from
operator
import
itemgetter
output_dict
=
{}
output_dict
[
'
avrg_w_std'
]
=
sorted
(
avrgstdout
,
key
=
itemgetter
(
'name'
))
output_dict
[
'
ens_out'
]
=
ens_out
return
output_dict
# VISUALIZATIONS
def
pr_space
(
input_dict
):
return
{}
def
eval_bar_chart
(
input_dict
):
return
{}
# VISUALIZATIONS
def
eval_to_table
(
input_dict
):
"""Widget Evaluation Results to Table"""
return
{}
def
data_table
(
input_dict
):
return
{}
def
data_info
(
input_dict
):
return
{}
def
definition_sentences
(
input_dict
):
return
{}
def
term_candidates
(
input_dict
):
return
{}
# def pr_space(input_dict):
# return {}
#
# def eval_bar_chart(input_dict):
# return {}
#
#
# def data_table(input_dict):
# return {}
#
# def data_info(input_dict):
# return {}
#
# def definition_sentences(input_dict):
# return {}
#
# def term_candidates(input_dict):
# return {}
cf_noise/noiseAlgorithms4lib.py
View file @
ea1b9fdd
import
orange
,
orngTree
,
random
import
random
def
addClassNoise
(
data
,
noise_level
,
rnd_seed
):
import
cf_data_mining.classifier
as
c
import
orange
import
orngTree
from
cf_core.helpers
import
UnpicklableObject
from
cf_noise.utilities
import
convert_dataset_from_orange_to_scikit
def
add_class_noise
(
data
,
noise_level
,
rnd_seed
):
"""adds class Noise
:param data: Orange dataset
...
...
@@ -43,9 +51,9 @@ def addClassNoise(data, noise_level, rnd_seed):
#print "\t", temp, "changed to:", data[index].getclass(), "(", index, ")"
#print "\n"
noise_indices
.
sort
()
return
{
'noise_inds'
:
noise_indices
,
'noisy_data'
:
data
}
return
noise_indices
,
data
def
add
M
eta
ID
(
data
):
def
add
_m
eta
_id
(
data
):
meta_id
=
orange
.
FloatVariable
(
"meta_id"
)
mid
=
orange
.
newmetaid
()
while
mid
in
data
.
domain
.
getmetas
().
keys
():
...
...
@@ -54,88 +62,39 @@ def addMetaID(data):
for
i
in
range
(
len
(
data
)):
data
[
i
][
meta_id
]
=
i
def
cfdecide
(
input_dict
,
widget
):
def
cf_decide
(
learner
,
orange_dataset
,
k_folds
,
widget
):
"""Classification filter decide
:param input_dict:
:param learner: Classifier object
:param orange_dataset:
:param k_folds:
:param widget:
:return:
"""
from
pysimplesoap.client
import
SoapFault
somelearner
=
input_dict
[
'learner'
]
print
somelearner
# somelearner = input_dict['learner']
print
learner
# SWITCH TO PROCESSING WITH WEKA CLASSIFIERS
if
type
(
somelearner
)
==
unicode
or
type
(
somelearner
)
==
str
:
# from services.webservice import WebService
from
cf_base.helpers
import
WebService
wsutil
=
WebService
(
'http://vihar.ijs.si:8092/Utilities?wsdl'
,
float
(
input_dict
[
'timeout'
]))
name
=
""
try
:
name
=
wsutil
.
client
.
print_model
(
model
=
somelearner
)[
'model_as_string'
]
print
wsutil
.
client
.
print_model
(
model
=
somelearner
),
name
except
SoapFault
:
print
"Soap fault: unicode string is not a Weka classification learner/model."
return
{}
return
cfweka
(
somelearner
,
input_dict
[
'data'
],
int
(
input_dict
[
'k_folds'
]),
float
(
input_dict
[
'timeout'
]),
if
isinstance
(
learner
,
c
.
Classifier
):
name
=
learner
.
print_classifier
()
return
cf_run
(
learner
,
orange_dataset
,
k_folds
,
name
,
widget
)
else
:
return
cforange
(
input_dict
,
widget
)
return
cf_run_harf
(
learner
,
orange_dataset
,
k_folds
,
widget
)
# else:
# raise Exception("Provided learner is in an unsupported format", str(learner))
def
cforange
(
input_dict
,
widget
):
"""Classification filter for Orange learner
:param input_dict:
:param widget:
:return:
"""
# from workflows.helpers import UnpicklableObject
from
cf_base.helpers
import
UnpicklableObject
somelearner
=
input_dict
[
'learner'
]
print
"Before generate"
learner
=
somelearner
if
not
isinstance
(
somelearner
,
UnpicklableObject
)
else
somelearner
.
generate
()
print
"After generate"
data
=
input_dict
[
'data'
]
print
len
(
data
)
addMetaID
(
data
)
print
'Before for loop'
k
=
int
(
input_dict
[
'k_folds'
])
noisyIndices
=
[]
selection
=
orange
.
MakeRandomIndicesCV
(
data
,
folds
=
k
)
count_noisy
=
[
0
]
*
k
print
'Before for loop'
for
test_fold
in
range
(
k
):
train_data
=
data
.
select
(
selection
,
test_fold
,
negate
=
1
)
test_data
=
data
.
select
(
selection
,
test_fold
)
#print "\t\t", "Learned on", len(train_data), "examples"
#file.flush()
print
'Before classifier construction'
#print learner.hovername if learner.hovername != None else "ni hovernamea"
classifier
=
learner
(
train_data
)
print
'After classifier construction'
for
example
in
test_data
:
exclassified
=
classifier
(
example
)
if
exclassified
!=
None
and
exclassified
!=
example
.
getclass
():
# selection_filter[int(example[meta_id].value)] = 0
noisyIndices
.
append
(
int
(
example
[
"meta_id"
].
value
))
count_noisy
[
test_fold
]
+=
1
# END test_data
widget
.
progress
=
int
((
test_fold
+
1
)
*
1.0
/
k
*
100
)
widget
.
save
()
# END test_fold
return
{
'inds'
:
sorted
(
noisyIndices
),
'name'
:
learner
.
name
}
## filtered_data = data.select(selection_filter, 1)
## noisy_data = data.select(selection_filter, 0)
## return [filtered_data, noisy_data]=======
def
cf
weka
(
learner
,
data
,
k_folds
,
timeout
,
name
,
widget
=
None
):
"""
C
lassification filter
for a Weka learner
def
cf
_run
(
learner
,
data
,
k_folds
,
name
,
widget
=
None
):
"""
Runs a c
lassification filter
:param learner: Weka
learner, serialized
:param learner: Weka
Classifier
:param data: Orange dataset
:param k_folds:
:param name:
...
...
@@ -144,10 +103,6 @@ def cfweka(learner, data, k_folds, timeout, name, widget=None):
:return:
"""
from
cf_base.helpers
import
WebService
wseval
=
WebService
(
'http://vihar.ijs.si:8092/Evaluation?wsdl'
,
timeout
)
wsutil
=
WebService
(
'http://vihar.ijs.si:8092/Utilities?wsdl'
,
timeout
)
somelearner
=
learner
print
somelearner
...
...
@@ -155,23 +110,32 @@ def cfweka(learner, data, k_folds, timeout, name, widget=None):
selection
=
orange
.
MakeRandomIndicesCV
(
data
,
folds
=
k_folds
)
count_noisy
=
[
0
]
*
k_folds
for
test_fold
in
range
(
k_folds
):
train_
arffstr
=
toARFFstring
(
data
.
select
(
selection
,
test_fold
,
negate
=
1
)).
getvalue
()
train_data
=
wsutil
.
client
.
arff_to_weka_instances
(
arff
=
train_arffstr
,
class_index
=
data
.
domain
.
index
(
data
.
domain
.
classVar
))[
'instances'
]
#
train_
data = wsutil.client.arff_to_weka_instances(arff = train_arffstr, class_index = data.domain.index(data.domain.classVar))['instances']
train_data
=
convert_dataset_from_orange_to_scikit
(
data
.
select
(
selection
,
test_fold
,
negate
=
1
)
)
test_inds
=
[
i
for
i
in
range
(
len
(
selection
))
if
selection
[
i
]
==
test_fold
]
test_arffstr
=
toARFFstring
(
data
.
select
(
selection
,
test_fold
)).
getvalue
()
test_data
=
wsutil
.
client
.
arff_to_weka_instances
(
arff
=
test_arffstr
,
class_index
=
data
.
domain
.
index
(
data
.
domain
.
classVar
))[
'instances'
]
# test_data = wsutil.client.arff_to_weka_instances(arff = test_arffstr, class_index = data.domain.index(data.domain.classVar))['instances']
test_data
=
convert_dataset_from_orange_to_scikit
(
data
.
select
(
selection
,
test_fold
)
)
#print "\t\t", "Learned on", len(train_data), "examples"
#file.flush()
print
"pred cl build"
classifier
=
wseval
.
client
.
build_classifier
(
learner
=
somelearner
,
instances
=
train_data
)[
'classifier'
]
print
"po cl build"
eval_test_data
=
wseval
.
client
.
apply_classifier
(
classifier
=
classifier
,
instances
=
test_data
)
print
"po eval"
for
i
in
range
(
len
(
eval_test_data
)):
print
"before cl build"
# classifier = wseval.client.build_classifier(learner = somelearner, instances = train_data)['classifier']
learner
.
build_classifier
(
train_data
)
print
"after cl build"
# eval_test_data = wseval.client.apply_classifier(classifier = classifier, instances = test_data)
scikit_dataset_predicted
=
learner
.
apply_classifier
(
test_data
)
print
"after apply"
for
i
in
range
(
len
(
scikit_dataset_predicted
.
target
)):
#print "Test data length:", len(test_data), "Test inds length:", len(test_inds), "Eval Test data length:", len(eval_test_data)
print
i
,
"v for zanki"
,
eval_test_data
[
i
][
'classes'
],
data
[
test_inds
[
i
]].
getclass
()
if
eval_test_data
[
i
][
'classes'
]
!=
unicode
(
data
[
test_inds
[
i
]].
getclass
()):
# print i, "v for zanki", eval_test_data[i]['classes'], data[test_inds[i]].getclass()
# if eval_test_data[i]['classes'] != unicode(data[test_inds[i]].getclass()):
if
scikit_dataset_predicted
.
target
[
i
]
!=
scikit_dataset_predicted
.
targetPredicted
[
i
]:
# selection_filter[int(example[meta_id].value)] = 0
noisyIndices
.
append
(
test_inds
[
i
])
count_noisy
[
test_fold
]
+=
1
...
...
@@ -180,7 +144,53 @@ def cfweka(learner, data, k_folds, timeout, name, widget=None):
widget
.
progress
=
int
((
test_fold
+
1
)
*
1.0
/
k_folds
*
100
)
widget
.
save
()
# END test_fold
return
{
'inds'
:
sorted
(
noisyIndices
),
'name'
:
getWekaName
(
name
)}
return
{
'inds'
:
sorted
(
noisyIndices
),
'name'
:
get_weka_name
(
name
)}
def
cf_run_harf
(
learner
,
data_orange
,
k_folds
,
widget
=
None
):
"""Classification filter for HARF learner
:param learner:
:param data_orange:
:param k_folds:
:param widget:
:return:
"""
somelearner
=
learner
print
"Before generate"
learner
=
somelearner
if
not
isinstance
(
somelearner
,
UnpicklableObject
)
else
somelearner
.
generate
()
print
"After generate"
# data_orange = input_dict['data_orange']
print
len
(
data_orange
)
add_meta_id
(
data_orange
)
print
'Before for loop'
k
=
k_folds
noisyIndices
=
[]
selection
=
orange
.
MakeRandomIndicesCV
(
data_orange
,
folds
=
k
)
count_noisy
=
[
0
]
*
k
print
'Before for loop'
for
test_fold
in
range
(
k
):
train_data
=
data_orange
.
select
(
selection
,
test_fold
,
negate
=
1
)
test_data
=
data_orange
.
select
(
selection
,
test_fold
)
#print "\t\t", "Learned on", len(train_data), "examples"
#file.flush()
print
'Before classifier construction'
#print learner.hovername if learner.hovername != None else "ni hovernamea"
classifier
=
learner
(
train_data
)
print
'After classifier construction'
for
example
in
test_data
:
exclassified
=
classifier
(
example
)
if
exclassified
!=
None
and
exclassified
!=
example
.
getclass
():
# selection_filter[int(example[meta_id].value)] = 0
noisyIndices
.
append
(
int
(
example
[
"meta_id"
].
value
))
count_noisy
[
test_fold
]
+=
1
# END test_data
if
not
(
widget
is
None
):
widget
.
progress
=
int
((
test_fold
+
1
)
*
1.0
/
k
*
100
)
widget
.
save
()
# END test_fold
return
{
'inds'
:
sorted
(
noisyIndices
),
'name'
:
learner
.
name
}
def
saturation_type
(
dataset
,
satur_type
=
'normal'
,
widget
=
None
):
"""Saturation filter
...
...
@@ -191,7 +201,7 @@ def saturation_type(dataset, satur_type='normal', widget=None):
:return:
"""
add
M
eta
ID
(
dataset
)
add
_m
eta
_id
(
dataset
)
if
not
(
widget
==
None
):
widget
.
progress
=
0
widget
.
save
()
...
...
@@ -200,7 +210,7 @@ def saturation_type(dataset, satur_type='normal', widget=None):
progress_steps
=
(
3
*
data_len
**
2
+
2
*
data_len
)
/
8
# provided max allowed iter steps (k) = data_len/2
if
satur_type
==
'prune'
:
if
not
dataset
.
hasMissingValues
():
return
prune
SF
(
dataset
,
1
,
progress_steps
,
widget
)
return
prune
_sf
(
dataset
,
1
,
progress_steps
,
widget
)
else
:
raise
Exception
(
"Pre-pruned saturation filtering requires data WITHOUT missing values!"
)
else
:
...
...
@@ -210,7 +220,7 @@ def cmplx(set):
classifier
=
orngTree
.
TreeLearner
(
set
,
sameMajorityPruning
=
1
,
mForPruning
=
0
)
return
orngTree
.
countNodes
(
classifier
)
def
find
N
oise
(
data
):
def
find
_n
oise
(
data
):
n
=
len
(
data
)
noisiest
=
[]
gE
=
cmplx
(
data
)
...
...
@@ -264,7 +274,7 @@ def saturation(dataset, widget):
workSet
=
orange
.
ExampleTable
(
dataset
)
while
k
!=
0
:
n
=
len
(
workSet
)
satfilter
=
find
N
oise
(
workSet
)
satfilter
=
find
_n
oise
(
workSet
)
if
satfilter
==
[
1
,[]]:
print
"
\t\t
"
,
satfilter
if
not
(
widget
==
None
):
...
...
@@ -321,10 +331,10 @@ def findPrunableNoisy(node, minExmplsInLeaf):
else
:
raise
TypeError
,
"TreeNode expected"
def
exclude
P
runed
(
dataset
,
classifier
,
minExmplsInLeaf
):
def
exclude
_p
runed
(
dataset
,
classifier
,
minExmplsInLeaf
):
print
"in exclude"
toPrune
=
findPrunableNoisy
(
classifier
.
tree
,
minExmplsInLeaf
)
unique
I
tems
(
toPrune
)
unique
_i
tems
(
toPrune
)
print
"
\t\t
"
,
"Leaves with"
,
minExmplsInLeaf
,
"or less examples will be pruned."
print
"
\t\t
"
,
"IDs of examples excluded by pruning:"
,
toPrune
#file.flush()
...
...
@@ -338,7 +348,7 @@ def excludePruned(dataset, classifier, minExmplsInLeaf):
#return [noisyA, dataset]
return
[
toPrune
,
workSet
]
def
unique
I
tems
(
list
):
def
unique
_i
tems
(
list
):
list
.
sort
()
k
=
0
while
k
<
len
(
list
)
-
1
:
...
...
@@ -347,7 +357,7 @@ def uniqueItems(list):
else
:
k
+=
1
def
prune
SF
(
data
,
minExmplsInLeaf
,
progress_steps
,
widget
=
None
):
def
prune
_sf
(
data
,
minExmplsInLeaf
,
progress_steps
,
widget
=
None
):
"""Prune Saturation Filter
:param data:
...
...
@@ -362,8 +372,8 @@ def pruneSF(data, minExmplsInLeaf, progress_steps, widget=None):
classifier
=
orngTree
.
TreeLearner
(
data
,
sameMajorityPruning
=
1
,
mForPruning
=
0
,
storeExamples
=
1
)
print
"
\t\t
"
,
"Classifier complexity:
\t
"
,
orngTree
.
countNodes
(
classifier
),
"nodes."
#file.flush()
## [noisyA, dataset] = exclude
P
runed(data, classifier, minExmplsInLeaf)
[
noisePruned
,
dataset
]
=
exclude
P
runed
(
data
,
classifier
,
minExmplsInLeaf
)
## [noisyA, dataset] = exclude
_p
runed(data, classifier, minExmplsInLeaf)
[
noisePruned
,
dataset
]
=
exclude
_p
runed
(
data
,
classifier
,
minExmplsInLeaf
)
print
"
\t\t
"
,
len
(
noisePruned
),
"example(s) were excluded by pruning."
#file.flush()
classifier2
=
orngTree
.
TreeLearner
(
dataset
,
sameMajorityPruning
=
1
,
mForPruning
=
0
,
storeExamples
=
1
)
...
...
@@ -395,67 +405,8 @@ def pruneSF(data, minExmplsInLeaf, progress_steps, widget=None):
#return noisePruned
# to ARFF String
def
toARFFstring
(
table
,
try_numericize
=
0
):
#filename,table,try_numericize=0):
import
cStringIO
,
string
t
=
table
#if filename[-5:] == ".arff":
# filename = filename[:-5]
#print filename
f
=
cStringIO
.
StringIO
()
f
.
write
(
'@relation %s
\n
'
%
t
.
domain
.
classVar
.
name
)
# attributes
ats
=
[
i
for
i
in
t
.
domain
.
attributes
]
ats
.
append
(
t
.
domain
.
classVar
)
for
i
in
ats
:
real
=
1
if
i
.
varType
==
1
:
if
try_numericize
:
# try if all values numeric
for
j
in
i
.
values
:
try
:
x
=
string
.
atof
(
j
)
except
:
real
=
0
# failed
break
else
:
real
=
0
iname
=
str
(
i
.
name
)
if
string
.
find
(
iname
,
" "
)
!=
-
1
:
iname
=
"'%s'"
%
iname
if
real
==
1
:
f
.
write
(
'@attribute %s real
\n
'
%
iname
)
else
:
f
.
write
(
'@attribute %s { '
%
iname
)
x
=
[]
for
j
in
i
.
values
:
s
=
str
(
j
)
if
string
.
find
(
s
,
" "
)
==
-
1
:
x
.
append
(
"%s"
%
s
)
else
:
x
.
append
(
"'%s'"
%
s
)
for
j
in
x
[:
-
1
]:
f
.
write
(
'%s,'
%
j
)
f
.
write
(
'%s }
\n
'
%
x
[
-
1
])