Commit ea1b9fdd authored by Darko Aleksovski's avatar Darko Aleksovski
Browse files

Widget modifications

parent 62ede229
import cf_noise.utilities as u
import cf_data_mining.dataset as d
# ===================================================================
# HARF (HIGH AGREEMENT RANDOM FOREST)
def harf(input_dict):
#import orngRF_HARF
from cf_base.helpers import UnpicklableObject
from cf_core.helpers import UnpicklableObject
agrLevel = input_dict['agr_level']
#data = input_dict['data']
harfout = UnpicklableObject("orngRF_HARF.HARFLearner(agrLevel ="+agrLevel+", name='HARF-"+str(agrLevel)+"')")
harfout.addimport("import orngRF_HARF")
harfout = UnpicklableObject("cf_noise.orngRF_HARF.HARFLearner(agrLevel ="+agrLevel+", name='HARF-"+str(agrLevel)+"')")
harfout.addimport("import cf_noise.orngRF_HARF")
#harfLearner = orngRF_HARF.HARFLearner(agrLevel = agrLevel, name = "_HARF-"+agrLevel+"_")
output_dict = {}
output_dict['harfout']= harfout
......@@ -18,23 +21,39 @@ def harf(input_dict):
def classification_filter(input_dict, widget):
import cf_noise.noiseAlgorithms4lib as nalg
output_dict = {}
# output_dict['noise_dict']= noiseAlgorithms4lib.cfdecide(input_dict, widget)
output_dict['noise_dict']= nalg.cfdecide(input_dict, widget=None)
# output_dict['noise_dict']= noiseAlgorithms4lib.cf_decide(input_dict, widget)
orange_dataset = u.convert_dataset_from_scikit_to_orange(input_dict['data'])
output_dict['noise_dict']= nalg.cf_decide(input_dict['learner'], orange_dataset, int(input_dict['k_folds']), widget=None)
return output_dict
# SATURATION NOISE FILTER
def saturation_filter(input_dict, widget):
import cf_noise.noiseAlgorithms4lib as nalg
orange_dataset = u.convert_dataset_from_scikit_to_orange(input_dict['data'])
if not(input_dict['satur_type'] in ['normal', 'prune']):
raise Exception("Only 'normal' or 'prune' allowed for 'satur_type'.")
output_dict = {}
output_dict['noise_dict']= nalg.saturation_type(input_dict['data'], widget)
output_dict['noise_dict']= nalg.saturation_type(orange_dataset, input_dict['satur_type'], widget)
return output_dict
# NOISE RANK
def noiserank(input_dict):
"""Widget NoiseRank
:param input_dict:
:return:
"""
allnoise = {}
data = input_dict['data']
data = u.convert_dataset_from_scikit_to_orange(input_dict['data'])
for item in input_dict['noise']:
det_by = item['name']
for i in item['inds']:
......@@ -68,12 +87,18 @@ def compareNoisyExamples(item1, item2):
def noiserank_select(postdata,input_dict, output_dict):
try:
output_dict['indices']= outselection = [int(i) for i in postdata['selected']]
data = input_dict['data']
# data = input_dict['data']
data = u.convert_dataset_from_scikit_to_orange(input_dict['data'])
selection = [0]*len(data)
for i in outselection:
selection[i] = 1
outdata = data.select(selection, 1)
output_dict['selection'] = outdata
data_scikit = u.convert_dataset_from_orange_to_scikit(outdata)
output_dict['selection'] = data_scikit
except KeyError:
output_dict['selection'] = None
......@@ -83,10 +108,22 @@ def noiserank_select(postdata,input_dict, output_dict):
# EVALUATION OF NOISE DETECTION PERFORMANCE
def add_class_noise(input_dict):
"""Widget Add Class Noise
"""
"""
data_scikit = input_dict['data']
if not(d.is_target_nominal(data_scikit)):
raise Exception("Widget Add Class Noise accepts only datasets with nominal class!")
data = u.convert_dataset_from_scikit_to_orange(data_scikit)
import cf_noise.noiseAlgorithms4lib as nalg
output_dict = nalg.addClassNoise(input_dict['data'], input_dict['noise_level'], input_dict['rnd_seed'])
noise_indices, orange_data = nalg.add_class_noise(data, input_dict['noise_level'], input_dict['rnd_seed'])
data = u.convert_dataset_from_orange_to_scikit(orange_data)
output_dict = {'noise_inds':noise_indices, 'noisy_data': data}
return output_dict
def aggr_results(input_dict):
......@@ -99,6 +136,7 @@ def aggr_results(input_dict):
output_dict['aggr_dict'] = { 'positives' : input_dict['pos_inds'], 'by_alg': input_dict['detected_inds']}
return output_dict
def eval_batch(input_dict):
"""Widget "Evaluate Repeated Detection"
"""
......@@ -160,71 +198,65 @@ def eval_noise_detection(input_dict):
output_dict['nd_eval'] = sorted(performance, key=itemgetter('name'))
return output_dict
def avrg_std(input_dict):
"""Widget "Average and Standard Deviation" which for some reason is missing from source.ijs.si
-> to be connected on the left using widget "Evaluate Repeated Detection" (eval_batch)
# ENSEMBLE
def noise_detect_ensemble(input_dict):
""" Noise detection ensemble
:param input_dict:
:return:
"""
perf_results = input_dict['perf_results']
stats = {}
# Aggregate performance results
n = len(perf_results)
for i in range(n):
for item in perf_results[i]:
alg = item['name']
if not stats.has_key(alg):
stats[alg] = {}
stats[alg]['precisions'] = [item['precision']]
stats[alg]['recalls'] = [item['recall']]
stats[alg]['fscores'] = [item['fscore']]
stats[alg]['fbeta'] = item['fbeta']
import math
ens = {}
data_inds = input_dict['data_inds']
ens_type = input_dict['ens_type']
for item in data_inds:
#det_by = item['detected_by']
for i in item['inds']:
if not ens.has_key(i):
ens[i] = 1
else:
stats[alg]['precisions'].append(item['precision'])
stats[alg]['recalls'].append(item['recall'])
stats[alg]['fscores'].append(item['fscore'])
# if last experiment: compute averages
if i == n-1:
stats[alg]['avrg_pr'] = reduce(lambda x,y: x+y, stats[alg]['precisions'])/n
stats[alg]['avrg_re'] = reduce(lambda x,y: x+y, stats[alg]['recalls'])/n
stats[alg]['avrg_fs'] = reduce(lambda x,y: x+y, stats[alg]['fscores'])/n
# Compute Standard Deviations
import numpy
avrgstdout = []
print stats
for alg, stat in stats.items():
avrgstdout.append({'name': alg, 'precision': stat['avrg_pr'], 'recall': stat['avrg_re'],
'fscore' : stat['avrg_fs'],
'fbeta' : stat['fbeta'],
'std_pr' : numpy.std(stat['precisions']),
'std_re' : numpy.std(stat['recalls']),
'std_fs' : numpy.std(stat['fscores']) })
ens[i] += 1
ens_out = {}
ens_out['name'] = input_dict['ens_name']
ens_out['inds'] = []
n_algs = len(data_inds)
print ens_type
if ens_type == "consensus":
ens_out['inds'] = sorted([x[0] for x in ens.items() if x[1] == n_algs])
else: # majority
ens_out['inds'] = sorted([x[0] for x in ens.items() if x[1] >= math.floor(n_algs/2+1)])
from operator import itemgetter
output_dict = {}
output_dict['avrg_w_std'] = sorted(avrgstdout, key=itemgetter('name'))
output_dict['ens_out'] = ens_out
return output_dict
# VISUALIZATIONS
def pr_space(input_dict):
return {}
def eval_bar_chart(input_dict):
return {}
# VISUALIZATIONS
def eval_to_table(input_dict):
"""Widget Evaluation Results to Table"""
return {}
def data_table(input_dict):
return {}
def data_info(input_dict):
return {}
def definition_sentences(input_dict):
return {}
def term_candidates(input_dict):
return {}
# def pr_space(input_dict):
# return {}
#
# def eval_bar_chart(input_dict):
# return {}
#
#
# def data_table(input_dict):
# return {}
#
# def data_info(input_dict):
# return {}
#
# def definition_sentences(input_dict):
# return {}
#
# def term_candidates(input_dict):
# return {}
import orange, orngTree, random
import random
def addClassNoise(data, noise_level, rnd_seed):
import cf_data_mining.classifier as c
import orange
import orngTree
from cf_core.helpers import UnpicklableObject
from cf_noise.utilities import convert_dataset_from_orange_to_scikit
def add_class_noise(data, noise_level, rnd_seed):
"""adds class Noise
:param data: Orange dataset
......@@ -43,9 +51,9 @@ def addClassNoise(data, noise_level, rnd_seed):
#print "\t", temp, "changed to:", data[index].getclass(), "(", index, ")"
#print "\n"
noise_indices.sort()
return {'noise_inds':noise_indices, 'noisy_data': data}
return noise_indices, data
def addMetaID(data):
def add_meta_id(data):
meta_id = orange.FloatVariable("meta_id")
mid = orange.newmetaid()
while mid in data.domain.getmetas().keys():
......@@ -54,88 +62,39 @@ def addMetaID(data):
for i in range(len(data)):
data[i][meta_id] = i
def cfdecide(input_dict, widget):
def cf_decide(learner, orange_dataset, k_folds, widget):
"""Classification filter decide
:param input_dict:
:param learner: Classifier object
:param orange_dataset:
:param k_folds:
:param widget:
:return:
"""
from pysimplesoap.client import SoapFault
somelearner = input_dict['learner']
print somelearner
# somelearner = input_dict['learner']
print learner
# SWITCH TO PROCESSING WITH WEKA CLASSIFIERS
if type(somelearner) == unicode or type(somelearner) == str:
# from services.webservice import WebService
from cf_base.helpers import WebService
wsutil = WebService('http://vihar.ijs.si:8092/Utilities?wsdl', float(input_dict['timeout']))
name = ""
try:
name = wsutil.client.print_model(model = somelearner)['model_as_string']
print wsutil.client.print_model(model = somelearner), name
except SoapFault:
print "Soap fault: unicode string is not a Weka classification learner/model."
return {}
return cfweka(somelearner,
input_dict['data'],
int(input_dict['k_folds']),
float(input_dict['timeout']),
if isinstance(learner, c.Classifier):
name = learner.print_classifier()
return cf_run(learner,
orange_dataset,
k_folds,
name,
widget)
else:
return cforange(input_dict, widget)
return cf_run_harf(learner, orange_dataset, k_folds, widget)
# else:
# raise Exception("Provided learner is in an unsupported format", str(learner))
def cforange(input_dict, widget):
"""Classification filter for Orange learner
:param input_dict:
:param widget:
:return:
"""
# from workflows.helpers import UnpicklableObject
from cf_base.helpers import UnpicklableObject
somelearner = input_dict['learner']
print "Before generate"
learner = somelearner if not isinstance(somelearner,UnpicklableObject) else somelearner.generate()
print "After generate"
data = input_dict['data']
print len(data)
addMetaID(data)
print 'Before for loop'
k = int(input_dict['k_folds'])
noisyIndices = []
selection = orange.MakeRandomIndicesCV(data, folds=k)
count_noisy = [0]*k
print 'Before for loop'
for test_fold in range(k):
train_data = data.select(selection, test_fold, negate=1)
test_data = data.select(selection, test_fold)
#print "\t\t", "Learned on", len(train_data), "examples"
#file.flush()
print 'Before classifier construction'
#print learner.hovername if learner.hovername != None else "ni hovernamea"
classifier = learner(train_data)
print 'After classifier construction'
for example in test_data:
exclassified = classifier(example)
if exclassified != None and exclassified != example.getclass():
# selection_filter[int(example[meta_id].value)] = 0
noisyIndices.append(int(example["meta_id"].value))
count_noisy[test_fold] += 1
# END test_data
widget.progress = int((test_fold+1)*1.0/k*100)
widget.save()
# END test_fold
return {'inds': sorted(noisyIndices), 'name': learner.name}
## filtered_data = data.select(selection_filter, 1)
## noisy_data = data.select(selection_filter, 0)
## return [filtered_data, noisy_data]=======
def cfweka(learner, data, k_folds, timeout, name, widget=None):
"""Classification filter for a Weka learner
def cf_run(learner, data, k_folds, name, widget=None):
"""Runs a classification filter
:param learner: Weka learner, serialized
:param learner: WekaClassifier
:param data: Orange dataset
:param k_folds:
:param name:
......@@ -144,10 +103,6 @@ def cfweka(learner, data, k_folds, timeout, name, widget=None):
:return:
"""
from cf_base.helpers import WebService
wseval = WebService('http://vihar.ijs.si:8092/Evaluation?wsdl', timeout)
wsutil = WebService('http://vihar.ijs.si:8092/Utilities?wsdl', timeout)
somelearner = learner
print somelearner
......@@ -155,23 +110,32 @@ def cfweka(learner, data, k_folds, timeout, name, widget=None):
selection = orange.MakeRandomIndicesCV(data, folds=k_folds)
count_noisy = [0]*k_folds
for test_fold in range(k_folds):
train_arffstr = toARFFstring(data.select(selection, test_fold, negate=1)).getvalue()
train_data = wsutil.client.arff_to_weka_instances(arff = train_arffstr, class_index = data.domain.index(data.domain.classVar))['instances']
# train_data = wsutil.client.arff_to_weka_instances(arff = train_arffstr, class_index = data.domain.index(data.domain.classVar))['instances']
train_data = convert_dataset_from_orange_to_scikit( data.select(selection, test_fold, negate=1) )
test_inds = [i for i in range(len(selection)) if selection[i] == test_fold ]
test_arffstr = toARFFstring(data.select(selection, test_fold)).getvalue()
test_data = wsutil.client.arff_to_weka_instances(arff = test_arffstr, class_index = data.domain.index(data.domain.classVar))['instances']
# test_data = wsutil.client.arff_to_weka_instances(arff = test_arffstr, class_index = data.domain.index(data.domain.classVar))['instances']
test_data = convert_dataset_from_orange_to_scikit( data.select(selection, test_fold) )
#print "\t\t", "Learned on", len(train_data), "examples"
#file.flush()
print "pred cl build"
classifier = wseval.client.build_classifier(learner = somelearner, instances = train_data)['classifier']
print "po cl build"
eval_test_data = wseval.client.apply_classifier(classifier = classifier, instances = test_data)
print "po eval"
for i in range(len(eval_test_data)):
print "before cl build"
# classifier = wseval.client.build_classifier(learner = somelearner, instances = train_data)['classifier']
learner.build_classifier(train_data)
print "after cl build"
# eval_test_data = wseval.client.apply_classifier(classifier = classifier, instances = test_data)
scikit_dataset_predicted = learner.apply_classifier(test_data)
print "after apply"
for i in range(len(scikit_dataset_predicted.target)):
#print "Test data length:", len(test_data), "Test inds length:", len(test_inds), "Eval Test data length:", len(eval_test_data)
print i, "v for zanki", eval_test_data[i]['classes'], data[test_inds[i]].getclass()
if eval_test_data[i]['classes'] != unicode(data[test_inds[i]].getclass()):
# print i, "v for zanki", eval_test_data[i]['classes'], data[test_inds[i]].getclass()
# if eval_test_data[i]['classes'] != unicode(data[test_inds[i]].getclass()):
if scikit_dataset_predicted.target[i] != scikit_dataset_predicted.targetPredicted[i]:
# selection_filter[int(example[meta_id].value)] = 0
noisyIndices.append(test_inds[i])
count_noisy[test_fold] += 1
......@@ -180,7 +144,53 @@ def cfweka(learner, data, k_folds, timeout, name, widget=None):
widget.progress = int((test_fold+1)*1.0/k_folds*100)
widget.save()
# END test_fold
return {'inds': sorted(noisyIndices), 'name': getWekaName(name)}
return {'inds': sorted(noisyIndices), 'name': get_weka_name(name)}
def cf_run_harf(learner, data_orange, k_folds, widget=None):
"""Classification filter for HARF learner
:param learner:
:param data_orange:
:param k_folds:
:param widget:
:return:
"""
somelearner = learner
print "Before generate"
learner = somelearner if not isinstance(somelearner,UnpicklableObject) else somelearner.generate()
print "After generate"
# data_orange = input_dict['data_orange']
print len(data_orange)
add_meta_id(data_orange)
print 'Before for loop'
k = k_folds
noisyIndices = []
selection = orange.MakeRandomIndicesCV(data_orange, folds=k)
count_noisy = [0]*k
print 'Before for loop'
for test_fold in range(k):
train_data = data_orange.select(selection, test_fold, negate=1)
test_data = data_orange.select(selection, test_fold)
#print "\t\t", "Learned on", len(train_data), "examples"
#file.flush()
print 'Before classifier construction'
#print learner.hovername if learner.hovername != None else "ni hovernamea"
classifier = learner(train_data)
print 'After classifier construction'
for example in test_data:
exclassified = classifier(example)
if exclassified != None and exclassified != example.getclass():
# selection_filter[int(example[meta_id].value)] = 0
noisyIndices.append(int(example["meta_id"].value))
count_noisy[test_fold] += 1
# END test_data
if not(widget is None):
widget.progress = int((test_fold+1)*1.0/k*100)
widget.save()
# END test_fold
return {'inds': sorted(noisyIndices), 'name': learner.name}
def saturation_type(dataset, satur_type='normal', widget=None):
"""Saturation filter
......@@ -191,7 +201,7 @@ def saturation_type(dataset, satur_type='normal', widget=None):
:return:
"""
addMetaID(dataset)
add_meta_id(dataset)
if not(widget==None):
widget.progress = 0
widget.save()
......@@ -200,7 +210,7 @@ def saturation_type(dataset, satur_type='normal', widget=None):
progress_steps = (3*data_len**2 + 2*data_len)/8 # provided max allowed iter steps (k) = data_len/2
if satur_type == 'prune':
if not dataset.hasMissingValues():
return pruneSF(dataset, 1, progress_steps, widget)
return prune_sf(dataset, 1, progress_steps, widget)
else:
raise Exception("Pre-pruned saturation filtering requires data WITHOUT missing values!")
else:
......@@ -210,7 +220,7 @@ def cmplx(set):
classifier = orngTree.TreeLearner(set, sameMajorityPruning=1, mForPruning=0)
return orngTree.countNodes(classifier)
def findNoise(data):
def find_noise(data):
n = len(data)
noisiest = []
gE = cmplx(data)
......@@ -264,7 +274,7 @@ def saturation(dataset, widget):
workSet = orange.ExampleTable(dataset)
while k != 0:
n = len(workSet)
satfilter = findNoise(workSet)
satfilter = find_noise(workSet)
if satfilter == [1,[]]:
print "\t\t", satfilter
if not(widget==None):
......@@ -321,10 +331,10 @@ def findPrunableNoisy(node, minExmplsInLeaf):
else:
raise TypeError, "TreeNode expected"
def excludePruned(dataset, classifier, minExmplsInLeaf):
def exclude_pruned(dataset, classifier, minExmplsInLeaf):
print "in exclude"
toPrune = findPrunableNoisy(classifier.tree, minExmplsInLeaf)
uniqueItems(toPrune)
unique_items(toPrune)
print "\t\t", "Leaves with", minExmplsInLeaf, "or less examples will be pruned."
print "\t\t", "IDs of examples excluded by pruning:", toPrune
#file.flush()
......@@ -338,7 +348,7 @@ def excludePruned(dataset, classifier, minExmplsInLeaf):
#return [noisyA, dataset]
return [toPrune, workSet]
def uniqueItems(list):
def unique_items(list):
list.sort()
k = 0
while k < len(list)-1:
......@@ -347,7 +357,7 @@ def uniqueItems(list):
else:
k += 1
def pruneSF(data, minExmplsInLeaf, progress_steps, widget=None):
def prune_sf(data, minExmplsInLeaf, progress_steps, widget=None):
"""Prune Saturation Filter
:param data:
......@@ -362,8 +372,8 @@ def pruneSF(data, minExmplsInLeaf, progress_steps, widget=None):
classifier = orngTree.TreeLearner(data, sameMajorityPruning=1, mForPruning=0, storeExamples=1)
print "\t\t", "Classifier complexity:\t", orngTree.countNodes(classifier), "nodes."
#file.flush()
## [noisyA, dataset] = excludePruned(data, classifier, minExmplsInLeaf)
[noisePruned, dataset] = excludePruned(data, classifier, minExmplsInLeaf)
## [noisyA, dataset] = exclude_pruned(data, classifier, minExmplsInLeaf)
[noisePruned, dataset] = exclude_pruned(data, classifier, minExmplsInLeaf)
print "\t\t", len(noisePruned), "example(s) were excluded by pruning."
#file.flush()
classifier2 = orngTree.TreeLearner(dataset, sameMajorityPruning=1, mForPruning=0, storeExamples=1)
......@@ -395,67 +405,8 @@ def pruneSF(data, minExmplsInLeaf, progress_steps, widget=None):
#return noisePruned
# to ARFF String
def toARFFstring(table,try_numericize=0):#filename,table,try_numericize=0):
import cStringIO, string
t = table
#if filename[-5:] == ".arff":
# filename = filename[:-5]
#print filename
f = cStringIO.StringIO()
f.write('@relation %s\n'%t.domain.classVar.name)
# attributes
ats = [i for i in t.domain.attributes]
ats.append(t.domain.classVar)
for i in ats:
real = 1
if i.varType == 1:
if try_numericize:
# try if all values numeric
for j in i.values:
try:
x = string.atof(j)
except:
real = 0 # failed
break
else:
real = 0
iname = str(i.name)
if string.find(iname," ") != -1:
iname = "'%s'"%iname
if real==1:
f.write('@attribute %s real\n'%iname)
else:
f.write('@attribute %s { '%iname)
x = []
for j in i.values:
s = str(j)
if string.find(s," ") == -1:
x.append("%s"%s)
else:
x.append("'%s'"%s)
for j in x[:-1]:
f.write('%s,'%j)
f.write('%s }\n'%x[-1])
# examples