Commit ea1b9fdd authored by Darko Aleksovski's avatar Darko Aleksovski
Browse files

Widget modifications

parent 62ede229
import cf_noise.utilities as u
import cf_data_mining.dataset as d
# =================================================================== # ===================================================================
# HARF (HIGH AGREEMENT RANDOM FOREST) # HARF (HIGH AGREEMENT RANDOM FOREST)
def harf(input_dict): def harf(input_dict):
#import orngRF_HARF #import orngRF_HARF
from cf_base.helpers import UnpicklableObject from cf_core.helpers import UnpicklableObject
agrLevel = input_dict['agr_level'] agrLevel = input_dict['agr_level']
#data = input_dict['data'] #data = input_dict['data']
harfout = UnpicklableObject("orngRF_HARF.HARFLearner(agrLevel ="+agrLevel+", name='HARF-"+str(agrLevel)+"')") harfout = UnpicklableObject("cf_noise.orngRF_HARF.HARFLearner(agrLevel ="+agrLevel+", name='HARF-"+str(agrLevel)+"')")
harfout.addimport("import orngRF_HARF") harfout.addimport("import cf_noise.orngRF_HARF")
#harfLearner = orngRF_HARF.HARFLearner(agrLevel = agrLevel, name = "_HARF-"+agrLevel+"_") #harfLearner = orngRF_HARF.HARFLearner(agrLevel = agrLevel, name = "_HARF-"+agrLevel+"_")
output_dict = {} output_dict = {}
output_dict['harfout']= harfout output_dict['harfout']= harfout
...@@ -18,23 +21,39 @@ def harf(input_dict): ...@@ -18,23 +21,39 @@ def harf(input_dict):
def classification_filter(input_dict, widget): def classification_filter(input_dict, widget):
import cf_noise.noiseAlgorithms4lib as nalg import cf_noise.noiseAlgorithms4lib as nalg
output_dict = {} output_dict = {}
# output_dict['noise_dict']= noiseAlgorithms4lib.cfdecide(input_dict, widget) # output_dict['noise_dict']= noiseAlgorithms4lib.cf_decide(input_dict, widget)
output_dict['noise_dict']= nalg.cfdecide(input_dict, widget=None)
orange_dataset = u.convert_dataset_from_scikit_to_orange(input_dict['data'])
output_dict['noise_dict']= nalg.cf_decide(input_dict['learner'], orange_dataset, int(input_dict['k_folds']), widget=None)
return output_dict return output_dict
# SATURATION NOISE FILTER # SATURATION NOISE FILTER
def saturation_filter(input_dict, widget): def saturation_filter(input_dict, widget):
import cf_noise.noiseAlgorithms4lib as nalg import cf_noise.noiseAlgorithms4lib as nalg
orange_dataset = u.convert_dataset_from_scikit_to_orange(input_dict['data'])
if not(input_dict['satur_type'] in ['normal', 'prune']):
raise Exception("Only 'normal' or 'prune' allowed for 'satur_type'.")
output_dict = {} output_dict = {}
output_dict['noise_dict']= nalg.saturation_type(input_dict['data'], widget) output_dict['noise_dict']= nalg.saturation_type(orange_dataset, input_dict['satur_type'], widget)
return output_dict return output_dict
# NOISE RANK # NOISE RANK
def noiserank(input_dict): def noiserank(input_dict):
"""Widget NoiseRank
:param input_dict:
:return:
"""
allnoise = {} allnoise = {}
data = input_dict['data']
data = u.convert_dataset_from_scikit_to_orange(input_dict['data'])
for item in input_dict['noise']: for item in input_dict['noise']:
det_by = item['name'] det_by = item['name']
for i in item['inds']: for i in item['inds']:
...@@ -68,12 +87,18 @@ def compareNoisyExamples(item1, item2): ...@@ -68,12 +87,18 @@ def compareNoisyExamples(item1, item2):
def noiserank_select(postdata,input_dict, output_dict): def noiserank_select(postdata,input_dict, output_dict):
try: try:
output_dict['indices']= outselection = [int(i) for i in postdata['selected']] output_dict['indices']= outselection = [int(i) for i in postdata['selected']]
data = input_dict['data']
# data = input_dict['data']
data = u.convert_dataset_from_scikit_to_orange(input_dict['data'])
selection = [0]*len(data) selection = [0]*len(data)
for i in outselection: for i in outselection:
selection[i] = 1 selection[i] = 1
outdata = data.select(selection, 1) outdata = data.select(selection, 1)
output_dict['selection'] = outdata
data_scikit = u.convert_dataset_from_orange_to_scikit(outdata)
output_dict['selection'] = data_scikit
except KeyError: except KeyError:
output_dict['selection'] = None output_dict['selection'] = None
...@@ -83,10 +108,22 @@ def noiserank_select(postdata,input_dict, output_dict): ...@@ -83,10 +108,22 @@ def noiserank_select(postdata,input_dict, output_dict):
# EVALUATION OF NOISE DETECTION PERFORMANCE # EVALUATION OF NOISE DETECTION PERFORMANCE
def add_class_noise(input_dict): def add_class_noise(input_dict):
"""Widget Add Class Noise
""" """
"""
data_scikit = input_dict['data']
if not(d.is_target_nominal(data_scikit)):
raise Exception("Widget Add Class Noise accepts only datasets with nominal class!")
data = u.convert_dataset_from_scikit_to_orange(data_scikit)
import cf_noise.noiseAlgorithms4lib as nalg import cf_noise.noiseAlgorithms4lib as nalg
output_dict = nalg.addClassNoise(input_dict['data'], input_dict['noise_level'], input_dict['rnd_seed']) noise_indices, orange_data = nalg.add_class_noise(data, input_dict['noise_level'], input_dict['rnd_seed'])
data = u.convert_dataset_from_orange_to_scikit(orange_data)
output_dict = {'noise_inds':noise_indices, 'noisy_data': data}
return output_dict return output_dict
def aggr_results(input_dict): def aggr_results(input_dict):
...@@ -99,6 +136,7 @@ def aggr_results(input_dict): ...@@ -99,6 +136,7 @@ def aggr_results(input_dict):
output_dict['aggr_dict'] = { 'positives' : input_dict['pos_inds'], 'by_alg': input_dict['detected_inds']} output_dict['aggr_dict'] = { 'positives' : input_dict['pos_inds'], 'by_alg': input_dict['detected_inds']}
return output_dict return output_dict
def eval_batch(input_dict): def eval_batch(input_dict):
"""Widget "Evaluate Repeated Detection" """Widget "Evaluate Repeated Detection"
""" """
...@@ -160,71 +198,65 @@ def eval_noise_detection(input_dict): ...@@ -160,71 +198,65 @@ def eval_noise_detection(input_dict):
output_dict['nd_eval'] = sorted(performance, key=itemgetter('name')) output_dict['nd_eval'] = sorted(performance, key=itemgetter('name'))
return output_dict return output_dict
def avrg_std(input_dict): # ENSEMBLE
"""Widget "Average and Standard Deviation" which for some reason is missing from source.ijs.si
-> to be connected on the left using widget "Evaluate Repeated Detection" (eval_batch) def noise_detect_ensemble(input_dict):
""" Noise detection ensemble
:param input_dict:
:return:
""" """
perf_results = input_dict['perf_results']
stats = {} import math
# Aggregate performance results ens = {}
n = len(perf_results) data_inds = input_dict['data_inds']
for i in range(n): ens_type = input_dict['ens_type']
for item in perf_results[i]:
alg = item['name'] for item in data_inds:
if not stats.has_key(alg): #det_by = item['detected_by']
stats[alg] = {} for i in item['inds']:
stats[alg]['precisions'] = [item['precision']] if not ens.has_key(i):
stats[alg]['recalls'] = [item['recall']] ens[i] = 1
stats[alg]['fscores'] = [item['fscore']]
stats[alg]['fbeta'] = item['fbeta']
else: else:
stats[alg]['precisions'].append(item['precision']) ens[i] += 1
stats[alg]['recalls'].append(item['recall'])
stats[alg]['fscores'].append(item['fscore']) ens_out = {}
ens_out['name'] = input_dict['ens_name']
# if last experiment: compute averages ens_out['inds'] = []
if i == n-1: n_algs = len(data_inds)
stats[alg]['avrg_pr'] = reduce(lambda x,y: x+y, stats[alg]['precisions'])/n print ens_type
stats[alg]['avrg_re'] = reduce(lambda x,y: x+y, stats[alg]['recalls'])/n if ens_type == "consensus":
stats[alg]['avrg_fs'] = reduce(lambda x,y: x+y, stats[alg]['fscores'])/n ens_out['inds'] = sorted([x[0] for x in ens.items() if x[1] == n_algs])
else: # majority
# Compute Standard Deviations ens_out['inds'] = sorted([x[0] for x in ens.items() if x[1] >= math.floor(n_algs/2+1)])
import numpy
avrgstdout = []
print stats
for alg, stat in stats.items():
avrgstdout.append({'name': alg, 'precision': stat['avrg_pr'], 'recall': stat['avrg_re'],
'fscore' : stat['avrg_fs'],
'fbeta' : stat['fbeta'],
'std_pr' : numpy.std(stat['precisions']),
'std_re' : numpy.std(stat['recalls']),
'std_fs' : numpy.std(stat['fscores']) })
from operator import itemgetter
output_dict = {} output_dict = {}
output_dict['avrg_w_std'] = sorted(avrgstdout, key=itemgetter('name')) output_dict['ens_out'] = ens_out
return output_dict return output_dict
# VISUALIZATIONS
def pr_space(input_dict):
return {}
def eval_bar_chart(input_dict): # VISUALIZATIONS
return {}
def eval_to_table(input_dict): def eval_to_table(input_dict):
"""Widget Evaluation Results to Table"""
return {} return {}
def data_table(input_dict): # def pr_space(input_dict):
return {} # return {}
#
def data_info(input_dict): # def eval_bar_chart(input_dict):
return {} # return {}
#
def definition_sentences(input_dict): #
return {} # def data_table(input_dict):
# return {}
def term_candidates(input_dict): #
return {} # def data_info(input_dict):
# return {}
#
# def definition_sentences(input_dict):
# return {}
#
# def term_candidates(input_dict):
# return {}
import orange, orngTree, random import random
def addClassNoise(data, noise_level, rnd_seed): import cf_data_mining.classifier as c
import orange
import orngTree
from cf_core.helpers import UnpicklableObject
from cf_noise.utilities import convert_dataset_from_orange_to_scikit
def add_class_noise(data, noise_level, rnd_seed):
"""adds class Noise """adds class Noise
:param data: Orange dataset :param data: Orange dataset
...@@ -43,9 +51,9 @@ def addClassNoise(data, noise_level, rnd_seed): ...@@ -43,9 +51,9 @@ def addClassNoise(data, noise_level, rnd_seed):
#print "\t", temp, "changed to:", data[index].getclass(), "(", index, ")" #print "\t", temp, "changed to:", data[index].getclass(), "(", index, ")"
#print "\n" #print "\n"
noise_indices.sort() noise_indices.sort()
return {'noise_inds':noise_indices, 'noisy_data': data} return noise_indices, data
def addMetaID(data): def add_meta_id(data):
meta_id = orange.FloatVariable("meta_id") meta_id = orange.FloatVariable("meta_id")
mid = orange.newmetaid() mid = orange.newmetaid()
while mid in data.domain.getmetas().keys(): while mid in data.domain.getmetas().keys():
...@@ -54,88 +62,39 @@ def addMetaID(data): ...@@ -54,88 +62,39 @@ def addMetaID(data):
for i in range(len(data)): for i in range(len(data)):
data[i][meta_id] = i data[i][meta_id] = i
def cfdecide(input_dict, widget):
def cf_decide(learner, orange_dataset, k_folds, widget):
"""Classification filter decide """Classification filter decide
:param input_dict: :param learner: Classifier object
:param orange_dataset:
:param k_folds:
:param widget: :param widget:
:return: :return:
""" """
from pysimplesoap.client import SoapFault # somelearner = input_dict['learner']
somelearner = input_dict['learner'] print learner
print somelearner
# SWITCH TO PROCESSING WITH WEKA CLASSIFIERS # SWITCH TO PROCESSING WITH WEKA CLASSIFIERS
if type(somelearner) == unicode or type(somelearner) == str: if isinstance(learner, c.Classifier):
# from services.webservice import WebService
from cf_base.helpers import WebService name = learner.print_classifier()
wsutil = WebService('http://vihar.ijs.si:8092/Utilities?wsdl', float(input_dict['timeout']))
name = "" return cf_run(learner,
try: orange_dataset,
name = wsutil.client.print_model(model = somelearner)['model_as_string'] k_folds,
print wsutil.client.print_model(model = somelearner), name
except SoapFault:
print "Soap fault: unicode string is not a Weka classification learner/model."
return {}
return cfweka(somelearner,
input_dict['data'],
int(input_dict['k_folds']),
float(input_dict['timeout']),
name, name,
widget) widget)
else: else:
return cforange(input_dict, widget) return cf_run_harf(learner, orange_dataset, k_folds, widget)
# else:
# raise Exception("Provided learner is in an unsupported format", str(learner))
def cforange(input_dict, widget):
"""Classification filter for Orange learner
:param input_dict:
:param widget:
:return:
"""
# from workflows.helpers import UnpicklableObject
from cf_base.helpers import UnpicklableObject
somelearner = input_dict['learner']
print "Before generate"
learner = somelearner if not isinstance(somelearner,UnpicklableObject) else somelearner.generate()
print "After generate"
data = input_dict['data']
print len(data)
addMetaID(data)
print 'Before for loop'
k = int(input_dict['k_folds'])
noisyIndices = []
selection = orange.MakeRandomIndicesCV(data, folds=k)
count_noisy = [0]*k
print 'Before for loop'
for test_fold in range(k):
train_data = data.select(selection, test_fold, negate=1)
test_data = data.select(selection, test_fold)
#print "\t\t", "Learned on", len(train_data), "examples"
#file.flush()
print 'Before classifier construction'
#print learner.hovername if learner.hovername != None else "ni hovernamea"
classifier = learner(train_data)
print 'After classifier construction'
for example in test_data:
exclassified = classifier(example)
if exclassified != None and exclassified != example.getclass():
# selection_filter[int(example[meta_id].value)] = 0
noisyIndices.append(int(example["meta_id"].value))
count_noisy[test_fold] += 1
# END test_data
widget.progress = int((test_fold+1)*1.0/k*100)
widget.save()
# END test_fold
return {'inds': sorted(noisyIndices), 'name': learner.name}
## filtered_data = data.select(selection_filter, 1)
## noisy_data = data.select(selection_filter, 0)
## return [filtered_data, noisy_data]=======
def cfweka(learner, data, k_folds, timeout, name, widget=None): def cf_run(learner, data, k_folds, name, widget=None):
"""Classification filter for a Weka learner """Runs a classification filter
:param learner: Weka learner, serialized :param learner: WekaClassifier
:param data: Orange dataset :param data: Orange dataset
:param k_folds: :param k_folds:
:param name: :param name:
...@@ -144,10 +103,6 @@ def cfweka(learner, data, k_folds, timeout, name, widget=None): ...@@ -144,10 +103,6 @@ def cfweka(learner, data, k_folds, timeout, name, widget=None):
:return: :return:
""" """
from cf_base.helpers import WebService
wseval = WebService('http://vihar.ijs.si:8092/Evaluation?wsdl', timeout)
wsutil = WebService('http://vihar.ijs.si:8092/Utilities?wsdl', timeout)
somelearner = learner somelearner = learner
print somelearner print somelearner
...@@ -155,23 +110,32 @@ def cfweka(learner, data, k_folds, timeout, name, widget=None): ...@@ -155,23 +110,32 @@ def cfweka(learner, data, k_folds, timeout, name, widget=None):
selection = orange.MakeRandomIndicesCV(data, folds=k_folds) selection = orange.MakeRandomIndicesCV(data, folds=k_folds)
count_noisy = [0]*k_folds count_noisy = [0]*k_folds
for test_fold in range(k_folds): for test_fold in range(k_folds):
train_arffstr = toARFFstring(data.select(selection, test_fold, negate=1)).getvalue() # train_data = wsutil.client.arff_to_weka_instances(arff = train_arffstr, class_index = data.domain.index(data.domain.classVar))['instances']
train_data = wsutil.client.arff_to_weka_instances(arff = train_arffstr, class_index = data.domain.index(data.domain.classVar))['instances'] train_data = convert_dataset_from_orange_to_scikit( data.select(selection, test_fold, negate=1) )
test_inds = [i for i in range(len(selection)) if selection[i] == test_fold ] test_inds = [i for i in range(len(selection)) if selection[i] == test_fold ]
test_arffstr = toARFFstring(data.select(selection, test_fold)).getvalue() # test_data = wsutil.client.arff_to_weka_instances(arff = test_arffstr, class_index = data.domain.index(data.domain.classVar))['instances']
test_data = wsutil.client.arff_to_weka_instances(arff = test_arffstr, class_index = data.domain.index(data.domain.classVar))['instances'] test_data = convert_dataset_from_orange_to_scikit( data.select(selection, test_fold) )
#print "\t\t", "Learned on", len(train_data), "examples" #print "\t\t", "Learned on", len(train_data), "examples"
#file.flush() #file.flush()
print "pred cl build"
classifier = wseval.client.build_classifier(learner = somelearner, instances = train_data)['classifier'] print "before cl build"
print "po cl build" # classifier = wseval.client.build_classifier(learner = somelearner, instances = train_data)['classifier']
eval_test_data = wseval.client.apply_classifier(classifier = classifier, instances = test_data) learner.build_classifier(train_data)
print "po eval" print "after cl build"
for i in range(len(eval_test_data)):
# eval_test_data = wseval.client.apply_classifier(classifier = classifier, instances = test_data)
scikit_dataset_predicted = learner.apply_classifier(test_data)
print "after apply"
for i in range(len(scikit_dataset_predicted.target)):
#print "Test data length:", len(test_data), "Test inds length:", len(test_inds), "Eval Test data length:", len(eval_test_data) #print "Test data length:", len(test_data), "Test inds length:", len(test_inds), "Eval Test data length:", len(eval_test_data)
print i, "v for zanki", eval_test_data[i]['classes'], data[test_inds[i]].getclass() # print i, "v for zanki", eval_test_data[i]['classes'], data[test_inds[i]].getclass()
if eval_test_data[i]['classes'] != unicode(data[test_inds[i]].getclass()): # if eval_test_data[i]['classes'] != unicode(data[test_inds[i]].getclass()):
if scikit_dataset_predicted.target[i] != scikit_dataset_predicted.targetPredicted[i]:
# selection_filter[int(example[meta_id].value)] = 0 # selection_filter[int(example[meta_id].value)] = 0
noisyIndices.append(test_inds[i]) noisyIndices.append(test_inds[i])
count_noisy[test_fold] += 1 count_noisy[test_fold] += 1
...@@ -180,7 +144,53 @@ def cfweka(learner, data, k_folds, timeout, name, widget=None): ...@@ -180,7 +144,53 @@ def cfweka(learner, data, k_folds, timeout, name, widget=None):
widget.progress = int((test_fold+1)*1.0/k_folds*100) widget.progress = int((test_fold+1)*1.0/k_folds*100)
widget.save() widget.save()
# END test_fold # END test_fold
return {'inds': sorted(noisyIndices), 'name': getWekaName(name)} return {'inds': sorted(noisyIndices), 'name': get_weka_name(name)}
def cf_run_harf(learner, data_orange, k_folds, widget=None):
"""Classification filter for HARF learner
:param learner:
:param data_orange:
:param k_folds:
:param widget:
:return:
"""
somelearner = learner
print "Before generate"
learner = somelearner if not isinstance(somelearner,UnpicklableObject) else somelearner.generate()
print "After generate"
# data_orange = input_dict['data_orange']
print len(data_orange)
add_meta_id(data_orange)
print 'Before for loop'
k = k_folds
noisyIndices = []
selection = orange.MakeRandomIndicesCV(data_orange, folds=k)
count_noisy = [0]*k
print 'Before for loop'
for test_fold in range(k):
train_data = data_orange.select(selection, test_fold, negate=1)
test_data = data_orange.select(selection, test_fold)
#print "\t\t", "Learned on", len(train_data), "examples"
#file.flush()
print 'Before classifier construction'
#print learner.hovername if learner.hovername != None else "ni hovernamea"
classifier = learner(train_data)
print 'After classifier construction'
for example in test_data:
exclassified = classifier(example)
if exclassified != None and exclassified != example.getclass():
# selection_filter[int(example[meta_id].value)] = 0
noisyIndices.append(int(example["meta_id"].value))
count_noisy[test_fold] += 1
# END test_data
if not(widget is None):
widget.progress = int((test_fold+1)*1.0/k*100)
widget.save()
# END test_fold
return {'inds': sorted(noisyIndices), 'name': learner.name}
def saturation_type(dataset, satur_type='normal', widget=None): def saturation_type(dataset, satur_type='normal', widget=None):
"""Saturation filter """Saturation filter
...@@ -191,7 +201,7 @@ def saturation_type(dataset, satur_type='normal', widget=None): ...@@ -191,7 +201,7 @@ def saturation_type(dataset, satur_type='normal', widget=None):
:return: :return:
""" """
addMetaID(dataset) add_meta_id(dataset)
if not(widget==None): if not(widget==None):
widget.progress = 0 widget.progress = 0
widget.save() widget.save()
...@@ -200,7 +210,7 @@ def saturation_type(dataset, satur_type='normal', widget=None): ...@@ -200,7 +210,7 @@ def saturation_type(dataset, satur_type='normal', widget=None):
progress_steps = (3*data_len**2 + 2*data_len)/8 # provided max allowed iter steps (k) = data_len/2 progress_steps = (3*data_len**2 + 2*data_len)/8 # provided max allowed iter steps (k) = data_len/2
if satur_type == 'prune': if satur_type == 'prune':
if not dataset.hasMissingValues(): if not dataset.hasMissingValues():
return pruneSF(dataset, 1, progress_steps, widget) return prune_sf(dataset, 1, progress_steps, widget)
else: else:
raise Exception("Pre-pruned saturation filtering requires data WITHOUT missing values!") raise Exception("Pre-pruned saturation filtering requires data WITHOUT missing values!")
else: else:
...@@ -210,7 +220,7 @@ def cmplx(set): ...@@ -210,7 +220,7 @@ def cmplx(set):
classifier = orngTree.TreeLearner(set, sameMajorityPruning=1, mForPruning=0) classifier = orngTree.TreeLearner(set, sameMajorityPruning=1, mForPruning=0)
return orngTree.countNodes(classifier) return orngTree.countNodes(classifier)
def findNoise(data): def find_noise(data):
n = len(data) n = len(data)
noisiest = [] noisiest = []
gE = cmplx(data) gE = cmplx(data)
...@@ -264,7 +274,7 @@ def saturation(dataset, widget): ...@@ -264,7 +274,7 @@ def saturation(dataset, widget):
workSet = orange.ExampleTable(dataset) workSet = orange.ExampleTable(dataset)
while k != 0: while k != 0: