Commit b54e348d authored by Darko Aleksovski's avatar Darko Aleksovski
Browse files

Implementation and tests for 8 widgets

parent 0ce46ce8
# ===================================================================
# HARF (HIGH AGREEMENT RANDOM FOREST)
def harf(input_dict):
#import orngRF_HARF
from cf_base.helpers import UnpicklableObject
agrLevel = input_dict['agr_level']
#data = input_dict['data']
harfout = UnpicklableObject("orngRF_HARF.HARFLearner(agrLevel ="+agrLevel+", name='HARF-"+str(agrLevel)+"')")
harfout.addimport("import orngRF_HARF")
#harfLearner = orngRF_HARF.HARFLearner(agrLevel = agrLevel, name = "_HARF-"+agrLevel+"_")
output_dict = {}
output_dict['harfout']= harfout
return output_dict
# CLASSIFICATION NOISE FILTER
def classification_filter(input_dict, widget):
import cf_noise.noiseAlgorithms4lib as nalg
output_dict = {}
# output_dict['noise_dict']= noiseAlgorithms4lib.cfdecide(input_dict, widget)
output_dict['noise_dict']= nalg.cfdecide(input_dict, widget=None)
return output_dict
# SATURATION NOISE FILTER
def saturation_filter(input_dict, widget):
import cf_noise.noiseAlgorithms4lib as nalg
output_dict = {}
output_dict['noise_dict']= nalg.saturation_type(input_dict['data'], widget)
return output_dict
# NOISE RANK
def noiserank(input_dict):
allnoise = {}
data = input_dict['data']
for item in input_dict['noise']:
det_by = item['name']
for i in item['inds']:
if not allnoise.has_key(i):
allnoise[i] = {}
allnoise[i]['id'] = i
allnoise[i]['class'] = data[int(i)].getclass().value
allnoise[i]['by'] = []
allnoise[i]['by'].append(det_by)
print allnoise[i]['by']
from operator import itemgetter
outallnoise = sorted(allnoise.values(), key=itemgetter('id'))
outallnoise.sort(compareNoisyExamples)
output_dict = {}
output_dict['allnoise'] = outallnoise
output_dict['selection'] = {}
return output_dict
def compareNoisyExamples(item1, item2):
len1 = len(item1["by"])
len2 = len(item2["by"])
if len1 > len2: # reversed, want to have decreasing order
return -1
elif len1 < len2: # reversed, want to have decreasing order
return 1
else:
return 0
def noiserank_select(postdata,input_dict, output_dict):
try:
output_dict['indices']= outselection = [int(i) for i in postdata['selected']]
data = input_dict['data']
selection = [0]*len(data)
for i in outselection:
selection[i] = 1
outdata = data.select(selection, 1)
output_dict['selection'] = outdata
except KeyError:
output_dict['selection'] = None
return output_dict
# EVALUATION OF NOISE DETECTION PERFORMANCE
def add_class_noise(input_dict):
"""
"""
import cf_noise.noiseAlgorithms4lib as nalg
output_dict = nalg.addClassNoise(input_dict['data'], input_dict['noise_level'], input_dict['rnd_seed'])
return output_dict
def aggr_results(input_dict):
"""Widget Aggregate Detection Results
:param input_dict:
:return:
"""
output_dict = {}
output_dict['aggr_dict'] = { 'positives' : input_dict['pos_inds'], 'by_alg': input_dict['detected_inds']}
return output_dict
def eval_batch(input_dict):
"""Widget "Evaluate Repeated Detection"
"""
alg_perfs = input_dict['perfs']
beta = float(input_dict['beta'])
performances = []
for exper in alg_perfs:
noise = exper['positives']
nds = exper['by_alg']
performance = []
for nd in nds:
nd_alg = nd['name']
det_noise = nd['inds']
inboth = set(noise).intersection(set(det_noise))
recall = len(inboth)*1.0/len(noise) if len(noise) > 0 else 0
precision = len(inboth)*1.0/len(det_noise) if len(det_noise) > 0 else 0
print beta, recall, precision
if precision == 0 and recall == 0:
fscore = 0
else:
fscore = (1+beta**2)*precision*recall/((beta**2)*precision + recall)
performance.append({'name':nd_alg, 'recall': recall, 'precision' : precision, 'fscore' : fscore, 'fbeta': beta})
performances.append(performance)
output_dict = {}
output_dict['perf_results'] = performances
return output_dict
def eval_noise_detection(input_dict):
"""Widget "Evaluate Detection Algorithms"
:param input_dict:
:return:
"""
noise = input_dict['noisy_inds']
nds = input_dict['detected_noise']
performance = []
for nd in nds:
nd_alg = nd['name']
det_noise = nd['inds']
inboth = set(noise).intersection(set(det_noise))
recall = len(inboth)*1.0/len(noise) if len(noise) > 0 else 0
precision = len(inboth)*1.0/len(det_noise) if len(det_noise) > 0 else 0
beta = float(input_dict['f_beta'])
print beta, recall, precision
if precision == 0 and recall == 0:
fscore = 0
else:
fscore = (1+beta**2)*precision*recall/((beta**2)*precision + recall)
performance.append({'name':nd_alg, 'recall': recall, 'precision' : precision, 'fscore' : fscore, 'fbeta': beta})
from operator import itemgetter
output_dict = {}
output_dict['nd_eval'] = sorted(performance, key=itemgetter('name'))
return output_dict
def avrg_std(input_dict):
"""Widget "Average and Standard Deviation" which for some reason is missing from source.ijs.si
-> to be connected on the left using widget "Evaluate Repeated Detection" (eval_batch)
"""
perf_results = input_dict['perf_results']
stats = {}
# Aggregate performance results
n = len(perf_results)
for i in range(n):
for item in perf_results[i]:
alg = item['name']
if not stats.has_key(alg):
stats[alg] = {}
stats[alg]['precisions'] = [item['precision']]
stats[alg]['recalls'] = [item['recall']]
stats[alg]['fscores'] = [item['fscore']]
stats[alg]['fbeta'] = item['fbeta']
else:
stats[alg]['precisions'].append(item['precision'])
stats[alg]['recalls'].append(item['recall'])
stats[alg]['fscores'].append(item['fscore'])
# if last experiment: compute averages
if i == n-1:
stats[alg]['avrg_pr'] = reduce(lambda x,y: x+y, stats[alg]['precisions'])/n
stats[alg]['avrg_re'] = reduce(lambda x,y: x+y, stats[alg]['recalls'])/n
stats[alg]['avrg_fs'] = reduce(lambda x,y: x+y, stats[alg]['fscores'])/n
# Compute Standard Deviations
import numpy
avrgstdout = []
print stats
for alg, stat in stats.items():
avrgstdout.append({'name': alg, 'precision': stat['avrg_pr'], 'recall': stat['avrg_re'],
'fscore' : stat['avrg_fs'],
'fbeta' : stat['fbeta'],
'std_pr' : numpy.std(stat['precisions']),
'std_re' : numpy.std(stat['recalls']),
'std_fs' : numpy.std(stat['fscores']) })
from operator import itemgetter
output_dict = {}
output_dict['avrg_w_std'] = sorted(avrgstdout, key=itemgetter('name'))
return output_dict
# VISUALIZATIONS
def pr_space(input_dict):
return {}
def eval_bar_chart(input_dict):
return {}
def eval_to_table(input_dict):
return {}
def data_table(input_dict):
return {}
def data_info(input_dict):
return {}
def definition_sentences(input_dict):
return {}
def term_candidates(input_dict):
return {}
<
import orange, orngTree, random
def addClassNoise(data, noise_level, rnd_seed):
"""adds class Noise
:param data: Orange dataset
:param noise_level:
:param rnd_seed:
:return:
"""
meta_noisy = orange.EnumVariable("noise", values=["no", "yes"])
mid = orange.newmetaid()
while mid in data.domain.getmetas().keys():
mid = orange.newmetaid()
data.domain.addmeta(mid, meta_noisy)
data.addMetaAttribute("noise", "no")
# Generate random indices for noise insertion
percent = float(noise_level)/100
try:
rnds = int(rnd_seed)
except:
rnds = 0
print "Random Seed:", rnds
orange.setrandseed(rnds)
noise_indices = random.sample(range(len(data)), int(round(percent*len(data))))
#print "Amount of added noise:", percent*100, "percent (", len(noise_indices), "examples ):"
#print "Random indices for added noise:", noise_indices
className = data.domain.classVar.name
#print "Class name:", className
for index in noise_indices:
data[index]["noise"] = "yes"
temp = data[index][className]
## if len(data.domain.classVar.values) > 2:
# random value + check if it is diferent from the current one
new_label = data.domain.classVar.randomvalue()
while new_label == temp:
new_label = data.domain.classVar.randomvalue()
data[index][className] = new_label
## else:
## # switch the class value
## data[index][className] = data.domain.classVar.nextvalue(data[index][className])
#print "\t", temp, "changed to:", data[index].getclass(), "(", index, ")"
#print "\n"
noise_indices.sort()
return {'noise_inds':noise_indices, 'noisy_data': data}
def addMetaID(data):
meta_id = orange.FloatVariable("meta_id")
mid = orange.newmetaid()
while mid in data.domain.getmetas().keys():
mid = orange.newmetaid()
data.domain.addmeta(mid, meta_id)
for i in range(len(data)):
data[i][meta_id] = i
def cfdecide(input_dict, widget):
"""Classification filter decide
:param input_dict:
:param widget:
:return:
"""
from pysimplesoap.client import SoapFault
somelearner = input_dict['learner']
print somelearner
# SWITCH TO PROCESSING WITH WEKA CLASSIFIERS
if type(somelearner) == unicode or type(somelearner) == str:
# from services.webservice import WebService
from cf_base.helpers import WebService
wsutil = WebService('http://vihar.ijs.si:8092/Utilities?wsdl', float(input_dict['timeout']))
name = ""
try:
name = wsutil.client.print_model(model = somelearner)['model_as_string']
print wsutil.client.print_model(model = somelearner), name
except SoapFault:
print "Soap fault: unicode string is not a Weka classification learner/model."
return {}
return cfweka(somelearner,
input_dict['data'],
int(input_dict['k_folds']),
float(input_dict['timeout']),
name,
widget)
else:
return cforange(input_dict, widget)
def cforange(input_dict, widget):
"""Classification filter for Orange learner
:param input_dict:
:param widget:
:return:
"""
# from workflows.helpers import UnpicklableObject
from cf_base.helpers import UnpicklableObject
somelearner = input_dict['learner']
print "Before generate"
learner = somelearner if not isinstance(somelearner,UnpicklableObject) else somelearner.generate()
print "After generate"
data = input_dict['data']
print len(data)
addMetaID(data)
print 'Before for loop'
k = int(input_dict['k_folds'])
noisyIndices = []
selection = orange.MakeRandomIndicesCV(data, folds=k)
count_noisy = [0]*k
print 'Before for loop'
for test_fold in range(k):
train_data = data.select(selection, test_fold, negate=1)
test_data = data.select(selection, test_fold)
#print "\t\t", "Learned on", len(train_data), "examples"
#file.flush()
print 'Before classifier construction'
#print learner.hovername if learner.hovername != None else "ni hovernamea"
classifier = learner(train_data)
print 'After classifier construction'
for example in test_data:
exclassified = classifier(example)
if exclassified != None and exclassified != example.getclass():
# selection_filter[int(example[meta_id].value)] = 0
noisyIndices.append(int(example["meta_id"].value))
count_noisy[test_fold] += 1
# END test_data
widget.progress = int((test_fold+1)*1.0/k*100)
widget.save()
# END test_fold
return {'inds': sorted(noisyIndices), 'name': learner.name}
## filtered_data = data.select(selection_filter, 1)
## noisy_data = data.select(selection_filter, 0)
## return [filtered_data, noisy_data]=======
def cfweka(learner, data, k_folds, timeout, name, widget=None):
"""Classification filter for a Weka learner
:param learner: Weka learner, serialized
:param data: Orange dataset
:param k_folds:
:param name:
:param timeout:
:param widget:
:return:
"""
from cf_base.helpers import WebService
wseval = WebService('http://vihar.ijs.si:8092/Evaluation?wsdl', timeout)
wsutil = WebService('http://vihar.ijs.si:8092/Utilities?wsdl', timeout)
somelearner = learner
print somelearner
noisyIndices = []
selection = orange.MakeRandomIndicesCV(data, folds=k_folds)
count_noisy = [0]*k_folds
for test_fold in range(k_folds):
train_arffstr = toARFFstring(data.select(selection, test_fold, negate=1)).getvalue()
train_data = wsutil.client.arff_to_weka_instances(arff = train_arffstr, class_index = data.domain.index(data.domain.classVar))['instances']
test_inds = [i for i in range(len(selection)) if selection[i] == test_fold ]
test_arffstr = toARFFstring(data.select(selection, test_fold)).getvalue()
test_data = wsutil.client.arff_to_weka_instances(arff = test_arffstr, class_index = data.domain.index(data.domain.classVar))['instances']
#print "\t\t", "Learned on", len(train_data), "examples"
#file.flush()
print "pred cl build"
classifier = wseval.client.build_classifier(learner = somelearner, instances = train_data)['classifier']
print "po cl build"
eval_test_data = wseval.client.apply_classifier(classifier = classifier, instances = test_data)
print "po eval"
for i in range(len(eval_test_data)):
#print "Test data length:", len(test_data), "Test inds length:", len(test_inds), "Eval Test data length:", len(eval_test_data)
print i, "v for zanki", eval_test_data[i]['classes'], data[test_inds[i]].getclass()
if eval_test_data[i]['classes'] != unicode(data[test_inds[i]].getclass()):
# selection_filter[int(example[meta_id].value)] = 0
noisyIndices.append(test_inds[i])
count_noisy[test_fold] += 1
# END test_data
if not(widget is None):
widget.progress = int((test_fold+1)*1.0/k_folds*100)
widget.save()
# END test_fold
return {'inds': sorted(noisyIndices), 'name': getWekaName(name)}
def saturation_type(dataset, satur_type='normal', widget=None):
"""Saturation filter
:param dataset: Orange dataset
:param satur_type: 'normal' or 'prune'
:param widget:
:return:
"""
addMetaID(dataset)
if not(widget==None):
widget.progress = 0
widget.save()
data_len = len(dataset)
#k = data_len/2
progress_steps = (3*data_len**2 + 2*data_len)/8 # provided max allowed iter steps (k) = data_len/2
if satur_type == 'prune':
if not dataset.hasMissingValues():
return pruneSF(dataset, 1, progress_steps, widget)
else:
raise Exception("Pre-pruned saturation filtering requires data WITHOUT missing values!")
else:
return saturation(dataset, widget)
def cmplx(set):
classifier = orngTree.TreeLearner(set, sameMajorityPruning=1, mForPruning=0)
return orngTree.countNodes(classifier)
def findNoise(data):
n = len(data)
noisiest = []
gE = cmplx(data)
print "\t\t", "Classifier complexity:", gE, "nodes"
#file.flush()
min = gE
for i in range(n):
selection = [1]*n
selection[i] = 0
Ex = data.select(selection)
if len(Ex)== 0:
print "\t\t", "Saturation Filtering FAILED!"
#file.flush()
return [0, []]
else:
gEx = cmplx(Ex)
if gEx < min:
noisiest = [i]
min = gEx
print "\t\t", "(%s." % int(data[i]["meta_id"]),"example excluded) Subset complexity:", gEx, "nodes"#, "(%s)" % data[i]["noise"].value
#file.flush()
#print data[i]
elif gEx != gE and gEx == min:
noisiest.append(i)
print "\t\t", "(%s." % int(data[i]["meta_id"]),"example excluded) Subset complexity:", gEx, "nodes"#, "(%s)" % data[i]["noise"].value
#file.flush()
#print data[i]
if noisiest != []:
return [0, noisiest]
else:
return [1, []]
def saturation(dataset, widget):
"""Saturation
:param dataset: Orange dataset
:param widget:
:return:
"""
#dataset = input_dict['data']
print "\t","Saturation Filtering:"
#file.flush()
noisyA = orange.ExampleTable(dataset.domain)
data_len = len(dataset)
k = data_len/2
progress_steps = (3*data_len**2 + 2*data_len)/8 # provided max allowed iter steps (k) = data_len/2
if not(widget==None):
prog_sum = widget.progress
workSet = orange.ExampleTable(dataset)
while k != 0:
n = len(workSet)
satfilter = findNoise(workSet)
if satfilter == [1,[]]:
print "\t\t", satfilter
if not(widget==None):
widget.progress = 100
widget.save()
break
else:
noisyExmpls = satfilter[1]
#print noisyExmpls
selection = [0]*n
choose = random.choice(noisyExmpls)
print "\t\t", "Randomly choose one noisy example among:", len(noisyExmpls),\
# "(%s. is added noise: %s)" % (int(workSet[choose]["meta_id"]), workSet[choose]["noise"].value)
#file.flush()
selection[choose] = 1
noisyA.extend(workSet.select(selection))
workSet = workSet.select(selection, negate=1)
if not(widget==None):
prog_sum += n*1.0/progress_steps*100
widget.progress = int(prog_sum)
widget.save()
print "widget prog: ", widget.progress, "n: ", n, "progress_steps:", progress_steps, "prog_sum:", prog_sum
k -= 1
print "\t\t", "Found:", len(noisyA), "examples.\n"
#file.flush()
noisyIndices = []
for ex in noisyA:
noisyIndices.append(int(ex["meta_id"].value))
#return [noisyA, workSet]
#return [noisyIndices, workSet]
return {"inds" : sorted(noisyIndices), "name" : "SF"}
def findPrunableNoisy(node, minExmplsInLeaf):
toPrune = []
print "in find, toPrune:", toPrune
if isinstance(node, orange.TreeNode):
#print "Bu!"
if node and node.branchSelector:
#print "Bu111!"
for branch in node.branches:
if branch == None:
continue
else:
if len(branch.examples) > minExmplsInLeaf + 0.5:
bla = findPrunableNoisy(branch, minExmplsInLeaf)
toPrune.extend(bla)
else:
print "Zapisal za brisanje"
for ex in branch.examples:
toPrune.append(int(ex["meta_id"].value))
return toPrune
return []
else:
raise TypeError, "TreeNode expected"
def excludePruned(dataset, classifier, minExmplsInLeaf):
print "in exclude"
toPrune = findPrunableNoisy(classifier.tree, minExmplsInLeaf)
uniqueItems(toPrune)
print "\t\t", "Leaves with", minExmplsInLeaf, "or less examples will be pruned."
print "\t\t", "IDs of examples excluded by pruning:", toPrune
#file.flush()
#noisyA = orange.ExampleTable(dataset.domain)
n = len(dataset)
selection = [0]*n
for index in toPrune:
selection[index] = 1
#noisyA.extend(dataset.select(selection))
workSet = dataset.select(selection, negate=1)
#return [noisyA, dataset]
return [toPrune, workSet]
def uniqueItems(list):
list.sort()
k = 0
while k < len(list)-1:
if list[k+1] == list[k]:
del list[k+1]
else:
k += 1
def pruneSF(data, minExmplsInLeaf, progress_steps, widget=None):
"""Prune Saturation Filter
:param data:
:param minExmplsInLeaf:
:param progress_steps:
:param widget:
:return:
"""
print "\t", "Pruning + Saturation Filter:"
#file.flush()
classifier = orngTree.TreeLearner(data, sameMajorityPruning=1, mForPruning=0, storeExamples=1)
print "\t\t", "Classifier complexity:\t", orngTree.countNodes(classifier), "nodes."
#file.flush()
## [noisyA, dataset] = excludePruned(data, classifier, minExmplsInLeaf)
[noisePruned, dataset] = excludePruned(data, classifier, minExmplsInLeaf)
print "\t\t", len(noisePruned), "example(s) were excluded by pruning."
#file.flush()