Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Petra
DM_course
Commits
f67f720c
Commit
f67f720c
authored
Nov 14, 2019
by
Petra
Browse files
classification
parent
8cab96ea
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
2_classification-overfitting.py
0 → 100644
View file @
f67f720c
import
pandas
as
pd
from
sklearn
import
tree
from
sklearn.model_selection
import
train_test_split
from
sklearn
import
metrics
print
(
"_______________________________________________________________________________"
)
print
(
" Classification, fitting & overfitting. "
)
print
(
"_______________________________________________________________________________"
)
print
(
""" --- Load the data ---"""
)
csvFileName
=
r
".\Datasets\A-greater-then-B.csv "
df
=
pd
.
read_csv
(
csvFileName
)
print
(
df
.
head
())
print
(
"data shape: "
,
df
.
shape
)
print
(
""" --- Set the features (independednt variables, attributes) and target ---"""
)
feature_cols
=
[
'A'
,
'B'
,
'C'
]
target_var
=
'A>B'
X
=
df
[
feature_cols
].
values
y
=
df
[
target_var
].
values
print
(
"Features: "
,
feature_cols
,
"
\n
Target:"
,
target_var
)
print
(
""" --- Train-test split ---"""
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
print
(
"train set X shape: "
,
X_train
.
shape
,
"train set y shape: "
,
y_train
.
shape
)
print
(
"test set X shape: "
,
X_test
.
shape
,
"test set y shape: "
,
y_test
.
shape
)
print
(
"decision_tree.tree_.node_count
\t
acc(train)
\t
acc(test)"
)
for
i
in
range
(
2
,
100
):
decision_tree
=
tree
.
DecisionTreeClassifier
(
max_leaf_nodes
=
i
)
decision_tree
.
fit
(
X_train
,
y_train
)
y_pred_test
=
decision_tree
.
predict
(
X_test
)
y_pred_train
=
decision_tree
.
predict
(
X_train
)
acc_train
=
metrics
.
accuracy_score
(
y_train
,
y_pred_train
)
acc_test
=
metrics
.
accuracy_score
(
y_test
,
y_pred_test
)
print
(
"
\t
{0:5.2f}
\t
{1:5.2f}
\t
{2:5.2f}"
.
format
(
decision_tree
.
tree_
.
node_count
,
acc_train
,
acc_test
))
2_classification.py
0 → 100644
View file @
f67f720c
import
pandas
as
pd
from
sklearn
import
tree
from
sklearn.model_selection
import
train_test_split
from
sklearn
import
metrics
print
(
"_______________________________________________________________________________"
)
print
(
" Classification, fitting, predicting ... printing. "
)
print
(
"_______________________________________________________________________________"
)
print
(
""" --- Load the data ---"""
)
csvFileName
=
r
".\Datasets\A-greater-then-B.csv "
df
=
pd
.
read_csv
(
csvFileName
)
print
(
df
.
head
())
print
(
"data shape: "
,
df
.
shape
)
print
(
""" --- Set the features (independednt variables, attributes) and target ---"""
)
feature_cols
=
[
'A'
,
'B'
,
'C'
]
target_var
=
'A>B'
X
=
df
[
feature_cols
].
values
y
=
df
[
target_var
].
values
print
(
"Features: "
,
feature_cols
,
"
\n
Target:"
,
target_var
)
print
(
""" --- Train-test split ---"""
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
print
(
"train set X shape: "
,
X_train
.
shape
,
"train set y shape: "
,
y_train
.
shape
)
print
(
"test set X shape: "
,
X_test
.
shape
,
"test set y shape: "
,
y_test
.
shape
)
print
(
""" --- Initialize the learner(s) --- """
)
# documentation: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
decision_tree
=
tree
.
DecisionTreeClassifier
()
print
(
"Parameters of the decision tree: "
,
decision_tree
.
get_params
())
print
(
""" --- Fit --- """
)
decision_tree
.
fit
(
X_train
,
y_train
)
print
(
"Depth of the decision tree: "
,
decision_tree
.
tree_
.
max_depth
)
print
(
"Number of nodes of the decision tree: "
,
decision_tree
.
tree_
.
node_count
)
print
(
""" --- Predict --- """
)
y_pred
=
decision_tree
.
predict
(
X_test
)
print
(
"
\n
Actual Predicted"
)
#for i in range(len(y_test)):
for
i
in
range
(
10
):
print
(
"{0:6.2f} {1:8.2f}"
.
format
(
y_test
[
i
],
y_pred
[
i
]))
print
(
""" -- Performance ---"""
)
print
(
"Accuracy
\t
{0:5.2f}"
.
format
(
metrics
.
accuracy_score
(
y_test
,
y_pred
)))
""" Visualize the tree """
# https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html#sphx-glr-auto-examples-tree-plot-unveil-tree-structure-py
treeFileName
=
'decision_tree.dot'
tree
.
export_graphviz
(
decision_tree
,
out_file
=
treeFileName
)
#
# # install GraphViz
#$ dot -Tps decision_tree.dot -o decision_tree.ps (PostScript format)
# # $ dot -Tpng decision_tree.dot -o decision_tree.png (PNG format)
Datasets/A-greater-then-B.csv
0 → 100644
View file @
f67f720c
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment