aboutsummaryrefslogtreecommitdiff
path: root/evaluate.py
diff options
context:
space:
mode:
Diffstat (limited to 'evaluate.py')
-rwxr-xr-xevaluate.py68
1 files changed, 37 insertions, 31 deletions
diff --git a/evaluate.py b/evaluate.py
index df0c79b..d4d0b07 100755
--- a/evaluate.py
+++ b/evaluate.py
@@ -11,6 +11,7 @@ import logging
from logging import debug
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomTreesEmbedding
from sklearn.metrics import accuracy_score
import time
@@ -25,6 +26,8 @@ parser.add_argument("-D", "--treedepth", help="depth of trees", type=int, defaul
parser.add_argument("-v", "--verbose", help="Use verbose output", action='store_true')
parser.add_argument("-t", "--timer", help="Display execution time", action='store_true')
parser.add_argument("-T", "--testmode", help="Testmode", action='store_true')
+parser.add_argument("-E", "--embest", help="RandomTreesEmbedding estimators", type=int, default=256)
+parser.add_argument("-r", "--randomness", help="Randomness parameter", type=int, default=0)
parser.add_argument("-s", "--seed", help="Seed to use for random_state when creating trees", type=int, default=0)
args = parser.parse_args()
@@ -35,7 +38,7 @@ def make_histogram(data, model, args):
if args.kmean:
hist_size = args.kmean
else:
- hist_size = args.estimators*args.leaves
+ hist_size = args.embest*args.leaves
histogram = np.zeros((data.shape[0], data.shape[1],hist_size))
for i in range(data.shape[0]):
@@ -54,20 +57,22 @@ def run_model (data, train, test, train_part, args):
if (args.kmean):
logging.debug("Computing KMeans with", train_part.shape[0], "keywords")
- kmeans = KMeans(n_clusters=args.kmean, n_init=args.estimators, random_state=args.seed).fit(train_part)
+ kmeans = KMeans(n_clusters=args.kmean, n_init=1, random_state=args.seed).fit(train_part)
hist_train = make_histogram(train, kmeans, args)
hist_test = make_histogram(test, kmeans, args)
else:
- trees = RandomTreesEmbedding(max_leaf_nodes=args.leaves, n_estimators=args.estimators, random_state=args.seed).fit(train_part)
+ trees = RandomTreesEmbedding(max_leaf_nodes=args.leaves, n_estimators=args.embest, random_state=args.seed).fit(train_part)
hist_train = make_histogram(train, trees, args)
hist_test = make_histogram(test, trees, args)
logging.debug("Generating histograms")
-
logging.debug("Keywords shape", hist_train.shape, "\n")
logging.debug("Planting trees...")
- clf = RandomForestClassifier(n_estimators=args.estimators, max_depth=args.treedepth, random_state=args.seed)
+ if args.randomness:
+ clf = RandomForestClassifier(max_features=args.randomness, n_estimators=args.estimators, max_depth=args.treedepth, random_state=0)
+ else:
+ clf = RandomForestClassifier(n_estimators=args.estimators, max_depth=args.treedepth, random_state=args.seed)
clf.fit(
hist_train.reshape((hist_train.shape[0]*hist_train.shape[1], hist_train.shape[2])),
np.repeat(np.arange(hist_train.shape[0]), hist_train.shape[1]))
@@ -76,6 +81,8 @@ def run_model (data, train, test, train_part, args):
test_pred = clf.predict(hist_test.reshape((hist_test.shape[0]*hist_test.shape[1], hist_test.shape[2])))
test_label = np.repeat(np.arange(hist_test.shape[0]), hist_test.shape[1])
+ train_pred = clf.predict(hist_train.reshape((hist_train.shape[0]*hist_train.shape[1], hist_train.shape[2])))
+ train_label = np.repeat(np.arange(hist_train.shape[0]), hist_train.shape[1])
if args.timer:
end = time.time()
@@ -84,7 +91,7 @@ def run_model (data, train, test, train_part, args):
skplt.metrics.plot_confusion_matrix(test_pred, test_label, normalize=True)
plt.show()
- return accuracy_score(test_pred, test_label)
+ return accuracy_score(test_pred, test_label)#, accuracy_score(train_pred, train_label), end-start
@@ -96,35 +103,34 @@ def main():
logging.debug("Verbose is on")
if args.testmode:
- cnt = 0
- acc = np.zeros((5,5))
- for i in range(5):
- args.estimators = (i+1)*200
- cnt+=1
- for j in range(5):
- args.treedepth = j + 1
- print("Step ", cnt)
- acc[i][j] = run_model (data, train, test, train_part, args)
- print("Accuracy ",acc[i][j])
- cnt+=1
- fig, ax = plt.subplots()
- im = ax.imshow(acc)
- ax.set_xticks(np.arange(5))
- ax.set_yticks(np.arange(5))
- ax.set_xlabel('Number of trees')
- ax.set_ylabel('Tree depth')
+ acc = np.zeros((3,100))
+ a = np.zeros(100)
+ for i in range(100):
+ if i <= 10:
+ args.kmean = (i+1)
+ else:
+ args.kmean = 15*i
+ a[i] = args.kmean
+ print("Kmeans: ",args.kmean)
+ acc[0][i], acc[1][i], acc[2][i] = run_model (data, train, test, train_part, args)
+ print("Accuracy test:",acc[0][i], "Accuracy train:", acc[1][i])
+
+ plt.plot(a,1-acc[0])
+ plt.plot(a,1-acc[1])
+ plt.ylabel('Normalized Classification Error')
+ plt.xlabel('Vocabulary size')
+ plt.title('Classification error varying vocabulary size')
+ plt.legend(('Test','Train'),loc='upper right')
+ plt.show()
+ plt.plot(a,acc[2])
+ plt.ylabel('Time (s)')
+ plt.xlabel('Vocabulary size')
+ plt.title('Time complexity varying vocabulary size')
+ plt.show()
- # Loop over data dimensions and create text annotations.
- for i in range(5):
- for j in range(5):
- text = ax.text(j, i, acc[i, j], ha="center", va="center", color="w")
- ax.set_title("Accuracy varying hyper-parameters")
- fig.tight_layout()
- plt.show()
else:
acc = run_model (data, train, test, train_part, args)
print(acc)
if __name__ == "__main__":
main()
-