diff options
Diffstat (limited to 'evaluate.py')
-rwxr-xr-x | evaluate.py | 68 |
1 files changed, 37 insertions, 31 deletions
diff --git a/evaluate.py b/evaluate.py index df0c79b..d4d0b07 100755 --- a/evaluate.py +++ b/evaluate.py @@ -11,6 +11,7 @@ import logging from logging import debug from sklearn.cluster import KMeans from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import RandomTreesEmbedding from sklearn.metrics import accuracy_score import time @@ -25,6 +26,8 @@ parser.add_argument("-D", "--treedepth", help="depth of trees", type=int, defaul parser.add_argument("-v", "--verbose", help="Use verbose output", action='store_true') parser.add_argument("-t", "--timer", help="Display execution time", action='store_true') parser.add_argument("-T", "--testmode", help="Testmode", action='store_true') +parser.add_argument("-E", "--embest", help="RandomTreesEmbedding estimators", type=int, default=256) +parser.add_argument("-r", "--randomness", help="Randomness parameter", type=int, default=0) parser.add_argument("-s", "--seed", help="Seed to use for random_state when creating trees", type=int, default=0) args = parser.parse_args() @@ -35,7 +38,7 @@ def make_histogram(data, model, args): if args.kmean: hist_size = args.kmean else: - hist_size = args.estimators*args.leaves + hist_size = args.embest*args.leaves histogram = np.zeros((data.shape[0], data.shape[1],hist_size)) for i in range(data.shape[0]): @@ -54,20 +57,22 @@ def run_model (data, train, test, train_part, args): if (args.kmean): logging.debug("Computing KMeans with", train_part.shape[0], "keywords") - kmeans = KMeans(n_clusters=args.kmean, n_init=args.estimators, random_state=args.seed).fit(train_part) + kmeans = KMeans(n_clusters=args.kmean, n_init=1, random_state=args.seed).fit(train_part) hist_train = make_histogram(train, kmeans, args) hist_test = make_histogram(test, kmeans, args) else: - trees = RandomTreesEmbedding(max_leaf_nodes=args.leaves, n_estimators=args.estimators, random_state=args.seed).fit(train_part) + trees = RandomTreesEmbedding(max_leaf_nodes=args.leaves, n_estimators=args.embest, random_state=args.seed).fit(train_part) hist_train = make_histogram(train, trees, args) hist_test = make_histogram(test, trees, args) logging.debug("Generating histograms") - logging.debug("Keywords shape", hist_train.shape, "\n") logging.debug("Planting trees...") - clf = RandomForestClassifier(n_estimators=args.estimators, max_depth=args.treedepth, random_state=args.seed) + if args.randomness: + clf = RandomForestClassifier(max_features=args.randomness, n_estimators=args.estimators, max_depth=args.treedepth, random_state=0) + else: + clf = RandomForestClassifier(n_estimators=args.estimators, max_depth=args.treedepth, random_state=args.seed) clf.fit( hist_train.reshape((hist_train.shape[0]*hist_train.shape[1], hist_train.shape[2])), np.repeat(np.arange(hist_train.shape[0]), hist_train.shape[1])) @@ -76,6 +81,8 @@ def run_model (data, train, test, train_part, args): test_pred = clf.predict(hist_test.reshape((hist_test.shape[0]*hist_test.shape[1], hist_test.shape[2]))) test_label = np.repeat(np.arange(hist_test.shape[0]), hist_test.shape[1]) + train_pred = clf.predict(hist_train.reshape((hist_train.shape[0]*hist_train.shape[1], hist_train.shape[2]))) + train_label = np.repeat(np.arange(hist_train.shape[0]), hist_train.shape[1]) if args.timer: end = time.time() @@ -84,7 +91,7 @@ def run_model (data, train, test, train_part, args): skplt.metrics.plot_confusion_matrix(test_pred, test_label, normalize=True) plt.show() - return accuracy_score(test_pred, test_label) + return accuracy_score(test_pred, test_label)#, accuracy_score(train_pred, train_label), end-start @@ -96,35 +103,34 @@ def main(): logging.debug("Verbose is on") if args.testmode: - cnt = 0 - acc = np.zeros((5,5)) - for i in range(5): - args.estimators = (i+1)*200 - cnt+=1 - for j in range(5): - args.treedepth = j + 1 - print("Step ", cnt) - acc[i][j] = run_model (data, train, test, train_part, args) - print("Accuracy ",acc[i][j]) - cnt+=1 - fig, ax = plt.subplots() - im = ax.imshow(acc) - ax.set_xticks(np.arange(5)) - ax.set_yticks(np.arange(5)) - ax.set_xlabel('Number of trees') - ax.set_ylabel('Tree depth') + acc = np.zeros((3,100)) + a = np.zeros(100) + for i in range(100): + if i <= 10: + args.kmean = (i+1) + else: + args.kmean = 15*i + a[i] = args.kmean + print("Kmeans: ",args.kmean) + acc[0][i], acc[1][i], acc[2][i] = run_model (data, train, test, train_part, args) + print("Accuracy test:",acc[0][i], "Accuracy train:", acc[1][i]) + + plt.plot(a,1-acc[0]) + plt.plot(a,1-acc[1]) + plt.ylabel('Normalized Classification Error') + plt.xlabel('Vocabulary size') + plt.title('Classification error varying vocabulary size') + plt.legend(('Test','Train'),loc='upper right') + plt.show() + plt.plot(a,acc[2]) + plt.ylabel('Time (s)') + plt.xlabel('Vocabulary size') + plt.title('Time complexity varying vocabulary size') + plt.show() - # Loop over data dimensions and create text annotations. - for i in range(5): - for j in range(5): - text = ax.text(j, i, acc[i, j], ha="center", va="center", color="w") - ax.set_title("Accuracy varying hyper-parameters") - fig.tight_layout() - plt.show() else: acc = run_model (data, train, test, train_part, args) print(acc) if __name__ == "__main__": main() - |