#!/usr/bin/python # EE4 Selected Topics From Computer Vision Coursework # Vasil Zlatanov, Nunzio Pucci import numpy as np import matplotlib import matplotlib.pyplot as plt import scikitplot as skplt import argparse import logging from logging import debug from sklearn.cluster import KMeans from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import RandomTreesEmbedding from sklearn.metrics import accuracy_score import time parser = argparse.ArgumentParser() parser.add_argument("-d", "--data", help="Data path", action='store_true', default='data.npz') parser.add_argument("-c", "--conf_mat", help="Show visual confusion matrix", action='store_true') parser.add_argument("-k", "--kmean", help="Perform kmean clustering with --kmean cluster centers", type=int, default=0) parser.add_argument("-l", "--leaves", help="Maximum leaf nodes for RF classifier", type=int, default=256) parser.add_argument("-e", "--estimators", help="number of estimators to be used", type=int, default=100) parser.add_argument("-D", "--treedepth", help="depth of trees", type=int, default=5) parser.add_argument("-v", "--verbose", help="Use verbose output", action='store_true') parser.add_argument("-t", "--timer", help="Display execution time", action='store_true') parser.add_argument("-T", "--testmode", help="Testmode", action='store_true') parser.add_argument("-E", "--embest", help="RandomTreesEmbedding estimators", type=int, default=256) parser.add_argument("-r", "--randomness", help="Randomness parameter", type=int, default=0) args = parser.parse_args() if args.verbose: logging.basicConfig(level=logging.DEBUG) def make_histogram(data, model, args): if args.kmean: hist_size = args.kmean else: hist_size = args.embest*args.leaves histogram = np.zeros((data.shape[0], data.shape[1],hist_size)) for i in range(data.shape[0]): for j in range(data.shape[1]): if (args.kmean): histogram[i][j] = np.bincount(model.predict(data[i][j].T), minlength=args.kmean) else: leaves = model.apply(data[i][j].T) leaves = np.apply_along_axis(np.bincount, axis=0, arr=leaves, minlength=args.leaves) histogram[i][j] = leaves.reshape(hist_size) return histogram def run_model (data, train, test, train_part, args): if args.timer: start = time.time() if (args.kmean): logging.debug("Computing KMeans with", train_part.shape[0], "keywords") kmeans = KMeans(n_clusters=args.kmean, n_init=1, random_state=0).fit(train_part) hist_train = make_histogram(train, kmeans, args) hist_test = make_histogram(test, kmeans, args) else: trees = RandomTreesEmbedding(max_leaf_nodes=args.leaves, n_estimators=args.embest, random_state=0).fit(train_part) hist_train = make_histogram(train, trees, args) hist_test = make_histogram(test, trees, args) logging.debug("Generating histograms") logging.debug("Keywords shape", hist_train.shape, "\n") logging.debug("Planting trees...") if args.randomness: clf = RandomForestClassifier(max_features=args.randomness, n_estimators=args.estimators, max_depth=args.treedepth, random_state=0) else: clf = RandomForestClassifier(n_estimators=args.estimators, max_depth=args.treedepth, random_state=0) clf.fit( hist_train.reshape((hist_train.shape[0]*hist_train.shape[1], hist_train.shape[2])), np.repeat(np.arange(hist_train.shape[0]), hist_train.shape[1])) logging.debug("Random forests created") test_pred = clf.predict(hist_test.reshape((hist_test.shape[0]*hist_test.shape[1], hist_test.shape[2]))) test_label = np.repeat(np.arange(hist_test.shape[0]), hist_test.shape[1]) train_pred = clf.predict(hist_train.reshape((hist_train.shape[0]*hist_train.shape[1], hist_train.shape[2]))) train_label = np.repeat(np.arange(hist_train.shape[0]), hist_train.shape[1]) if args.timer: end = time.time() print("Execution time: ",end - start) if args.conf_mat: skplt.metrics.plot_confusion_matrix(test_pred, test_label, normalize=True) plt.show() return accuracy_score(test_pred, test_label)#, accuracy_score(train_pred, train_label), end-start def main(): data = np.load(args.data) train = data['desc_tr'] test = data['desc_te'] train_part = data['desc_sel'].T logging.debug("Verbose is on") if args.testmode: acc = np.zeros((3,100)) a = np.zeros(100) for i in range(100): if i <= 10: args.kmean = (i+1) else: args.kmean = 15*i a[i] = args.kmean print("Kmeans: ",args.kmean) acc[0][i], acc[1][i], acc[2][i] = run_model (data, train, test, train_part, args) print("Accuracy test:",acc[0][i], "Accuracy train:", acc[1][i]) plt.plot(a,1-acc[0]) plt.plot(a,1-acc[1]) plt.ylabel('Normalized Classification Error') plt.xlabel('Vocabulary size') plt.title('Classification error varying vocabulary size') plt.legend(('Test','Train'),loc='upper right') plt.show() plt.plot(a,acc[2]) plt.ylabel('Time (s)') plt.xlabel('Vocabulary size') plt.title('Time complexity varying vocabulary size') plt.show() else: acc = run_model (data, train, test, train_part, args) print(acc) if __name__ == "__main__": main()