8 files changed, 37 insertions, 31 deletions
diff --git a/evaluate.py b/evaluate.py
index df0c79b..d4d0b07 100755
--- a/evaluate.py
+++ b/evaluate.py
@@ -11,6 +11,7 @@ import logging
 from logging import debug
 from sklearn.cluster import KMeans
 from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import ExtraTreesClassifier 
 from sklearn.ensemble import RandomTreesEmbedding
 from sklearn.metrics import accuracy_score
 import time 
@@ -25,6 +26,8 @@ parser.add_argument("-D", "--treedepth", help="depth of trees", type=int, defaul
 parser.add_argument("-v", "--verbose", help="Use verbose output", action='store_true')
 parser.add_argument("-t", "--timer", help="Display execution time", action='store_true')
 parser.add_argument("-T", "--testmode", help="Testmode", action='store_true')
+parser.add_argument("-E", "--embest", help="RandomTreesEmbedding estimators", type=int, default=256)
+parser.add_argument("-r", "--randomness", help="Randomness parameter", type=int, default=0)
 parser.add_argument("-s", "--seed", help="Seed to use for random_state when creating trees", type=int, default=0)
 
 args = parser.parse_args()
@@ -35,7 +38,7 @@ def make_histogram(data, model, args):
     if args.kmean:
         hist_size = args.kmean
     else:
-        hist_size = args.estimators*args.leaves
+        hist_size = args.embest*args.leaves
 
     histogram = np.zeros((data.shape[0], data.shape[1],hist_size)) 
     for i in range(data.shape[0]):
@@ -54,20 +57,22 @@ def run_model (data, train, test, train_part, args):
     
     if (args.kmean):
         logging.debug("Computing KMeans with", train_part.shape[0], "keywords")
-        kmeans = KMeans(n_clusters=args.kmean, n_init=args.estimators, random_state=args.seed).fit(train_part)
+        kmeans = KMeans(n_clusters=args.kmean, n_init=1, random_state=args.seed).fit(train_part)
         hist_train = make_histogram(train, kmeans, args)
         hist_test = make_histogram(test, kmeans, args)
     else:
-        trees = RandomTreesEmbedding(max_leaf_nodes=args.leaves, n_estimators=args.estimators, random_state=args.seed).fit(train_part)
+        trees = RandomTreesEmbedding(max_leaf_nodes=args.leaves, n_estimators=args.embest, random_state=args.seed).fit(train_part)
         hist_train = make_histogram(train, trees, args)
         hist_test = make_histogram(test, trees, args)
  
     logging.debug("Generating histograms")
     
-
     logging.debug("Keywords shape", hist_train.shape, "\n")
     logging.debug("Planting trees...")
-    clf = RandomForestClassifier(n_estimators=args.estimators, max_depth=args.treedepth, random_state=args.seed)
+    if args.randomness:
+        clf = RandomForestClassifier(max_features=args.randomness, n_estimators=args.estimators, max_depth=args.treedepth, random_state=0)
+    else:
+        clf = RandomForestClassifier(n_estimators=args.estimators, max_depth=args.treedepth, random_state=args.seed)
     clf.fit(
             hist_train.reshape((hist_train.shape[0]*hist_train.shape[1], hist_train.shape[2])),
             np.repeat(np.arange(hist_train.shape[0]), hist_train.shape[1]))
@@ -76,6 +81,8 @@ def run_model (data, train, test, train_part, args):
 
     test_pred = clf.predict(hist_test.reshape((hist_test.shape[0]*hist_test.shape[1], hist_test.shape[2])))
     test_label = np.repeat(np.arange(hist_test.shape[0]), hist_test.shape[1])
+    train_pred = clf.predict(hist_train.reshape((hist_train.shape[0]*hist_train.shape[1], hist_train.shape[2])))
+    train_label = np.repeat(np.arange(hist_train.shape[0]), hist_train.shape[1])
 
     if args.timer:
         end = time.time()
@@ -84,7 +91,7 @@ def run_model (data, train, test, train_part, args):
         skplt.metrics.plot_confusion_matrix(test_pred, test_label, normalize=True)
         plt.show()
 
-    return accuracy_score(test_pred, test_label)
+    return accuracy_score(test_pred, test_label)#, accuracy_score(train_pred, train_label), end-start
  
 
 
@@ -96,35 +103,34 @@ def main():
     logging.debug("Verbose is on")
 
     if args.testmode:
-        cnt = 0
-        acc = np.zeros((5,5))
-        for i in range(5):
-            args.estimators = (i+1)*200
-            cnt+=1
-            for j in range(5):
-                args.treedepth = j + 1
-                print("Step ", cnt)
-                acc[i][j] = run_model (data, train, test, train_part, args) 
-                print("Accuracy ",acc[i][j])
-                cnt+=1
-        fig, ax = plt.subplots()
-        im = ax.imshow(acc)
-        ax.set_xticks(np.arange(5))
-        ax.set_yticks(np.arange(5))
-        ax.set_xlabel('Number of trees')
-        ax.set_ylabel('Tree depth')
+        acc = np.zeros((3,100))
+        a = np.zeros(100)
+        for i in range(100):
+            if i <= 10:
+                args.kmean = (i+1) 
+            else:
+                args.kmean = 15*i
+            a[i] = args.kmean
+            print("Kmeans: ",args.kmean)
+            acc[0][i], acc[1][i], acc[2][i] = run_model (data, train, test, train_part, args) 
+            print("Accuracy test:",acc[0][i], "Accuracy train:", acc[1][i])
+
+        plt.plot(a,1-acc[0]) 
+        plt.plot(a,1-acc[1]) 
+        plt.ylabel('Normalized Classification Error')
+        plt.xlabel('Vocabulary size')
+        plt.title('Classification error varying vocabulary size')
+        plt.legend(('Test','Train'),loc='upper right')
+        plt.show()
+        plt.plot(a,acc[2]) 
+        plt.ylabel('Time (s)')
+        plt.xlabel('Vocabulary size')
+        plt.title('Time complexity varying vocabulary size')
+        plt.show()
 
-        # Loop over data dimensions and create text annotations.
-        for i in range(5):
-                for j in range(5):
-                            text = ax.text(j, i, acc[i, j], ha="center", va="center", color="w")
-                            ax.set_title("Accuracy varying hyper-parameters")
-                            fig.tight_layout()
-                            plt.show()
     else:
         acc = run_model (data, train, test, train_part, args)
         print(acc)
     
 if __name__ == "__main__":
     main()
-
diff --git a/report/fig/256t1_e200D5_cm.pdf b/report/fig/256t1_e200D5_cm.pdf
new file mode 100644
index 0000000..66c33c3
--- /dev/null
+++ b/report/fig/256t1_e200D5_cm.pdf
diff --git a/report/fig/Depth_Trees_P2.pdf b/report/fig/Depth_Trees_P2.pdf
new file mode 100644
index 0000000..18c85f4
--- /dev/null
+++ b/report/fig/Depth_Trees_P2.pdf
diff --git a/report/fig/Depth_Trees_P3.pdf b/report/fig/Depth_Trees_P3.pdf
new file mode 100644
index 0000000..8d9f7f2
--- /dev/null
+++ b/report/fig/Depth_Trees_P3.pdf
diff --git a/report/fig/Depth_Trees_P3_fixedestimators.pdf b/report/fig/Depth_Trees_P3_fixedestimators.pdf
new file mode 100644
index 0000000..c042d89
--- /dev/null
+++ b/report/fig/Depth_Trees_P3_fixedestimators.pdf
diff --git a/report/fig/e100k256d5_cm.pdf b/report/fig/e100k256d5_cm.pdf
new file mode 100644
index 0000000..649f597
--- /dev/null
+++ b/report/fig/e100k256d5_cm.pdf
diff --git a/report/fig/est_vocsize.pdf b/report/fig/est_vocsize.pdf
new file mode 100644
index 0000000..998d853
--- /dev/null
+++ b/report/fig/est_vocsize.pdf
diff --git a/report/fig/leaves_vocsize.pdf b/report/fig/leaves_vocsize.pdf
new file mode 100644
index 0000000..395077b
--- /dev/null
+++ b/report/fig/leaves_vocsize.pdf