diff options
-rw-r--r-- | lenet.py | 31 | ||||
-rw-r--r-- | report/paper.md | 87 |
2 files changed, 65 insertions, 53 deletions
@@ -16,6 +16,7 @@ from sklearn.model_selection import train_test_split from sklearn.decomposition import PCA from classifier_metrics_impl import classifier_score_from_logits from sklearn.utils import shuffle +from sklearn.manifold import TSNE def import_mnist(): from tensorflow.examples.tutorials.mnist import input_data @@ -141,12 +142,12 @@ def train_classifier(x_train, y_train, x_val, y_val, batch_size=128, epochs=100, model.save_weights('./weights.h5') return model -def test_classifier(model, x_test, y_true, conf_mat=False, pca=False): +def test_classifier(model, x_test, y_true, conf_mat=False, pca=False, tsne=False): x_test = np.pad(x_test, ((0,0),(2,2),(2,2),(0,0)), 'constant') - y_pred = model.predict(x_test) - logits = tf.convert_to_tensor(y_pred, dtype=tf.float32) - inception_score = tf.keras.backend.eval(classifier_score_from_logits(logits)) - y_pred = np.argmax(y_pred, axis=1) + logits = model.predict(x_test) + tf_logits = tf.convert_to_tensor(logits, dtype=tf.float32) + inception_score = tf.keras.backend.eval(classifier_score_from_logits(tf_logits)) + y_pred = np.argmax(logits, axis=1) y_true = np.argmax(y_true, axis=1) plot_example_errors(y_pred, y_true, x_test) cm = confusion_matrix(y_true, y_pred) @@ -158,16 +159,24 @@ def test_classifier(model, x_test, y_true, conf_mat=False, pca=False): plt.show() if pca: set_pca = PCA(n_components=2) - pca_rep = np.reshape(x_test, (x_test.shape[0], x_test.shape[1]*x_test.shape[2])) - print(pca_rep.shape) - pca_rep = set_pca.fit_transform(pca_rep) - print(pca_rep.shape) + pca_rep = set_pca.fit_transform(logits) pca_rep, y_tmp = shuffle(pca_rep, y_true, random_state=0) - plt.scatter(pca_rep[:100, 0], pca_rep[:100, 1], c=y_true[:100], edgecolor='none', alpha=0.5, cmap=plt.cm.get_cmap('Paired', 10)) + plt.scatter(pca_rep[:1000, 0], pca_rep[:1000, 1], c=y_true[:1000], edgecolor='none', alpha=0.5, cmap=plt.cm.get_cmap('Paired', 10)) plt.xlabel('component 1') plt.ylabel('component 2') plt.colorbar(); plt.show() + if tsne: + tsne = TSNE(n_components=2, random_state=0) + components = tsne.fit_transform(logits) + print(components.shape) + components, y_tmp = shuffle(components, y_true, random_state=0) + plt.scatter(components[:1000, 0], components[:1000, 1], c=y_true[:1000], edgecolor='none', alpha=0.5, cmap=plt.cm.get_cmap('Paired', 10)) + plt.xlabel('component 1') + plt.ylabel('component 2') + plt.colorbar(); + plt.show() + return accuracy_score(y_true, y_pred), inception_score @@ -202,4 +211,4 @@ if __name__ == '__main__': x_train, y_train, x_val, y_val, x_t, y_t = import_mnist() print(y_t.shape) model = train_classifier(x_train[:100], y_train[:100], x_val, y_val, epochs=3) - print(test_classifier(model, x_t, y_t, pca=True)) + print(test_classifier(model, x_t, y_t, pca=False, tsne=True)) diff --git a/report/paper.md b/report/paper.md index 35d8ba7..fbb54f3 100644 --- a/report/paper.md +++ b/report/paper.md @@ -1,17 +1,25 @@ # Introduction -In this coursework we present two variants of the GAN architecture - DCGAN and CGAN, applied to the MNIST dataaset and evaluate performance metrics across various optimisations techniques. The MNIST dataset contains 60,000 training images and 10,000 testing images of size 28x28, spread across ten classes representing the ten handwritten digits. +In this coursework we present two variants of the GAN architecture - DCGAN and CGAN, applied to the MNIST dataset and evaluate performance metrics across various optimisations techniques. The MNIST dataset contains 60,000 training images and 10,000 testing images of size 28x28, spread across ten classes representing the ten handwritten digits. ## GAN -Generative Adversarial Networks present a system of models which learn to output data, similar to training data. A trained GAN takes noise as an input and is able to provide an output with the same dimensions and ideally features as the samples it has been trained with. +Generative Adversarial Networks present a system of models which learn to output data, similar to training data. A trained GAN takes noise as an input and is able to provide an output with the same dimensions and relevant features as the samples it has been trained with. GAN's employ two neural networks - a *discriminator* and a *generator* which contest in a zero-sum game. The task of the *discriminator* is to distinguish generated images from real images, while the task of the generator is to produce realistic images which are able to fool the discriminator. -Training a shallow GAN with no convolutional layers poses multiple problems: mode collapse and generating low quality images due to unbalanced G-D losses. +Training a shallow GAN with no convolutional layers poses problems such as mode collapse and unbalanced G-D losses which lead to low quality image output. + +\begin{figure} +\begin{center} +\includegraphics[width=24em]{fig/generic_gan_mode_collapse.pdf} +\caption{Vanilla GAN mode collapse} +\label{fig:mode_collapse} +\end{center} +\end{figure} -Mode collapse can be observed in figure \ref{fig:mode_collapse}, after 200.000 iterations of the GAN network presented in appendix, figure \ref{fig:vanilla_gan} . The output of the generator only represents few of the labels originally fed. At that point the loss function of the generator stops -improving as shown in figure \ref{fig:vanilla_loss}. We observe, the discriminator loss tentding to zero as it learns ti classify the fake 1's, while the generator is stuck producing 1's. + +Mode collapse is achieved with our naive *vanilla GAN* (Appendix-\ref{fig:vanilla_gan}) implementation after 200,000 epochs. The generated images observed during a mode collapse can be seen on figure \ref{fig:mode_collapse}. The output of the generator only represents few of the labels originally fed. When mode collapse is reached loss function of the generator stops improving as shown in figure \ref{fig:vanilla_loss}. We observe, the discriminator loss tends to zero as the discriminator learns to assume and classify the fake 1's, while the generator is stuck producing 1 and hence not able to improve. A significant improvement to this vanilla architecture is Deep Convolutional Generative Adversarial Networks (DCGAN). @@ -39,7 +47,7 @@ The main architecture used can be observed in figure \ref{fig:dcganarc}. ## Tests on MNIST -We propose 3 different architectures, varying the size of convolutional layers in the generator, while retaining the structure proposed in figure \ref{fig:dcganarc}: +We evaluate three different GAN architectures, varying the size of convolutional layers in the generator, while retaining the structure presented in figure \ref{fig:dcganarc}: \begin{itemize} \item Shallow: Conv128-Conv64 @@ -56,17 +64,13 @@ We propose 3 different architectures, varying the size of convolutional layers i \end{center} \end{figure} -It is possible to notice that using deeper architectures it is possible to balance G-D losses more easilly. Medium DCGAN achieves a very good performance, -balancing both binary cross entropy losses ar around 1 after 5.000 epochs, showing significantly lower oscillation for longer training even when compared to -Deep DCGAN. +We observed that the deep architectures result in a more easily achievable equilibria of G-D losses. +Our medium depth DCGAN achieves very good performance, balancing both binary cross entropy losses at approximately 0.9 after 5.000 epochs, reaching equilibrium quicker and with less oscillation that the Deepest DCGAN tested. -Since we are training with no labels, the generator will simply try to output images that fool the discriminator, but do not directly map to one specific class. -Examples of this can be observed for all the output groups reported above as some of the shapes look very odd (but smooth enough to be labelled as real). This -specific issue is solved by training the network for more epochs or introducing a deeper architecture, as it can be deducted from a qualitative comparison +As DCGAN is trained with no labels, the generator primary objective is to output images that fool the discriminator, but does not intrinsically separate the classes form one another. Therefore we sometimes observe oddly shape fused digits which may temporarily full be labeled real by the discriminator. This issue is solved by training the network for more epochs or introducing a deeper architecture, as it can be deducted from a qualitative comparison between figures \ref{fig:dcmed}, \ref{fig:dcshort} and \ref{fig:dclong}. -Applying Virtual Batch Normalization on Medium DCGAN does not provide observable changes in G-D balancing, but reduces within-batch correlation. Although it -is difficult to qualitatively assess the improvements, figure \ref{fig:vbn_dc} shows results of the introduction of this technique. +Applying Virtual Batch Normalization our Medium DCGAN does not provide observable changes in G-D balancing, but reduces within-batch correlation. Although it is difficult to qualitatively assess the improvements, figure \ref{fig:vbn_dc} shows results of the introduction of this technique. \begin{figure} \begin{center} @@ -76,11 +80,10 @@ is difficult to qualitatively assess the improvements, figure \ref{fig:vbn_dc} s \end{center} \end{figure} -We evaluated the effect of different dropout rates (results in appendix, figures \ref{fig:dcdrop1_1}, \ref{fig:dcdrop1_2}, \ref{fig:dcdrop2_1}, \ref{fig:dcdrop2_2}) and concluded that the optimization -of this parameter is essential to obtain good performance: a high dropout rate would result in DCGAN producing only artifacts that do not really match any specific class due to the generator performing better than the discriminator. Conversely a low dropout rate would lead to an initial stabilisation of G-D losses, but it would result into oscillation when training for a large number of epochs. +We evaluated the effect of different dropout rates (results in appendix figures \ref{fig:dcdrop1_1}, \ref{fig:dcdrop1_2}, \ref{fig:dcdrop2_1}, \ref{fig:dcdrop2_2}) and concluded that the optimisation +of the droupout hyper-parameter is essential for maximising performance. A high dropout rate results in DCGAN producing only artifacts that do not match any specific class due to the generator performing better than the discriminator. Conversely a low dropout rate leads to an initial stabilisation of G-D losses, but ultimately results in instability under the form of oscillation when training for a large number of epochs. -While training the different proposed DCGAN architectures, we did not observe mode collapse, confirming that the architecture used performed better than -the simple GAN presented in the introduction. +While training the different proposed DCGAN architectures, we did not observe mode collapse, indicating the DCGAN is less prone to a collapse compared to our *vanilla GAN*. # CGAN @@ -148,24 +151,28 @@ architectures in Q2.** We measure the performance of the considered GAN's using the Inecption score [-inception], as calculated with L2-Net logits. -$$ \textrm{IS}(x) = \exp(\mathcal{E}_x \left( \textrm{KL} ( p(y\|x) \|\| p(y) ) \right) ) $$ +$$ \textrm{IS}(x) = \exp(\mathbb{E}_x \left( \textrm{KL} ( p(y\mid x) \| p(y) ) \right) ) $$ +``` \begin{table}[] \begin{tabular}{llll} - & \begin{tabular}[c]{@{}l@{}}Test \\ Accuracy \\ (L2-Net)\end{tabular} & \begin{tabular}[c]{@{}l@{}}Inception \\ Score \\ (L2-Net)\end{tabular} & \begin{tabular}[c]{@{}l@{}}Execution \\ time\\ (Training \\ GAN)\end{tabular} \\ \hline - Shallow CGAN & 0.645 & 3.57 & 8:14 \\ - Medium CGAN & 0.715 & 3.79 & 10:23 \\ - Deep CGAN & 0.739 & 3.85 & 16:27 \\ - Convolutional CGAN & 0.737 & 4 & 25:27 \\ - \begin{tabular}[c]{@{}l@{}}Medium CGAN\\ One-sided label \\ smoothing\end{tabular} & 0.749 & 3.643 & 10:42 \\ - \begin{tabular}[c]{@{}l@{}}Convolutional CGAN\\ One-sided label \\ smoothing\end{tabular} & 0.601 & 2.494 & 27:36 \\ - \begin{tabular}[c]{@{}l@{}}Medium CGAN\\ Dropout 0.1\end{tabular} & 0.761 & 3.836 & 10:36 \\ - \begin{tabular}[c]{@{}l@{}}Medium CGAN\\ Dropout 0.5\end{tabular} & 0.725 & 3.677 & 10:36 \\ - \begin{tabular}[c]{@{}l@{}}Medium CGAN\\ Virtual Batch \\ Normalization\end{tabular} & ? & ? & ? \\ - \begin{tabular}[c]{@{}l@{}}Medium CGAN\\ Virtual Batch \\ Normalization\\ One-sided label \\ smoothing\end{tabular} & ? & ? & ? \\ - *MNIST original & 0.9846 & 9.685 & N/A - \end{tabular} - \end{table} +& \begin{tabular}[c]{@{}l@{}}Test \\ Accuracy \\ (L2-Net)\end{tabular} & \begin{tabular}[c]{@{}l@{}}Inception \\ Score \\ (L2-Net)\end{tabular} & \begin{tabular}[c]{@{}l@{}}Execution \\ time\\ (Training \\ GAN)\end{tabular} \\ \hline + Shallow CGAN & 0.645 & 3.57 & 8:14 \\ + Medium CGAN & 0.715 & 3.79 & 10:23 \\ + Deep CGAN & 0.739 & 3.85 & 16:27 \\ + Convolutional CGAN & 0.737 & 4 & 25:27 \\ + + \begin{tabular}[c]{@{}l@{}}Medium CGAN\\ One-sided label \\ smoothing\end{tabular} & 0.749 & 3.643 & 10:42 \\ + \begin{tabular}[c]{@{}l@{}}Convolutional CGAN\\ One-sided label \\ smoothing\end{tabular} & 0.601 & 2.494 & 27:36 \\ + \begin{tabular}[c]{@{}l@{}}Medium CGAN\\ Dropout 0.1\end{tabular} & 0.761 & 3.836 & 10:36 \\ + \begin{tabular}[c]{@{}l@{}}Medium CGAN\\ Dropout 0.5\end{tabular} & 0.725 & 3.677 & 10:36 \\ + \begin{tabular}[c]{@{}l@{}}Medium CGAN\\ Virtual Batch \\ Normalization\end{tabular} & ? & ? & ? \\ + \begin{tabular}[c]{@{}l@{}}Medium CGAN\\ Virtual Batch \\ Normalization\\ One-sided label \\ smoothing\end{tabular} & ? & ? & ? \\ + *MNIST original & 0.9846 & 9.685 & N/A + +\end{tabular} +\end{table} +``` # Re-training the handwritten digit classifier @@ -245,7 +252,11 @@ as most of the testing images that got misclassified (mainly nines and fours) sh # Bonus -This is an open question. Do you have any other ideas to improve GANs or +## Relation to PCA + +Similarly to GAN's, PCA can be used to formulate **generative** models of a system. While GAN's are trained neural networks, PCA is a definite statistical procedure which perform orthogonal transformations of the data. While both attempt to identify the most important or *variant* features of the data (which we may then use to generate new data), PCA by itself is only able to extract linearly related features. In a purely linear system, a GAN would be converging to PCA. In a more complicated system, we would ndeed to identify relevant kernels in order to extract relevant features with PCA, while a GAN is able to leverage dense and convolutional neural network layers which may be trained to perform relevant transformations. + +* This is an open question. Do you have any other ideas to improve GANs or have more insightful and comparative evaluations of GANs? Ideas are not limited. For instance, \begin{itemize} @@ -295,14 +306,6 @@ architecture and loss function? \begin{figure} \begin{center} -\includegraphics[width=24em]{fig/generic_gan_mode_collapse.pdf} -\caption{Shallow GAN mode collapse} -\label{fig:mode_collapse} -\end{center} -\end{figure} - -\begin{figure} -\begin{center} \includegraphics[width=24em]{fig/short_dcgan_ex.pdf} \includegraphics[width=24em]{fig/short_dcgan.png} \caption{Shallow DCGAN} |