2 files changed, 1120 insertions, 2 deletions
diff --git a/classifier_metrics_impl.py b/classifier_metrics_impl.py
new file mode 100644
index 0000000..2334d29
--- /dev/null
+++ b/classifier_metrics_impl.py
@@ -0,0 +1,1114 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model evaluation tools for TFGAN.
+
+These methods come from https://arxiv.org/abs/1606.03498,
+https://arxiv.org/abs/1706.08500, and https://arxiv.org/abs/1801.01401.
+
+NOTE: This implementation uses the same weights as in
+https://github.com/openai/improved-gan/blob/master/inception_score/model.py,
+but is more numerically stable and is an unbiased estimator of the true
+Inception score even when splitting the inputs into batches.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+import sys
+import tarfile
+
+from six.moves import urllib
+
+from tensorflow.contrib.layers.python.layers import layers
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import image_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_impl
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import resource_loader
+
+
+__all__ = [
+    'get_graph_def_from_disk',
+    'get_graph_def_from_resource',
+    'get_graph_def_from_url_tarball',
+    'preprocess_image',
+    'run_image_classifier',
+    'run_inception',
+    'inception_score',
+    'classifier_score',
+    'classifier_score_from_logits',
+    'frechet_inception_distance',
+    'frechet_classifier_distance',
+    'frechet_classifier_distance_from_activations',
+    'mean_only_frechet_classifier_distance_from_activations',
+    'diagonal_only_frechet_classifier_distance_from_activations',
+    'kernel_inception_distance',
+    'kernel_inception_distance_and_std',
+    'kernel_classifier_distance',
+    'kernel_classifier_distance_and_std',
+    'kernel_classifier_distance_from_activations',
+    'kernel_classifier_distance_and_std_from_activations',
+    'INCEPTION_DEFAULT_IMAGE_SIZE',
+]
+
+INCEPTION_URL = 'http://download.tensorflow.org/models/frozen_inception_v1_2015_12_05.tar.gz'
+INCEPTION_FROZEN_GRAPH = 'inceptionv1_for_inception_score.pb'
+INCEPTION_INPUT = 'Mul:0'
+INCEPTION_OUTPUT = 'logits:0'
+INCEPTION_FINAL_POOL = 'pool_3:0'
+INCEPTION_DEFAULT_IMAGE_SIZE = 299
+
+
+def _validate_images(images, image_size):
+  images = ops.convert_to_tensor(images)
+  images.shape.with_rank(4)
+  images.shape.assert_is_compatible_with([None, image_size, image_size, None])
+  return images
+
+
+def _symmetric_matrix_square_root(mat, eps=1e-10):
+  """Compute square root of a symmetric matrix.
+
+  Note that this is different from an elementwise square root. We want to
+  compute M' where M' = sqrt(mat) such that M' * M' = mat.
+
+  Also note that this method **only** works for symmetric matrices.
+
+  Args:
+    mat: Matrix to take the square root of.
+    eps: Small epsilon such that any element less than eps will not be square
+      rooted to guard against numerical instability.
+
+  Returns:
+    Matrix square root of mat.
+  """
+  # Unlike numpy, tensorflow's return order is (s, u, v)
+  s, u, v = linalg_ops.svd(mat)
+  # sqrt is unstable around 0, just use 0 in such case
+  si = array_ops.where(math_ops.less(s, eps), s, math_ops.sqrt(s))
+  # Note that the v returned by Tensorflow is v = V
+  # (when referencing the equation A = U S V^T)
+  # This is unlike Numpy which returns v = V^T
+  return math_ops.matmul(
+      math_ops.matmul(u, array_ops.diag(si)), v, transpose_b=True)
+
+
+def preprocess_image(images,
+                     height=INCEPTION_DEFAULT_IMAGE_SIZE,
+                     width=INCEPTION_DEFAULT_IMAGE_SIZE,
+                     scope=None):
+  """Prepare a batch of images for evaluation.
+
+  This is the preprocessing portion of the graph from
+  http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz.
+
+  Note that it expects Tensors in [0, 255]. This function maps pixel values to
+  [-1, 1] and resizes to match the InceptionV1 network.
+
+  Args:
+    images: 3-D or 4-D Tensor of images. Values are in [0, 255].
+    height: Integer. Height of resized output image.
+    width: Integer. Width of resized output image.
+    scope: Optional scope for name_scope.
+
+  Returns:
+    3-D or 4-D float Tensor of prepared image(s). Values are in [-1, 1].
+  """
+  is_single = images.shape.ndims == 3
+  with ops.name_scope(scope, 'preprocess', [images, height, width]):
+    if not images.dtype.is_floating:
+      images = math_ops.to_float(images)
+    if is_single:
+      images = array_ops.expand_dims(images, axis=0)
+    resized = image_ops.resize_bilinear(images, [height, width])
+    resized = (resized - 128.0) / 128.0
+    if is_single:
+      resized = array_ops.squeeze(resized, axis=0)
+    return resized
+
+
+def _kl_divergence(p, p_logits, q):
+  """Computes the Kullback-Liebler divergence between p and q.
+
+  This function uses p's logits in some places to improve numerical stability.
+
+  Specifically:
+
+  KL(p || q) = sum[ p * log(p / q) ]
+    = sum[ p * ( log(p)                - log(q) ) ]
+    = sum[ p * ( log_softmax(p_logits) - log(q) ) ]
+
+  Args:
+    p: A 2-D floating-point Tensor p_ij, where `i` corresponds to the minibatch
+      example and `j` corresponds to the probability of being in class `j`.
+    p_logits: A 2-D floating-point Tensor corresponding to logits for `p`.
+    q: A 1-D floating-point Tensor, where q_j corresponds to the probability
+      of class `j`.
+
+  Returns:
+    KL divergence between two distributions. Output dimension is 1D, one entry
+    per distribution in `p`.
+
+  Raises:
+    ValueError: If any of the inputs aren't floating-point.
+    ValueError: If p or p_logits aren't 2D.
+    ValueError: If q isn't 1D.
+  """
+  for tensor in [p, p_logits, q]:
+    if not tensor.dtype.is_floating:
+      raise ValueError('Input %s must be floating type.', tensor.name)
+  p.shape.assert_has_rank(2)
+  p_logits.shape.assert_has_rank(2)
+  q.shape.assert_has_rank(1)
+  return math_ops.reduce_sum(
+      p * (nn_ops.log_softmax(p_logits) - math_ops.log(q)), axis=1)
+
+
+def get_graph_def_from_disk(filename):
+  """Get a GraphDef proto from a disk location."""
+  with gfile.FastGFile(filename, 'rb') as f:
+    return graph_pb2.GraphDef.FromString(f.read())
+
+
+def get_graph_def_from_resource(filename):
+  """Get a GraphDef proto from within a .par file."""
+  return graph_pb2.GraphDef.FromString(resource_loader.load_resource(filename))
+
+
+def get_graph_def_from_url_tarball(url, filename, tar_filename=None):
+  """Get a GraphDef proto from a tarball on the web.
+
+  Args:
+    url: Web address of tarball
+    filename: Filename of graph definition within tarball
+    tar_filename: Temporary download filename (None = always download)
+
+  Returns:
+    A GraphDef loaded from a file in the downloaded tarball.
+  """
+  if not (tar_filename and os.path.exists(tar_filename)):
+
+    def _progress(count, block_size, total_size):
+      sys.stdout.write('\r>> Downloading %s %.1f%%' %
+                       (url,
+                        float(count * block_size) / float(total_size) * 100.0))
+      sys.stdout.flush()
+
+    tar_filename, _ = urllib.request.urlretrieve(url, tar_filename, _progress)
+  with tarfile.open(tar_filename, 'r:gz') as tar:
+    proto_str = tar.extractfile(filename).read()
+  return graph_pb2.GraphDef.FromString(proto_str)
+
+
+def _default_graph_def_fn():
+  return get_graph_def_from_url_tarball(INCEPTION_URL, INCEPTION_FROZEN_GRAPH,
+                                        os.path.basename(INCEPTION_URL))
+
+
+def run_inception(images,
+                  graph_def=None,
+                  default_graph_def_fn=_default_graph_def_fn,
+                  image_size=INCEPTION_DEFAULT_IMAGE_SIZE,
+                  input_tensor=INCEPTION_INPUT,
+                  output_tensor=INCEPTION_OUTPUT):
+  """Run images through a pretrained Inception classifier.
+
+  Args:
+    images: Input tensors. Must be [batch, height, width, channels]. Input shape
+      and values must be in [-1, 1], which can be achieved using
+      `preprocess_image`.
+    graph_def: A GraphDef proto of a pretrained Inception graph. If `None`,
+      call `default_graph_def_fn` to get GraphDef.
+    default_graph_def_fn: A function that returns a GraphDef. Used if
+      `graph_def` is `None. By default, returns a pretrained InceptionV3 graph.
+    image_size: Required image width and height. See unit tests for the default
+      values.
+    input_tensor: Name of input Tensor.
+    output_tensor: Name or list of output Tensors. This function will compute
+      activations at the specified layer. Examples include INCEPTION_V3_OUTPUT
+      and INCEPTION_V3_FINAL_POOL which would result in this function computing
+      the final logits or the penultimate pooling layer.
+
+  Returns:
+    Tensor or Tensors corresponding to computed `output_tensor`.
+
+  Raises:
+    ValueError: If images are not the correct size.
+    ValueError: If neither `graph_def` nor `default_graph_def_fn` are provided.
+  """
+  images = _validate_images(images, image_size)
+
+  if graph_def is None:
+    if default_graph_def_fn is None:
+      raise ValueError('If `graph_def` is `None`, must provide '
+                       '`default_graph_def_fn`.')
+    graph_def = default_graph_def_fn()
+
+  activations = run_image_classifier(images, graph_def, input_tensor,
+                                     output_tensor)
+  if isinstance(activations, list):
+    for i, activation in enumerate(activations):
+      if array_ops.rank(activation) != 2:
+        activations[i] = layers.flatten(activation)
+  else:
+    if array_ops.rank(activations) != 2:
+      activations = layers.flatten(activations)
+
+  return activations
+
+
+def run_image_classifier(tensor,
+                         graph_def,
+                         input_tensor,
+                         output_tensor,
+                         scope='RunClassifier'):
+  """Runs a network from a frozen graph.
+
+  Args:
+    tensor: An Input tensor.
+    graph_def: A GraphDef proto.
+    input_tensor: Name of input tensor in graph def.
+    output_tensor: A tensor name or list of tensor names in graph def.
+    scope: Name scope for classifier.
+
+  Returns:
+    Classifier output if `output_tensor` is a string, or a list of outputs if
+    `output_tensor` is a list.
+
+  Raises:
+    ValueError: If `input_tensor` or `output_tensor` aren't in the graph_def.
+  """
+  input_map = {input_tensor: tensor}
+  is_singleton = isinstance(output_tensor, str)
+  if is_singleton:
+    output_tensor = [output_tensor]
+  classifier_outputs = importer.import_graph_def(
+      graph_def, input_map, output_tensor, name=scope)
+  if is_singleton:
+    classifier_outputs = classifier_outputs[0]
+
+  return classifier_outputs
+
+
+def classifier_score(images, classifier_fn, num_batches=1):
+  """Classifier score for evaluating a conditional generative model.
+
+  This is based on the Inception Score, but for an arbitrary classifier.
+
+  This technique is described in detail in https://arxiv.org/abs/1606.03498. In
+  summary, this function calculates
+
+  exp( E[ KL(p(y|x) || p(y)) ] )
+
+  which captures how different the network's classification prediction is from
+  the prior distribution over classes.
+
+  NOTE: This function consumes images, computes their logits, and then
+  computes the classifier score. If you would like to precompute many logits for
+  large batches, use classifier_score_from_logits(), which this method also
+  uses.
+
+  Args:
+    images: Images to calculate the classifier score for.
+    classifier_fn: A function that takes images and produces logits based on a
+      classifier.
+    num_batches: Number of batches to split `generated_images` in to in order to
+      efficiently run them through the classifier network.
+
+  Returns:
+    The classifier score. A floating-point scalar of the same type as the output
+    of `classifier_fn`.
+  """
+  generated_images_list = array_ops.split(
+      images, num_or_size_splits=num_batches)
+
+  # Compute the classifier splits using the memory-efficient `map_fn`.
+  logits = functional_ops.map_fn(
+      fn=classifier_fn,
+      elems=array_ops.stack(generated_images_list),
+      parallel_iterations=1,
+      back_prop=False,
+      swap_memory=True,
+      name='RunClassifier')
+  logits = array_ops.concat(array_ops.unstack(logits), 0)
+
+  return classifier_score_from_logits(logits)
+
+
+def classifier_score_from_logits(logits):
+  """Classifier score for evaluating a generative model from logits.
+
+  This method computes the classifier score for a set of logits. This can be
+  used independently of the classifier_score() method, especially in the case
+  of using large batches during evaluation where we would like precompute all
+  of the logits before computing the classifier score.
+
+  This technique is described in detail in https://arxiv.org/abs/1606.03498. In
+  summary, this function calculates:
+
+  exp( E[ KL(p(y|x) || p(y)) ] )
+
+  which captures how different the network's classification prediction is from
+  the prior distribution over classes.
+
+  Args:
+    logits: Precomputed 2D tensor of logits that will be used to
+      compute the classifier score.
+
+  Returns:
+    The classifier score. A floating-point scalar of the same type as the output
+    of `logits`.
+  """
+  logits.shape.assert_has_rank(2)
+
+  # Use maximum precision for best results.
+  logits_dtype = logits.dtype
+  if logits_dtype != dtypes.float64:
+    logits = math_ops.to_double(logits)
+
+  p = nn_ops.softmax(logits)
+  q = math_ops.reduce_mean(p, axis=0)
+  kl = _kl_divergence(p, logits, q)
+  kl.shape.assert_has_rank(1)
+  log_score = math_ops.reduce_mean(kl)
+  final_score = math_ops.exp(log_score)
+
+  if logits_dtype != dtypes.float64:
+    final_score = math_ops.cast(final_score, logits_dtype)
+
+  return final_score
+
+
+inception_score = functools.partial(
+    classifier_score,
+    classifier_fn=functools.partial(
+        run_inception, output_tensor=INCEPTION_OUTPUT))
+
+
+def trace_sqrt_product(sigma, sigma_v):
+  """Find the trace of the positive sqrt of product of covariance matrices.
+
+  '_symmetric_matrix_square_root' only works for symmetric matrices, so we
+  cannot just take _symmetric_matrix_square_root(sigma * sigma_v).
+  ('sigma' and 'sigma_v' are symmetric, but their product is not necessarily).
+
+  Let sigma = A A so A = sqrt(sigma), and sigma_v = B B.
+  We want to find trace(sqrt(sigma sigma_v)) = trace(sqrt(A A B B))
+  Note the following properties:
+  (i) forall M1, M2: eigenvalues(M1 M2) = eigenvalues(M2 M1)
+     => eigenvalues(A A B B) = eigenvalues (A B B A)
+  (ii) if M1 = sqrt(M2), then eigenvalues(M1) = sqrt(eigenvalues(M2))
+     => eigenvalues(sqrt(sigma sigma_v)) = sqrt(eigenvalues(A B B A))
+  (iii) forall M: trace(M) = sum(eigenvalues(M))
+     => trace(sqrt(sigma sigma_v)) = sum(eigenvalues(sqrt(sigma sigma_v)))
+                                   = sum(sqrt(eigenvalues(A B B A)))
+                                   = sum(eigenvalues(sqrt(A B B A)))
+                                   = trace(sqrt(A B B A))
+                                   = trace(sqrt(A sigma_v A))
+  A = sqrt(sigma). Both sigma and A sigma_v A are symmetric, so we **can**
+  use the _symmetric_matrix_square_root function to find the roots of these
+  matrices.
+
+  Args:
+    sigma: a square, symmetric, real, positive semi-definite covariance matrix
+    sigma_v: same as sigma
+
+  Returns:
+    The trace of the positive square root of sigma*sigma_v
+  """
+
+  # Note sqrt_sigma is called "A" in the proof above
+  sqrt_sigma = _symmetric_matrix_square_root(sigma)
+
+  # This is sqrt(A sigma_v A) above
+  sqrt_a_sigmav_a = math_ops.matmul(sqrt_sigma,
+                                    math_ops.matmul(sigma_v, sqrt_sigma))
+
+  return math_ops.trace(_symmetric_matrix_square_root(sqrt_a_sigmav_a))
+
+
+def frechet_classifier_distance(real_images,
+                                generated_images,
+                                classifier_fn,
+                                num_batches=1):
+  """Classifier distance for evaluating a generative model.
+
+  This is based on the Frechet Inception distance, but for an arbitrary
+  classifier.
+
+  This technique is described in detail in https://arxiv.org/abs/1706.08500.
+  Given two Gaussian distribution with means m and m_w and covariance matrices
+  C and C_w, this function calculates
+
+              |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))
+
+  which captures how different the distributions of real images and generated
+  images (or more accurately, their visual features) are. Note that unlike the
+  Inception score, this is a true distance and utilizes information about real
+  world images.
+
+  Note that when computed using sample means and sample covariance matrices,
+  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
+  even if the two distributions are the same, for a small sample size, the
+  expected Frechet distance is large). It is important to use the same
+  sample size to compute Frechet classifier distance when comparing two
+  generative models.
+
+  NOTE: This function consumes images, computes their activations, and then
+  computes the classifier score. If you would like to precompute many
+  activations for real and generated images for large batches, please use
+  frechet_clasifier_distance_from_activations(), which this method also uses.
+
+  Args:
+    real_images: Real images to use to compute Frechet Inception distance.
+    generated_images: Generated images to use to compute Frechet Inception
+      distance.
+    classifier_fn: A function that takes images and produces activations
+      based on a classifier.
+    num_batches: Number of batches to split images in to in order to
+      efficiently run them through the classifier network.
+
+  Returns:
+    The Frechet Inception distance. A floating-point scalar of the same type
+    as the output of `classifier_fn`.
+  """
+  real_images_list = array_ops.split(
+      real_images, num_or_size_splits=num_batches)
+  generated_images_list = array_ops.split(
+      generated_images, num_or_size_splits=num_batches)
+
+  real_imgs = array_ops.stack(real_images_list)
+  generated_imgs = array_ops.stack(generated_images_list)
+
+  # Compute the activations using the memory-efficient `map_fn`.
+  def compute_activations(elems):
+    return functional_ops.map_fn(fn=classifier_fn,
+                                 elems=elems,
+                                 parallel_iterations=1,
+                                 back_prop=False,
+                                 swap_memory=True,
+                                 name='RunClassifier')
+
+  real_a = compute_activations(real_imgs)
+  gen_a = compute_activations(generated_imgs)
+
+  # Ensure the activations have the right shapes.
+  real_a = array_ops.concat(array_ops.unstack(real_a), 0)
+  gen_a = array_ops.concat(array_ops.unstack(gen_a), 0)
+
+  return frechet_classifier_distance_from_activations(real_a, gen_a)
+
+
+def mean_only_frechet_classifier_distance_from_activations(
+    real_activations, generated_activations):
+  """Classifier distance for evaluating a generative model from activations.
+
+  Given two Gaussian distribution with means m and m_w and covariance matrices
+  C and C_w, this function calcuates
+
+                                |m - m_w|^2
+
+  which captures how different the distributions of real images and generated
+  images (or more accurately, their visual features) are. Note that unlike the
+  Inception score, this is a true distance and utilizes information about real
+  world images.
+
+  Note that when computed using sample means and sample covariance matrices,
+  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
+  even if the two distributions are the same, for a small sample size, the
+  expected Frechet distance is large). It is important to use the same
+  sample size to compute frechet classifier distance when comparing two
+  generative models.
+
+  In this variant, we only compute the difference between the means of the
+  fitted Gaussians. The computation leads to O(n) vs. O(n^2) memory usage, yet
+  still retains much of the same information as FID.
+
+  Args:
+    real_activations: 2D array of activations of real images of size
+      [num_images, num_dims] to use to compute Frechet Inception distance.
+    generated_activations: 2D array of activations of generated images of size
+      [num_images, num_dims] to use to compute Frechet Inception distance.
+
+  Returns:
+    The mean-only Frechet Inception distance. A floating-point scalar of the
+    same type as the output of the activations.
+  """
+  real_activations.shape.assert_has_rank(2)
+  generated_activations.shape.assert_has_rank(2)
+
+  activations_dtype = real_activations.dtype
+  if activations_dtype != dtypes.float64:
+    real_activations = math_ops.to_double(real_activations)
+    generated_activations = math_ops.to_double(generated_activations)
+
+  # Compute means of activations.
+  m = math_ops.reduce_mean(real_activations, 0)
+  m_w = math_ops.reduce_mean(generated_activations, 0)
+
+  # Next the distance between means.
+  mean = math_ops.reduce_sum(
+      math_ops.squared_difference(m, m_w))  # Equivalent to L2 but more stable.
+  mofid = mean
+  if activations_dtype != dtypes.float64:
+    mofid = math_ops.cast(mofid, activations_dtype)
+
+  return mofid
+
+
+def diagonal_only_frechet_classifier_distance_from_activations(
+    real_activations, generated_activations):
+  """Classifier distance for evaluating a generative model.
+
+  This is based on the Frechet Inception distance, but for an arbitrary
+  classifier.
+
+  This technique is described in detail in https://arxiv.org/abs/1706.08500.
+  Given two Gaussian distribution with means m and m_w and covariance matrices
+  C and C_w, this function calcuates
+
+          |m - m_w|^2 + (sigma + sigma_w - 2(sigma x sigma_w)^(1/2))
+
+  which captures how different the distributions of real images and generated
+  images (or more accurately, their visual features) are. Note that unlike the
+  Inception score, this is a true distance and utilizes information about real
+  world images. In this variant, we compute diagonal-only covariance matrices.
+  As a result, instead of computing an expensive matrix square root, we can do
+  something much simpler, and has O(n) vs O(n^2) space complexity.
+
+  Note that when computed using sample means and sample covariance matrices,
+  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
+  even if the two distributions are the same, for a small sample size, the
+  expected Frechet distance is large). It is important to use the same
+  sample size to compute frechet classifier distance when comparing two
+  generative models.
+
+  Args:
+    real_activations: Real images to use to compute Frechet Inception distance.
+    generated_activations: Generated images to use to compute Frechet Inception
+      distance.
+
+  Returns:
+    The diagonal-only Frechet Inception distance. A floating-point scalar of
+    the same type as the output of the activations.
+
+  Raises:
+    ValueError: If the shape of the variance and mean vectors are not equal.
+  """
+  real_activations.shape.assert_has_rank(2)
+  generated_activations.shape.assert_has_rank(2)
+
+  activations_dtype = real_activations.dtype
+  if activations_dtype != dtypes.float64:
+    real_activations = math_ops.to_double(real_activations)
+    generated_activations = math_ops.to_double(generated_activations)
+
+  # Compute mean and covariance matrices of activations.
+  m, var = nn_impl.moments(real_activations, axes=[0])
+  m_w, var_w = nn_impl.moments(generated_activations, axes=[0])
+
+  actual_shape = var.get_shape()
+  expected_shape = m.get_shape()
+
+  if actual_shape != expected_shape:
+    raise ValueError('shape: {} must match expected shape: {}'.format(
+        actual_shape, expected_shape))
+
+  # Compute the two components of FID.
+
+  # First the covariance component.
+  # Here, note that trace(A + B) = trace(A) + trace(B)
+  trace = math_ops.reduce_sum(
+      (var + var_w) - 2.0 * math_ops.sqrt(math_ops.multiply(var, var_w)))
+
+  # Next the distance between means.
+  mean = math_ops.reduce_sum(
+      math_ops.squared_difference(m, m_w))  # Equivalent to L2 but more stable.
+  dofid = trace + mean
+  if activations_dtype != dtypes.float64:
+    dofid = math_ops.cast(dofid, activations_dtype)
+
+  return dofid
+
+
+def frechet_classifier_distance_from_activations(real_activations,
+                                                 generated_activations):
+  """Classifier distance for evaluating a generative model.
+
+  This methods computes the Frechet classifier distance from activations of
+  real images and generated images. This can be used independently of the
+  frechet_classifier_distance() method, especially in the case of using large
+  batches during evaluation where we would like precompute all of the
+  activations before computing the classifier distance.
+
+  This technique is described in detail in https://arxiv.org/abs/1706.08500.
+  Given two Gaussian distribution with means m and m_w and covariance matrices
+  C and C_w, this function calculates
+
+                |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))
+
+  which captures how different the distributions of real images and generated
+  images (or more accurately, their visual features) are. Note that unlike the
+  Inception score, this is a true distance and utilizes information about real
+  world images.
+
+  Note that when computed using sample means and sample covariance matrices,
+  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
+  even if the two distributions are the same, for a small sample size, the
+  expected Frechet distance is large). It is important to use the same
+  sample size to compute frechet classifier distance when comparing two
+  generative models.
+
+  Args:
+    real_activations: 2D Tensor containing activations of real data. Shape is
+      [batch_size, activation_size].
+    generated_activations: 2D Tensor containing activations of generated data.
+      Shape is [batch_size, activation_size].
+
+  Returns:
+   The Frechet Inception distance. A floating-point scalar of the same type
+   as the output of the activations.
+
+  """
+  real_activations.shape.assert_has_rank(2)
+  generated_activations.shape.assert_has_rank(2)
+
+  activations_dtype = real_activations.dtype
+  if activations_dtype != dtypes.float64:
+    real_activations = math_ops.to_double(real_activations)
+    generated_activations = math_ops.to_double(generated_activations)
+
+  # Compute mean and covariance matrices of activations.
+  m = math_ops.reduce_mean(real_activations, 0)
+  m_w = math_ops.reduce_mean(generated_activations, 0)
+  num_examples_real = math_ops.to_double(array_ops.shape(real_activations)[0])
+  num_examples_generated = math_ops.to_double(
+      array_ops.shape(generated_activations)[0])
+
+  # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T
+  real_centered = real_activations - m
+  sigma = math_ops.matmul(
+      real_centered, real_centered, transpose_a=True) / (
+          num_examples_real - 1)
+
+  gen_centered = generated_activations - m_w
+  sigma_w = math_ops.matmul(
+      gen_centered, gen_centered, transpose_a=True) / (
+          num_examples_generated - 1)
+
+  # Find the Tr(sqrt(sigma sigma_w)) component of FID
+  sqrt_trace_component = trace_sqrt_product(sigma, sigma_w)
+
+  # Compute the two components of FID.
+
+  # First the covariance component.
+  # Here, note that trace(A + B) = trace(A) + trace(B)
+  trace = math_ops.trace(sigma + sigma_w) - 2.0 * sqrt_trace_component
+
+  # Next the distance between means.
+  mean = math_ops.reduce_sum(
+      math_ops.squared_difference(m, m_w))  # Equivalent to L2 but more stable.
+  fid = trace + mean
+  if activations_dtype != dtypes.float64:
+    fid = math_ops.cast(fid, activations_dtype)
+
+  return fid
+
+frechet_inception_distance = functools.partial(
+    frechet_classifier_distance,
+    classifier_fn=functools.partial(
+        run_inception, output_tensor=INCEPTION_FINAL_POOL))
+
+
+def kernel_classifier_distance(real_images,
+                               generated_images,
+                               classifier_fn,
+                               num_classifier_batches=1,
+                               max_block_size=1024,
+                               dtype=None):
+  """Kernel "classifier" distance for evaluating a generative model.
+
+  This is based on the Kernel Inception distance, but for an arbitrary
+  embedding.
+
+  This technique is described in detail in https://arxiv.org/abs/1801.01401.
+  Given two distributions P and Q of activations, this function calculates
+
+      E_{X, X' ~ P}[k(X, X')] + E_{Y, Y' ~ Q}[k(Y, Y')]
+        - 2 E_{X ~ P, Y ~ Q}[k(X, Y)]
+
+  where k is the polynomial kernel
+
+      k(x, y) = ( x^T y / dimension + 1 )^3.
+
+  This captures how different the distributions of real and generated images'
+  visual features are. Like the Frechet distance (and unlike the Inception
+  score), this is a true distance and incorporates information about the
+  target images. Unlike the Frechet score, this function computes an
+  *unbiased* and asymptotically normal estimator, which makes comparing
+  estimates across models much more intuitive.
+
+  The estimator used takes time quadratic in max_block_size. Larger values of
+  max_block_size will decrease the variance of the estimator but increase the
+  computational cost. This differs slightly from the estimator used by the
+  original paper; it is the block estimator of https://arxiv.org/abs/1307.1954.
+
+  NOTE: the blocking code assumes that real_activations and
+  generated_activations are both in random order. If either is sorted in a
+  meaningful order, the estimator will behave poorly.
+
+  NOTE: This function consumes images, computes their activations, and then
+  computes the classifier score. If you would like to precompute many
+  activations for real and generated images for large batches, or to compute
+  multiple scores based on the same images, please use
+  kernel_clasifier_distance_from_activations(), which this method also uses.
+
+  Args:
+    real_images: Real images to use to compute Kernel Inception distance.
+    generated_images: Generated images to use to compute Kernel Inception
+      distance.
+    classifier_fn: A function that takes images and produces activations based
+      on a classifier.
+    num_classifier_batches: Number of batches to split images in to in order to
+      efficiently run them through the classifier network.
+    max_estimator_block_size: integer, default 1024. The distance estimator
+      splits samples into blocks for computational efficiency. Larger values are
+      more computationally expensive but decrease the variance of the distance
+      estimate.
+    dtype: if not None, coerce activations to this dtype before computations.
+
+  Returns:
+   The Kernel Inception Distance. A floating-point scalar of the same type
+   as the output of the activations.
+  """
+  return kernel_classifier_distance_and_std(
+      real_images,
+      generated_images,
+      classifier_fn,
+      num_classifier_batches=num_classifier_batches,
+      max_block_size=max_block_size,
+      dtype=dtype)[0]
+
+
+kernel_inception_distance = functools.partial(
+    kernel_classifier_distance,
+    classifier_fn=functools.partial(
+        run_inception, output_tensor=INCEPTION_FINAL_POOL))
+
+
+def kernel_classifier_distance_and_std(real_images,
+                                       generated_images,
+                                       classifier_fn,
+                                       num_classifier_batches=1,
+                                       max_block_size=1024,
+                                       dtype=None):
+  """Kernel "classifier" distance for evaluating a generative model.
+
+  This is based on the Kernel Inception distance, but for an arbitrary
+  embedding. Also returns an estimate of the standard error of the distance
+  estimator.
+
+  This technique is described in detail in https://arxiv.org/abs/1801.01401.
+  Given two distributions P and Q of activations, this function calculates
+
+      E_{X, X' ~ P}[k(X, X')] + E_{Y, Y' ~ Q}[k(Y, Y')]
+        - 2 E_{X ~ P, Y ~ Q}[k(X, Y)]
+
+  where k is the polynomial kernel
+
+      k(x, y) = ( x^T y / dimension + 1 )^3.
+
+  This captures how different the distributions of real and generated images'
+  visual features are. Like the Frechet distance (and unlike the Inception
+  score), this is a true distance and incorporates information about the
+  target images. Unlike the Frechet score, this function computes an
+  *unbiased* and asymptotically normal estimator, which makes comparing
+  estimates across models much more intuitive.
+
+  The estimator used takes time quadratic in max_block_size. Larger values of
+  max_block_size will decrease the variance of the estimator but increase the
+  computational cost. This differs slightly from the estimator used by the
+  original paper; it is the block estimator of https://arxiv.org/abs/1307.1954.
+
+  NOTE: the blocking code assumes that real_activations and
+  generated_activations are both in random order. If either is sorted in a
+  meaningful order, the estimator will behave poorly.
+
+  NOTE: This function consumes images, computes their activations, and then
+  computes the classifier score. If you would like to precompute many
+  activations for real and generated images for large batches, or to compute
+  multiple scores based on the same images, please use
+  kernel_clasifier_distance_from_activations(), which this method also uses.
+
+  Args:
+    real_images: Real images to use to compute Kernel Inception distance.
+    generated_images: Generated images to use to compute Kernel Inception
+      distance.
+    classifier_fn: A function that takes images and produces activations based
+      on a classifier.
+    num_classifier_batches: Number of batches to split images in to in order to
+      efficiently run them through the classifier network.
+    max_estimator_block_size: integer, default 1024. The distance estimator
+      splits samples into blocks for computational efficiency. Larger values are
+      more computationally expensive but decrease the variance of the distance
+      estimate. Having a smaller block size also gives a better estimate of the
+      standard error.
+    dtype: if not None, coerce activations to this dtype before computations.
+
+  Returns:
+   The Kernel Inception Distance. A floating-point scalar of the same type
+     as the output of the activations.
+   An estimate of the standard error of the distance estimator (a scalar of
+     the same type).
+  """
+  real_images_list = array_ops.split(
+      real_images, num_or_size_splits=num_classifier_batches)
+  generated_images_list = array_ops.split(
+      generated_images, num_or_size_splits=num_classifier_batches)
+
+  real_imgs = array_ops.stack(real_images_list)
+  generated_imgs = array_ops.stack(generated_images_list)
+
+  # Compute the activations using the memory-efficient `map_fn`.
+  def compute_activations(elems):
+    return functional_ops.map_fn(
+        fn=classifier_fn,
+        elems=elems,
+        parallel_iterations=1,
+        back_prop=False,
+        swap_memory=True,
+        name='RunClassifier')
+
+  real_a = compute_activations(real_imgs)
+  gen_a = compute_activations(generated_imgs)
+
+  # Ensure the activations have the right shapes.
+  real_a = array_ops.concat(array_ops.unstack(real_a), 0)
+  gen_a = array_ops.concat(array_ops.unstack(gen_a), 0)
+
+  return kernel_classifier_distance_and_std_from_activations(
+      real_a, gen_a, max_block_size=max_block_size)
+
+
+kernel_inception_distance_and_std = functools.partial(
+    kernel_classifier_distance_and_std,
+    classifier_fn=functools.partial(
+        run_inception, output_tensor=INCEPTION_FINAL_POOL))
+
+
+def kernel_classifier_distance_from_activations(real_activations,
+                                                generated_activations,
+                                                max_block_size=1024,
+                                                dtype=None):
+  """Kernel "classifier" distance for evaluating a generative model.
+
+  This methods computes the kernel classifier distance from activations of
+  real images and generated images. This can be used independently of the
+  kernel_classifier_distance() method, especially in the case of using large
+  batches during evaluation where we would like to precompute all of the
+  activations before computing the classifier distance, or if we want to
+  compute multiple metrics based on the same images.
+
+  This technique is described in detail in https://arxiv.org/abs/1801.01401.
+  Given two distributions P and Q of activations, this function calculates
+
+      E_{X, X' ~ P}[k(X, X')] + E_{Y, Y' ~ Q}[k(Y, Y')]
+        - 2 E_{X ~ P, Y ~ Q}[k(X, Y)]
+
+  where k is the polynomial kernel
+
+      k(x, y) = ( x^T y / dimension + 1 )^3.
+
+  This captures how different the distributions of real and generated images'
+  visual features are. Like the Frechet distance (and unlike the Inception
+  score), this is a true distance and incorporates information about the
+  target images. Unlike the Frechet score, this function computes an
+  *unbiased* and asymptotically normal estimator, which makes comparing
+  estimates across models much more intuitive.
+
+  The estimator used takes time quadratic in max_block_size. Larger values of
+  max_block_size will decrease the variance of the estimator but increase the
+  computational cost. This differs slightly from the estimator used by the
+  original paper; it is the block estimator of https://arxiv.org/abs/1307.1954.
+
+  NOTE: the blocking code assumes that real_activations and
+  generated_activations are both in random order. If either is sorted in a
+  meaningful order, the estimator will behave poorly.
+
+  Args:
+    real_activations: 2D Tensor containing activations of real data. Shape is
+      [batch_size, activation_size].
+    generated_activations: 2D Tensor containing activations of generated data.
+      Shape is [batch_size, activation_size].
+    max_block_size: integer, default 1024. The distance estimator splits samples
+      into blocks for computational efficiency. Larger values are more
+      computationally expensive but decrease the variance of the distance
+      estimate.
+    dtype: if not None, coerce activations to this dtype before computations.
+
+  Returns:
+   The Kernel Inception Distance. A floating-point scalar of the same type
+   as the output of the activations.
+  """
+  return kernel_classifier_distance_and_std_from_activations(
+      real_activations, generated_activations, max_block_size=max_block_size)[0]
+
+
+def kernel_classifier_distance_and_std_from_activations(real_activations,
+                                                        generated_activations,
+                                                        max_block_size=1024,
+                                                        dtype=None):
+  """Kernel "classifier" distance for evaluating a generative model.
+
+  This methods computes the kernel classifier distance from activations of
+  real images and generated images. This can be used independently of the
+  kernel_classifier_distance() method, especially in the case of using large
+  batches during evaluation where we would like to precompute all of the
+  activations before computing the classifier distance, or if we want to
+  compute multiple metrics based on the same images. It also returns a rough
+  estimate of the standard error of the estimator.
+
+  This technique is described in detail in https://arxiv.org/abs/1801.01401.
+  Given two distributions P and Q of activations, this function calculates
+
+      E_{X, X' ~ P}[k(X, X')] + E_{Y, Y' ~ Q}[k(Y, Y')]
+        - 2 E_{X ~ P, Y ~ Q}[k(X, Y)]
+
+  where k is the polynomial kernel
+
+      k(x, y) = ( x^T y / dimension + 1 )^3.
+
+  This captures how different the distributions of real and generated images'
+  visual features are. Like the Frechet distance (and unlike the Inception
+  score), this is a true distance and incorporates information about the
+  target images. Unlike the Frechet score, this function computes an
+  *unbiased* and asymptotically normal estimator, which makes comparing
+  estimates across models much more intuitive.
+
+  The estimator used takes time quadratic in max_block_size. Larger values of
+  max_block_size will decrease the variance of the estimator but increase the
+  computational cost. This differs slightly from the estimator used by the
+  original paper; it is the block estimator of https://arxiv.org/abs/1307.1954.
+  The estimate of the standard error will also be more reliable when there are
+  more blocks, i.e. when max_block_size is smaller.
+
+  NOTE: the blocking code assumes that real_activations and
+  generated_activations are both in random order. If either is sorted in a
+  meaningful order, the estimator will behave poorly.
+
+  Args:
+    real_activations: 2D Tensor containing activations of real data. Shape is
+      [batch_size, activation_size].
+    generated_activations: 2D Tensor containing activations of generated data.
+      Shape is [batch_size, activation_size].
+    max_block_size: integer, default 1024. The distance estimator splits samples
+      into blocks for computational efficiency. Larger values are more
+      computationally expensive but decrease the variance of the distance
+      estimate. Having a smaller block size also gives a better estimate of the
+      standard error.
+    dtype: if not None, coerce activations to this dtype before computations.
+
+  Returns:
+   The Kernel Inception Distance. A floating-point scalar of the same type
+     as the output of the activations.
+   An estimate of the standard error of the distance estimator (a scalar of
+     the same type).
+  """
+
+  real_activations.shape.assert_has_rank(2)
+  generated_activations.shape.assert_has_rank(2)
+  real_activations.shape[1].assert_is_compatible_with(
+      generated_activations.shape[1])
+
+  if dtype is None:
+    dtype = real_activations.dtype
+    assert generated_activations.dtype == dtype
+  else:
+    real_activations = math_ops.cast(real_activations, dtype)
+    generated_activations = math_ops.cast(generated_activations, dtype)
+
+  # Figure out how to split the activations into blocks of approximately
+  # equal size, with none larger than max_block_size.
+  n_r = array_ops.shape(real_activations)[0]
+  n_g = array_ops.shape(generated_activations)[0]
+
+  n_bigger = math_ops.maximum(n_r, n_g)
+  n_blocks = math_ops.to_int32(math_ops.ceil(n_bigger / max_block_size))
+
+  v_r = n_r // n_blocks
+  v_g = n_g // n_blocks
+
+  n_plusone_r = n_r - v_r * n_blocks
+  n_plusone_g = n_g - v_g * n_blocks
+
+  sizes_r = array_ops.concat([
+      array_ops.fill([n_blocks - n_plusone_r], v_r),
+      array_ops.fill([n_plusone_r], v_r + 1),
+  ], 0)
+  sizes_g = array_ops.concat([
+      array_ops.fill([n_blocks - n_plusone_g], v_g),
+      array_ops.fill([n_plusone_g], v_g + 1),
+  ], 0)
+
+  zero = array_ops.zeros([1], dtype=dtypes.int32)
+  inds_r = array_ops.concat([zero, math_ops.cumsum(sizes_r)], 0)
+  inds_g = array_ops.concat([zero, math_ops.cumsum(sizes_g)], 0)
+
+  dim = math_ops.cast(real_activations.shape[1], dtype)
+
+  def compute_kid_block(i):
+    'Compute the ith block of the KID estimate.'
+    r_s = inds_r[i]
+    r_e = inds_r[i + 1]
+    r = real_activations[r_s:r_e]
+    m = math_ops.cast(r_e - r_s, dtype)
+
+    g_s = inds_g[i]
+    g_e = inds_g[i + 1]
+    g = generated_activations[g_s:g_e]
+    n = math_ops.cast(g_e - g_s, dtype)
+
+    k_rr = (math_ops.matmul(r, r, transpose_b=True) / dim + 1)**3
+    k_rg = (math_ops.matmul(r, g, transpose_b=True) / dim + 1)**3
+    k_gg = (math_ops.matmul(g, g, transpose_b=True) / dim + 1)**3
+    return (-2 * math_ops.reduce_mean(k_rg) +
+            (math_ops.reduce_sum(k_rr) - math_ops.trace(k_rr)) / (m * (m - 1)) +
+            (math_ops.reduce_sum(k_gg) - math_ops.trace(k_gg)) / (n * (n - 1)))
+
+  ests = functional_ops.map_fn(
+      compute_kid_block, math_ops.range(n_blocks), dtype=dtype, back_prop=False)
+
+  mn = math_ops.reduce_mean(ests)
+
+  # nn_impl.moments doesn't use the Bessel correction, which we want here
+  n_blocks_ = math_ops.cast(n_blocks, dtype)
+  var = control_flow_ops.cond(
+      math_ops.less_equal(n_blocks, 1),
+      lambda: array_ops.constant(float('nan'), dtype=dtype),
+      lambda: math_ops.reduce_sum(math_ops.square(ests - mn)) / (n_blocks_ - 1))
+
+  return mn, math_ops.sqrt(var / n_blocks_)
diff --git a/lenet.py b/lenet.py
index 3ddab06..5ed6705 100644
--- a/lenet.py
+++ b/lenet.py
@@ -13,6 +13,8 @@ import random
 from sklearn.metrics import accuracy_score
 from sklearn.model_selection import train_test_split
 
+from classifier_metrics_impl import classifier_score_from_logits
+
 def import_mnist():
   from tensorflow.examples.tutorials.mnist import input_data
   mnist = input_data.read_data_sets("MNIST_data/", reshape=False)
@@ -126,10 +128,12 @@ def train_classifier(x_train, y_train, x_val, y_val, batch_size=128, epochs=100,
 def test_classifier(model, x_test, y_true):
   x_test = np.pad(x_test, ((0,0),(2,2),(2,2),(0,0)), 'constant')
   y_pred = model.predict(x_test)
+  logits = tf.convert_to_tensor(y_pred, dtype=tf.float32)
+  inception_score = tf.keras.backend.eval(classifier_score_from_logits(logits))
   y_pred = np.argmax(y_pred, axis=1)
   y_true = np.argmax(y_true, axis=1)
   plot_example_errors(y_pred, y_true, x_test)
-  return accuracy_score(y_true, y_pred)
+  return accuracy_score(y_true, y_pred), inception_score
 
 def mix_data(X_train, y_train, X_validation, y_validation, train_gen, tr_labels_gen, val_gen, val_labels_gen, split=0):
 
@@ -162,4 +166,4 @@ if __name__ == '__main__':
   x_train, y_train, x_val, y_val, x_t, y_t = import_mnist()
   print(y_t.shape)
   model = train_classifier(x_train[:100], y_train[:100], x_val, y_val, epochs=3)
-  test_classifier(model, x_t, y_t)
+  print(test_classifier(model, x_t, y_t))