Variational Autoencoder

理解VAE先别从网络结构上理解。计算： $p(z \vert X) = \frac{p(X \vert z)p(z)}{p(X)} = \frac{p(X \vert z)p(z)}{\int P_{xz}(X|z;\theta) P_z(z) dz}$ 用KL divergence来衡量两个分布的信息损失： $KL(q_\lambda(z \vert x) \vert \vert p(z \vert x)) = \mathbf{E}_q[\log q_\lambda(z \vert x)]- \mathbf{E}_q[\log p(x, z)] + \log p(x)$ 。稍作变换： $\log p(x) -KL(q_\lambda(z \vert x) \vert \vert p(z \vert x)) = \mathbf{E}_q[\log p(x, z)] - \mathbf{E}_q[\log q_\lambda(z \vert x)] = \mathbf{E}_q[\log p(x, z)] - KL(q_\lambda(z\vert x) \vert\vert p(z)) =ELBO(\lambda)$

右边就是要优化的：

右边第一项的log似然的期望最大化 -- 这是decoder
右边第二项的KL散度最小化 -- 这是encoder
所以跟AUtoEncoder扯上关系了。

注意：

decoder过程 $p(X \vert z)$ ，并不是衡量x 与 $\hat x$ 的误差，而是极大似然 $\log p(x, z) = \log p(x \vert z) p(z)$
encoder输出正太分布均值和方差，但是做了reparemerization trick，见下面。

变分自编码器（Variational Auto-Encoder，VAE）

优化目标

从神经网络结构角度：每个样本 $x_i$ 的loss为： $l_i(\theta, \phi) = - E_{z\sim q_\theta(z\vert x_i)}[\log p_\phi(x_i\vert z)] + KL(q_\theta(z\vert x_i) \vert\vert p(z))$ 右边第一项为重构x的loss，第二项为正则项。

从概率的角度： $p(x, z) = p(x \vert z) p(z)$ 把联合概率用 likelihood 和 prior 。 $p(z \vert x) = \frac{p(x \vert z)p(z)}{p(x)}$ 推理部分包含了先验概率，不好求，于是用变分近似，用 $q_\lambda(z \vert x)$ 去近似 $p(z \vert x)$ 。用KL divergence来衡量两个分布的信息损失： $KL(q_\lambda(z \vert x) \vert \vert p(z \vert x)) = \mathbf{E}_q[\log q_\lambda(z \vert x)]- \mathbf{E}_q[\log p(x, z)] + \log p(x)$ 。抛开先验概率这项，定义： $ELBO(\lambda) = \mathbf{E}_q[\log p(x, z)] - \mathbf{E}_q[\log q_\lambda(z \vert x)]$ , 得 $\log p(x) = ELBO(\lambda) + KL(q_\lambda(z \vert x) \vert \vert p(z \vert x))$ , 通过Jensen’s inequality ，KL divergence始终大于等于0，所以最小化KL divergence，等价于最大化ELBO 。

对于单个样本， $ELBO_i(\lambda) = E_{q_\lambda(z\vert x_i)}[\log p(x_i\vert z)] - KL(q_\lambda(z\vert x_i) \vert\vert p(z))$ , 然后用stochastic gradient descent 求解。稍微改写下，包含inference和 generative network parameters ， $ELBO_i(\theta, \phi) = E_{q_\theta(z\vert x_i)}[\log p_\phi(x_i\vert z)] - KL(q_\theta(z\vert x_i) \vert\vert p(z))$ , 到这里 $ELBO_i(\theta, \phi) = -l_i(\theta, \phi)$ 。

附KL两个多维正太分布：

KL(P1||P2) = \frac {1}{2} [\log \frac {det(\Sigma_2)}{det(\Sigma_1)} -d + tr(\Sigma_2^{-1} \Sigma_1) + (\mu_2-\mu_1)^T \Sigma_2(\mu_2-\mu_1)] \\ KL(P1|| N(0,1)) = \frac {1}{2} [\log {det(\Sigma_1)} -d + tr(\Sigma_1) + \mu_1^T \mu_1]

the reparametrization trick

encoder的参数是正太分布 $P(Z|X)$ 的mean和standard devation (协方差矩阵的主对角线，可以就用一个向量表示)，可以通过 $z = \mu + \sigma \odot \epsilon$ 得到。

图来自VAE（4）——实现有更细节的图 Caffe code to accompany my Tutorial on Variational Autoencoders

TFlearn autoencoder

VAE.py

from __future__ import (
    division,
    print_function,
    absolute_import
)
from six.moves import range

import tensorflow as tf
import tflearn

def encode(incoming, intermediate_dim=None, latent_dim=None):
    with tf.variable_op_scope([incoming], 'Encoder') as scope:
        name = scope.name

        net = tflearn.fully_connected(incoming, intermediate_dim)
        net = tflearn.batch_normalization(net)
        net = tflearn.activation(net, activation='relu')
        net = tflearn.fully_connected(net, intermediate_dim)
        net = tflearn.batch_normalization(net)
        net = tflearn.activation(net, activation='relu')
        net = tflearn.fully_connected(net, intermediate_dim)
        net = tflearn.batch_normalization(net)
        h = tflearn.activation(net, activation='relu', name='H')

        mean = tflearn.fully_connected(h, latent_dim, name='Mean')
        log_var = tflearn.fully_connected(h, latent_dim, name='LogVariance')
        std = tf.exp(0.5 * log_var, name='StandardDeviation')
        epsilon = tf.random_normal(tf.shape(log_var), name='Epsilon')
        z = tf.add(mean, tf.mul(std, epsilon), name='SampleLatentVariable') 
        #把标准差乘上正太分布，然后加到均值上去？

    tf.add_to_collection(tf.GraphKeys.LAYER_TENSOR + '/' + name, z)

    return z, mean, log_var

def decode(incoming, intermediate_dim=None, original_shape=None):
    with tf.variable_op_scope([incoming], 'Decoder') as scope:
        name = scope.name

        net = tflearn.fully_connected(incoming, intermediate_dim)
        net = tflearn.batch_normalization(net)
        net = tflearn.activation(net, activation='relu')
        net = tflearn.fully_connected(net, intermediate_dim)
        net = tflearn.batch_normalization(net)
        net = tflearn.activation(net, activation='relu')
        net = tflearn.fully_connected(net, intermediate_dim)
        net = tflearn.batch_normalization(net)
        h = tflearn.activation(net, activation='relu', name='H')

        mean = tflearn.fully_connected(h, original_shape[0], activation='sigmoid',
                                       name='Mean')

    tf.add_to_collection(tf.GraphKeys.LAYER_TENSOR + '/' + name, mean)

    return mean

encoder.py

from __future__ import (
    division,
    print_function,
    absolute_import
)
from six.moves import range

import tensorflow as tf
import tflearn
import vae
from tflearn.datasets import mnist

import numpy as np
from skimage import io

batch_size = 128
latent_dim = 2
intermediate_dim = 512

X, Y, testX, testY = mnist.load_data()

original_shape = X.shape[1:]
original_shape = [original_shape[i] for i in range(len(original_shape))]
input_shape = [None] + original_shape

x = tflearn.input_data(shape=input_shape)
z, z_mean, z_log_var = vae.encode(x, intermediate_dim=intermediate_dim,
                                  latent_dim=latent_dim)
x_decoded_mean = vae.decode(z, intermediate_dim=intermediate_dim,
                            original_shape=original_shape)

def vae_loss(y_pred, y_true):
    with tf.variable_op_scope([y_pred, y_true], 'Loss') as scope:
        name = scope.name

        binary_cross_entropy_loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(y_pred, y_true), reduction_indices=1)
        kullback_leibler_divergence_loss = - 0.5 * tf.reduce_sum(1 + z_log_var - tf.pow(z_mean, 2) - tf.exp(z_log_var), reduction_indices=1)
        loss = tf.reduce_mean(binary_cross_entropy_loss + kullback_leibler_divergence_loss)

    tf.add_to_collection(tf.GraphKeys.LAYER_TENSOR + '/' + name, loss)

    return loss

vae = tflearn.regression(x_decoded_mean, optimizer='adam', loss=vae_loss,
                         metric=None)
vae = tflearn.DNN(vae, tensorboard_verbose=0,
                  checkpoint_path='model_variational_autoencoder',
                  max_checkpoints=10)
vae.fit(X, X, n_epoch=100, batch_size=batch_size,
        run_id='varational_auto_encoder')

generator.py

from __future__ import (
    division,
    print_function,
    absolute_import
)
from six.moves import range

import tensorflow as tf
import tflearn
from tflearn.datasets import mnist
import vae

import numpy as np
from skimage import io

original_dim = 784
latent_dim = 2
intermediate_dim = 512
model_file = 'model_variational_autoencoder-43000'

X, Y, testX, testY = mnist.load_data()

original_shape = X.shape[1:]
original_shape = [original_shape[i] for i in range(len(original_shape))]

with tf.Graph().as_default():
    input_shape = [None] + original_shape
    x = tflearn.input_data(shape=input_shape)
    z, mean, logvar = vae.encode(x, intermediate_dim=intermediate_dim,
                                 latent_dim=latent_dim)
    encoder = tflearn.DNN(z)
    optargs = {'scope_for_restore':'Encoder'}
    encoder.load(model_file, optargs)
    mean_encoder = tflearn.DNN(mean)
    mean_encoder.load(model_file, optargs)
    logvar_encoder = tflearn.DNN(logvar)
    logvar_encoder.load(model_file, optargs)

with tf.Graph().as_default():
    # build a digit generator that can sample from the learned distribution
    decoder_input = tflearn.input_data(shape=[None, latent_dim])
    gen_decoded_mean = vae.decode(decoder_input, intermediate_dim=intermediate_dim,
                                  original_shape=original_shape)
    generator = tflearn.DNN(gen_decoded_mean)
    generator.load(model_file, {'scope_for_restore':'Decoder'})

digit_size = 28
n = 15
linspace = 1000
figure = np.zeros((digit_size * n, digit_size * n))
grid_x = np.linspace(-linspace, linspace, n)
grid_y = np.linspace(-linspace, linspace, n)

for i, yi in enumerate(grid_x):
    for j, xi in enumerate(grid_y):
        z_sample = np.array([[xi, yi] + [0 for k in range(2, latent_dim)]])
        x_decoded = generator.predict(z_sample)
        digit = np.reshape(x_decoded[0], [digit_size, digit_size])
        figure[i * digit_size : (i + 1) * digit_size,
               j * digit_size : (j + 1) * digit_size] = digit
figure *= 255
figure = figure.astype(np.uint8)

io.imsave('vae_z.png', figure)

figure = np.ndarray(shape=(digit_size * (n), digit_size * (n)),
                    dtype=np.float16)
testX = tflearn.data_utils.shuffle(X)[0][0:1]
testMean = mean_encoder.predict(testX)[0]
testLogVar = logvar_encoder.predict(testX)[0]
std = [np.exp(0.5 * testLogVar[i]) * 4 for i in range(2)]
grid_x = np.linspace(-std[0], std[0], n) + testMean[0]
grid_y = np.linspace(-std[1], std[1], n) + testMean[1]
for i, yi in enumerate(grid_x):
    for j, xi in enumerate(grid_y):
        z_sample = np.array([[xi, yi] + [testMean[k] for k in range(2, latent_dim)]])
        x_decoded = generator.predict(z_sample)
        digit = np.reshape(x_decoded[0], [digit_size, digit_size])
        figure[i * digit_size : (i + 1) * digit_size,
               j * digit_size : (j + 1) * digit_size] = digit
figure *= 255
figure = figure.astype(np.uint8)

io.imsave('vae_std.png', figure)

参考佳文

从变分编码、信息瓶颈到正态分布：论遗忘的重要性

变分推断 variational inference

The Unreasonable Confusion of Variational Autoencoders Tutorial on Variational Autoencoders VAE(3)——公式与实现变分自编码器（Variational Autoencoder, VAE）通俗教程

https://yq.aliyun.com/articles/68410 没有任何公式——直观的理解变分自动编码器VAE

VAE和Adam发明人博士论文：变分推理和深度学习

再谈变分自编码器VAE：从贝叶斯观点出发

PreviousDenoising Autoencoder NextWasserstein AE

Last updated 3 years ago