1. Introduction

In [ ]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

1.

$$ \begin{align*} y(x_n, \pmb{w}) &= \sum_{j=0}^M w_j x_n^j \\ E(\pmb{w}) &= \frac{1}{2}\sum_{n=1}^N(y(x_n, \pmb{w})-t_n)^2 \\ &= \frac{1}{2}\sum_{n=1}^N\left(\sum_{j=0}^M w_j x_n^j-t_n\right)^2 \\ \end{align*} $$

Differentiating with respect to $w_i$, $$ \begin{align*} \frac{\mathrm{d}E(\pmb{w})}{\mathrm{d}w_i} &= \sum_{n=1}^N\left(\sum_{j=0}^M w_j x_n^j-t_n\right)x_n^i \\ &= \sum_{n=1}^N\sum_{j=0}^M w_j x_n^{i+j}-\sum_{n=1}^N t_n x_n^i \\ &= \sum_{j=0}^M\left(\sum_{n=1}^N x_n^{i+j}\right) w_j-\sum_{n=1}^N t_n x_n^i \\ \end{align*} $$ Setting this derivative to 0 we get, $$ \begin{align*} \sum_{j=0}^M\left(\sum_{n=1}^N x_n^{i+j}\right) w_j-\sum_{n=1}^N t_n x_n^i &= 0 \\ \implies \sum_{j=0}^M\left(\sum_{n=1}^N x_n^{i+j}\right) w_j &= \sum_{n=1}^N t_n x_n^i \\ \implies \sum_{j=0}^M A_{ij} w_j &= T_i\\ \end{align*} $$ where $$ \begin{align*} A_{ij} = \sum_{n=1}^N (x_n)^{i+j} & & T_i = \sum_{n=1}^N (x_n)^i t_n \end{align*} $$ This is a linear equation for each $i \in \{0, 1, \dots, n\}$ which together form a system of linear equations. This can be written in matrix form as $$ \begin{align*} A\pmb{w} = T \end{align*} $$

2.

$$ \begin{align*} y(x_n, \pmb{w}) &= \sum_{j=0}^M w_j x_n^j \\ E(\pmb{w}) &= \frac{1}{2}\sum_{n=1}^N(y(x_n, \pmb{w})-t_n)^2 + \frac{\lambda}{2}\|\pmb{w}\|^2\\ &= \frac{1}{2}\sum_{n=1}^N\left(\sum_{j=0}^M w_j x_n^j-t_n\right)^2 + \frac{\lambda}{2}\sum_{j=0}^M w_j^2\\ \end{align*} $$

Differentiating with respect to $w_i$, $$ \begin{align*} \frac{\mathrm{d}E(\pmb{w})}{\mathrm{d}w_i} &= \sum_{n=1}^N\left(\sum_{j=0}^M w_j x_n^j-t_n\right)x_n^i + \lambda w_i\\ &= \sum_{n=1}^N\sum_{j=0}^M w_j x_n^{i+j}-\sum_{n=1}^N t_n x_n^i + \lambda w_i \\ &= \sum_{j=0}^M\left(\lambda\delta_{ij} + \sum_{n=1}^N x_n^{i+j}\right) w_j-\sum_{n=1}^N t_n x_n^i \\ \end{align*} $$ Setting this derivative to 0 we get, $$ \begin{align*} \sum_{j=0}^M\left(\lambda\delta_{ij} + \sum_{n=1}^N x_n^{i+j}\right) w_j-\sum_{n=1}^N t_n x_n^i &= 0 \\ \implies \sum_{j=0}^M\left(\lambda\delta_{ij} + \sum_{n=1}^N x_n^{i+j}\right) w_j &= \sum_{n=1}^N t_n x_n^i \\ \implies \sum_{j=0}^M (\lambda\delta_{ij} + A_{ij}) w_j &= T_i\\ \end{align*} $$ where $$ \begin{align*} A_{ij} = \sum_{n=1}^N (x_n)^{i+j} & & T_i = \sum_{n=1}^N (x_n)^i t_n \end{align*} $$ This is a linear equation for each $i \in \{0, 1, \dots, n\}$ which together form a system of linear equations. This can be written in matrix form as $$ \begin{align*} (A + \lambda I_n)\pmb{w} = T \end{align*} $$ where $I_n$ is the $n\times n$ identity matrix.

3.

$$ \begin{align*} p(a) &= p(a, r) + p(a, b) + p(a, g) \\ &= p(r)p(a | r) + p(b)p(a | b) + p(g)p(a | g) \\ &= 0.2 \times 3/10 + 0.2 \times 1/2 + 0.6 \times 3/10 \\ &= 0.34 \\ p(g|o) &= \frac{p(g)p(o|g)}{p(o)} \\ &= \frac{p(g)p(o|g)}{p(r)p(o|r) + p(b)p(o|b) + p(g)p(o|g)} \\ &= \frac{0.6 \times 3/10}{0.2 \times 4/10 + 0.2 \times 1/2 + 0.6 \times 3/10} \\ &= 0.5 \end{align*} $$

4.

$$ \begin{align*} x &= g(y) \\ \implies p_y(y) &= p_x(g(y))|g'(y)| \\ \end{align*} $$

Differentiating with respect to $y$, $$ \begin{align*} p_y'(y) = p_x'(g(y))g'(y)|g'(y)| + p_x(g(y))g''(y)\frac{|g'(y)|}{g'(y)} \end{align*} $$ At the location $\hat{y}$ of the maximum of the density in $y$, the above derivative is 0 $$ \begin{align*} p_y'(\hat{y}) = p_x'(g(\hat{y}))g'(\hat{y})|g'(\hat{y})| + p_x(g(\hat{y}))g''(\hat{y})\frac{|g'(\hat{y})|}{g'(\hat{y})} &= 0 \\ \implies p_x'(g(\hat{y}))g'(\hat{y}) + p_x(g(\hat{y}))\frac{g''(\hat{y})}{g'(\hat{y})} &= 0 & (\because |g'(\hat{y})| \neq 0 \text{ in general}) \\ \end{align*} $$ If the location $\hat{x}$ of the maximum of the density in $x$ were such that $\hat{x} = g(\hat{y})$ then the above equation will reduce to $$ \begin{align*} p_x'(\hat{x})g'(\hat{y}) + p_x(\hat{x})\frac{g''(\hat{y})}{g'(\hat{y})} &= 0 \\ \implies p_x(\hat{x})\frac{g''(\hat{y})}{g'(\hat{y})} &= 0 & (\because p_x'(\hat{x}) = 0) \end{align*} $$ But the last equation is not true in general (for example consider $y \sim \mathcal{N}(1, 1), \ x = y^2$).

If the transformation is linear, i.e., $x = ay, a \neq 0$, we have, $$ \begin{align*} p_y(y) &= p_x(ay)|a| \\ \end{align*} $$ Differentiating with respect to $y$, $$ \begin{align*} p_y'(y) &= p_x'(ay)a|a| \\ \end{align*} $$ At $\hat{y}$, $$ \begin{align*} 0 = p_y'(\hat{y}) = p_x'(a\hat{y})a|a| \\ \implies p_x'(a\hat{y}) = 0 \implies \hat{x} = a\hat{y} = g(\hat{y}) \end{align*} $$ Note that this is trivially true even for the case $a=0$.

5.

$$ \begin{align*} \mathrm{var}[f(x)] &= \mathbb{E}[(f(x) - \mathbb{E}[f(x)])^2] \\ &= \mathbb{E}[f(x)^2 - 2f(x)\mathbb{E}[f(x)] + \mathbb{E}[f(x)]^2] \\ &= \mathbb{E}[f(x)^2] - 2\mathbb{E}[f(x)\mathbb{E}[f(x)]] + \mathbb{E}[\mathbb{E}[f(x)]^2] \\ &= \mathbb{E}[f(x)^2] - 2\mathbb{E}[f(x)]^2 + \mathbb{E}[f(x)]^2 \\ &= \mathbb{E}[f(x)^2] - \mathbb{E}[f(x)]^2 \\ \end{align*} $$

6.

$$ \begin{align*} \mathbb{E}_{x,y}[xy] = \int xy p(x, y) \mathrm{d}x \mathrm{d}y \end{align*} $$

If $x, y$ are independent, $p(x, y) = p(x)p(y)$. So, the above expectation becomes $$ \begin{align*} \mathbb{E}_{x,y}[xy] &= \int xy p(x)p(y) \mathrm{d}x \mathrm{d}y \\ &= \int x p(x) \mathrm{d}x \int yp(y) \mathrm{d}y \\ &= \mathbb{E}[x]\mathbb{E}[y] \end{align*} $$
So when $x,y$ are independent $\mathrm{cov}[x,y] = \mathbb{E}_{x,y}[xy] - \mathbb{E}[x]\mathbb{E}[y] = \mathbb{E}[x]\mathbb{E}[y] - \mathbb{E}[x]\mathbb{E}[y] = 0$

We can show that $\mathbb{E}_{x,y}[xy] = \mathbb{E}[x]\mathbb{E}[y]$ even when $x, y$ are independent discrete random variables.

7.

$$ \begin{align*} I &= \int_{-\infty}^{\infty} \exp{\left(-\frac{1}{2\sigma^2}x^2\right)}\mathrm{d}x \\ I^2 &= \int_{-\infty}^{\infty} \int_{-\infty}^{\infty} \exp{\left(-\frac{1}{2\sigma^2}x^2 - \frac{1}{2\sigma^2}y^2\right)} \, \mathrm{d}x \, \mathrm{d}y \\ \end{align*} $$

Transforming to polar coordinates, $$ \begin{align*} I^2 &= \int_{0}^{2\pi} \int_{0}^{\infty} \exp{\left(-\frac{1}{2\sigma^2}r^2\right)} \, r\mathrm{d}r \, \mathrm{d}\theta \\ &= \int_{0}^{\infty} \exp{\left(-\frac{1}{2\sigma^2}r^2\right)} \, r\mathrm{d}r \int_{0}^{2\pi} \, \mathrm{d}\theta \\ &= 2\pi\int_{0}^{\infty} \exp{\left(-\frac{1}{2\sigma^2}r^2\right)} \, r\mathrm{d}r \\ \end{align*} $$ Substituting $u = r^2$ we get, $$ \begin{align*} I^2 &= 2\pi\int_{0}^{\infty} \exp{\left(-\frac{1}{2\sigma^2}u\right)} \, \frac{1}{2} \mathrm{d}u \\ &= \pi\int_{0}^{\infty} \exp{\left(-\frac{1}{2\sigma^2}u\right)} \, \mathrm{d}u \\ &= 2\pi\sigma^2 \\ \implies I &= (2\pi\sigma^2)^{1/2} \end{align*} $$ Let's now verify the normalization condition for the Gaussian distribution, $$ \begin{align*} \int_{-\infty}^{\infty}\mathcal{N}(x|\mu,\sigma^2) \, \mathrm{d}x &= \int_{-\infty}^{\infty}\frac{1}{(2\pi\sigma^2)^{1/2}}\exp{\left(\frac{-1}{2\sigma^2}(x-\mu)^2\right)}\, \mathrm{d}x \end{align*} $$ Substituting $t = x - \mu$, $$ \begin{align*} \int_{-\infty}^{\infty}\mathcal{N}(x|\mu,\sigma^2) \, \mathrm{d}x &= \int_{-\infty}^{\infty}\frac{1}{(2\pi\sigma^2)^{1/2}}\exp{\left(\frac{-1}{2\sigma^2}t^2\right)}\, \mathrm{d}t \\ &= \frac{1}{(2\pi\sigma^2)^{1/2}} \int_{-\infty}^{\infty}\exp{\left(\frac{-1}{2\sigma^2}t^2\right)}\, \mathrm{d}t \\ &= \frac{1}{(2\pi\sigma^2)^{1/2}} I = 1 \end{align*} $$

8.

$x \sim \mathcal{N}(\mu, \sigma^2)$ $$ \begin{align*} \mathbb{E}[x] &= \int_{-\infty}^{\infty} \mathcal{N}(x|\mu, \sigma^2) x \, \mathrm{d}x \\ &= \int_{-\infty}^{\infty} \frac{1}{(2\pi\sigma^2)^{1/2}}\exp{\left(\frac{-1}{2\sigma^2}(x-\mu)^2\right)} x \, \mathrm{d}x \\ \end{align*} $$ Substituting $t = x-\mu$, $$ \begin{align*} \mathbb{E}[x] &= \int_{-\infty}^{\infty} \frac{1}{(2\pi\sigma^2)^{1/2}}\exp{\left(\frac{-1}{2\sigma^2}t^2\right)} (t+\mu) \, \mathrm{d}t \\ &= \int_{-\infty}^{\infty} \frac{1}{(2\pi\sigma^2)^{1/2}}\exp{\left(\frac{-1}{2\sigma^2}t^2\right)}t \, \mathrm{d}t + \int_{-\infty}^{\infty} \frac{1}{(2\pi\sigma^2)^{1/2}}\exp{\left(\frac{-1}{2\sigma^2}t^2\right)} \mu \, \mathrm{d}t \\ &= \int_{-\infty}^{\infty} \frac{1}{(2\pi\sigma^2)^{1/2}}\exp{\left(\frac{-1}{2\sigma^2}t^2\right)}t \, \mathrm{d}t + \mu \int_{-\infty}^{\infty} \frac{1}{(2\pi\sigma^2)^{1/2}}\exp{\left(\frac{-1}{2\sigma^2}t^2\right)} \, \mathrm{d}t \end{align*} $$ The first integral is 0 because the integrand is an odd function which is integrated over the entire real line. The second integral is 1 due to the normalization condition of the Gaussian distribution. So $\mathbb{E}[x] = \mu$.

To calculate the second moment we differentiate the normalization condition with respect to $\sigma^2$, $$ \begin{align*} \frac{\mathrm{d}}{\mathrm{d}\sigma^2}\int_{-\infty}^{\infty} \mathcal{N}(x|\mu, \sigma^2) \, \mathrm{d}x &= \frac{\mathrm{d}}{\mathrm{d}\sigma^2}1 = 0 \\ \implies 0 &= \frac{\mathrm{d}}{\mathrm{d}\sigma^2}\int_{-\infty}^{\infty} \frac{1}{(2\pi\sigma^2)^{1/2}}\exp{\left(\frac{-1}{2\sigma^2}(x-\mu)^2\right)} \, \mathrm{d}x \\ &= \int_{-\infty}^{\infty} \frac{\mathrm{d}}{\mathrm{d}\sigma^2} \left(\frac{1}{(2\pi\sigma^2)^{1/2}}\exp{\left(\frac{-1}{2\sigma^2}(x-\mu)^2\right)}\right) \, \mathrm{d}x \\ &= \int_{-\infty}^{\infty} \frac{-1}{2(2\pi)^{1/2}(\sigma^2)^{3/2}}\exp{\left(\frac{-1}{2\sigma^2}(x-\mu)^2\right)} \, \mathrm{d}x \\& \ \ \ + \int_{-\infty}^{\infty} \frac{1}{(2\pi\sigma^2)^{1/2}}\exp{\left(\frac{-1}{2\sigma^2}(x-\mu)^2\right)}\left(\frac{1}{2(\sigma^2)^2}(x-\mu)^2\right) \, \mathrm{d}x\\ &= \frac{-1}{2\sigma^2}\int_{-\infty}^{\infty} \frac{1}{(2\pi\sigma^2)^{1/2}}\exp{\left(\frac{-1}{2\sigma^2}(x-\mu)^2\right)} \, \mathrm{d}x \\& \ \ \ + \frac{1}{2(\sigma^2)^2}\int_{-\infty}^{\infty} \frac{1}{(2\pi\sigma^2)^{1/2}}\exp{\left(\frac{-1}{2\sigma^2}(x-\mu)^2\right)}(x-\mu)^2 \, \mathrm{d}x\\ &= \frac{-1}{2\sigma^2} + \frac{1}{2(\sigma^2)^2}\mathbb{E}[(x-\mu)^2] \\ \implies \frac{1}{2(\sigma^2)^2}\mathbb{E}[(x-\mu)^2] &= \frac{1}{2\sigma^2} \\ \end{align*} $$ Rearranging the above expression we get, $$ \begin{align*} \mathbb{E}[(x-\mu)^2] &= \sigma^2 \\ \implies \mathbb{E}[x^2-\mu^2] &= \sigma^2 \\ \implies \mathbb{E}[x^2] &= \mu^2 + \sigma^2 \end{align*} $$ So, $\mathrm{var}[x] = \mathbb{E}[x^2] - \mathbb{E}[x]^2 = \mu^2 + \sigma^2 - \mu^2 = \sigma^2$

11.

$$ \begin{align*} \ln p(\pmb{x}|\mu, \sigma^2) = -\frac{1}{2\sigma^2}\sum_{n=1}^{N}(x_n-\mu)^2-\frac{N}{2}\ln \sigma^2 - \frac{N}{2}\ln (2\pi) \end{align*} $$

Differentiating log-likelihood with respect to $\mu$, $$ \begin{align*} \frac{\mathrm{d}\ln p(\pmb{x}|\mu, \sigma^2)}{\mathrm{d}\mu} = \frac{1}{2\sigma^2}\sum_{n=1}^{N}2(x_n-\mu) \end{align*} $$ Setting it to 0, we get: $$ \begin{align*} \sum_{n=1}^{N}(x_n-\mu_{ML}) &= 0 \\ \implies \mu_{ML} &= \frac{1}{N}\sum_{n=1}^{N}x_n \end{align*} $$ Differentiating log-likelihood with respect to $\sigma^2$ ($\mu$ is set to its optimal value $\mu_{ML}$), $$ \begin{align*} \frac{\mathrm{d}\ln p(\pmb{x}|\mu, \sigma^2)}{\mathrm{d}\sigma^2} = \frac{1}{2\sigma^4}\sum_{n=1}^{N}(x_n-\mu_{ML})^2-\frac{N}{2\sigma^2} \end{align*} $$ Setting it to 0, we get: $$ \begin{align*} \sigma_{ML}^2 = \frac{1}{N}\sum_{n=1}^N(x-\mu_{ML})^2 \end{align*} $$

12.

If $n = m, \, \mathbb{E}[x_nx_m] = \mathbb{E}[x_n^2] = \mu^2 + \sigma^2$ (see problem 8).

If $n \neq m, \, \mathbb{E}[x_nx_m] = \mathbb{E}[x_n]\mathbb{E}[x_m] = \mu^2$ ($\because x_n, x_m$ are independent when $n\neq m$).

Hence, $\mathbb{E}[x_nx_m] = \mu^2 + I_{nm}\sigma^2$.

Now, $$ \begin{align*} \mathbb{E}[\mu_{ML}] &= \mathbb{E}\left[\frac{1}{N}\sum_{n=1}^Nx_n\right] \\ &= \frac{1}{N}\sum_{n=1}^N\mathbb{E}[x_n] \\ &= \frac{1}{N} \times N\mu \\ &= \mu \end{align*} $$ $$ \begin{align*} \mathbb{E}[\sigma_{ML}^2] &= \mathbb{E}\left[\frac{1}{N}\sum_{n=1}^N(x_n - \mu_{ML})^2\right] \\ &= \frac{1}{N}\sum_{n=1}^N\mathbb{E}[(x_n - \mu_{ML})^2] \\ &= \frac{1}{N}\sum_{n=1}^N\mathbb{E}\left[\left(x_n - \frac{1}{N}\sum_{i=1}^Nx_i\right)^2\right] \\ &= \frac{1}{N}\sum_{n=1}^N\mathbb{E}\left[x_n^2-\frac{2}{N}\sum_{i=1}^Nx_ix_n + \frac{1}{N^2}\sum_{j=1}^N\sum_{i=1}^Nx_ix_j\right] \\ &= \frac{1}{N}\sum_{n=1}^N\left(\mathbb{E}[x_n^2]-\frac{2}{N}\sum_{i=1}^N\mathbb{E}[x_ix_n] + \frac{1}{N^2}\sum_{j=1}^N\sum_{i=1}^N\mathbb{E}[x_ix_j]\right) \\ &= \frac{1}{N}\sum_{n=1}^N\left(\mu^2 + \sigma^2-\frac{2}{N}(N\mu^2 + \sigma^2) + \frac{1}{N^2}(N(\mu^2 + \sigma^2) + N(N-1)\mu^2)\right) \\ &= \frac{1}{N}\sum_{n=1}^N\left(\mu^2 + \sigma^2-\frac{2}{N}(N\mu^2 + \sigma^2) + \frac{1}{N}(N\mu^2 + \sigma^2)\right) \\ &= \frac{1}{N}\sum_{n=1}^N\left(\mu^2 + \sigma^2-\frac{1}{N}(N\mu^2 + \sigma^2)\right) \\ &= \frac{1}{N}\sum_{n=1}^N\left(\frac{N-1}{N}\right)\sigma^2 \\ &= \left(\frac{N-1}{N}\right)\sigma^2 \end{align*} $$

13.

The modified variance estimator is given by $$ \begin{align*} \sigma^2_{ML} &= \frac{1}{N}\sum_{n=1}^N(x_n-\mu)^2 \\ \implies \mathbb{E}[\sigma^2_{ML}] &= \mathbb{E}\left[\frac{1}{N}\sum_{n=1}^N(x_n-\mu)^2\right] \\ &= \frac{1}{N}\sum_{n=1}^N\mathbb{E}[(x_n-\mu)^2] \\ &= \frac{1}{N}\sum_{n=1}^N \sigma^2 \\ &= \frac{1}{N} \times N\sigma^2 \\ &= \sigma^2 \end{align*} $$

30.

$$ \begin{align*} \mathrm{KL}(p\|q) &= -\int p(x) \ln \left(\frac{q(x)}{p(x)}\right) \mathrm{d}x \\ &= -\int p(x) \ln \left(\frac{\frac{1}{\sqrt{2\pi s^2}}e^{\frac{-(x-m)^2}{2s^2}}}{\frac{1}{\sqrt{2\pi \sigma^2}}e^{\frac{-(x-\mu)^2}{2\sigma^2}}}\right) \mathrm{d}x \\ &= -\int p(x) \left( \ln \left(\frac{\sigma}{s}\right) + \frac{(x-\mu)^2}{2\sigma^2} - \frac{(x-m)^2}{2s^2}\right) \mathrm{d}x \\ &= \ln \frac{s}{\sigma} + \frac{1}{2s^2}\mathrm{E}_p[(x-m)^2]-\frac{1}{2\sigma^2}\mathrm{E}_p[(x-\mu)^2] \\ &= \ln \frac{s}{\sigma} + \frac{1}{2s^2}(\mu^2 + \sigma^2 - 2m\mu + m^2) - \frac{1}{2} \\ &= \ln \frac{s}{\sigma} + \frac{1}{2s^2}(\sigma^2 + (\mu - m)^2) - \frac{1}{2} \\ \end{align*} $$

35.

$$ \begin{align*} p(x) &= \frac{1}{(2\pi \sigma^2)^{1/2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}} \\ \mathrm{H}[x] &= -\int p(x) \ln p(x) \mathrm{d}x \\ &= -\int p(x) \ln \left(\frac{1}{(2\pi \sigma^2)^{1/2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}}\right) \mathrm{d}x\\ &= -\int p(x) \left(-\frac{1}{2}\ln (2\pi \sigma^2) - \frac{(x-\mu)^2}{2\sigma^2}\right) \mathrm{d}x \\ &= \frac{1}{2} \ln (2\pi \sigma^2) \int p(x) \mathrm{d}x + \int p(x) \frac{(x-\mu)^2}{2\sigma^2} \mathrm{d}x \\ &= \frac{1}{2} \ln (2\pi \sigma^2) \times 1 + \frac{1}{2\sigma^2} \times \sigma^2 \\ &= \frac{1}{2}\left(1 + \ln(2\pi \sigma^2)\right) \end{align*} $$
In [ ]:
x = np.linspace(1e-6, 1, num=50)
y = 0.5 * (1 + np.log(2 * np.pi * x))

plt.plot(x, y)
plt.title("Entropy of Gaussian distribution vs Variance")
plt.xlabel(r"$\sigma^2$")
plt.ylabel(r"$\mathrm{H}[X]$")
plt.show()

Note that unlike in the discrete case, entropy can be negative for continuous random variables!

40.

Let $a_1, a_2, \dots, a_n$ be $n$ real numbers. Applying Jensen's inequality (1.115) to the concave function $f(x) = \ln x$ on $a_1, a_2, \dots, a_n$ with $\lambda_i = \frac{1}{n} \, \forall i$ gives: $$ \begin{align*} \ln \left({\frac{1}{n}\sum_{i=1}^n a_i}\right) &\geq \frac{1}{n}\sum_{i=1}^n\ln a_i \\ &\geq \frac{1}{n}\ln \prod_{i=1}^n a_i \\ &\geq \ln \left(\prod_{i=1}^n a_i\right)^{\frac{1}{n}} \\ \text{Exponentiating on both sides,} \\ \frac{1}{n}\sum_{i=1}^n a_i &\geq \left(\prod_{i=1}^n a_i\right)^{\frac{1}{n}} \end{align*} $$

41.

$$ \begin{align*} \mathrm{I}[\pmb{x}, \pmb{y}] &\equiv \mathrm{KL}(p(\pmb{x}, \pmb{y}) \| p(\pmb{x})p(\pmb{y})) \\ &= -\int \int p(\pmb{x}, \pmb{y}) \ln {\frac{p(\pmb{x})p(\pmb{y})}{p(\pmb{x}, \pmb{y})}} \mathrm{d}\pmb{x}\mathrm{d}\pmb{y} \\ &= -\int \int p(\pmb{x}, \pmb{y}) \ln {\frac{p(\pmb{x})p(\pmb{y})}{p(\pmb{x}) p(\pmb{y}|\pmb{x})}} \mathrm{d}\pmb{x}\mathrm{d}\pmb{y} \\ &= -\int \int p(\pmb{x}, \pmb{y}) \ln {\frac{p(\pmb{y})}{p(\pmb{y}|\pmb{x})}} \mathrm{d}\pmb{x}\mathrm{d}\pmb{y} \\ &= -\int \int p(\pmb{x}, \pmb{y}) (\ln {p(\pmb{y})} - \ln {p(\pmb{y}|\pmb{x})}) \mathrm{d}\pmb{x}\mathrm{d}\pmb{y} \\ &= -\int \int p(\pmb{x}, \pmb{y}) \ln {p(\pmb{y})} \mathrm{d}\pmb{x}\mathrm{d}\pmb{y} + \int \int p(\pmb{x}, \pmb{y}) \ln {p(\pmb{y}|\pmb{x})}) \mathrm{d}\pmb{x}\mathrm{d}\pmb{y} \\ &= -\int p(\pmb{y}) \ln {p(\pmb{y})} \mathrm{d}\pmb{y} + \int \int p(\pmb{x}, \pmb{y}) \ln {p(\pmb{y}|\pmb{x})}) \mathrm{d}\pmb{x}\mathrm{d}\pmb{y} \\ &= \mathrm{H}(\pmb{y}) - \mathrm{H}(\pmb{y}|\pmb{x}) \\ \text{Similarly, } &= \mathrm{H}(\pmb{x}) - \mathrm{H}(\pmb{x}|\pmb{y}) \end{align*} $$