\documentclass[10pt]{article}
\usepackage{amsfonts,amsthm,amsmath,amssymb}
\usepackage{array}
\usepackage{epsfig}
\usepackage{fullpage}
\usepackage{amssymb}
\usepackage[colorlinks = false]{hyperref}
\newcommand{\1}{\mathbbm{1}}
\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator*{\argmax}{argmax}
\newcommand{\x}{\times}
\newcommand{\Z}{\mathbb{Z}}
\newcommand{\Q}{\mathbb{Q}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\N}{\mathbb{N}}
\newcommand{\F}{\mathbb{F}}
\newcommand{\E}{\mathop{\mathbb{E}}}
\renewcommand{\bar}{\overline}
\renewcommand{\epsilon}{\varepsilon}
\newcommand{\eps}{\varepsilon}
\newcommand{\DTIME}{\textbf{DTIME}}
\renewcommand{\P}{\textbf{P}}
\newcommand{\SPACE}{\textbf{SPACE}}
\begin{document}
\input{preamble.tex}
\newtheorem{example}[theorem]{Example}
\theoremstyle{definition}
\newtheorem{defn}[theorem]{Definition}
\handout{CS 229r Information Theory in Computer Science}{Feb. 5, 2019}{Instructor:
Madhu Sudan}{Scribe: Prayaag Venkat}{Lecture 3}
\section{Administrative notes}
\begin{enumerate}
\item \textbf{Scribing:} Due to the large class size, students may double or triple up on scribing for lectures. Madhu will post further instructions.
\item \textbf{Problem Set 1:} Due Friday, February 8.
\item \textbf{Office Hours:} Madhu will hold office hours after lectures, in MD 339. See Piazza for Mitali's office hours.
\end{enumerate}
\section{Plan and Review}
In this lecture, we covered the following topics that will give us more background on information theory:
\begin{enumerate}
\item Conditional entropy, Divergence, Mutual Information
\item Divergence Theorem and applications
\end{enumerate}
Before proceeding, we review some concepts from the previous lecture. For a random variable $X$, its \emph{entropy} $H(X)$ is the average number of bits needed to convey $n$ i.i.d. copies $X_1,\ldots,X_n$ of $X$ in expectation. Here, we are averaging over the $n$ copies (dividing by $n$) and computing the expectation over the random variables $X_1, \ldots,X_n$. We saw that if $X$ is supported on a finite set $\Omega = [m]$ and its distribution $P_X$ is written as $P_X = (p_1, \ldots, p_m)$ (where $p_i \geq 0$ and $\sum_{i=1}^m p_i = 1$), then we can write:
\[
H(X) = \sum_{i=1}^m p_i \log \frac{1}{p_i} = \E_{i \sim P_X}[\log \frac{1}{p_i}].
\]
We can interpret this second expression as telling us that to encode element $i$, we are ``budgeting'' $l^*_i = \log \frac{1}{p_i}$ bits. We can then ask if this choice of $\{l_i^*\}_{i=1}^m$ is the best set of encoding lengths. Is is possible that some other $\{l_i\}_{i=1}^m$, where we encode $i$ using $l_i$ bits, achieves a smaller expected encoding length? Problem 4 on Problem Set 1 (Kraft's Inequality) asks you to investigate what constraints one must have on $\{l_i\}_{i=1}^m$ in order to have a valid encoding. Any prefix-free encoding must satisfy $\sum_i 2^{-l_i} \leq 1$.
Given $\{l_i\}_{i=1}^m$, we can define $q_i = 2^{- l_i}$. It is easy to see that $q_i \geq 0$ and $\sum_i q_i \leq 1$ (if a corresponding prefix-free encoding exists, by Kraft's inequality). Then the expected number of bits we need to send is $\sum_i p_i l_i = \sum_i p_i \log (\frac{1}{q_i})$. By the end of this lecture, we hope to show that:
\[
\sum_i p_i \log (\frac{1}{q_i}) \geq \sum_i p_i \log (\frac{1}{p_i}).
\]
This tells us that the optimal way to compress $P_X$ is by using $\{l_i^*\}_{i=1}^m$, rather than any other $\{l_i\}_{i=1}^m$.
\section{Axioms of Entropy}
First, we set up some notation. $X$ and $Y$ are random variables supported on $\Omega$. Their joint distribution is $P_{XY}$, written $(X,Y) \sim P_{XY}$, which simply means $Pr[X=\alpha,Y=\beta] = P_{XY}(\alpha,\beta)$. The marginal distribution of $X$ is $P_X$, where $P_X(\alpha) = \sum_{\beta \in \Omega} P_{XY}(\alpha,\beta)$, and similarly for $Y$. The conditional distribution of $Y$ given that $X = \alpha$ is $P_{Y | X= \alpha}$, where $P_{Y | X= \alpha}(\beta) = \frac{P_{XY}(\alpha,\beta)}{P_X(\alpha)}$. Finally, we write $X \bot Y$ to denote that $X,Y$ are independent.
Now, recall the followings axioms. By the end of the lecture, we will formally prove all of them.
\begin{enumerate}
\item $H(X) \leq \log |\Omega|$, with equality iff $P_X = Unif(\Omega)$.
\item $H(X,Y) = H(X) + H(Y|X)$. This is the chain rule for entropy.
\item $H(Y|X) \leq H(Y)$. This captures the intuitive fact that conditioning can only reduce entropy.
\end{enumerate}
\section{Conditional Entropy}
\begin{defn}[Conditional entropy]
The \emph{conditional entropy} of $Y$ given $X$ is the expected entropy of the conditional random variable $Y|X$. Formally, it is defined as:
\[
H(Y|X) = \E_{\alpha \sim P_X}[H(Y|X=\alpha)] = \sum_{\alpha \in \Omega} P_X(\alpha)H(Y|X= \alpha) = \sum_{\alpha,\beta\in \Omega} P_X(\alpha) \log \frac{P_X(\alpha)}{P_{XY}(\alpha,\beta)}.
\]
\end{defn}
\begin{exercise}
Given this definition of conditional entropy, prove Axiom 2. To do this, express each of $H(X), H(Y), H(Y|X)$ as a sum over $\alpha,\beta \in \Omega$ and compare them term-by-term.
\end{exercise}
\begin{exercise}
Recall that $X \bot Y$ means $P_{XY}(\alpha,\beta) = P_X(\alpha) P_Y(\beta)$ for all $\alpha, \beta \in \Omega$. Prove that if $X \bot Y$, then $H(Y|X) = H(X)$ (this is one part of Axiom 3).
\end{exercise}
Combining these two exercises, we easily obtain the following intuitive result that entropy is multiplicative.
\begin{corollary}
IF $X_1,\ldots,X_n$ are i.i.d. copies of $X$ then $H(X_1,\ldots,X_n) = nH(X)$.
\end{corollary}
\section{Divergence}
We know return to the following central inequality:
\[
\sum_i p_i \log (\frac{1}{q_i}) \geq \sum_i p_i \log (\frac{1}{p_i}).
\]
From this, we can prove all the inequality parts of the axioms. The main technical tool is the following.
\begin{theorem}[Divergence Theorem]
Let $P,Q$ be distributions on $\Omega$. Then:
\[
\E_{x \sim P}[\log \frac{1}{P(x)}] \leq \E_{x \sim P}[\log \frac{1}{Q(x)}].
\]
Moreover, equality is attained iff $P=Q$.
\end{theorem}
Note that in the inequality, both expectations are taken over $P$. First, if $P(x) = 0$, then can just take $P(x) \log \frac{1}{P(x)}$ to be 0. Second, if $P(x) > 0$, but $Q(x) = 0$, then the right hand side of the inequality is $\infty$, meaning that $Q$ was not expecting $x$ to appear and hence could have encoded this string using a very long sequence.
To prove this Divergence Theorem, we will make use of Jensen's Inequality.
\begin{theorem}[Jensen's Inequality]
Let $f: \R \rightarrow \R$ be a concave function and $Z$ a real-valued random variable. Then:
\[
\E_Z [f(Z)] \leq f(\E_Z[Z]).
\]
Moreover, if $f$ is strictly concave, then equality holds iff $Z$ is deterministic (a constant).
\end{theorem}
We omit the proof; see the Wikipedia page for an explanation.
\begin{proof}[Proof of Divergence Theorem]
Apply Jensen's Inequality on the function $f(x) = \log x$ (which is strictly concave) and the random variable $Z = \frac{Q(X)}{P(X)}$ where $X \sim P$. Then it follows that:
\[
\E_{X\sim P}[\log \frac{Q(X)}{P(X)}] \leq \log \E_{X\sim P}[\frac{Q(X)}{P(X)}] = 0.
\]
Rearranging, we get that:
\[
\E_{x \sim P}[\log \frac{1}{P(x)}] \leq \E_{x \sim P}[\log \frac{1}{Q(x)}].
\]
Next, we leave it as an exercise to check that the equality part of the theorem follows from the equality part of Jensen's Inequality.
\end{proof}
Revisiting the proof, we can extract the following useful definition.
\begin{defn}(Kullback-Leibler Divergence)
The \emph{KL divergence} between two distributions $P,Q$ is:
\[
D(P||Q) = \E_{X \sim P}[\log \frac{Q(X)}{P(X)}].
\]
\end{defn}
Roughly, $D(P||Q)$ represents the similarity of the two distributions. It describes the average increase in bits one would need to encode $X \sim P$ under the mistaken belief that $X \sim Q$. More explicitly, the KL divergence satisfies the following nice properties:
\begin{enumerate}
\item $D(P||Q) \geq 0$, with equality iff $P=Q$.
\item $D(P^n||Q^n) = n D(P||Q)$, where $P^n$ denotes the $n$-fold product distribution of $P$.
\end{enumerate}
On the other hand, the KL divergence is not so well-behaved in the following ways:
\begin{enumerate}
\item It is not symmetric. That is, $D(P||Q) \neq D(Q||P)$ in general.
\item It does not satisfy the triangle inequality. That is, $D(P||Q) \nleq D(P||R) + D(R||Q)$ in general.
\item $D(P||Q)$ is not bounded. This occurs, for example, when $Q(x) = 0 < P(x)$ for some element $x \in \Omega$.
\end{enumerate}
\subsection{Applications}
We will now use the Divergence Theorem to prove the remaining parts of the axioms.
\begin{exercise}
Prove Axiom 1. To do this, instantiate the Divergence Theorem with $P = P_X$ and $Q = Unif(\Omega)$.
\end{exercise}
To prove Axiom 3, note that we will look at the divergence between $P_{XY}$ (the joint distribution) and the $P_{X} \times P_Y$ (the product distribution of the marginals). Note that if $X \bot Y$, then $P_{XY} = P_{X} \times P_Y$. From the chain rule, we know that $H(X,Y) = H(X) + H(Y|X)$. Because $P_{X} \times P_Y$ is a product distribution, then entropy of a random variable from this distribution is $H(X) + H(Y)$. If we show that $H(X,Y) \leq H(X) + H(Y)$, then we may conclude that $H(Y|X) \leq H(Y)$ (which is precisely Axiom 3).
Proceeding in this way, we know $0 \leq D(P_{XY} ||P_{X} \times P_Y)$. Rearranging as in the proof of the Divergence Theorem, we have:
\[
H(X,Y) = \E_{(x,y) \sim P_{XY}}[\log \frac{1}{P_{XY}(x,y)}] \leq \E_{(x,y) \sim P_{XY}}[\log \frac{1}{P_X(x)P_Y(y)}] = H(X) + H(Y),
\]
where the last step follows by expanding the logarithm of the product and collecting terms appropriately.
\section{Mutual Information}
\begin{defn}
The \emph{mutual information} $I(Y;X)$ of two random variables $X,Y$ represents the amount of information that $X$ contains about $Y$. Formally, we define it to be $I(Y;X) = H(Y|X) - H(X)$.
\end{defn}
The following corollary is implied by the third axiom.
\begin{corollary}
$I(Y;X) \geq 0$, with equality iff $X \bot Y$.
\end{corollary}
\begin{exercise}
Verify that $I(Y;X) = I(X;Y)$.
\end{exercise}
\subsection{Conditional Mutual Information}
\begin{defn}
The \emph{mutual information} $I(Y;X|Z)$ of two random variables $X,Y$ conditioned on a third random variable $Z$ represents the amount of information that $X|Z$ contains about $Y|Z$. Formally, we define it to be $I(Y;X | Z) = \E_{z \sim P_Z}[I(Y|Z=z;X|Z=z)] = H(Y|X,Z) - H(X|Z)$.
\end{defn}
Similar to entropy, we have a chain rule for mutual information. If $X_1, \ldots, X_n$ are i.i.d. copies of $X$, then
\[
I(Y;X_1,\ldots,X_n) = I(Y;X_1) + I(Y;X_2|X_1) + \ldots + I(Y;X_n|X_1,\ldots,X_{n-1}).
\]
\section{More Inequalities}
We now state two more inequalities. We did not have time to cover the proofs in lecture, but they follow from the machinery we have developed so far.
\begin{theorem}[Data Processsing Inequality]
Let $X \rightarrow Y \rightarrow \hat{X}$ be Markov chain (meaning $X, \hat{X}$ are independent, conditioned on $Y$). Then:
\[
I(X;\hat{X}) \leq I(X;Y).
\]
\end{theorem}
This inequality models the following scenario. $X$ is a random variable we want to predict, based on observing only the random variable $Y$. $\hat{X}$ represents an estimate of $X$, based on $Y$. The inequality says that our estimator cannot contain more information about $X$ than does $Y$.
As a special case, one can take $\hat{X} = g(Y)$, where $g$ is some (deterministic) function. Then $I(X;g(Y)) \leq I(X;Y)$ describes a limitation on our predictor $g$.
As a side note, if $H(X)$ is small, then this tells us that $X$ should be ``predictable''. Similarly, if $H(X|Y)$ is small, then $X$ should be ``predictable'' from $Y$. Problem 5 of Problem Set 1 asks you to investigate this intuition and prove Fano's Inequality.
\end{document}