\documentclass[10pt]{article}
\usepackage{amsfonts,amsthm,amsmath,amssymb}
\usepackage{array}
\usepackage{epsfig}
\usepackage{fullpage}
\usepackage{amssymb}
\usepackage[colorlinks = false]{hyperref}
\newcommand{\1}{\mathbbm{1}}
\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator*{\argmax}{argmax}
\newcommand{\x}{\times}
\newcommand{\Z}{\mathbb{Z}}
\newcommand{\Q}{\mathbb{Q}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\N}{\mathbb{N}}
\newcommand{\F}{\mathbb{F}}
\newcommand{\E}{\mathop{\mathbb{E}}}
\renewcommand{\bar}{\overline}
\renewcommand{\epsilon}{\varepsilon}
\newcommand{\eps}{\varepsilon}
\newcommand{\DTIME}{\textbf{DTIME}}
\renewcommand{\P}{\textbf{P}}
\newcommand{\SPACE}{\textbf{SPACE}}
\begin{document}
\input{preamble.tex}
\newtheorem{example}[theorem]{Example}
\theoremstyle{definition}
\newtheorem{defn}[theorem]{Definition}
\handout{CS 229r Information Theory in Computer Science}{Jan 29, 2019}{Instructor:
Madhu Sudan}{Scribe: Kenz Kallal}{Lecture 1}
\section{Welcome to CS221}
\subsection{Course Information}
Contact information and office hours:
\begin{itemize}
\item \textbf{Lecturer:} Madhu Sudan (\href{mailto:madhu@cs.harvard.edu}{madhu@cs.harvard.edu}).
\item \textbf{Prof. Sudan's Office Hours:} Tuesday, Thursday 1:15--2:15.
\item \textbf{TF:} Mitali Bafna (\href{mailto:mitali.bafna@gmail.com}{mitali.bafna@gmail.com})
\item \textbf{TF's Office Hours:} This week Wednesday and Friday 4:30--5:30 at LISE 319.
\end{itemize}
\subsection{Course Expectations}
Grades will be based on the following:
\begin{itemize}
\item 3--5 problem sets
\item Scribing $\geq 1$ lecture
\item Final project
\item Participation (in class and on Piazza)
\end{itemize}
\subsection{Course Topics}
The first few lectures will be about the basics of information theory. Then, they will cover applications of information theory to computer science:
\begin{itemize}
\item Limits on the performance of data structures
\item How well can information be compressed?
\item Error-correcting codes
\item Communication complexity
\item Streaming
\item Differential privacy
\item Optimization
\end{itemize}
\newpage
\section{Basics of Information Theory}
Today we will not be rigorous about the definitions or manipulations of notions from information theory. Instead, we will give a sense of how the tools of information theory might be applied to solve interesting problems.
\subsection{Random Variables}
Let $X$ be a random variable with probability distribution $P_X$. In this context it is convenient to restrict $X$ to a compact set $\Omega$. Recall that random variables $X, Y$ can be jointly distributed with probability distribution $P_{XY}$. This carries the data $\{P_{Y | X = \alpha}\}_{\alpha \in \Omega}$ of probability distributions for $Y$ given any possible fixed value of $X$.
\subsection{Entropy}
Today we will not give a fully rigorous definition of entropy, but the following ``definition'' will suffice to motivate our use of it in the next section.
\begin{defn}[Entropy]
Let $X$ be a random variable. The \emph{entropy} of $X$, denoted $H(X)$, is ``the number of bits needed, in expectation, to convey $X$.''
\end{defn}
For example, Alice and Bob might both know $P_X$, and they need to come up with a protocol to compress $X$ and send it over the line to each other.
So far we have no rigorous way to calculate the entropy of a random variable, but intuition tells us what the answers are in some easy examples:
\begin{example}
Suppose $P_X$ is the uniform distribution over $\{0, 1\}^n$. Then intuitively we must use $n$ bits to convey $X$, and we can write ``$H(X) = n$''
\end{example}
\begin{example}
Suppose $X$ is $0^n$ with probability $1/2$ and is uniformly distributed over $\{0, 1\}^n$ with probability $1/2$. Then we can use a single bit to indicate which case occurs, and an additional $n$ bits in case the second case occurs. The expected value of the number of bits used is
\[\frac{1 + (n+1)}{2} = \frac{n}{2} + 1\]
so we can write ``$H(X) \approx n/2$.''
\end{example}
\begin{defn}[Conditional Entropy]
The entropy of $Y$ conditioned on $X$, denoted $H(Y | X)$, is ``the number of bits needed, in expectation, to convey $Y$ given that $X$ is known.'' More precisely,
\[H(Y | X) = \E_{\alpha \in \Omega}[H(Y | X = \alpha)]\]
where $Y | X = \alpha$ is distributed according to the joint distribution $P_{XY}$.
\end{defn}
\begin{example}
Suppose $X$ and $Y$ are independent and uniformly distributed over $\{0, 1\}^n$. Intuitively, knowing $X$ does not give any additional information about $Y$, we can write ``$H(Y | X) = H(Y) = n$.''
\end{example}
\begin{example}
Suppose $X$ is uniformly distributed over $\{0, 1\}^n$, and $Y$ is uniformly distributed over $\{0, 1\}^{2n}$ such that $X$ consists of the first $n$ bits of $Y$. Then, given $X$, one knows the first $n$ bits of $Y$ (and no other information is conveyed by $X$) so we can write ``$H(Y|X) = n$.''
\end{example}
We now state some intuitive axioms for entropy.
\begin{enumerate}
\item[(1)]\label{axiom1} If $|\Omega| < \infty$, then $H(X) \leq \log |\Omega|$ with equality if and only if $P_X$ is uniform on $\Omega$.
\item[(2)]\label{axiom2} $H(X, Y) = H(X) + H(Y | X)$. ``to specify $X$ and $Y$ it suffices to specify $X$ and then $Y$ given that $X$ has already been transmitted.'' One can show that this method of transmitting $X, Y$ is optimal. [NB: this axiom is frequently called the \emph{chain rule} of conditional entropy]
\item[(3)]\label{axiom3} $H(Y | X) \leq H(Y)$.
\end{enumerate}
Warning: axiom (3) does not necessarily work when specialized to an arbitrary value of $X$; it is only true in expectation over all possible values of $X$.
\begin{exercise}
Construct a counterexample to the assertion that $H(Y | X = \alpha) \leq H(Y)$.
\end{exercise}
\section{Shearer's Lemma}
Let $F \subseteq [N]^d$ represent some object in $d$-dimensional space. For any set $S \subseteq [d]$ with $|S| = k \leq d$, we can project $F$ to a $k$-dimensional object on the coordinates described by $S$. In particular, if $S = \{i_1, \ldots, i_k\}$ with WLOG $i_1 < \cdots < i_k$, we can define
\[F_S := \{(x_{i_1}, \ldots, x_{i_k}) : (x_1, \ldots, x_d) \in F\}.\]
Intuitively, knowing that the projections of $F$ are small should tell us that $F$ cannot be too big. This is the content of Shearer's Lemma.
\begin{lemma}[Shearer's Lemma]
Let $F \subseteq [N]^d$ and $k \leq d$. Then
\[|F|^{\binom{d-1}{k-1}} \leq \prod_{\substack{S \subseteq [d] \\ |S| = k}}|F_S|.\]
\end{lemma}
In the case $d = 3, k = 1$, this specializes to the following:
\begin{lemma}[Shearer's Lemma, ``infant version'']
Let $F \subseteq [N]^3$. Then
\[|F| \leq |F_{\{1\}}||F_{\{2\}}||F_{\{3\}}|.\]
\end{lemma}
\begin{proof}
Each element of $F$ is of the form $(x_1, x_2, x_3)$, where by definition $x_i \in F_{\{i\}}$ for $i = 1, 2, 3$. So, we have an inclusion of sets
\[F \subseteq F_{\{1\}} \times F_{\{2\}} \times F_{\{3\}}.\]
Taking the cardinalities of both sides, the result is immediate.
\end{proof}
We use entropy to prove a harder case, namely $d = 3, k = 2$.
\begin{lemma}[Shearer's Lemma, ``baby version'']
Let $F \subseteq [N]^3$. Then
\[|F|^2 \leq |F_{\{1, 2\}}||F_{\{2, 3\}}||F_{\{1, 3\}}|.\]
\end{lemma}
\begin{proof}
Take the random variable $(X, Y, Z)$ to be uniformly distributed on $F$. By Axiom (1), we know
\[H(X, Y, Z) = \log |F|.\]
By definition of the projections,
\begin{itemize}
\item $(X, Y)$ is restricted to $F_{\{1, 2\}}$
\item $(Y, Z)$ is restricted to $F_{\{2, 3\}}$
\item $(X, Z)$ is restricted to $F_{\{1, 3\}}$
\end{itemize}
So Axiom (1) yields
\begin{align*}
H(X, Y) &\leq \log|F_{\{1, 2\}}|\\
H(Y, Z) &\leq \log|F_{\{2, 3\}}|\\
H(X, Y) &\leq \log|F_{\{1, 3\}}|.
\end{align*}
To show the desired result $|F|^2 \leq |F_{\{1, 2\}}||F_{\{2, 3\}}||F_{\{1, 3\}}|$, by taking logs it therefore suffices to show
\[2H(X, Y, Z) \leq H(X, Y) + H(Y, Z) + H(X, Y).\]
Using Axiom (2), we have
\begin{align*}
H(X, Y) &= H(X) + H(Y | X)\\
H(Y, Z) &= H(Y) + H(Z | Y)\\
H(X, Z) &= H(X) + H(Z | X)\\
\end{align*}
Axiom (3) tells us that $H(Y) \geq H(Y | X)$ and $H(Z | X), H(Z | Y)\geq H(Z | X, Y)$. Adding up the three equations above and applying these inequalities,
\[H(X, Y) + H(Y, Z) + H(X, Z) \geq 2H(X) + 2H(Y | X) + 2H(Z | X, Y).\]
The right hand side is equal to $2H(X, Y) + 2H(Z | X, Y) = 2H(X, Y, Z)$ by axiom (2), which yields the desired result.
\end{proof}
\begin{exercise}
Can the proof of the baby version of Shearer's Lemma be extended to the general case?
\end{exercise}
\end{document}