\documentclass[10pt]{article}
\usepackage{amsfonts,amsthm,amsmath,amssymb}
\usepackage{array}
\usepackage{epsfig}
\usepackage{fullpage}
\usepackage{amssymb}
\usepackage[colorlinks = false]{hyperref}
\newcommand{\1}{\mathbbm{1}}
\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator*{\argmax}{argmax}
\newcommand{\x}{\times}
\newcommand{\Z}{\mathbb{Z}}
\newcommand{\Q}{\mathbb{Q}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\N}{\mathbb{N}}
\newcommand{\F}{\mathbb{F}}
\newcommand{\E}{\mathop{\mathbb{E}}}
\renewcommand{\bar}{\overline}
\renewcommand{\epsilon}{\varepsilon}
\newcommand{\eps}{\varepsilon}
\newcommand{\DTIME}{\textbf{DTIME}}
\renewcommand{\P}{\textbf{P}}
\newcommand{\SPACE}{\textbf{SPACE}}
\usepackage{tikz}
\usepackage[europeanresistors,americaninductors]{circuitikz}
\usetikzlibrary{chains}
\usetikzlibrary{decorations.pathreplacing,decorations.pathmorphing}
\begin{document}
\input{preamble.tex}
\newtheorem{example}[theorem]{Example}
\theoremstyle{definition}
\newtheorem{defn}[theorem]{Definition}
\handout{CS 229r Information Theory in Computer Science}{Feb 19, 2019}{Instructor:
Madhu Sudan}{Scribe: Jane Ahn}{Lecture 7}
\section{Overview}
\subsection{Outline}
\begin{enumerate}
\item Converse Coding Theorems
\item Efficiency in Coding
\item Linear Coding \& Linear Compression
\end{enumerate}
\subsection{Administrative Things}
\begin{enumerate}
\item We have a new TF! His name is Noah Golowich
\item Problem set 1 solutions are out
\item Problem set 2 is out
\item No Office Hours for Madhu this Thursday
\item Mitali has Office Hours Tuesday (2/19) at 5pm and Thursday (2/21) at 5pm
\end{enumerate}
\section{Channel Coding}
Our goal is to recover the original message that has passed through a noisy channel, where we also have an encoding function $E_n: \{0, 1\}^{Rn} \to \Omega_X^n$ and a decoding function $D_n: \Omega_Y^n \to \{0, 1\}^{Rn}$. A channel looks like this:
\[\begin{tikzpicture}[
start chain=going right,
box/.style={
on chain,join,draw,
minimum height=3cm,
text centered,
minimum width=2cm,
},
every join/.style={ultra thick},
node distance=5mm
]
\node [on chain] {$x \in \Omega_X$};
\node [on chain,join,draw,
text width=1cm,
minimum width=4cm,
minimum height=1.6cm,
label=above: Channel,
]{$P_{Y|X}$} ;
\node [on chain,join,xshift=5mm]{$y \in \Omega_Y$};
\end{tikzpicture}\]
If you feed in $x \in \Omega_X$, you get $y \in \Omega_Y$. Every channel is specified by the conditional probability distribution $P_{Y | X}$. Define a channel's capacity to be the following:
\[Cap = \sup_R \{\lim_{\epsilon \to 0} \lim_{n \to \infty} \{\text{communication of $Rn$ bits is possible with $\epsilon$-error during $n$ uses of channel}\}\}\]
The capacity is the supremum of all rates $R$ as error goes to zero and the string has sufficient bits. Capacity tells you how well/efficiently you can communicate. Now if $E_n(m)_i \sim P_X$ i.i.d. over $m, i$, (we proved last time that) these channels have bounded capacity. What we proved last time was for every $\epsilon > 0$, we can achieve $R \geq \sup_{P_X} \{I(X; Y)\} - \epsilon$. This means the capacity can get arbitrarily close to the information $Y$ gives about $X$. This implies that $\sup_{P_X}\{I(X; Y)\}$ is a lower bound for the capacity. We now define
\[C_0 \triangleq \sup_{P_X}\{I(X; Y)\}\]
for notational convenience, and prove the following claim:
\begin{claim}
\[Cap = \sup_{P_x} \{I(X; Y)\}\]
\end{claim}
\subsection{Upper Bounding Capacity}
We won't prove the following theorem in class, but it gives a really strong bound:
\begin{theorem}
For a BSC$(p)$, for all $\epsilon > 0$ there exists $\delta > 0$ such that if rate $R > C_0 + \epsilon$, then $Pr[\text{decoding error}] \geq 1 - exp(-\delta n)$.
\end{theorem}
This implies that if the rate is too high, most of the time you get the wrong answer. We will prove the following theorem instead:
\begin{theorem}
For all $P_{Y | X}$ (channels) and for all $\epsilon > 0$, there exists $\delta > 0$ such that if rate $R > C_o + \epsilon$, then $Pr[\text{decoding failure}] \geq \delta$
\end{theorem}
\begin{proof}
We want to use the following understanding of the process.
\begin{center}
\begin{tikzpicture}
%{$\displaystyle p$};
\draw (2,-1) -- (2,1);
\draw (2,-1) -- (5,-1);
\draw (5,-1) -- (5,1);
\draw (2,1)--(5,1);
\draw (6,-1) -- (6,1);
\draw (6,-1) -- (9,-1);
\draw (9,-1) -- (9,1);
\draw (6,1)--(9,1);
\draw (10,-1) -- (10,1);
\draw (10,-1) -- (13,-1);
\draw (13,-1) -- (13,1);
\draw (10,1)--(13,1);
\draw [->] (1,0) -- (2,0);
\draw [->] (5,0) --(6,0);
\draw [->] (9,0) --(10,0);
\draw [->] (13,0) --(14,0);
\node at (1.5,0.2) {$\displaystyle m$};
\node at (5.5,0.2) {$\displaystyle X^n$};
\node at (9.5,0.2) {$\displaystyle Y^n$};
\node at (13.5,0.2) {$\displaystyle \hat{m}$};
\node at (3.5,0) {$\displaystyle Encode$};
\node at (7.5,0) {$\displaystyle Channel$};
\node at (11.5,0) {$\displaystyle Decode$};
\end{tikzpicture}
\end{center}
Note that the above process is a Markov chain, which means that if we fix any state, the future is independent of the past. For example, if we fix $Y^n$, $\hat{m}$ is independent of $m, X^n$.
Define $\delta = Pr[m \neq \hat{m}]$. We wish to show that $\delta > 0$. In order to understand the probability that $m \neq \hat{m}$, we look at the entropy
\[H(m) = nR\]
since $m \in \{0, 1\}^{nR}$. Now by definition of mutual information, we have
\[H(m) = H(m | \hat{m}) + I(\hat{m}; m)\]
By the data processing inequality, we see that the information $\hat{m}$ gives about $m$ cannot be larger than the information $Y_n$ gives about $m$. Similarly, then this amount is at most the amount of information $X^n$ gives about $Y^n$. Hence,
\[I(\hat{m}; m) \leq I(Y^n; X^n) = \sum_{i = 1}^n I(Y_i; X^n, Y_1, \cdots, Y_{i-1})\]
Now consider the Markov chain
\begin{center}
\begin{tikzpicture}
\draw [->] (5.5,0) --(6.5,0);
\draw [->] (8.5,0) --(9.5,0);
\node at (1.5,0.2) {};
\node at (5.5,0.2) {};
\node at (9.5,0.2) {};
\node at (13.5,0.2) {};
\node at (3.5,0) {$(X^n; Y_1, \cdots, Y_{n-1})$};
\node at (7.5,0) {$X_i$};
\node at (10,0) {$Y_i$};
\end{tikzpicture}
\end{center}
Conditioning on $X_i$, we know that the left and right are independent. Now since each $Y_i$ is independent of $Y_j$ for $j \neq i$, we get that $ I(Y_i; X^n, Y_1, \cdots, Y_{i-1}) = I(Y_i; X_i)$. Hence, we upper bound $I(\hat{m}; m)$ by
\[I(\hat{m}; m) \leq I(Y^n; X^n) = \sum_{i = 1}^n I(Y_i; X^n, Y_1, \cdots, Y_{i-1}) \leq \sum_{i = 1}^n I(Y_i ; X_i)\]
and further, for any $i$, $I(Y_i; X_i) \leq C_0$, so
\[I(\hat{m}; m) \leq I(Y^n; X^n) = \sum_{i = 1}^n I(Y_i; X^n, Y_1, \cdots, Y_{i-1}) \leq \sum_{i = 1}^n I(Y_i ; X_i) \leq nC_0\]
Now we consider $H(m | \hat{m})$ using Fano's inequality. We get
\[H(m | \hat{m}) \leq H(1_{m \neq \hat{m}}) + Pr[m \neq \hat{m}]\log |\{0, 1\}^{nR}| \leq 1 + \delta nR\]
where $1_{m \neq \hat{m}}$ is the indicator variable for the event $m \neq \hat{m}$. Hence, we get
\[nR \leq 1 + \delta nR + nC_0 \implies \delta nR \geq n(R - C_0) - 1 \geq \epsilon n - 1 \implies \delta \geq \frac{\epsilon}{R} - \frac{1}{n}\]
\end{proof}
\section{Efficiency in Coding}
Once we start looking at algorithm efficiency, what we have is pretty bad. There are three stages of efficiency:
\begin{enumerate}
\item Preprocessing: Designing $E_n$ and $D_n$
\item Encoding Complexity: How much work does it take to compute the encoding given a message
\item Decoding Complexity: How much work does it take to compute the decoding given an encoding
\end{enumerate}
How long does it take to compute $E_n$? We look at each of the stages:
\begin{enumerate}
\item There are $2^{Rn}$ possible messages; this is the size of the encoding (since if it's completely random we can't compress it anymore): this is the space complexity and the randomized time complexity
\item Encoding still has $2^{Rn}$ space since we have to look up which encoding to use; time is poly$(n)$
\item Decoding is still $2^{Rn}$ space, $2^{Rn}$ deterministic time complexity (once we have the decoding everything is deterministic), since decoding will go through all possible encodings
\end{enumerate}
Obviously, this isn't that great. But in the next couple lectures we'll develop something that will give us poly time. Note we've assumed so far that probability of error is exponentially low, but this is something we can leverage (we just need probability of error going to 0, not necessarily decreasing exponentially). We can do this by dividing the long block into small chunks and encoding each chunk separately. Moreover, from now on, we are solely going to focus on binary symmetric channels.
Given $k$ bits, we chop it up into lengths of $10 \log k$. We apply Shannon's methodology (which is the breaking of the large block into smaller chunks) independently to each block. Let $l = 10 \log k$ and $L = \frac{10 \log k}{C_0 - \epsilon}$.
\begin{center}
\begin{tikzpicture}
\draw (0,0) rectangle (5,-0.3);
\draw[decorate,decoration={brace,amplitude=5pt},xshift=0pt,yshift=3pt] (0,0) -- (5,0);
\node at (2.5,0.5) {$k$};
\draw (2.8,-0.9) -- (0,-0.9) -- (0,-0.6) -- (2.8,-0.6);
\draw (3.8,-0.9) -- (5,-0.9) -- (5,-0.6) -- (3.8,-0.6);
\foreach \x in {0.8,1.6,2.4,4.2} {
\draw (\x,-0.6) -- (\x,-0.9);
}
\node at (3.3,-0.75) {$\cdots$};
\draw[decorate,decoration={brace,mirror,amplitude=5pt},xshift=0pt,yshift=-3pt] (0,-0.9) -- (0.8,-0.9);
\node at (0.4,-1.4) {\scalebox{0.8}{$\ell = 10 \log k$}};
\draw (3.92,-1.9) -- (0,-1.9) -- (0,-2.2) -- (3.92,-2.2);
\draw (5.32,-1.9) -- (7,-1.9) -- (7,-2.2) -- (5.32,-2.2);
\foreach \x in {1.12,2.24,3.36,5.88} {
\draw (\x,-1.9) -- (\x,-2.2);
}
\node at (4.62,-2.05) {$\cdots$};
\draw[decorate,decoration={brace,mirror,amplitude=5pt},xshift=0pt,yshift=-3pt] (0,-2.2) -- (1.12,-2.2);
\node at (0.56,-2.75) {\scalebox{0.8}{$L = \frac{10 \log k}{C_0 - \epsilon}$}};
\draw[->,decorate,decoration={snake,amplitude=.2mm,segment length=2mm}] (2.2,-1.15) -- (2.5,-1.65);
\node[right] at (2.4,-1.4) {\scalebox{0.8}{$E_L$}};
\draw[->,decorate,decoration={snake,amplitude=.2mm,segment length=2mm}] (5,-1.15) -- (6,-1.65);
\node[above right] at (5.5,-1.4) {\scalebox{0.8}{$E_L$}};
\end{tikzpicture}
\end{center}
\begin{enumerate}
\item The preprocessing cost and space is now $exp(L) = poly(k)$. There is still randomness here.
\item Encoding needs a table where you have to look things up, and this takes now poly time in $k$ (since it is exp in $10 \log k$).
\item Decoding is similar; do brute force on each block - which string is most likely to have produced encoding? Decoding also takes $poly(k)$ now (since it is exp in $10 \log k$).
\end{enumerate}
The error probability is probability of existence of a block that was decoded incorrectly. By the union bound.
\[Pr[\text{there exists a block decoded incorrectly}] \leq k \cdot Pr[\text{block is decoded incorrectly}] \leq k \cdot \frac{1}{k^2} = \frac{1}{k}\]
The inequality holds assuming the condition 10 was large enough. The upper bound value which goes to 0. People probably knew this a long time ago, and we implicitly uses it all the time. But formalization took longer!
However, there are problems with the current solution.
\begin{enumerate}
\item The running time of the decoding function is at least $\frac{1}{\text{error probability}}$.
\item There is also a lower bound on how small each of these small blocks can be. Each block should be big enough so that probability of bit flip is "detectable" and so you can check for errors. This is related to the divergence, and you get that the length of block $\geq \frac{1}{\epsilon^2}$ for $\epsilon = C_0 - R$. Any smaller and you can't distinguish between different error rates. So the running time is also at least $2^{\frac{1}{\epsilon^2}}$.
\end{enumerate}
Hence, running time of decoder is at least
\[\max\left\{\frac{1}{\text{error probability}}, 2^{\frac{1}{\epsilon^2}}\right\}\]
The first problem was resolved by a technique called ``concatenated codes'' by Forney in 1966. The rough idea is you don't want to take the union bound (effectively worrying about single corruption), but you put in extra encoding before you encode each block separately. You worry about $\delta$ fraction corruption (using Chernoff bounds) instead. This probability is exponentially small in $k$, so error probability is exponential while runtime is polynomial. For the second problem, we had no idea how to get around the $2^{\frac{1}{\epsilon^2}}$ until 2008 (formally proved in 2013), which uses Polar Codes.
\section{Linear Coding $\Leftarrow$ Linear Compression}
Suppose the encoding map is linear, i.e. $E_n: \mathbb{F}_2^k \to \mathbb{F}_2^n$. So far, all encoding functions were random, mapping strings of length $k$ to a random string of length $n$. We get a random linear encoding by picking a random matrix.
\begin{claim}
Random Linear Encoding achieves capacity.
\end{claim}
\begin{exercise}
Prove the above claim. Could possibly be a problem on the next problem set.
\end{exercise}
The nice thing about linear encoding maps is that we only need polynomial space. We only require a $k \times n$ matrix rather than a table of $2^k$ entries. So it's much more space-efficient. Note that we still have $E_m[\text{decoding incorrectly}] \leq small$, so for all messages, probability of decoding incorrectly is small. Further, error detection is easy. Given a message we want to decode, first check whether there is a message for which the string I have could be an encoding of. This can be done with just standard linear algebra. Finally, it is much more likely to be injective.
\begin{proposition}
For every full rank matrix $G \in \mathbb{F}_2^{n \times k}$, there exists a full rank $H \in \mathbb{F}_2^{m \times n}$ where $m = n - k$ such that $HG = 0$, i.e. every row of $H$ is orthogonal to every column of $G$.
\end{proposition}
\begin{exercise}
Prove the above proposition.
\end{exercise}
\begin{exercise}
For $x \in \mathbb{F}_2^n$, show that $Hx = 0$ iff there exists $m$ such that $x = Gm$.
\end{exercise}
What does $H$ do? $H$ is actually compressing the error...why?
\subsection{Efficient Linear Compression for Bern$(p)^n$}
\begin{definition}
A pair $(H, D)$ is an \textbf{efficient linear compression} for Bern$(p)^n$ if
\begin{enumerate}
\item $H \in \mathbb{F}_2^{m \times n}$, $D \in \mathbb{F}_2^{n \times m}$, and $m \leq (H(p) + \epsilon)n$
\item We have Comp: $\mathbb{F}_2^n \to \mathbb{F}_2^m$ is a linear map given by $H$ and $D: \mathbb{F}_2^m \to \mathbb{F}_2^n$ is efficient; for $Z \in \mathbb{F}_2^n,$ Comp$(Z) = HZ$.
\item $Pr_{Z \sim Bern(p)^n}[D(HZ) \neq Z] \leq \delta$ for some constant $\delta$
\end{enumerate}
\end{definition}
Given $(H, D)$ we will now construct a good code:
\begin{enumerate}
\item Let $G \in \mathbb{F}_2^{(n-m) \times n}$ be the orthogonal matrix, so $HG = 0$.
\item Define encoding as Encoding$(m) = Gm$.
\end{enumerate}
\begin{center}
\begin{tikzpicture}[scale=1.5]
\draw (0,0) rectangle (1.5,2);
\node at (0.75,1) {$G$};
\draw (1.7,0.5) rectangle (2,2);
\node at (1.85,1.25) {$m$};
\draw[->] (2.4,1.2) -- (2.9,1.2);
\node[above] at (2.65,1.2) {Encoding};
\draw (3.3,0) rectangle (3.6,2);
\node at (3.45,1) {\scalebox{0.6}{$Gm$}};
\draw[decorate,decoration={brace,amplitude=5pt},xshift=-3pt,yshift=0pt] (0,0) -- (0,2);
\node at (-0.4,1) {$n$};
\draw[decorate,decoration={brace,amplitude=5pt},xshift=0pt,yshift=3pt] (0,2) -- (1.5,2);
\node at (0.75,2.4) {$k$};
\draw[decorate,decoration={brace,mirror,amplitude=5pt},xshift=3pt,yshift=0pt] (3.6,0) -- (3.6,2);
\node at (4,1) {$n$};
\end{tikzpicture}
\end{center}
Now how do we decode this? Suppose we have $Gm + Z$ (your encoded message plus Bernoulli independent noise, denoted by $Z$). Note that recovering $m$ is the same as recovering $Z$. To get $Z$, we multiply by $H$ to get
\[H(Gm + Z) = HGm + HZ = HZ\]
and apply the decompressor on $HZ$. You will usually get $Z$ (this happens with probability $1 - \delta$).
Everything in error correction boils down to compressing the error. The challenge from now on is to compress $n$ Bern$(p)$ bits to $(H(p) + \epsilon)n$ bits. The decoding time is $poly\left(\frac{n}{\epsilon}\right)$.
\end{document}