\documentclass[10pt]{article}
\usepackage{amsfonts,amsthm,amsmath,amssymb}
\usepackage{array}
\usepackage{epsfig}
\usepackage{fullpage}
\usepackage{amssymb}
\usepackage[colorlinks = false]{hyperref}
\newcommand{\1}{\mathbbm{1}}
\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator*{\argmax}{argmax}
\newcommand{\x}{\times}
\newcommand{\Z}{\mathbb{Z}}
\newcommand{\Q}{\mathbb{Q}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\N}{\mathbb{N}}
\newcommand{\F}{\mathbb{F}}
\newcommand{\E}{\mathop{\mathbb{E}}}
\renewcommand{\bar}{\overline}
\renewcommand{\epsilon}{\varepsilon}
\newcommand{\eps}{\varepsilon}
\newcommand{\DTIME}{\textbf{DTIME}}
\renewcommand{\P}{\textbf{P}}
\newcommand{\SPACE}{\textbf{SPACE}}
\begin{document}
\input{preamble.tex}
\newtheorem{example}[theorem]{Example}
\theoremstyle{definition}
\newtheorem{defn}[theorem]{Definition}
\handout{CS 229r Information Theory in Computer Science}{Jan 29, 2019}{Instructor:
Madhu Sudan}{Scribe: Tristan Yang}{Lecture 7}
\section*{Administrative things}
\begin{itemize}
\item Noah Golowich is the new TF
\item Pset 2 and solutions to Pset 1 are out
\item NO office hours for Sudan this Thursday
\item Mitali's office hours are at 5pm on Tuesday and Thursday
\end{itemize}
\section*{Outline}
\begin{enumerate}
\item Converse Coding Theorems
\item Efficiency in Coding
\item Linear Coding and Linear Compression
\end{enumerate}
\section{Converse Coding Theorems}
\subsection{Review of Channel Coding}
Recall from last week: a general channel takes as input some \(X \in \Omega_X\) and outputs some \(Y \in \Omega_Y\). Its behavior is specified by \(P_{Y|X}\). We encode a message \(m \in \{0, 1\}^k\) with an encoding function \(E_n : \{0, 1\}^k \to \Omega_X^n\) and recover the decoded message \(\hat m\) with a decoding function \(D_n : \Omega_Y^n \to \{0, 1\}^k\). We define the rate \(R = k/n\).
We define the \textit{Capacity} of such a channel as the following: \[
\sup_{R} \lim_{\epsilon \to 0}\lim_{n \to \infty} \{\text{communication of }Rn\text{ bits is possible with }\epsilon\text{-error during }n\text{ uses of channel}\}
\]
We proved that by simply picking random i.i.d \(E_n(m)_i \sim P_X\) over \((m, i)\) we can achieve: \[
R \geq \sup_{P_X} \{I(X; Y)\}
\]
This means that \(\sup_{P_X} \{I(X; Y)\}\) is a lower bound for the capacity. We now aim to prove that it is also an upper bound, and thus equal to the capacity.
\subsection{Capacity Upper Bound}
Let \(C_0 = \sup_{P_X}\{I(X; Y)\}\). We have the following theorem:
\begin{theorem}
For the Binary Symmetric Channel BSC\((p)\), for all \(\epsilon > 0\) there exists \(\delta > 0\) such that if rate \(R > C_0 + \epsilon\), \[
\Pr[\text{decoding error}] \geq 1 - \exp(-\delta n)
\]
\end{theorem}
\noindent However, instead of proving this we will focus on general channels and prove that the decoding error cannot be \(o(1)\):
\begin{theorem}
For all \(\epsilon > 0\) there exists \(\delta > 0\) such that if rate \(R > C_0 + \epsilon\) then \[
\Pr[\text{decoding error}] \geq \delta
\]
\end{theorem}
\begin{proof}
Consider the complete encoding/decoding process for message \(m\): \[
m \to X^n \to Y^n \to \hat m.
\]
Note that this is a markov chain (e.g. \(\hat m | Y^n \perp m, X^n\)). \\
Let \(\delta = \Pr[m \neq \hat m]\). We want to show that \(\delta > 0\).
Consider \(H(m|\hat m)\). We have that \[
nR = H(m) = H(m|\hat m) + I(\hat m ; m)
\]
The first equality \(nR = H(m)\) comes from the fact that we're considering a uniformly random message from \(\{0, 1\}^{nR}\). By the data processing inequality: \[
I(\hat m; m) \leq I(Y^n; m) \leq I(Y^n; X^n) = \sum_{i = 1}^n I(Y_i ; X^n, Y_1 \dots Y_{i - 1})
\]
Now note that \[
(X^n, Y_1 \dots Y_{i - 1}) \to X_i \to Y_i
\]
is again a markov chain, so \[
\sum_{i = 1}^n I(Y_i ; X^n, Y_1 \dots Y_{i - 1}) \leq \sum_{i = 1}^n I(Y_i ; X_i) \leq nC_0.
\]
(The last inequality comes from the fact that \(I(Y_i ; X_i) \leq C_0\) no matter the distribution of \(X_i\).) \\
To deal with the other term, we note that Fano's inequality implies that if \(\Pr[m \neq \hat m]\) is small, then \(H(m|\hat m)\) is small: \[
H(m|\hat m) \leq H(\mathbf{1}_{m \neq \hat m}) + \Pr[m \neq \hat m]\log(|\{0, 1\}^{nR}|) \leq 1 + \delta nR
\]
Thus we conclude \begin{align*}
(1 - \delta)nR \leq 1 + nC_0
&\implies \delta n R \geq n(R - C_0) - 1 \geq \epsilon n - 1 \\
&\implies \delta \geq \epsilon/R - 1/n
\end{align*}
\end{proof}
\section{Efficiency in Coding}
We've shown that a random encoding reaches the optimal bound, but from an algorithmic efficiency standpoint this is pretty bad. In practice, we want to consider the following:
\begin{enumerate}
\item Complexity of designing \(E_n\) and decoder (preprocessing)
\item Encoding time/space complexity
\item Decoding time/space complexity
\end{enumerate}
Analysis for random encoder:
\begin{enumerate}
\item Space complexity and randomized time complexity to construct \(E_n\) is of order \(2^{Rn}\) since there are \(2^{Rn}\) possible messages.
\item Encoding process has \(2^{Rn}\) space complexity to store the lookups. The time complexity is polynomial in \(n\).
\item Decoding process also has \(2^{Rn}\) space complexity. The time complexity is deterministic.
\end{enumerate}
To create a better algorithm, we can leverage the fact that the probability of decoding error in the above case was exponentially low, since we only require that the error approaches \(0\). We focus only on the case of the binary symmetric channel from now on. \\
The idea is to divide \(k\)-bit sequence into chunks of length e.g. \(l = 10 \log k\) and then to apply Shannon's methodology independently to each chunk (encodes block to length \(L = l/(C_0 - \epsilon)\)). Now the preprocessing cost and space (including randomness), as well as the encoding and decoding time/space complexities are of order \(\exp(L) = \operatorname{poly}(k)\). \\
We can use the union bound on the error probability \[
\Pr[\exists \text{ block which was decoded incorrectly}] \leq k\Pr[\text{fixed block is decoded incorrectly}].
\]
Since the latter probability is exponentially small in \(k\), this will go to \(0\). In practice, breaking up messages into chunks is used all the time e.g. in CDs.
\vspace{4mm}\\
There are still some issues with the above solution:
\begin{itemize}
\item The running time of decoder is at least \(1/\text{error prob.}\)
\item Each block has to be big enough so that a bit flip is ``detectable'' to check for errors. Let \(\epsilon = C_0 - R\). We get that the length of each block must at least \(1/\epsilon^2\). So even to achieve 10\% of capacity, we would need blocks of length \(100\) which has running time on the order of \(2^{100}\).
\end{itemize}
The first issue was resolved by ``Concatenated codes'' by Forney '66. The rough idea is that instead of taking the union bound over separate blocks, we use extra redundent encodings (``outer codes'') of a \(2^\delta\) fraction of the blocks to help correct errors. Thus instead of worrying about a single corruption, we worry about corruption of a \(\delta\) fraction and can use Chernoff bounds.\\
The second problem persisted until 2008, and was only proved in 2013. The solution uses \textit{Polar Codes}, which will be the focus of the next few lectures.
\section{Linear Coding}
In \textit{Linear coding}, the encoding map is linear over \(\F_2\): \[
E_n(m) = Gm
\]
where \(G\) is \(n \times k\) matrix (\(m\) has length \(k\)). To get a random linear encoding function we can simply pick \(G\) at random.
\begin{claim}
A random \textit{linear} encoding acheives capacity. \emph{In this case, two different messages still have independent encodings, which it turns out is sufficient.}
\end{claim}
\begin{exercise}
Prove the above claim.
\end{exercise}
\noindent Linear encoding has several benefits:
\begin{enumerate}
\item It only requires polynomial space.
\item It is likely to be injective: \textit{For all} \(m\), \(\Pr[\text{incorrect decoding}]\) is small.
\item Error detection is easy. Given \(x \in \F_2^n\), we can easily find out if there exists \(m\) such that \(x = Gm\).
\end{enumerate}
\begin{proposition}
For all full rank \(G \in \F_2^{n \times k}\), there exists full rank \(H \in \F_2^{m \times n}\) such that \(HG = 0\) where \(m = n - k\).
\end{proposition}
\begin{proof}
Straightforward linear algebra.
\end{proof}
Thus the point 3 above is equivalent to finding out if \(Hx = 0\). It turns out a good way of constructing a good \(G\) is to construct a good \(H\). This means that \textit{linear compression \(\implies\) linear coding.}
\subsection{Efficient Linear Compression for \texorpdfstring{$\operatorname{Bern}(p)^n$}{Bern(p)n}}
\begin{definition}
An \textbf{efficient linear compression} for \(\operatorname{Bern}(p)^n\) consists of a pair of linear maps \(H \in \F_2^{m \times n}\) and \(D \in F_2^{n \times m}\) with \(m \leq (H(p) + \epsilon)n\). The efficient compression process maps \(Z\) to \(HZ\) and decompression maps \(HZ\) to \(D(HZ)\). In addition, we want \[
\Pr_{Z \sim \operatorname{Bern}(p)^n}[D(HZ) \neq Z] \leq \delta
\]
for some \(\delta\).
\end{definition}
\noindent This is sufficient for linear coding. We let \(G\) be the orthogonal complement of \(H\) such that \(HG = 0\), and encode \(m\) as \(Gm\). We receive \(Gm + Z\) where \(Z \sim \operatorname{Bern}(p)^n\). Recovering \(m\) is the same as recovering \(Z\), which we can do by multiplying by \(H\): \[
D(H(Gm + Z)) = D(HZ)
\]
This is equal to \(Z\) with probability \(1 - \delta\).
\vspace{2mm} \\
The challenge now is to compress \(n\) \(\operatorname{Bern}(p)\) bits to \((H(p) + \epsilon)n\) bits with decoding time polynomial in \(n/\epsilon\). This is equivalent to encoding to \(n_0 = \operatorname{poly}(1/\epsilon)\) bits with decoding time \(\operatorname{poly}(1/\epsilon)\).
\end{document}