\documentclass[10pt]{article}
\usepackage{amsfonts,amsthm,amsmath,amssymb}
\usepackage{array}
\usepackage{epsfig}
\usepackage{fullpage}
\usepackage{algorithm}
\usepackage[noend]{algorithmic}
%\usepackage{ecrc}
\usepackage{amsmath}
\usepackage{algorithm}
\usepackage[noend]{algorithmic}
%\usepackage[noend]{algpseudocode}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{tikz}
\usepackage{pgfplots}
%\usepackage{authblk}
\usepackage{multirow}
\usepackage{graphicx}
\usepackage{epstopdf}
\usepackage{mdwlist}
\usepackage{lipsum}% just to generate text for the example
\usepackage[]{algorithm2e}
\begin{document}
\input{preamble.tex}
\renewcommand{\binset}{\bbF_2}
\handout{CS 229r Essential Coding Theory, Lecture 16}{Mar 24, 2017}{Instructor: Madhu Sudan}{Scribes: Sai Qian Zhang}{Lecture 16 Polar Codes II}
%Hamming Codes, Distance, Examples, Limits, and Algorithms}
\section{Administrivia}
No Office hour today
\section{Agenda}
Today: Polar codes II
\begin{itemize}
\item Erratum
\item Review
\item Decoding
\item Polarization Speed
\end{itemize}
\section{Recap}
In the last lecture, we talked about the polar code, which has a linear compression scheme. We further prove the following theorem:
\begin{claim}
Suppose $f_{H}:\{0,1\}^{n}\longrightarrow\{0,1\}^{m}$ and $f_{H}^{-1}:\{0,1\}^{m}\longrightarrow\{0,1\}^{n}$ are such that
\begin{itemize}
\item $f_{H}$ is linear, i.e. $f_{H}(x)=xH$
\item For $x\sim Bern(p)^{n}, f^{-1}_{H}(f_{H}(x))=x$
\end{itemize}
then H is the parity check matrix for code correcting p-fraction random errors.
\end{claim}
Remember, given $x\sim Bern(p)^{n}$, $Y=f(x)=xP$,P is invertible, we want make $Y=\{Y_{A},Y_{B}\}$ such that
$Y_{A}$ contains the bits which have high entropy, and $Y_{B}$ contains the bits which have low entropy, which means that we can take a subset of the output $Y_{A}$, and infer the rest ($Y_{B}$) from $Y_{A}$. If this is true, we can build matrix H by permute the columns of P such that xH corresponds to the bits in $Y_{A}$. And we discussed the procedures for polarization.
This lecture, we finish the proof left from the last lecture and analyze the speed of polarization.
\section{Erratum from last lecture}
In the last lecture, we talked about the procedure of polarization, which is shown in Figure 1. However, the correct procedure of polarization is shown in Figure 2.
\begin{figure}[tp!]
\includegraphics[width=7cm,height=3cm]{1.png}
\centering
\caption[U1]{Wrong Polarization procedure}
\end{figure}
\begin{figure}[tp!]
\includegraphics[width=7cm,height=3cm]{2.png}
\centering
\caption[U1]{Correct Polarization procedure}
\end{figure}
\begin{figure}[tp!]
\includegraphics[width=7cm,height=3cm]{3.png}
\centering
\caption[U1]{Example}
\end{figure}
Consider the one step of polarization shown in Figure 3. In the intermediate stage of calculation, we have variable A, conditioned on some C, where C is a linear combination of the input. And similarly, we have variable B, conditioned on some D, which is another linear combination of the input. Since entropy is preserved, we have the following equality:
\begin{equation}
H(A\oplus B|C,D)+H(B|A\oplus B,C,D)=H(A|C)+H(B|D)
\end{equation}
If we use the polarization procedure shown in Figure 2, we will have the $D\perp(A,C)$ and $C\perp(B,D)$, hence we have $H(A|C) = H(A|C,D)$, $H(B|D) = H(B|C,D)$, and $H(A\oplus B|C,D)+H(B|A\oplus B,C,D)=H(A|C,D)+H(B|C,D)$, which makes the chain rule hold.
In general, we have the polarization procedure shown in Figure 4.
\begin{figure}[tp!]
\includegraphics[width=7cm,height=5cm]{4.png}
\centering
\caption[U1]{General Polarization procedure}
\end{figure}
\section{Decoding Algorithm}
The decoding algorithm is described below:
\begin{algorithm}[H]
Input: $y_{i\in A}$, where $A$ is the set such that $\forall i\notin A, H(Y_{i}|Y_{1},...,Y_{i-1})\leq \sigma$ ($\sigma$ is small), matrix P, $x_{1},...,x_{n}\sim Bern(p)^{n}$ \\
Output: $\hat{y}_{i\notin A}$, which is the most likely $y_{i}$ for $i\notin A$\\
\For{i=1,...,n}{
\eIf{i $\in$ A}{
$\hat{y_{i}} = y_{i}$
}{
Compute $\alpha_{i}=P(Y_{i}=1|y_{0},...,y_{i-1})$\\
\eIf{$\alpha_{i}>0.5$}{$\hat{y_{i}}=1$}{$\hat{y_{i}}=0$}
}
}
\caption{Decompression Algorithm}
\end{algorithm}
\subsection{Accuracy of decoding}
Given that $H(Y_{i}|Y_{1},...,Y{i-1})\leq \sigma$, we have $E_{Y_{1},...,Y{i-1}}(H(Y_{i}|Y_{1},...,Y_{i-1}))\leq \sigma$, by Markov inequality, we have $P(H(Y_{i}|y_{1},...,y_{i-1})>\sqrt{\sigma})<\sqrt{\sigma}\Longrightarrow P(Y_{i}=mode(Y_{i})|y_{1},...,y_{i-1})>1-\sqrt{\sigma} \Longrightarrow P(Y_{i}\neq \hat{y_{i}}|y_{1},...,y_{i-1})<\sqrt{\sigma}$. Therefore the union bound of error is less or equal to $2N\sqrt{\sigma}$, where $N$ is total number of input.
\subsection{How to efficiently compute $P(Y_{i}=1|y_{0},...,y_{i-1})$ }
we need to find an efficient way to calculate $P(Y_{i}=1|y_{0},...,y_{i-1})$, which is used in the decoding algorithm above. Next we propose a recursive algorithm to calculate this probability.
Consider the block shown in Figure 5, let $x_{1},...,x_{n}\in Bern(p_{1}),...,Bern(p_{n})$ be the input of this block, because this is an intermediate stage of the polarization procedure, each $P(x_{i}=1)$ differs for each $i$. This recursion is described in Algorithm 2:
\begin{figure}[tp!]
\includegraphics[width=7cm,height=4.5cm]{5.png}
\centering
\caption[U1]{Polarization}
\end{figure}
\begin{algorithm}[H]
Input: $y_{A},p_{1},...,p_{n}$ \\
Output: $P(Y_{i}=1|y_{1},...,y_{i-1}), for 1\leq i\leq A $\\
Let $A+ = A\cap [\frac{n}{2}], A- = A-A+$\\
\For{$i=1,...,\frac{n}{2}$}{ $q_{i}=p_{i}(1-p_{\frac{n}{2}+i})+p_{\frac{n}{2}+i}(1-p_{i})$}
$(z_{1},...,z_{\frac{n}{2}})$=Decode($y_{A+},q_{1},...,q_{\frac{n}{2}}$)\\
\For{$i=1,...,\frac{n}{2}$}{ $r_{i}=P(b=1|a \sim Bern(p_{i}),b \sim Bern(p_{i+\frac{n}{2}}), a\oplus b = z_{i})$}
$(y_{1},...,y_{\frac{n}{2}})$=Decode($y_{A-},r_{1},...,r_{\frac{n}{2}}$)\\
$Output(z\oplus y,y)$
}
\caption{Decode($y_{A},p_{1},...,p_{n}$)}
\end{algorithm}
The complexity of this recursion is $\mathcal{O}(N\log{}N)$.
\section{Speed of Polarization}
Theorem 2 provides a description on the degree of polarization and the number of polarization steps.
\begin{theorem}
$\exists$ polynomial $K$, s.t. $\forall \epsilon>0$, $0 0$, if $N=K(\frac{1}{\epsilon}) $ and $[y_{1},...,y_{n}]^{T}=P_{N}[x_{1},...,x_{n}]^{T}$, $x_{1},...,x_{n}\sim Bern(p)$, we have $|\{i|H(Y_{i}|Y_{1},...,Y_{i-1})\in (\epsilon,1-\epsilon) \}|\leq \epsilon N$
\end{lemma}
Lemma 3 claims that after l-steps of polarization, the number of entropies which are closed to each other (within the interval $(\epsilon,1-\epsilon)$) is small ($\leq \epsilon N$).However, this bound is weak because $K$ can be any polynomial that we can not control. When $\epsilon = \frac{1}{N^{0.01}}$, the interval becomes $(\frac{1}{N^{0.01}}, 1-\frac{1}{N^{0.01}})$, which is very narrow.
\begin{lemma}
(strong, one-side polarization) $\exists$ polynomial $K_{1},K_{2}$ $\forall \epsilon>0$, if $p\leq K_{1}(\epsilon)$ and $N\geq K_{2}(\frac{1}{\epsilon})$, then
$|\{i|H(Y_{i}|Y_{1},...,Y_{i-1})\leq \frac{1}{N^{3}} \}|\geq (1-H(p)-\epsilon) N$
\end{lemma}
Lemma 4 claims that the number of outputs with small entropy is very large.
Lemma 4 is proven by using the following idea, let's go through one step of polarization, assume $x_{0},x_{1}\sim Bern(p)$, after polarization, we have $x_{0}\oplus x_{1}\sim Bern(p^{+})$ and $x_{0}|x_{0}\oplus x_{1} \sim Bern(p^{-})$, where $p^{+} = 2p(1-p)$ and $p^{-} = h^{-1}(2h(p)-h(2p(1-p)))$, $h(p)=plog(\frac{1}{p})+(1-p)log(\frac{1}{1-p})$. if p is very small, we have $h(p)\approx plog(\frac{1}{p})\Longrightarrow 2h(p)-h(p^{+}) = h(p^{-}) \approx 2p \Longrightarrow p^{-}\approx \frac{p}{log\frac{1}{1-p}}$. Now assume $p< 2^{-10}$. after one step of polarization $p^{+}$ will be doubled and $p^{-}$ will become one tenth of itself, there is a drift towards the negative direction.
This is illustrated in Figure 6, as long as $log(p) < log({p_0})$, each time $log(p)$ will either increase by 2 or decrease by 10. If $p>p_{0}$, p will escape from the this process and become large. However, the probability that p hits $p_{0}$ is approximately equal to $poly(p)$.
\begin{figure}[tp!]
\includegraphics[width=7cm,height=4.5cm]{6.png}
\centering
\caption[U1]{Random Walk}
\end{figure}
Similarly, lemma 3 can be proven by using claim 5, denote $p^{+} = 2p(1-p)$ and $p^{-}=h^{-1}(2h(p)-h(p^{+}))$ and define potential function to be $\phi(p)=\sqrt{h(p)(1-h(p))}$
\begin{claim}
$\exists \Lambda<1$ s.t. $\forall 0