\documentclass[10pt]{article}
\usepackage{amsfonts,amsthm,amsmath,amssymb}
\usepackage{array}
\usepackage{epsfig}
\usepackage{fullpage}
\usepackage{amssymb}
\usepackage[colorlinks = false]{hyperref}
\newcommand{\1}{\mathbbm{1}}
\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator*{\argmax}{argmax}
\newcommand{\x}{\times}
\newcommand{\Z}{\mathbb{Z}}
\newcommand{\Q}{\mathbb{Q}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\N}{\mathbb{N}}
\newcommand{\F}{\mathbb{F}}
\newcommand{\E}{\mathop{\mathbb{E}}}
\renewcommand{\bar}{\overline}
\renewcommand{\epsilon}{\varepsilon}
\newcommand{\eps}{\varepsilon}
\newcommand{\DTIME}{\textbf{DTIME}}
\renewcommand{\P}{\textbf{P}}
\newcommand{\SPACE}{\textbf{SPACE}}
\usepackage[shortlabels]{enumitem}
\usepackage{verbatim}
\begin{document}
\input{preamble.tex}
\newtheorem{example}[theorem]{Example}
\theoremstyle{definition}
\newtheorem{defn}[theorem]{Definition}
\handout{CS 229r Information Theory in Computer Science}{Feb 26, 2019}{Instructor:
Madhu Sudan}{Scribe: Vinh-Kha Le}{Lecture 9}
This lecture wraps up our discussion of polar codes. Recall from the previous lecture the polar encoding function $E_n$ as defined on strings of length $n = 2^t$. We first recursively define a pre-encoding function $\tilde{E}_n$. Given strings $U$ and $V$ each of length $n/2$, we take
\[
\tilde{E}_n(UV) = \tilde{E}_{n/2}(U \oplus V)\tilde{E}_{n/2}(V)
\]
and $\tilde{E}_1 = \id_{[2]}$. We visualize the pre-encoding step as a recursion tree of depth $t$, where at each step, we replace $U$ with $U \oplus V$ and $V$ with $V|U \oplus V$. Note that
\[
H(U) + H(V) = H(U, V) = H(U \oplus V, V) = H(U \oplus V) + H(V|U \oplus V).
\]
Given a string $Z \sim \mathrm{Bernoulli}(p)^n$ of i.i.d.\ bits, let $W = \tilde{E}_n(Z)$. Because at each step of the recursion $U \oplus V$ has a lower index than $V|U \oplus V$, it is consistent to consider each bit $W_j$ as conditioned upon all previous bits $W_{ 0$, there exists a $\beta < 1$ such that for all $t \in \mathbb{N}$,
\[
\mathbb{P}_{j \in [2^t]}[H(W_j|W_{ 0$, there exists a $\beta < 1$ such that for all $t \in \mathbb{N}$,
\[
\mathbb{P}_{(X_0, \ldots, X_t)}[X_t \in (c^{-t}, 1 - c^{-t})] \leq O(\beta^t).
\]
We first establish some important properties of $(X_i)_{i = 0}^{t}$.
\begin{exercise}
Show that $0 \leq X_i \leq 1$ for $0 \leq i \leq t$. A martingale is a sequence of random variables $X_i \in L^1(\Omega, \mathbb{P})$ such that
\[
\mathbb{E}[X_i|X_1, X_2, \ldots, X_{i - 1}] = X_{i - 1}.
\]
Use the invertibility of $(U, V) \mapsto (U \oplus V, V)$ and the chain rule for entropy to show that $(X_i)_{i = 0}^{t}$ is a Martingale.
\end{exercise}
These properties surprisingly are not sufficient to show convergence.
\begin{exercise}
Find a bounded Martingale that fails to converge.
\end{exercise}
To show that the $X_i$ converges, we want to show that it has variance in the middle and suction at the ends. Variance in the middle, i.e., away from $0$ and $1$, means that for all $\tau > 0$, there exists a $\sigma > 0$ such that for all $i \in [t]$,
\[
X_{i - 1} \in (\tau, 1 - \tau) \text{ implies that } \operatorname{Var}[X_i|X_{i - 1}] \geq \sigma^2.
\]
Suction at ends means that there exists a $\Theta > 0$ such that for all $c > 0$, there exists a $\tau > 0$ such that
\begin{align*}
X_{i - 1} \leq \tau &\text{ implies that } \mathbb{P}\left[X_i < \frac{X_{i - 1}}{c}\middle|X_{i - 1}\right] \geq \Theta \text{ and} \\
X_{i - 1} \geq \tau &\text{ implies that } \mathbb{P}\left[1 - X_i < \frac{1 - X_{i - 1}}{c}\middle|X_{i - 1}\right] \geq \Theta.
\end{align*}
Variance in the middle and suction at the ends are together called local polarization.
\begin{exercise}
Write down the distribution for $X_i|X_{i - 1}$.
\end{exercise}
We claim that $X_i$ polarizes locally. We will prove this in the next lecture. It suffices to show that local polarization implies strong polarization.
\begin{exercise}
Here we sketch a proof that local polarization implies strong polarization.
\begin{comment}
Extend the walks $(b_i)_{i = 1}^{t}$ to infinite binary sequences in $[2]^{\omega}$. Endow this space with the canonical probability measure generated by cylinder sets. Give a definition of $(X_i)_{i = 0}^{\infty}$ that agrees in distribution with our $(X_i)_{i = 0}^{t}$ under the truncation map.
\end{comment}
Suppose that $X_t$ is locally polarized. Let $\phi_t = \min\{\sqrt{X_t}, \sqrt{1 - X_t}\}$. Show that $\mathbb{E}[\phi_{t + 1}|\phi_t] \leq \alpha\phi_t$ for some $\alpha < 1$. Deduce via induction and Markov's inequality that $\mathbb{P}[\phi_t > \alpha^{t/2}] \leq \alpha^{t/2}$. Conclude that $X_t$ is strongly polarized by applying Doob's martingale inequality to $X_t$ for $t \in [t_0, 2t_0]$.
\end{exercise}
\begin{comment}
\begin{exercise}
Here we sketch an alternative proof of polarization. Extend the walks $(b_i)_{i = 1}^{t}$ to infinite binary sequences as before. Show that $(X_i)_{i = 0}^{\infty}$ is uniformly integrable. Deduce that it converges a.s. to a random variable $X_\infty$ for which $\mathbb{E}[X_\infty] = X_0$. Show that $X_{\infty}$ takes values in $\{0, 1\}$ as the set of fixed points of some transformation. Conclude that our truncated $X_i$ polarizes in some sense.
\end{exercise}
\end{comment}
\end{document}