\documentclass[10pt]{article}
\usepackage{amsfonts,amsthm,amsmath,amssymb,tikz,float}
\usepackage{array}
\usepackage{epsfig}
\usepackage{fullpage}
\usepackage{amssymb, algorithm}
\usepackage[noend]{algpseudocode}
\usepackage[colorlinks = false]{hyperref}
\newcommand{\1}{\mathbbm{1}}
\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator*{\argmax}{argmax}
\newcommand{\x}{\times}
\newcommand{\Z}{\mathbb{Z}}
\newcommand{\Q}{\mathbb{Q}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\N}{\mathbb{N}}
\newcommand{\F}{\mathbb{F}}
\newcommand{\E}{\mathop{\mathbb{E}}}
\renewcommand{\bar}{\overline}
\renewcommand{\epsilon}{\varepsilon}
\newcommand{\eps}{\varepsilon}
\newcommand{\DTIME}{\textbf{DTIME}}
\renewcommand{\P}{\textbf{P}}
\newcommand{\SPACE}{\textbf{SPACE}}
\begin{document}
\input{preamble.tex}
\newtheorem{example}[theorem]{Example}
\theoremstyle{definition}
\newtheorem{defn}[theorem]{Definition}
\handout{CS 229r Information Theory in Computer Science}{April 2, 2019}{Instructor:
Madhu Sudan}{Scribe: Patrick Guo}{Lecture 17}
\section{Overview}
Today we will conclude that Information = Amortized Complexity. The main theorem we will show (from \cite{BR}) is that
\begin{theorem}[Information = Amortized Complexity]\label{thm1}
$$\frac{1}{n}CC^n_{\epsilon,\mu} (f)= IC_{\epsilon,\mu}(f)(1+o_n(1))$$
\end{theorem}
\noindent Along the way we will also introduce Interactive Correlated Sampling.
\vskip.1in
\noindent Administrative:
\begin{itemize}
\item Project presentations May 1, 9am -- 4pm, LISE 303
\item Project writeups due May 7
\end{itemize}
\section{Preliminaries}
Recall the following definition from last lecture:
\begin{definition}[Direct product of a function]
Given a function $f: X \times Y \to R$, its $n$-fold product is denoted by $f^{\otimes n}: X^n \times Y^n \to R^n$
$$
f^{\otimes n}(x_1,\dots,x_n,y_1,\dots,y_n) = (f(x_1,y_1), \dots, f(x_n,y_n))
$$
\end{definition}
We are interested in the communication complexity of $f^{\otimes n}$. Trivially we have $CC(f^{\otimes n}) \le n\cdot CC(f)$ since we can solve $f^{\otimes n}$ by running the communication for $f$ $n$ times in parallel. $CC(f^{\otimes n})$ is interesting because perhaps the problem is easier when we are asked about $n$ independent copies of the problem -- How can solving $f(x_1,y_1)$ help solve $f(x_2,y_2)$, etc.? We will see today that there is reason to believe the iterated version of $f$ is easier.
As a note, when we go from protocols on $f$ on input distribution $\mu$ to protocols on $f^{\otimes n}$, we implicitly go from working with $\mu$ supported on $X \times Y$ to working with a distribution $\mu^n$ on $X^n \times Y^n$, where all $n$ instances of the problem are generated independently of one another. We will just write $\mu$ to specify input distribution from now on, and it will be clear when we mean $\mu^n$.
Recall we also redefined error:
\begin{definition}
$f^{\otimes n}$ is solved by $\bar \Pi$ with error $\epsilon$ if for all $i$,
$$\Pr[f^{\otimes n}(\bar x, \bar y)_i = \bar \Pi(\bar x, \bar y)_i] \ge 1-\varepsilon$$
\end{definition}
\noindent This gives us the following definitions for communication and information complexity on the $n$-fold product:
\begin{definition}
$$CC^n_{\epsilon,\mu}(f) = \min_{\bar\Pi}\{CC(\bar\Pi)\}$$
$$IC^n_{\epsilon,\mu}(f) = \min_{\bar\Pi}\{IC(\bar\Pi)\}$$
where the minimums are taken over $\bar\Pi$ solving $f^{\otimes n}$ on input distribution $\mu^n$ with error $\epsilon$
\end{definition}
The motivation behind relaxing our definition of error is that otherwise a protocol erring with probability $\epsilon$ for $f$ when iterated on $n$ independent instances of the problem gives a protocol for $f^{\otimes n}$ erring with probability $1-(1-\epsilon)^n$, which is unideal. The relaxed definition is nice since it means an $\epsilon$-error protocol on $f$ leads to an $\epsilon$-error protocol on $f^{\otimes n}$, though it also means an $\epsilon$-error protocol on $f^{\otimes n}$ in actuality errs more often than $\epsilon$, but this is fine since we are mainly proving lower bounds.
\section{Information = Amortized Complexity}
We will make use of the following lemma:
\begin{lemma}
$$IC^n_{\epsilon,\mu}(f) = n\cdot IC^1_{\epsilon,\mu}(f) $$
\end{lemma}
\noindent Informally, this means that the information leaked by the best protocol for $f^{\otimes n}$ grows linearly in $n$, and will be used in the proof of Theorem \ref{thm1} to allow us to compare $CC^n_{\epsilon,\mu}(f)$ to $IC^n_{\epsilon,\mu}(f)$, as well as to show that some terms are lower order.
\begin{proof}[Proof of Lemma]
Note that $IC^n (f) \le n \cdot IC^1(f)$ is intuitively obvious, since we can solve $n$ copies of the problem by running the solution for 1 copy $n$ times in parallel, i.e. by using $\Pi^{\otimes n}$.
\begin{exercise}
Rigorously show
$$IC_\mu(\Pi^{\otimes n}) \le n\cdot IC_\mu(\Pi)$$
by expanding the definition of information complexity and applying the chain rule
\end{exercise}
\noindent Hence we are left with showing the other side of the inequality, $IC^n_{\epsilon,\mu} (f) \ge n \cdot IC^1_{\epsilon,\mu}(f)$, or equivalently,
$$
IC^1_{\epsilon,\mu} \le 1/n \cdot IC^n_{\epsilon,\mu}(f)
$$
Intuitively, we are trying to extract from a solution for $n$ copies a solution a solution for $1$ copy which somehow compresses its information cost. To this end, we do simulation with the same embedding trick we've seen previously in this class for proving the communication complexity of \textsc{Disjointness} (embed the task of solving $1$ instance of the problem into a protocol that solves $n$)
\begin{figure}[H]
\begin{center}
\begin{tikzpicture}[scale=0.15]
\tikzstyle{every node}+=[inner sep=0pt]
\draw [black] (28.6,-17.5) circle (3);
\draw (28.6,-17.5) node {$u$};
\draw [black] (61.9,-17.5) circle (3);
\draw (61.9,-17.5) node {$v$};
\draw [white] (44.8,-17.5) circle (3);
\draw (44.8,-17.5) node {$(u,v)\sim\mu$};
\draw [black] (44.8,-8) circle (3);
\draw (44.8,-8) node {$R?$};
\draw [white] (28.5,-29.3) circle (5);
\draw (28.5,-29.3) node {$(x_1,\dots,x_n)$};
\draw [white] (61.9,-29.3) circle (5);
\draw (61.9,-29.3) node {$(y_1,\dots,y_n)$};
\draw [black] (45.1,-35.1) circle (3);
\draw (45.1,-35.1) node {$\bar \Pi$};
\draw [white] (44.9,-45.4) circle (3);
\draw (44.9,-45.4) node {$f^{\otimes n}(x_1,\dots,x_n,y_1,\dots,y_n)$};
\draw [white] (44.9,-55.2) circle (3);
\draw (44.9,-55.2) node {$f(u,v)$};
\draw [black] (49.8,-17.5) -- (58.9,-17.5);
\fill [black] (58.9,-17.5) -- (58.1,-17) -- (58.1,-18);
\draw [black] (39.3,-17.5) -- (31.6,-17.5);
\fill [black] (31.6,-17.5) -- (32.4,-18) -- (32.4,-17);
\draw [black] (47.42,-9.46) -- (59.28,-16.04);
\fill [black] (59.28,-16.04) -- (58.82,-15.22) -- (58.34,-16.09);
\draw [black] (42.21,-9.52) -- (31.19,-15.98);
\fill [black] (31.19,-15.98) -- (32.13,-16.01) -- (31.63,-15.15);
\draw [black] (31.33,-30.29) -- (42.27,-34.11);
\fill [black] (42.27,-34.11) -- (41.68,-33.37) -- (41.35,-34.32);
\draw [black] (59.06,-30.28) -- (47.94,-34.12);
\fill [black] (47.94,-34.12) -- (48.86,-34.33) -- (48.53,-33.39);
\draw [black] (45.04,-38.1) -- (44.96,-42.4);
\fill [black] (44.96,-42.4) -- (45.47,-41.61) -- (44.47,-41.59);
\draw [black] (44.9,-48.4) -- (44.9,-52.2);
\fill [black] (44.9,-52.2) -- (45.4,-51.4) -- (44.4,-51.4);
\draw (44.4,-50.3) node [left] {$?$};
\draw [black] (61.9,-20.5) -- (61.9,-26.3);
\fill [black] (61.9,-26.3) -- (62.4,-25.5) -- (61.4,-25.5);
\draw (61.4,-23.4) node [left] {$?$};
\draw [black] (28.57,-20.5) -- (28.53,-26.3);
\fill [black] (28.53,-26.3) -- (29.03,-25.5) -- (28.03,-25.5);
\draw (28.04,-23.4) node [left] {$?$};
\end{tikzpicture}
\caption{Extracting solution for $1$ instance of problem from a solution for $n$ instances through embedding}
\end{center}
\end{figure}
\noindent Like before, we need to determine the following:
\begin{itemize}
\item How to embed $u,v$ into $(x_1,\dots, x_n)$ and $(y_1,\dots, y_n)$, respectively
\item How to generate the rest of the inputs' coordinates
\item How to extract from $f^{\otimes n}(x_1,\dots,x_n,y_1,\dots,y_n)$ the value $f(u,v)$
\end{itemize}
\noindent Toward the first two points, we use shared randomness $R$. We simply sample $i \in_{Unif}[n]$ uniformly at random and set $x_i = u, y_i = v$. This is important since we don't know exactly at which coordinate in $\Pi$ that information is leaked, so a uniform $i$ ensures that we sum up equally over all possibilities to capture the information. Then the extraction is simply projection to the $i$th coordinate, or $f(u,v) = \left(f^{\otimes n}(x_1,\dots,x_n,y_1,\dots,y_n)\right)_i$.
Now the question is, how do we generate the remaining coordinates of $x,y$? We \textit{could} just sample them iid from their marginal distributions according to $\mu$, but the point is that we want to generate them from some joint, correlated distribution so that we can leverage information being leaked in the protocol for $f^{\otimes n}$, $\bar \Pi$. Thus, we sample with shared randomness $x_1,\dots, x_{i-1}, y_{i+1},\dots, y_{n}$ from their marginal distributions according to $\mu$. Then, to fill out the rest of the coordinates $x_{i+1},\dots, x_n, y_1,\dots, y_{n-1}$, the players use private randomness to sample $x_{i+1},\dots,x_n$ according to their conditional distribution conditioned on the public $y_{i+1},\dots, y_n$, and similarly for the other player to sample $y_1,\dots, y_{i-1}$.
\vskip.1in
\noindent As recap, our protocol $\Pi$ to solve $f(u,v)$ from a protocol $\bar \Pi$ for $f^{\otimes n}$ is
\begin{itemize}
\item Using shared randomness, sample $i$ uniformly at random from $[n]$, then sample $x_1,\dots, x_{i-1}, y_{i+1}, y_n$ from their marginal distributions according to $\mu$
\item Using private randomness, sample $x_{i+1},\dots, x_n, y_1,\dots, y_{i-1}$ according to their conditional distributions conditioned on $x_1,\dots, x_{i-1}, y_{i+1}, y_n$
\item Communicate according to $\bar \Pi$ to solve $f^{\otimes n}(x_1,\dots, x_n, y_1,\dots, y_n)$
\item Output $f(u,v) = \left(\bar \Pi(x_1,\dots,x_n,y_1,\dots,y_n)\right)_i$
\end{itemize}
Note that $\Pi$ and $\bar \Pi$ have the same communication.
Now we compute the information complexity of this protocol. We have
$$IC(\Pi) = I(u; \Pi | v, i, x_1, \dots, x_{i-1}, y_{i+1},\dots, y_n) + I(v; \Pi| u, i, x_1,\dots,x_{i-1},y_{i+1},\dots,y_n)$$
and we want to show $IC(\Pi) \le \frac{1}{n} IC(\Pi)$, so we want to go from $I(u; \Pi | v, i, x_1, \dots, x_{i-1}, y_{i+1},\dots, y_n)$ to $\frac{1}{n} I(x_1,...,x_n;\Pi|y_1,...,y_n)$. Since we are given $i$, we can rewrite the former term as $I(x_i; \Pi | y_i, i, x_1, \dots, x_{i-1}, y_{i+1},\dots, y_n)$. A step we will need is
\begin{exercise}
Create a Markov chain to argue through conditional independence that
$$I(x_i; \Pi | y_i, i, x_1,\dots, x_{i-1}, y_{i+1},\dots, y_n) =
I(x_i; \Pi | y_i, i, x_1, x_{i-1}, y_1,\dots,y_i,y_{i+1},\dots, y_n)$$
\end{exercise}
\noindent This is intuitively true since Bob generates $y_1, \dots, y_{i-1}$ given $x_1,\dots, x_{i-1}$, which are used only to generate the communications of $\bar \Pi$, so given $\Pi$, $y_1, \dots, y_{i-1}$ give no further information about $x_i$.
Hence, we can compute that
\begin{align*}
I(u; \Pi | v, i, x_1,\dots, x_{i-1}, y_{i+1},\dots, y_n) &=
I(x_i; \Pi | y_i, i, x_1,\dots, x_{i-1}, y_{i+1},\dots, y_n) \\
&=
I(x_i; \Pi | y_i, i, x_1, x_{i-1}, y_1,\dots,y_i,y_{i+1},\dots, y_n) \\
&= \frac{1}{n} \sum_{j=1}^n I(x_j, \Pi| x_1,...,x_{j-1}, y_1,...,y_n) \\
&= \frac{1}{n} I(x_1,\dots,x_n;\Pi|y_1,\dots,y_n)
\end{align*}
where the second to last equality comes from the fact that $i$ is uniform over $[n]$, and the last equality from chain rule, i.e. $I(A_1,\dots,A_n;B|C) = \sum_{i=1}^n I(A_i; B| C,A_1,\dots,A_{i-1})$
By a completely symmetrical argument we have
$$
I(v; \Pi | u, i, x_1,\dots, x_{i-1}, y_{i+1},\dots, y_n) = \frac{1}{n} I(y_1,\dots,y_n;\Pi|x_1,\dots,x_n)
$$
and putting them together gives
$$IC^n_{\epsilon,\mu}(f) = n\cdot IC^1_{\epsilon,\mu}(f) $$
as desired.
\end{proof}
Now back to the main theorem: recall that we are heading for Information = Amortized Complexity, or
$$\frac{1}{n}CC^n_{\epsilon,\mu} (f)= IC_{\epsilon,\mu}(f)(1+o_n(1))$$
What this means is that we are trying to compress a communication that has little information but lots of communication by using the fact that we have multiple independent instances of the problem. As an equivalent reformulation using our lemma, we want to show
\begin{align*}
CC^n_{\epsilon,\mu}(f) &\ge IC^n_{\epsilon,\mu}(f)(1\pm o(1)) \;\;\;\text{(obvious)} \\
CC^n_{\epsilon,\mu}(f) &\le IC^n_{\epsilon,\mu}(f)(1\pm o(1))
\end{align*}
The top inequality is straightforward, since for any working protocol, the amount of information conveyed cannot be more than the amount of bits sent in total.
Thus, we want to prove
$CC^n_{\epsilon,\mu}(f) \le IC^n_{\epsilon,\mu}(f)(1\pm o(1))$.
Specifically we will show
$$CC^n_{\epsilon,\mu}(f) \le IC^n_{\epsilon,\mu}(f) + O(\sqrt{ IC^n_{\epsilon,\mu}(f) + o(1))}$$
Again by our lemma, since the $n$-fold information cost is linear in $n$, we know that $\sqrt{IC^n_{\epsilon,\mu}(f)}$ is truly a lower order term.
Now, let $\Pi$ be a $k$-round protocol for $f(x,y)$ ($x$ being Alice's input, $y$ being Bob's) with communication $C$ and information $I$. We wish to compress this; specifically, we want to show existence of $\Pi'$ simulating $\Pi$ with communication $I+O(k\sqrt{I} + k\log\frac{k}{\epsilon})$. The idea is to compress each step $\Pi_i$ of the communication $\Pi = (\Pi_1,\dots, \Pi_k)$. Consider the first communication $\Pi_1$ from Alice to Bob. This is entirely a function of $x$, Alice's input, so the communication is exactly sampled from the distribution $\Pi_1 | x$, call this distribution $P$. Now, to compress this, we want to send just enough information for Bob to reconstruct $\Pi_1$ with small probability of error, and for this we need to know how much apriori knowledge Bob had about the distribution of $\Pi_1$. He only knows his input $y$, but if $y$ is correlated with $x$, then Bob can have some informed apriori estimate of $\Pi_1$ from his distribution $\Pi_1 | y$, call this distribution $Q$. Again as a toy example, suppose $P = Q$. Then with shared randomness Bob can simulate the entire communication on his own without any communication from Alice, so the communication needed (and information of the protocol) is $0$. In general, the closer $Q$ is to $P$, the less information is revealed by Alice's communication to Bob, and Bob can with less communication simulate the communication. This suggests that the amount of communication needed is related to the divergence between what Alice and Bob think $\Pi_1$ should be, and this is related to the information conveyed from Alice communicating her actual $\Pi_1$.
Thus, this brings us to the problem of ``Interactive Correlated Sampling.'' We want just enough communication between Alice and Bob such that, if $P$ is supported on $\Omega$, for all $a \in \Omega$ that $$\Pr[Y=a | X=a] \ge 1-\epsilon$$
\begin{figure}[H]
\begin{center}
\begin{tikzpicture}[scale=0.2]
\tikzstyle{every node}+=[inner sep=0pt]
\draw [black] (28.6,-17.5) circle (3);
\draw (28.6,-17.5) node {$Alice$};
\draw [black] (61.9,-17.6) circle (3);
\draw (61.9,-17.6) node {$Bob$};
\draw [black] (44.8,-8) circle (3);
\draw (44.8,-8) node {$\epsilon$};
\draw [black] (44.8,-29.5) circle (3);
\draw (44.8,-29.5) node {$\Pi'$};
\draw [black] (38.8,-17.6) circle (3);
\draw (38.8,-17.6) node {$P$};
\draw [black] (50.7,-17.6) circle (3);
\draw (50.7,-17.6) node {$Q$};
\draw [black] (28.4,-38) circle (3);
\draw (28.4,-38) node {$X\sim P$};
\draw [black] (61.9,-38) circle (3);
\draw (61.9,-38) node {$Y$};
\draw [black] (47.42,-9.47) -- (59.28,-16.13);
\fill [black] (59.28,-16.13) -- (58.83,-15.3) -- (58.34,-16.18);
\draw [black] (42.21,-9.52) -- (31.19,-15.98);
\fill [black] (31.19,-15.98) -- (32.13,-16.01) -- (31.63,-15.15);
\draw [black] (35.8,-17.5) -- (31.6,-17.5);
\fill [black] (31.6,-17.5) -- (32.4,-18) -- (32.4,-17);
\draw [black] (53.7,-17.6) -- (58.9,-17.6);
\fill [black] (58.9,-17.6) -- (58.1,-17.1) -- (58.1,-18.1);
\draw [black] (41.877,-28.834) arc (-106.79943:-146.25828:21.728);
\fill [black] (41.88,-28.83) -- (41.26,-28.12) -- (40.97,-29.08);
\draw [black] (46.756,-27.227) arc (136.89865:112.76996:35.943);
\fill [black] (46.76,-27.23) -- (47.67,-26.98) -- (46.94,-26.3);
\draw [black] (31.352,-18.692) arc (64.32061:42.62168:38.034);
\fill [black] (31.35,-18.69) -- (31.86,-19.49) -- (32.29,-18.59);
\draw [black] (60.58,-20.291) arc (-30.69734:-79.63405:18.823);
\fill [black] (60.58,-20.29) -- (59.74,-20.72) -- (60.6,-21.23);
\draw [black] (28.57,-20.5) -- (28.43,-35);
\fill [black] (28.43,-35) -- (28.94,-34.21) -- (27.94,-34.2);
\draw [black] (61.9,-20.6) -- (61.9,-35);
\fill [black] (61.9,-35) -- (62.4,-34.2) -- (61.4,-34.2);
\end{tikzpicture}
\end{center}
\caption{Interactive Correlated Sampling where $\Pr[Y=a|X=a] \ge 1-\epsilon$}
\end{figure}
We will show that there exists protocol $\Pi'$ that achieves $\Pr[Y=a|X=a] \ge 1-\epsilon$ with $CC(\Pi') \le D(P||Q) + O(\sqrt{D(P||Q)}) + \log \frac{1}{\epsilon}$. The protocol $\Pi'$ uses a similar dartboard sampling method as we saw earlier in the class for compressed interactive sampling.
\begin{figure}[H]
\begin{center}
\includegraphics[scale=.7]{dartboard.png}
\caption{Dartboard sampling method for interactive correlated sampling. The horizontal axis is the discrete axis representing values in the support of $P$. The vertical axis ranges from $0$ to $1$, representing the probability of each value. Points $(x_i,a_i)$ are uniformly and independently sampled on the dartboard. $(x_3,a_3)$ is the first point under $P(x)$.}
\end{center}
\end{figure}
Alice's strategy is simple: she outputs $X=x_i$ where $a_i < P(x_i)$ and $i$ is as small as possible. Note that, since points $(x_i,a_i)$ are uniformly distributed on the board, they are uniformly distributed under the line $P(x)$ which is the PMF of $P$, so the first such point under the line (and in fact, any point under the line) has $x$ coordinate distributed according to $P$, so $x_i \sim P$ as desired. Note that with error probability exponentially small in $\epsilon$ we have $i < \frac{|\Omega|}{\epsilon}$, since the probability of any particular point lying under $P$ is $\frac{1}{|\Omega|}$. Hence, it suffices to sample $\frac{|\Omega|}{\epsilon}$ points $(x,a)$.
Now back to our toy example of $P=Q$ for some intuition. In this case, since the points $(x_i,a_i)$ are generated with shared randomness, Bob can simply do the same strategy, since he knows $Q$ which is equal to $P$, so he takes $Y$ to also be the $x$ coordinate of the first point under $Q$ and no communication is required.
Now suppose we know a constant $c$ such that $P \le cQ$ (if no such constant exists, divergence is infinity and the result holds trivially. We also remark that the argument works for the weaker condition that $P(x_i) \le c Q(x_i)$, since our argument only uses the fact that $x_i$ is under the curve $cQ$). Bob's candidate points for $X$ are then the $x$-coordinates of all points lying below his line $cQ$ (and we can consider just the points with index $j < \frac{|\Omega|}{\epsilon}$ since $i < \frac{|\Omega|}{\epsilon}$ with exponentially small error), and we need enough communication between Alice and Bob to determine exactly which candidate point is correct. To do this, we use shared hash functions $h_j: \Omega \to \{0,1\}$. In particular, Alice sends $O(\log c/\epsilon)$ hash values of $x_i, (h_1(x_i)...h_{m\log c/\epsilon}(x_i))$, Bob does the same hashes for all his candidate points, and then we will have with error probability $\epsilon$ that Bob identifies the correct point $x_i$.
So how do we determine $c$? We do a search, trying $1,2,16...$, each time giving us some candidate points, and as long as we don't get fooled by a wrong point (which hashing takes care of with the desired probability), once we try large enough $c$, we are done.
\begin{algorithm}
\caption{Protocol for Interactive Correlated Sampling}\label{alg:pseudo}
\begin{algorithmic}[1]
\State Assume that $i < |\Omega|/\epsilon$
\For {$t = 0,1,2,3,\dots$}
\State Let $C_t = 2^{t^2}$, ``Hope that $P(x_i) < C_tQ(x_i)$''
\State Alice sends $\log C_t/\epsilon$ bits of hash of $x_i$ to Bob
\For {$j = 1,2,3,\dots, \frac{|\Omega|}{\epsilon}$}
\If {$a_j < C_tQ(x_j)$}
\If {Hashes of $x_j$ agree with Alice's message}
\State Bob sends message saying done
\State Break
\EndIf
\Else
\State Continue
\EndIf
\EndFor
\EndFor
\end{algorithmic}
\end{algorithm}
Note that the number of bits sent by the highest order computation is exactly $E_{x_i \sim P}\left[\log \frac{P(x_i)}{Q(x_i)}\right]$ which is just the divergence between $P$ and $Q$.
\begin{exercise}
Formalize the argument's computations, i.e. show that the probability Bob identifies the wrong point is indeed bounded by $\epsilon$, and compute that the amount of communication done is $D(P||Q) + O(\sqrt{D(P||Q)}) + \log \frac{1}{\epsilon}$
\end{exercise}
This concludes compression based on divergence, and hence we have Information = Amortized Complexity. This result is particularly nice because it illustrates an operational view of divergence -- if we both have individual distributions, but want to jointly sample from you, the amount of communication required (up to some lesser order terms) is equal to the divergence between our distributions.
\begin{thebibliography}{}
\bibitem{BR}
Braverman, Mark, and Anup Rao. "Information equals amortized communication." IEEE Transactions on Information Theory 60.10 (2014): 6058-6069.
\end{thebibliography}
\end{document}