\documentclass[10pt]{article}
\usepackage{amsfonts}
\usepackage{amsmath}
%\usepackage{mathtools}
%\usepackage{epsfig}
%\newcommand\myeq{\stackrel{\mathclap{\normalfont\mbox{\varepsilon}}}{\approx}}
\begin{document}
\input{preamble.tex}
\lecture{19}{Apr 5, 2016}{Madhu Sudan}{Themis Gouleakis}
%%%% body goes in here %%%%
\section{Preliminaries}
In order to continue the proof, we will need to use the following claims:
\begin{claim}\label{claim 1}
Let $E$ be an event in a probability space. Then,
\[ D(p(x\vert E\Vert p(x)))\leq \log (1/p(E)) \]
where $D(p\Vert q)$ denotes the Kullback-Leibler divergence.
\end{claim}
\begin{proof}
For the Kullback-Leibler divergence, we have that
\[ D(p(x\vert E\Vert p(x)))= E_{x\sim p(x|E)} \log\frac{p(x|E)}{p(x)} \]
Conditioning on the event $E$, makes the probability mass outside of $E$ equal to $0$ and blows up the probability of each element in $E$ by $\frac{1}{p(E)}$. Thus, we get the result.
\end{proof}
By using convexity arguments, we get the following generalization of claim \ref{claim 1}:
\begin{claim}\label{claim 2}
Let $E$ be an event and $A,X$ be random variables with support of size $k$. Then,
\[\mathbb{E}_{A\vert E}[D(p(x\vert A,E)\Vert p(x))]\leq \log (k/p(E)) \]
\end{claim}
\begin{claim}\label{claim 3}
Let $E$ be an event and $U,A,X$ be random variables with support of size $k$. Then,
\[\mathbb{E}_{A,U\vert E}[D(p(x\vert A,U,E)\Vert p(x\vert U))]\leq \log (k/p(E)) \]
\end{claim}
\begin{claim}\label{claim 4}
Let $p(x,y)$ and $q(x,y)=q(x)\cdot q(y)$ (i.e $q(x,y)$ defines a product measure) be two probability distributions. Then
\[
D(p(x,y)\Vert q(x,y))\geq D(p(x)\Vert q(x)) + D(p(y)\Vert q(y))
\]
\end{claim}
\section{Raz Lemma}
We will now see the proof of the parallel repetition theorem by Raz \cite{Raz98} presenting a simplified proof due to Holenstein \cite{Holenstein07}.
The theorem is stated as follows:
\begin{theorem}\label{thm}
Let $G$ be a $2$-prover-$1$-round game. If $val(G)<1-\alpha$, then \[val(G^t)<2^{-\Omega_{\alpha,k}(t)}\]
where $k$ is the number of possible responses for each player.
\end{theorem}
Recall here from the previous lecture that we use the notation $w_i$ for the event where $V(x_i, y_i, a_i, b_i) = 1$ and $w_S$ for the event where $V(x_i, y_i, a_i, b_i) = 1$ for all $i \in S \subset \{1, \ldots, t\}$.
The main ingredient for the proof of the above theorem is Raz lemma \cite{Raz98} which is stated below:
\begin{lemma}[Raz's Lemma]\label{Raz}
If ${\rm val}(G) < 1 - \alpha$ then there exists a constant $\gamma(k , \alpha)$ such that $\forall S \subseteq \{1, \ldots, t\}$ with $|S| \le \gamma t$, then either
\begin{enumerate}
\item ${\rm Pr}[w_S] \le 2^{-\gamma t}$
\item $\exists i \notin S$ such that ${\rm Pr}[w_i | w_S] \le 1 - {\alpha}/{2}$
\end{enumerate}
\end{lemma}
Note that the above lemma implies theorem \ref{thm} and the upper bound is $val(G^t)<\max\{2^{-\gamma t},(1-\frac{\alpha}{2})^{\gamma t}\}$. Roughly, $\gamma \sim \frac{\alpha^2}{\log k}$.
\begin{proof}
For the sake of contradiction we assume that the conclusion of the above lemma does not hold.
That is,
\[\exists S\subseteq \{1, \ldots, t\}: \vert S\vert < \gamma t\]
Such that both:
\begin{enumerate}
\item $ \Pr[w_S]\geq 2^{-\gamma t} $
\item $ \forall i\not\in S: \Pr[w_i\vert w_S]>1-\frac{\alpha}{2}$
\end{enumerate}
Now, let $S$ be the last $t-r$ coordinates.
We would like to use the to above conditions in order to contradict the fact that $val(G)<1-\alpha$. However, the second condition above is about a distribution conditional on the event $W_S$, while $val(G)$ is the probability of winning an one shot game, which is unconditional. So, our strategy would be to prove the existence of an index $i$ such that $\vert \Pr[w_i \vert w_S] - \Pr[w_i]\vert$ is sufficiently small.
Indeed, using claim \ref{claim 1} and the fact that $p(w_S)\geq 2^{-\gamma t}$, we get:
\[
D\left.\left.\left.\left. \left( p \left(
\begin{array}{ll}
x_1 \dots x_r \\
y_1 \dots y_r
\end{array}
\right| w_s \right) \right\Vert p \right( \begin{array}{ll}
x_1 \dots x_r \\
y_1 \dots y_r
\end{array} \right) \right) \leq \gamma t
\]
By claim \ref{claim 4}:
\[
\frac{1}{r}\sum_{i=1}^r D\left.\left.\left.\left. \left( p \left(
\begin{array}{ll}
x_i \\
y_i
\end{array}
\right| w_s \right) \right\Vert p \right( \begin{array}{ll}
x_i\\
y_i
\end{array} \right) \right) \leq \frac{\gamma t}{r}
\]
Since $D(p\Vert q)\geq |p-q|^2$:
\[
\mathbb{E}_r \left\vert p \left.\left.\left.\left(
\begin{array}{ll}
x_i \\
y_i
\end{array}
\right| w_s \right) - p \right( \begin{array}{ll}
x_i\\
y_i
\end{array} \right) \right\vert^2 \leq \frac{\gamma t}{r}
\]
\[
\mathbb{E}_r \left\vert p \left.\left.\left.\left(
\begin{array}{ll}
x_i \\
y_i
\end{array}
\right| w_s \right) - p \right( \begin{array}{ll}
x_i\\
y_i
\end{array} \right) \right\vert \leq \sqrt{2 \gamma}
\]
So, \begin{equation}\label{cond}
i: \left\vert p \left.\left.\left.\left(
\begin{array}{ll}
x_i \\
y_i
\end{array}
\right| w_s \right) - p \right( \begin{array}{ll}
x_i\\
y_i
\end{array} \right) \right\vert \leq \sqrt{2 \gamma}
\end{equation}
We now want to argue that $val(G)>1-\alpha$ if the 2 conditions above hold. For that it would be helpful if $X^t,Y^t$ were independent. However, we have to condition on $w_S$ to prove this lemma, and unfortunately, $X^t \vert w_S,Y^t \vert w_S$ are not independent.
Our goal is to find an auxiliary random variable $U$ such that $X^t,Y^t$ become conditionally independent with respect to $w_S,U$.
The auxiliary random variable $U$ is defined as follows:
\[
U= \left(
\begin{array}{ll}
V_1 \dots V_r \; X_S\\
T_1 \dots T_r \; Y_S
\end{array}
\right)
\]
where
\[
V_i = \left\{ \begin{array}{ll}
0 \;\; w.p\;\; 1/2 \\
1 \;\; w.p\;\; 1/2
\end{array} \right.
\]
\[
T_j = \left\{ \begin{array}{ll}
X_j \;\;if\;\; V_j=0 \\
Y_j \;\;\;if\;\; V_j=1
\end{array} \right.
\]
We define $U_{-i}$ as follows:
\[
U_{-i} = \left(
\begin{array}{ll}
V_1 \dots V_{i-1} \; V_{i+1} \dots V_r \; X_S\\
T_1 \dots T_{i-1} \; T_{i+1} \dots T_r \; Y_S
\end{array}
\right)
\]
As we said, desirable property we would like the random variable $U$ to have is that: $X^t\bot Y^t \vert W_S,U,A_S$, where $A_S$ denotes the set of answers from Alice. %That means that ...
Indeed this is true for the above choice of $U$.
We will now show that Alice and Bob can use shared randomness in order to sample from the distribution $p(u,i,A_S\vert X_i=x,Y_i=y,W_S)$ without communicating. After doing that, they can sample privately the variables $X_t,Y^t$ conditioned on those variables ( $U,W_S,A_S$ ) and return their answers $A_i,B_i$ for the $i$-th game. %Since the $X_t,Y_t$ are independent, their probability of success can be at most the value of an one-shot game: $\Pr[success]\leq val(G)<1-\alpha$.
We also have that:
\[
\Pr[success]=\Pr[W_i]= \mathbb{E}_i[\Pr[W_i]] \geq \mathbb{E}_i[\Pr[W_i\vert W_S]]-\sqrt{2 \gamma}\geq 1-\frac{\alpha}{2}-\sqrt{2 \gamma}>1-\alpha
\]
This is a contradiction to the fact that since the $X_t,Y_t$ are independent, their probability of success can be at most the value of an one-shot game (for which $val(G)<1-\alpha$). That finishes the proof of lemma \ref{Raz}.
\end{proof}
It now remains to show that Alice and Bob can indeed sample from the distribution $p(u,i,A_S\vert X_i=x,Y_i=y,W_S)$ without communicating. So, we assume that they use their shared randomness so that Alice samples from $p(u,i,A_S\vert X_i=x,W_S)$ and Bob samples from $p(u,i,A_S\vert Y_i=y,W_S)$. Even though they sample from different distributions, we can show that the distributions are close enough so that Alice and Bob can use correlated sampling and get the same sample most of the time. More specifically, we will use the following two lemmas:
\begin{lemma}
There exists some $\gamma(\alpha,k )$ such that
\[
p(i,x_i,y_i)\cdot p(U_{-i},A_S\vert W_S,i,x_i)\stackrel{\varepsilon}{\approx} p(i,x_i,y_i,A_S,U_{-i}\vert W_S) \stackrel{\varepsilon}{\approx} p(i,x_i,y_i)\cdot p(U_{-i},A_S\vert W_S,i,y_i)
\]
where $p(x)\stackrel{\varepsilon}{\approx}q(x) \Leftrightarrow |p(x)-q(x)\leq \varepsilon|$ and $\varepsilon=(\alpha-\gamma)/10$ in our case.
\begin{proof}[sketch]
Using claim \ref{claim 3}, we can show that
\begin{align}
p(i,x_i,y_i,A_S,U_{-i}\vert W_S)&=p(A_S,U_{-i}\vert W_S)\cdot p(i,x_i,y_i\vert A_S,U_{-i},w_S) \\
&\stackrel{2\varepsilon}{\approx} p(i,x_i,A_S,U_{-i}\vert W_S)\cdot p(y_i \vert i, x_i)\\
&=p(i,x_i\vert w_S)\cdot p(U_{-i},A_S\vert w_S,i,x_i)\cdot p(y_i\vert i,x_i)\\
&\stackrel{\varepsilon}{\approx} p(i,x_i,y_i)\cdot p(U_{-i},A_S\vert w_S,i,x_i)
\end{align}
For the last step, we also used the fact that conditioning on $w_S$ does not change much, as equation \ref{cond} suggests.
The second approximation:
\[
p(i,x_i,y_i,A_S,U_{-i}\vert W_S) \approx p(i,x_i,y_i)\cdot p(U_{-i},A_S\vert W_S,i,y_i)
\]
follows by symmetry.
\end{proof}
\end{lemma}
\begin{lemma}[correlated sampling]
There is a protocol for Alice and Bob to used shared randomness to sample a random variable such that Alice gets value $x \sim p$ % is distributed according to a distribution $p$,
and Bob value $y\sim q$ % is distributed according to a distribution $q$
and the probability that their values differ is: $\Pr[x\neq y]\leq 2\vert p-q\vert$.
\end{lemma}
\begin{proof}
Alice and Bob can use their shared randomness to sample an infinite sequence of tuples: $\{(x_i,\rho_i)\}$, where each $x_i$ is distributed uniformly on the sample space and each $rho_i$ is a uniformly distributed real number in $[0,1]$. Alice will pick the $x_i$ with the smallest index $i$ such that $p(x_i)\geq \rho_i$, while Bob will pick the $x_i$ with the smallest index $i$ such that $q(x_i)\geq \rho_i$. It is easy to see that $\forall i,j: \frac{\Pr[Alice\;\; picks \;\;x_i]}{\Pr[Alice\;\; picks \;\;x_j]}=\frac{p(x_i)}{p(x_j)}$ and similarly for Bob. So, they sample exactly from the distributions $p,q$ respectively, and also the only way they get a different sample is if for some $rho_i$ it holds that: $p(x_i)<\rho_i