\documentclass[10pt]{article}
\usepackage{amsfonts,amsthm,amsmath,amssymb}
\usepackage{array}
\usepackage{epsfig}
\usepackage{fullpage}
\usepackage{amssymb}
\usepackage[colorlinks = false]{hyperref}
\newcommand{\1}{\mathbbm{1}}
\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator*{\argmax}{argmax}
\newcommand{\x}{\times}
\newcommand{\Z}{\mathbb{Z}}
\newcommand{\Q}{\mathbb{Q}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\N}{\mathbb{N}}
\newcommand{\F}{\mathbb{F}}
\newcommand{\E}{\mathop{\mathbb{E}}}
\renewcommand{\bar}{\overline}
\renewcommand{\epsilon}{\varepsilon}
\newcommand{\eps}{\varepsilon}
\newcommand{\DTIME}{\textbf{DTIME}}
\renewcommand{\P}{\textbf{P}}
\newcommand{\SPACE}{\textbf{SPACE}}
\DeclareMathOperator{\CC}{CC}
\DeclareMathOperator{\CCpriv}{CC^{Priv}}
\DeclareMathOperator{\CCpub}{CC^{Pub}}
\DeclareMathOperator{\IP}{IP}
\DeclareMathOperator{\Disc}{Disc}
\usepackage{bbm}
\begin{document}
\input{preamble.tex}
\newtheorem{example}[theorem]{Example}
\theoremstyle{definition}
\newtheorem{defn}[theorem]{Definition}
\handout{CS 229r Information Theory in Computer Science}{Mar 5, 2019}{Instructor:
Madhu Sudan}{Scribe: Neekon Vafa}{Lecture 11}
\section{Bookkeeping}
\subsection{Outline}
Today: Communication Complexity
\begin{enumerate}
\item Upper Bounds
\item Lower Bounds for IP (Inner Product)
\begin{itemize}
\item Distributional Complexity
\item Discrepancy, Spectrum
\end{itemize}
\end{enumerate}
\subsection{Administrative Details}
\begin{itemize}
\item Problem set 3 is out, due Friday March 15
\item Professor Sudan has extra office hours this week on Friday from 1-3pm
\item List of topics for the project will come out shortly. Find partners! Use Piazza!
\end{itemize}
\section{Communication Complexity Review}
Recall that our model of communication is for Alice and Bob, given $x\in \{0,1\}^n$ and $y \in \{0,1\}^n$ respectively, to send binary strings to each other in rounds in order for Bob to compute $f : \{0,1\}^{2n} \to S$ on $(x,y)$, where $S$ is a finite set often chosen to be $\{0,1\}$. We can also add randomness to this model in two ways: either by {\it public} randomness, where a random string is available to both Alice and Bob simultaneously, or by {\it private} randomness, where a random string is available to Alice and not Bob and similarly a random string is available to Bob but not Alice.
As before, we have:
\begin{defn}[Communication Complexity]
We define the {\it communication complexity} of $f : \{ 0, 1\}^{2n} \to S$ to be
\begin{align*}
\CC(f) \triangleq \min_{\pi} \{ \# \text{ bits exchanged by } \pi \},
\end{align*}
where the $\min$ is taken over all protocols $\pi$ computing $f$. Similarly, we define the {\it private randomness communication complexity} of $f$ to be
\begin{align*}
\CCpriv(f) \triangleq \min_{\pi} \{ \# \text{ bits exchanged by } \pi \text{ with private randomness}\},
\end{align*}
and the {\it public randomness communication complexity} of $f$ to be
\begin{align*}
\CCpub(f) \triangleq \min_{\pi} \{ \# \text{ bits exchanged by } \pi \text{ with public randomness}\}.
\end{align*}
\end{defn}
Note that it's clear from these definitions that
\begin{align*}
\CCpub(f) \leq \CCpriv(f) \leq \CC(f),
\end{align*}
for any $f$. We also have the following inequalities in the other direction:
\begin{proposition}\label{prop:priv-pub}
For all $f : \{0,1\}^{2n} \to S$, we have $\CCpriv(f) \leq \CCpub(f) + O(\log(n))$.
\end{proposition}
\begin{proposition}\label{prop:cc-priv}
For all $f : \{0,1\}^{2n} \to S$, we have $\CC(f) \leq 2^{O(\CCpriv(f))}$.
\end{proposition}
Note that these two inequalities are tight for Equality$(x,y)$.
\section{Upper Bound Examples}
\subsection{Hamming Distance}
Consider the function
\begin{align*}
\text{HammingDist}_k(x,y) = \begin{cases} 1 & \text{if } \Delta(x,y) \leq k, \\ 0 &\text{if } \Delta(x,y) > k, \end{cases}
\end{align*}
for some parameter $k$, where $\Delta(x,y) = \# \{i : x_i \neq y_i\}$ is the Hamming distance between $x,y \in \{0,1\}^n$. It turns out that there is a $\Theta( k \log k + 1)$ bit protocol with shared randomness (does not depend on $n$). Today, we will see a $\Theta(k^2 + 1)$ bit protocol with shared randomness. Note that if $k = 0$, this is the Equality function, which we know has $\Theta(1)$ public randomness communication complexity, so this protocol is reasonably tight for small $k$.
\subsection{Small Set Disjointness}
Consider the Small Set Disjointness problem, where Alice gets $S \subseteq [n]$ and Bob gets $T \subseteq [n]$ (both represented as a characteristic vectors) with the condition that $|S|, |T| \leq k$ for some parameter $k$. The goal is to output whether $S \cap T = \emptyset$. Hastad and Wigderson give a $\Theta(k)$ bit protocol, but we will see a $\Theta(k \log k)$ bit protocol today.
\subsection{Protocols using hash functions}
Both of these problems can be solved by protocols that publicly pick a completely random hash function $h : [n] \to [m]$, which can be shown to have the property that for all $W \subseteq [n]$ with $|W| \leq k$, we have
\begin{align*}
\Pr_{h} [\exists i \neq j \in W \text{ s.t. } h(i) = h(j) ] \leq \frac{1}{100}.
\end{align*}
for some $m = O(k^2)$.
\begin{exercise}
Prove that a unformly random function $h : [n] \to [m]$ satisfies the above property for some $m = O(k^2)$.
\end{exercise}
For Small Set Disjointness, we can apply this to $W = S \cup T$, and Alice can send $\{h(i)\}_{i \in S}$ to Bob, which takes $|S| \log m \leq O(k \log k)$ bits. Since the probability of any collision is small, we know that Bob can recover $S$ with high probability and thus compute whether $S \cap T = \emptyset$.
\\\\
For $\text{HammingDist}_k$, for all $j \in [m]$, Alice can compute
\begin{align*}
u_j = \bigoplus_{i \in h^{-1}(j)} x_i
\end{align*}
and send the message $\{u_j\}_{j \in [m]}$. Then, Bob can similarly compute
\begin{align*}
v_j = \bigoplus_{i \in h^{-1}(j)} y_i,
\end{align*}
and check whether $\Delta(u,v) \leq k$. If $\Delta(x,y) \leq k$, then $x$ and $y$ differ in at most $k$ indices $ \subseteq \{i_1, \dots, i_k\}$, which implies that $u$ and $v$ differ only on a subset of the indices $\{h(i_1), \dots, h(i_k)\}$, which implies $\Delta(u,v) \leq k$. If $\Delta(x,y) > k$, then one can show that $\Delta(u,v) > k$ with probability $\geq 2/3$, which completes the analysis of this $\Theta(k^2 + 1)$ bit protocol for $\text{HammingDist}_k$.
\subsection{``Distance'' problems in $\R^n$}
Here, Alice and Bob are given $x,y \in \R^n$ respectively with $\norm{x}_2 = \norm{y}_2 = 1$. First, consider the function
\begin{align*}
f(x,y) = \sum_{i=1}^n x_i - y_i
\end{align*}
where we allow an additive error of up to $\epsilon$.
\\\\
\begin{remark}
The requirement that $\norm{x}_2 = \norm{y}_2 = 1$ is only so that the error term $\epsilon$ makes sense, as otherwise, we could scale $x$ and $y$ up without any change in $\epsilon$, which would be too good to be true.
\end{remark}
For this function, the protocol is easy: Alice sends $(\sum x_i) \pm \epsilon$ in $O(\log 1/\epsilon)$ bits, and Bob can compute the rest.
What about the function
\begin{align*}
f(x,y) = \sum_{i=1}^n (x_i - y_i)^2
\end{align*}
with an additive error of up to $\epsilon$? Here, the cross-terms $x_i y_i$ cause us difficulty. However, with randomness, Alice and Bob can overcome this obstacle. Specifically, Alice can send $(\sum x_i^2) \pm \epsilon$, similar to before, and now she can also send $\sum R_i x_i$, where $R_1, \dots, R_n$ are ``bits'' identically and independently distributed uniformly over $\{-1, 1\}$. For Bob to decode this, note that
\begin{align*}
\E_{R} \left[ \left( \sum_{i} R_i x_i \right) \left( \sum_{j} R_j y_j \right) \right] &= \E_{R} \left[ \sum_{i} R_i^2 x_i y_i \right] + \E_{R} \left[ \sum_{i \neq j} R_i R_j x_i y_j \right]
\\&= \sum_{i} x_i y_i + 0,
\end{align*}
where the last equality comes from the fact that $R_i^2 = 1$ and $\E_R[R_i R_j] = 0$ for all $i \neq j$. Therefore, Bob can take $\left( \sum_{i} R_i x_i \right)$ from Alice and $\left( \sum_{j} R_j y_j \right)$ directly from its input and multiply them to get an estimate for $\sum_{i} x_i y_i$. Given that Alice sends $\sum_i x_i^2$ and Bob can deduce $\sum_j y_j^2$, Bob can estimate $\sum_j (x_j - y_j)^2$. For this to work with high probability, we need to squash the variance of the random variable $\sum_{i \neq j} R_i R_j x_i y_j$. We can squash this variance successfully with $O(1/\epsilon^2)$ bits of communication. In fact, this is the best we can hope for:
\begin{exercise}
Prove that $1/\epsilon^2$ bits are required for any protocol to compute $f(x,y) = \sum_{i=1}^n (x_i - y_i)^2$ up to an additive error of $\epsilon$.
\end{exercise}
In summary, for the function $f(x,y) = (\sum_{i=1}^n x_i - y_i) \pm \epsilon$, there is a protocol that uses $O(\log 1/\epsilon)$ bits, but for the function $f(x,y) = (\sum_{i=1}^n (x_i - y_i)^2) \pm \epsilon$, the best protocol uses $\Theta( 1 / \epsilon^2)$ bits.
\section{Lower Bounds for Inner Product}
Recall that $\IP$ is defined as
\begin{align*}
\IP(x,y) \triangleq \sum_{i=1}^n x_i y_i \mod 2,
\end{align*}
for $x,y \in \{0,1\}^n$. How can we prove a $\Omega(n)$ lower bound on communication complexity of $\IP$ with shared randomness? One avenue to pursue would be to look at the rank of the matrix $M_{\IP}$, but we saw for Equality that rank was not helpful in proving lower bounds for protocols with randomness. So, we need something new.
\subsection{Distributional Complexity}
{\bf Idea}: Put a distribution $\mu$ on $\{0,1\}^n \times \{0,1\}^n$. We can define
\begin{align*}
\delta_{\mu}(f, g) \triangleq \Pr_{(x,y) \sim \mu}[f(x,y) \neq g(x,y)]
\end{align*}
and
\begin{align*}
D_{\epsilon, \mu}(f) \triangleq \min_{g \text{ s.t. } \delta_\mu(f, g) \leq \epsilon} \CC(g).
\end{align*}
\subsection{Randomized Protocol $\implies$ Distributional Deterministic Protocol}
With this setup, we can prove distributional lower bounds by putting some distribution $\mu$ on $\{0,1\}^n \times \{0,1\}^n$, and prove that no deterministic protocol $\pi$ using $k$ bits achieves small error on $(x,y) \sim \mu$.
Why is this helpful?
\begin{proposition}\label{prop:rand-to-det}
For all functions $f : \{0,1\}^{2n} \to S$ and distributions $\mu$ over $\{0,1\}^{2n}$, we have
\begin{align*}
\CCpub(f) \geq \frac{D_{\epsilon, \mu}(f)}{O(\log 1/ \epsilon)}.
\end{align*} \end{proposition}
Thus, if we have a lower bound on $k$ for any {\it deterministic} protocol computing $f$ achieving small error for some distribution $\mu$, then we must have a lower bound for any {\it random} protocol with public randomness computing $f$.
\begin{proof}[Proof of Proposition \ref{prop:rand-to-det}]
Suppose we have some $k$-bit protocol $\pi$ that gets error less than $1/3$ probability for every $(x,y) \in \{0,1\}^{2n}$. By repeating this protocol $O(\log 1 / \epsilon)$ times and taking the majority of the outputs, we have a protocol $\tilde{\pi}$ using $O(k \log 1/ \epsilon)$ bits that errors with probability $\leq \epsilon$. That is, for all $(x,y)$, we have
\begin{align*}
\E_{R} [\1_{f(x,y) \neq \tilde{\pi}(x,y,R)} ] \leq \epsilon,
\end{align*}
where the randomness $R$ denotes the randomness of the protocol. Now, we can take the expectation over $\mu$ and switch the order to get
\begin{align*}
\epsilon &\geq \E_{(x,y) \in \mu} \E_{R} [\1_{f(x,y) \neq \tilde{\pi}(x,y,R)} ]
\\&= \E_{R} \E_{(x,y) \in \mu} [\1_{f(x,y) \neq \tilde{\pi}(x,y,R)} ] .
\end{align*}
This means that there exists some $R$ such that $\E_{(x,y) \in \mu} [\1_{f(x,y) \neq \tilde{\pi}(x,y,R)} ] \leq \epsilon $, i.e. $\Pr_{(x,y) \in \mu}[f(x,y) \neq \tilde{\pi}(x,y,R)] \leq \epsilon$. Now, we can hardcode $R$ into $\tilde{\pi}$ to get a {\it deterministic} protocol $\pi'$ using $O(k \log 1/ \epsilon)$ bits, where we have $\Pr_{(x,y) \in \mu}[f(x,y) \neq \pi'(x,y)] \leq \epsilon$, i.e. $\delta_{\mu}(f, \pi') \leq \epsilon$, as desired.
\end{proof}
The idea here is that we can view randomized protocols as distributions over deterministic protocols.
\subsection{Discrepancy}
Now, we would like to show $D_{\mu, \epsilon}(\IP_n) \geq \Omega(n)$ for some distribution $\mu$, as from the proposition above, this would give a $\Omega(n)/ \log(1 / \epsilon)$ lower bound on the number of bits of any protocol computing $\IP_n$ with public randomness. In this case, thankfully choosing $\mu$ to be uniform will suffice, i.e. $\mu(x,y) = 4^{-n}$ for all $x,y \in \{0,1\}^n$.
Suppose $\pi$ is a protocol for $f$ using $k$ bits, with error probability $\leq \epsilon$ over $\mu$ (or equivalently, $D_{\mu, \epsilon}(f) \leq k$). Without loss of generality, we can assume that the final bit communicated by $\pi$ is the function value (as this adds at most 1 round and 1 bit). Considering the usual matrix $M_{\IP}$, we know that the $k$ bit protocol splits the matrix into $K =2^k$ rectangles $R_1, \dots, R_K$, where by a rectangle, we mean a Cartesian product of some $S \subseteq [n]$ and $T \subseteq [n]$. Let $p_i$ denote the probability that $\pi$ is correct and ends up in rectangle $R_i$, and let $\epsilon_i$ denote the probability that $\pi$ is wrong and ends up in rectangle $R_i$. Then, we have
\begin{align*}
\sum_{i=1}^K p_i &\geq 1 - \epsilon,
\\\sum_{i=1}^K \epsilon_i &\leq \epsilon.
\end{align*}
Subtracting the two equations, we have $\sum_{i=1}^K p_i - \epsilon_i \geq 1 - 2 \epsilon$, which implies that for some $i \in [K]$, we have
\begin{align}\label{eq:some-rect}
p_i - \epsilon_i \geq \frac{1 - 2 \epsilon}{K} = \frac{1 - 2 \epsilon}{2^k}.
\end{align}
Now, we are ready for another definition. In addition to the matrix $M_f(x,y) = f(x,y) \in \{0,1\}$ as we saw in the last lecture, we can now define
\begin{align*}
M_{f, \mu}(x,y) \triangleq \mu(x,y) (-1)^{f(x,y)}.
\end{align*}
Translating equation \eqref{eq:some-rect} into this new notation, for rectangle $R_i$, which we can say is given by rectangle $S \times T$, we have
\begin{align*}
\left| \sum_{x,y \in \{0,1\}^n} \1_S(x) \1_T(y) M_{f, \mu}(x,y) \right| = |p_i - \epsilon_i| \geq \frac{1 - 2 \epsilon}{2^k}.
\end{align*}
This motivates the following definition:
\begin{defn}[Discrepancy]
We can define the {\it discrepancy} of $f$ with respect to $\mu$ to be
\begin{align*}
\Disc_\mu(f) \triangleq \max_{S, T \subseteq [n]} \left| \sum_{x,y} \1_S(x) \1_T(y) M_{f, \mu}(x,y) \right|.
\end{align*}
\end{defn}
We have just shown:
\begin{proposition}\label{prop:d-disc}
If $D_{\mu, \epsilon}(f) \leq k$, then we have
\begin{align*}
\Disc_\mu(f) \geq \frac{1 - 2 \epsilon}{2^k}.
\end{align*}
\end{proposition}
Our goal now is to show that $\Disc_\mu(\IP_n)$ is small, as this would imply $D_{\mu, \epsilon}(f)$ is big by (the contrapositive of) proposition \ref{prop:d-disc}, which would imply that $\CCpub(f)$ is big by proposition \ref{prop:rand-to-det}.
\subsection{Spectrum bounds Discrepancy}
We can bound $\Disc_\mu(\IP_n)$ directly, where we represent $S, T$ by characteristic column vectors $U, V \in \{0,1\}^{2^n}$. Recall that $\mu$ is uniform over $\{0,1\}^{2n}$. Then, we have
\begin{align}
\Disc_\mu(\IP_n) &= \max_{S, T \subseteq [n]} \left| \sum_{x,y} \1_S(x) \1_T(y) M_{\IP_n, \mu}(x,y) \right|
\\&= \max_{U, V \in \{0,1\}^{2^n}} \left| U^\top M_{\IP_n, \mu} V \right|
\\&\leq \max_{\substack{U, V \in \R^{2^n} \\ \norm{U}_2, \norm{V}_2 \leq 2^{n/2}}} \left| U^\top M_{\IP_n, \mu} V \right|
\\&= 2^n \max_{\substack{U, V \in \R^{2^n} \\ \norm{U}_2, \norm{V}_2 \leq 1}} \left| U^\top M_{\IP_n, \mu} V \right|
\\&= 2^n \lambda_{\max}(M_{\IP_n, \mu}).\label{eq:1}
\end{align}
Thankfully, $M_{\IP_n, \mu}$ has enough structure to make computing its maximum eigenvalue tractable. In fact,
\begin{exercise}
$M_{\IP_n, \mu_n} = (M_{\IP_1, \mu_1})^{\otimes n}$, where $\mu_i$ is uniform over $\{0,1\}^{i} \times \{0,1\}^i$.
\end{exercise}
\begin{corollary}
$\lambda_{\max}(M_{\IP_n, \mu_n}) = \left(\lambda_{\max}(M_{\IP_1, \mu_1}) \right)^n$.
\end{corollary}
We can explicitly write $M_{\IP_1, \mu_1}$ as
\begin{align*}
M_{\IP_1, \mu_1} &=
\begin{bmatrix}
1/4 & 1/4
\\1/4 & -1/4
\end{bmatrix}
\end{align*}
as $\mu_1 = 1/4$ for all inputs, and $(-1)^{x y}$ is $-1$ if $x = y = 1$ and $1$ otherwise. A computation shows that $\lambda_{\max}(M_{\IP_1, \mu_1}) = 1/\sqrt{8}$, so $\lambda_{\max}(M_{\IP_n, \mu}) = (1/\sqrt{8})^n$. Thus, plugging back into \eqref{eq:1}, we get
\begin{align*}
\Disc_\mu(\IP_n) \leq 2^n \lambda_{\max}(M_{\IP_n, \mu}) = 2^{-n/2}.
\end{align*}
Thus, for $k = n/2- 1$ and $\epsilon < 1/4$, we can apply the contrapositive of Proposition \ref{prop:d-disc} to get that $D_{\mu, \epsilon} \geq n/2 - 1$. For constant $\epsilon < 1/4$ and applying Proposition \ref{prop:rand-to-det}, we have $\CCpub(\IP_n) \geq \Omega(n)$.
\end{document}