\documentclass[11pt]{article}
\usepackage[latin9]{inputenc}
\usepackage{amsmath}
\usepackage{amssymb}
\makeatletter
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% User specified LaTeX commands.
\usepackage{amsthm}
\DeclareMathOperator*{\E}{\mathbb{E}}
\let\Pr\relax
\DeclareMathOperator*{\Pr}{\mathbb{P}}
\newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}}
\newcommand{\eps}{\varepsilon}
\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
\newcommand{\R}{\mathbb{R}}
\newcommand{\handout}[5]{
\noindent
\begin{center}
\framebox{
\vbox{
\hbox to 5.78in { {\bf CS 229r: Algorithms for Big Data } \hfill #2 }
\vspace{4mm}
\hbox to 5.78in { {\Large \hfill #5 \hfill} }
\vspace{2mm}
\hbox to 5.78in { {\em #3 \hfill #4} }
}
}
\end{center}
\vspace*{4mm}
}
\newcommand{\lecture}[4]{\handout{#1}{#2}{#3}{Scribe: #4}{Lecture #1}}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{observation}[theorem]{Observation}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{nelsonquotes}[theorem]{Quote}
\newtheorem{bootlemma}[theorem]{Bootstrapped Lemma}
% 1-inch margins, from fullpage.sty by H.Partl, Version 2, Dec. 15, 1988.
\topmargin 0pt
\advance \topmargin by -\headheight
\advance \topmargin by -\headsep
\textheight 8.9in
\oddsidemargin 0pt
\evensidemargin \oddsidemargin
\marginparwidth 0.5in
\textwidth 6.5in
\parindent 0in
\parskip 1.5ex
\makeatother
\begin{document}
\lecture{10 --- October 3, 2013}{Fall 2013}{Prof.\ Jelani Nelson}{Yong Wook Kwon}
\section{Overview}
In the last lecture, we have used the concentration of Lipschitz functions of Gaussians and the decoupling lemma to prove Hanson-Wright inequality, which subsequently implied the distributional JL lemma, which then finally implied the Johnson-Lindenstrauss lemma.
The goal of this lecture is as follows:
\begin{itemize}
\item Prove Lipschitz-concentration and Decoupling lemma
\item Prove Alon's lower-bound ($m \gtrsim \Omega(\frac{1}{\eps^2 \log{\frac{1}{\eps}}}\log{N})$)
\item Try to circumvent Alon's lower-bound by giving a more refined upper bound for JL which takes properties of the point set into account,
\end{itemize}
\section{Proof of Lipschitz Concentration}
We shall prove a slightly more general theorem that implies Lipschitz concentration. The theorem was first stated by Pisier {\cite{Pisier}}, but a more elegant proof, which is reproduced below, was later given by Maurey.
\begin{theorem}
Let $X = (x_1, x_2, ... x_n)$ be i.i.d. $N(0, 1)$, $\Phi : \mathbb{R} \mapsto \mathbb{R}$ convex, and $f: \mathbb{R}^n \mapsto \mathbb{R}$ has a gradient almost everywhere (except a set of measure zero). Then, $\E_{X} \Phi(f(x) - \E f(X)) \le \E_{X, Y} \Phi(\frac{\pi}{2} \inprod{\nabla f(X), Y})$, where $Y = (y_1, ... y_n)$ also has i.i.d. $N(0, 1)$ entries.
\end{theorem}
Note that this theorem implies Lipschitz concentration, if one chooses $\Phi(z) = |z|^p$. Then,
\[
\E_X |f(x) - \E f(X)|^p \le (\frac{\pi}{2})^p \E_X \E_Y |\inprod{\nabla f(x), y}|^{p}
\]
As for a fixed $X$, the term inside the expectation is a convolution of normal variables, so by the $2$-stableness of the normal, has distribution $g \cdot \|\nabla f(X)\|_2$. Then, we use that the $p$-th moment of the normal is bounded upwards by $O(\sqrt{p)}^p$ to obtain
\[
(\frac{\pi}{2})^p \E_X \E_Y |\inprod{\nabla f(x), y}|^{p} \le (\frac{c \pi}{2})^{p} (\sqrt{p})^p \E_X\|\nabla f(x)\|^p_2
\]
But note that the $\ell_2$ norm of the gradient is at most the Lipschitz constant, since by moving an infinitesimal amount $\eps$ (in $\ell_2$ distance) in the direction of the gradient, we change $f$ by $\eps$ times the gradient. Thus, taking the $\frac{1}{p}$th power of both sides of the inequality gives Lipschitz Concentration, as desired.
Let us now prove the original theorem.
\begin{proof}
\[
\E_X \Phi (f(X) - \E f(X)) = \E_X \Phi (\E_Y(f(X) - f(Y))) \le \E_{X, Y} \Phi(f(X)-f(Y)) \mbox{ (Jensen)}
\]
This technique of pulling out the expectation out of the convex function via Jensen and making the inner expression symmetric in terms of $X$ and $Y$ is a useful technique and usually called {\em symmetrization}.
But the proof only gets slicker from here.
Define $g(\theta) = X \sin(\theta) + Y \cos(\theta)$, and note $\frac{d}{d\theta} g(\theta) = X \cos( \theta) - Y \sin(\theta) \eqdef g'(\theta)$.
\begin{align*}
\E_{X, Y} \Phi(f(x)-f(Y))
{}&= \E_{X, Y} \Phi(f(g(\frac{\pi}{2})) -f(g(0))) \\
{}&= \E_{X, Y} \Phi (\int^{\frac{\pi}{2}}_0 \frac{d}{d\theta}(f(g(\theta))) d\theta) \mbox{ (fundamental theorem of calculus)}\\
{}&= \E_{X, Y} \Phi (\int^{\frac{\pi}{2}}_0 \inprod{\nabla f(g(\theta)), g'(\theta)} d\theta)\\
{}&= \E_{X, Y} \Phi(\int^{\frac{\pi}{2}}_0 \frac{2}{\pi} [ \frac{\pi}{2} \inprod{\nabla f(g(\theta)), g'(\theta)}] d\theta) \\
{}&= \E_{X, Y} \Phi(\E_{\theta \in [0, \frac{\pi}{2}]}(\frac{\pi}{2} \inprod{\nabla f(g(\theta)), g'(\theta)})) \\
{}& \le \E_{X, Y, \theta} \Phi (\frac{\pi}{2} \inprod{\nabla f(g(\theta)), g'(\theta)}) \mbox{ (Jensen)}\\
{}&= \E_{X, Y} \Phi(\frac{\pi}{2} \inprod{ \nabla f(X), Y}) \\
\end{align*}
which completes the proof. The last inequality holds because for any fixed $\theta$, $[g'(\theta), g(\theta)']^T$ is formed by applying a two by two rotation matrix to $[X, Y]$. Thus since $X,Y$ are distributed as independent gaussians, then so are $g'(\theta),g(\theta)$ (drawing the point $(X,Y)$ in $\R^2$, we see that choosing two independent gaussians is equivalent to choosing a uniformly random point on a circle of radius $r$, where $r$ is chosen from the appropriate distribution; applying some fixed rotation still gives two independent and uniformly random points on the circle).
\end{proof}
\section{Proof of De-coupling}
Recall the statement from {\cite{PenaGine}}: $\sigma_1,\sigma'_1, \ldots, \sigma_n,\sigma'_n$ are i.i.d.\ signs, $A = (a_{ij})$, then
\[
\|\sum_{i \neq j} a_{ij} \sigma_i\sigma_j \|_p \le 4 \|\sum_{i, j} a_{ij}\sigma_i\sigma'_j\|_p
\]
\begin{quotation}
``There are going to be lots of random variables in this proof'' - Jelani Nelson
\end{quotation}
Note that the $p$-norm is a norm in the sense of random variables, that is $(\E|X|^p)^{\frac{1}{p}}$.
We will use the trick of inserting new random variables and pulling them out, in an opportune moment.
\begin{proof}
Let $y_1, ... y_n \in \{0, 1\}$ be fair coin flips. Then,
\begin{align*}
\|\sum_{i \neq j} a_{ij} \sigma_i\sigma_j \|_p = 4 \|\E_{y} \sum_{i \neq j} a_{ij} \sigma_i y_i \sigma_j (1- y_j)\|
{}& \le 4 \|\sum_{j \neq i} a_{ij}\sigma_i y_i \sigma_j (1-y_j)\|_p \mbox{ (Jensen)}
\end{align*}
Note that as the inequality holds for the expectation over all $y$, this means that the inequality holds for some fixed $y' \in \{0, 1\}^n$. Let $S = \{i | y'_i = 1\}$.
\[
\|\sum_{i \neq j} a_{ij} \sigma_i\sigma_j \|_p \le 4 \|\sum_{j \neq i} a_{ij}\sigma_i y'_i \sigma_j (1-y'_j)\|_p = 4 \|\sum_{i \in S} \sum_{j\notin S} a_{ij} \sigma_i \sigma_j\|_p
\]
But as the sets $S$ and $S^c$ are disjoint ($S^c$ representing the complement of $S$), the two groups can be viewed as separate, i.e.
\[
{}=4 \| \sum_{i\in S}\sum_{j \notin S} a_{ij} \sigma_i\sigma_j^{'} \| = 4 \|\E_{\sigma_S \sigma_{\bar{S}}} \sum_{i, j} a_{ij} \sigma_i \sigma_j^{'} \|_p
\]
where we added back the expectation of the missing terms, which is zero, and finally apply Jensen to obtain that the above is at most
\[
4 \|\sum_{i, j} a_{ij} \sigma_i \sigma_j^{'}\|_p
\]
\end{proof}
\section{Alon's Lower Bound}
\begin{theorem}[Alon {\cite{Alon}}] For every $N>1$ there exists a set of points $x_0,\ldots,x_N\in\R^N$ such that any embedding into $\ell_2^m$ with distortion at most $1+\eps$ for $\frac{1}{\sqrt{N}} < \eps <\frac{1}{2}$ must have $m = \Omega(\frac{1}{\eps^2 \log{\frac{1}{\eps}}\log{N}})$.
\end{theorem}
\begin{proof}
Let the points be $x_0 = 0$, and $x_i = e_i$ for $i>0$, and let $v_i$ be the image of $e_i$ in the $m$-dimensional space. Now consider $\Pi$, whose columns are $v_i$, and notice the following.
\begin{itemize}
\item $|1-\|v_i \|| < \eps$
\item $\|v_i - v_j\|^2 = \|v_i\|^2 + \|v_j\|^2 - 2 \implies |\inprod{v_i, v_j}| = O(\eps)$
\end{itemize}
Thus, another way of thinking about $\Pi$ is, once we rescale the $v_i$ to be unit vector, is an $\eps$-incoherent matrix, so this proof is also giving a lower-bound on the size of the $\eps$-incoherent matrices.
Now note
\[
\Pi^{T} \Pi = \begin{bmatrix}O(\eps) & 0 & ... & O(\eps) & O(\eps)\\
O(\eps) & 1 & ... & O(\eps) & O(\eps)\\
... & ... & ... & ... & ...\\
O(\eps) & O(\eps) & ... & 1 & O(\eps)\\
O(\eps) & O(\eps) & ... & O(\eps) & 1
\end{bmatrix}\mbox{ (m blocks each n\ensuremath{\times}n)},
\]
or in other words, $\Pi^T \Pi$ is an $\eps$-near identity matrix, with rank at most $m$. We now use this lemma, also by Alon.
\begin{lemma}
Any $n \times n$ $\eps$-near identity with rank $m$ must have $m \gtrsim \frac{1}{\eps^2 \log{\frac{1}{\eps}}\log{n}}$
\end{lemma}
Clearly, the proof of Alon's lower bound follows immediately from the lemma. Proving this lemma requires another lemma.
\begin{lemma}
If $A$, an $\eps$-near identity is symmetric (note that $\Pi^T \Pi$ is symmetric, so this is all we need), then $m \ge \frac{n}{1+ \eps^2(n-1)}$
\end{lemma}
\begin{proof}
Note that as the matrix is symmetric, it has $m$ non-zero eigenvalues, $\lambda_1, ... \lambda_m$. We notice the following.
\begin{itemize}
\item $\sum_i \lambda_i^2 = \|A\|_{F}^2 \le n + n(n-1)\eps^2$
\item $(\sum_i \lambda_i)^2 = (tr(A))^2 = n^2$
\item (Cauchy-Schwarz) $(\sum_{i} \lambda_i)^2 \le m \sum_{i} \lambda_i^2$
\item $\implies n^2 \le m(n + n(n-1)\eps^2)$
\end{itemize}
Rearranging gives the desired inequality.
\end{proof}
We still do not have a good enough bound, however, so we bootstrap this lemma.
\begin{bootlemma}
Let $A = (a_{ij})$ be of rank $m$, and let $p: \mathbb{R} \mapsto \mathbb{R}$ a degree $k$ polynomial, and we define $p(A) = (p(a_{ij}))$. Then, $rank(p(A)) \le \binom{m+k}{k}$
\end{bootlemma}
\begin{proof}
Suppose $v_1, ... v_m$ is a basis for the row space of $A$. If $p(z) = \sum_{i =0}^{k} \beta_i z^i$, then $p(a_{ij}) = \sum_{q = 0}^{k} \beta_q (\sum_r \alpha_r v_{r, j})^{q}$, so from this expansion, one can realize that $(v_{1, t}^{d_1}, ... v_{m, t}^{d_t})$, where $1 \le t \le n$, and $\sum d_i \le k$ spans the row space of $P(A)$. The total number of such vectors is $\binom{m+k}{k}$. This is a well-known combinatorial identity (the number of ways to place at most $k$ balls into $m$ bins).
\end{proof}
Given this lemma, now set $k = \frac{1}{2} \frac{\log{n}}{\log{\frac{1}{\eps}}}$ (which is at least $1$ for $\eps > 1/\sqrt{n}$), $p(z) = z^k$. Now note that $p(\Pi^T \Pi)$ has off-diagonal entries equal to $\frac{1}{n}$. Putting the previous two lemmas together, we have the following.
\[
\frac{n}{2} \le rank(p(\Pi^T \Pi)) \le \binom{m+k}{k}
\]
Note we have $\binom{a}{b} \le (e \cdot \frac{a}{b})^{b}$, so if we take the log of both sides, we obtain:
\[
\log{\frac{n}{2}} \le k \log{\frac{e(m+1)}{k}}
\]
Rearranging gives the desired proof of the lemma, and hence the proof of Alon's lower bound, as desired.
\end{proof}
\section{Improving the JL upper bound}
Recall: $T$ is a set of unit $\ell_2$ vectors, a random sign matrix $\Pi$ preserves all vectors in $T$ simultaneously up to $\eps$ error, where $m \gtrsim \frac{\log{T}}{\eps^2}$.
According to Gordon{\cite{Gordon}}, who proved this result for random gaussian matrices, and Klartag and Mendelson{\cite{KlartagMendelson05}}, who proved it for random sign matrices (and other matrices with independent subgaussian entries), the bound can be improved to the following:
\[
m \gtrsim \frac{g^2(T) + 1}{\eps^2}, g(T) = \E \sup_{x\in T} \inprod{g, x}
\]
where $g$ is a random gaussian vector. This is a generalization of the JL lemma, since for any $x\in T$ it holds that $\inprod{g,x}$ is a gaussian with unit variance. Thus
$$
\E \sup_{x\in T} \inprod{g,x} \eqdef \E \sup_{x\in T} g_x = \int_0^\infty \Pr(\sup_{x\in T} g_x > t) dt \le \sqrt{\log |T|} + \sum_{x\in T} \int_{\sqrt{\log |T|}}^\infty \Pr(g_x > t) dt
$$
is at most $O(\sqrt{\log |T|})$ (the last inequality was by the union bound), and thus $g^2(T) = O(\log |T|)$. However the bound can be much better than $O(\log|T|)$ depending on $T$, e.g.\ if the vectors in $T$ fall into a small number of well-clustered sets (in which case the union bound is suboptimal).
\begin{thebibliography}{1}
\bibitem{Pisier} Gilles Pisier. Probabilistic methods in the geometry of Banach spaces. {\em Probability and Analysis}, Varenna (Italy) 1985. {\em Lecture Notes in Math.}, 1206:167--241, 1986.
\bibitem{JohnsonL84}William B. Johnson and Joram Lindenstrauss. Extensions
of Lipschitz mappings into a Hilbert space. {\em Contemporary Mathematics}, 26:189--206, 1984.
\bibitem{Alon} Noga Alon. Problems and results in extremal combinatorics I. {\em Discrete Math.}, vol. 273, pp. 31\textendash{}53, 2003.
\bibitem{PenaGine} Victor Hugo de la Pe\~{n}a, Evarist Autor Gine. Decoupling: from dependence to independence. {\em Probability and its Applications} (New
York). Springer-Verlag, New York, 1999.
\bibitem{Gordon} Yehoram Gordon. On Milman's inequality and random subspaces which escape through a mesh in $\mathbb{R}^n$.
{\em Geom. Aspects of Funct. Anal.}, Israel seminar, Lecture Notes in Math., 1317, Springer-Verlag, 84--106, 1988.
\bibitem{KlartagMendelson05} Bo'az Klartag and Shahar Mendelson. Empirical processes and random projections. {\em J. Funct. Anal.}, vol.225, pp. 229\textendash{}245, 2005.
\end{thebibliography}
\end{document}