\documentclass[11pt]{article}
\usepackage{amsmath,amssymb,amsthm}
\DeclareMathOperator*{\E}{\mathbb{E}}
\let\Pr\relax
\DeclareMathOperator*{\Pr}{\mathbb{P}}
\newcommand{\eps}{\varepsilon}
\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
\newcommand{\R}{\mathbb{R}}
\newcommand{\handout}[5]{
\noindent
\begin{center}
\framebox{
\vbox{
\hbox to 5.78in { {\bf CS 229r: Algorithms for Big Data } \hfill #2 }
\vspace{4mm}
\hbox to 5.78in { {\Large \hfill #5 \hfill} }
\vspace{2mm}
\hbox to 5.78in { {\em #3 \hfill #4} }
}
}
\end{center}
\vspace*{4mm}
}
\newcommand{\lecture}[4]{\handout{#1}{#2}{#3}{Scribe: #4}{Lecture #1}}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{observation}[theorem]{Observation}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{assumption}[theorem]{Assumption}
% 1-inch margins, from fullpage.sty by H.Partl, Version 2, Dec. 15, 1988.
\topmargin 0pt
\advance \topmargin by -\headheight
\advance \topmargin by -\headsep
\textheight 8.9in
\oddsidemargin 0pt
\evensidemargin \oddsidemargin
\marginparwidth 0.5in
\textwidth 6.5in
\parindent 0in
\parskip 1.5ex
\begin{document}
\lecture{14 --- Thursday, Oct 17 2013}{Fall 2013}{Prof.\ Jelani Nelson}{Aleksandar Makelov}
\section{Overview}
Today, we'll be looking at subspace embeddings, and how to use them to get fast algorithms for least squares regression. Next time we'll see how to use them for low-rank approximation as well.
\section{Subspace embeddings}
Recall from the end of last lecture that a \emph{subspace embedding} is a linear transformation that has the Johnson-Lindenstrauss property for all vectors in the subspace:
\begin{definition}
Given $W\subset\R^n$ a linear subspace and $\eps\in(0,1)$, an $\eps$-\textbf{subspace embedding} is a matrix $\Pi\in\R^{m\times n}$ for some $m$ such that
\begin{align*}
\forall x\in W: (1-\eps)\|x\|_2\leq \|\Pi x\|_2\leq (1+\eps)\|x\|_2
\end{align*}
\end{definition}
\begin{claim}
For any linear subspace $W\subset\R^n$ with $\dim W=d$, there exists a $0$-subspace embedding $\Pi\in\R^{d\times n}$, but no $\eps$-subspace embedding $\Pi\in\R^{m\times n}$ with $\eps<1$ if $m0$, contradiction.
\end{proof}
How can we find the orthogonal matrix used in the proof efficiently? Suppose $W$ is given to us in terms of a matrix $A\in\R^{n\times d}$ that spans its columns. Then recall from linear algebra that
\begin{theorem}[Singular value decomposition]
\label{thm:svd}
Every $A\in\R^{n\times d}$ has a ``singular value decomposition"
\begin{align*}
A=U\Sigma V^T
\end{align*}
where $U\in\R^{n\times r}$ has orthonormal columns, $r=\textup{rank}(A)$, $\Sigma\in\R^{r\times r}$ is diagonal with strictly positive entries on the diagonal, and $V\in\R^{d\times r}$ has orthonormal columns.
\end{theorem}
Imagine $W$ is the column span of $A$ with $r=d$. By completing $U$ to a square orthogonal matrix, $U^{-1}=U^T$ is the rotation we need. As for efficiently finding the SVD, we have
\begin{theorem}[Demmel, Dumitru, Holtz {\cite{DDH07}}]
In the setting of the previous theorem, we can approximate SVD well in time $\tilde{O}(nd^{\omega-1})$ where $\omega$ is the constant in the exponent of the complexity of matrix multiplication. Here the tilde hides logarithmic factors in $n$.
\end{theorem}
We shall assume henceforth in this lecture that we can in fact find the SVD exactly in $\tilde{O}(nd^{\omega-1})$ time (to avoid talking about numerical analysis).
\section{Least squares regression}
\subsection{Definition and motivation}
\begin{definition}
\label{prob:}
Suppose we're given $A\in\R^{n\times d}$, $b\in\R^n$ where $n\gg d$. We want to solve $Ax=b$; however, since the system is over-constrained, an exact solution does not exist in general. In the \textbf{least squares regression} problem, we instead want to solve the equation in a specific approximate sense: we want to compute
\begin{align*}
x^* = \textup{argmin}_x \|Ax-b\|
\end{align*}
\end{definition}
The choice of the function to be optimized is not arbitrary. For example, assume that we have some system, and one of its parameters is a linear function of $d$ other parameters. How can we find the coefficients of that linear function? In reality, we experimentally observe a linear function + some random error. Under certain assumptions - errors have mean 0, same variance, and are independent, then least squares regression is provably the best estimator out of a certain class of estimators (see the Gauss-Markov theorem).
\subsection{How do we solve least squares in general?}
What's the best $x$ to choose? Notice that $\{Ax: x\in \R^d\}$
is the column span of $A$. Then, the $x^*$ we need is the projection of $b$ on that column span. Let $A=U\Sigma V^T$ be the SVD of $A$. Then the projection of $b$ satisfies
\begin{align*}
\text{Proj}_{\textup{Col}(A)}b=UU^Tb
\end{align*}
hence we can set $x^*=V\Sigma^{-1}U^Tb$ since then we have $Ax^*=U\Sigma V^TV\Sigma^{-1}U^Tb=UU^Tb$. Thus, we can solve LSR in SVD$(n,d)$ time.
\subsection{Using subspace embeddings}
We want to be faster than the above algorithm. Let $\Pi$ be an $\eps$-subspace embedding for the span of $b$ and columns of $A$, which has dimension $\leq d+1$. We'll compute $\tilde{x}^*=\textup{argmin}_x\|\Pi Ax-\Pi b\|$, and want to show that $\tilde{x}^*$ is an approximate solution. Indeed,
\begin{align*}
(1-\eps)\|A\tilde{x}^*-b\|_2\leq\|\Pi A\tilde{x}^* - \Pi b\|_2\leq \|\Pi Ax^*-\Pi b\|_2\leq(1+\eps)\|Ax^*-b\|_2
\end{align*}
and thus $\|A\tilde{x}^*-b\|_2\leq\frac{1+\eps}{1-\eps}\|Ax^*-b\|_2$, so we are within a $1+O(\eps)$ factor of the optimal solution. Thus, provided we have an efficient subspace embedding, we might be able to have a fast approximate algorithm for least squares regression.
\subsection{The Fast JL Transform approach}
How can we get such an efficient $\eps$-subspace embedding? Recall that in Problem 1 of Problem set 5, we showed there is an $\eps$-subspace embedding $\Pi\in\R^{m\times n}$ with $m=O(d/\eps^2)$ that works with probability $1-1/e^{cd}$. This embedding may be very good, but getting there costs a lot: we have to multiply $\Pi A$, which can be done in $O(mnd^{\omega-2})$ (when we break the matrices in $d\times d$ blocks) time, which is worse than $O(nd^{\omega-1})$ --- the complexity of SVD on the original instance!
Sarlos \cite{sarlos2006improved} was the first to investigate how we can speed up the slow part of the algorithm discussed in the previous paragraph, and he used the fast Johnson-Lindenstrauss transform. The time to compute $\Pi x$ in FJLT is $O(n\log n + m^3)$, and $m\approx d$, ignoring $\eps$, so $\approx O(n\log n + d^3)$. So, $\Pi A$ can be found in time $O(nd\log n + d^4)$. We can use other results (Ailon, Liberty) on the FJLT to get rid of the additive $d^4$ for a slightly worse $m$.
\subsection{Sparse embedding approach}
Clarkson et al \cite{clarkson2013low} showed we can get $m=\frac{d\log^6(d/\eps)}{\eps^2}$ with a matrix $\Pi$ that has $s=1$ non-zeroes per column.
Mahoney and Meng \cite{meng2013low}, and Nelson and Nguy$\tilde{\hat{\mbox{e}}}$n \cite{nelson2013lower} show that $m=O(d^2/\eps^2)$ with $s=1$ suffices. A possible approach to show this is the following: observe that $\Pi$ being an $\eps$-subspace embedding for a $d$-dimensional linear subspace $W\subset\R^n$ spanned by the columns of a matrix $A=U\Sigma V^T$ in singular value decomposition is equivalent to $\Pi U^T \Pi U - I_d$ having small operator norm:\begin{align*}
\|\Pi U^T \Pi U - I_d\|\leq \eps
\end{align*}
up to changing $\eps$ by a factor of at most $2$. To show this,we first do some linear algebra review.
\subsubsection{Linear algebra review}
\begin{definition}
For a matrix $A$, the \bf{operator norm} is $\|A\|=\sup_{\|x\|=1} \|Ax\|$.
\end{definition}
\begin{claim}
If $A=U\Sigma V^T$ in SVD, the operator norm is the largest entry in $\Sigma$, and also equal to $\sqrt{\lambda_{max}(A^TA)}$. The vector realizing it is the corresponding column of $V$. The diagonal entries of $\Sigma$ are called the {\em singular values} of $A$, and the least singular value $\sigma_{min}(A)$ satisfies
$$
\sigma_{min}(A) = \inf_{\|x\| = 1} \|Ax\| .
$$
This inf is also realized by the appropriate column of $V$. Also,
\begin{align*}
\|A\| = \sup_{\|x\|=1}|x^T Ax|\text{ if }A\text{ is symmetric}
\end{align*}
\end{claim}
\subsubsection{Proving the equivalence.}
Applying the above facts to our case for the symmetric $\Pi U^T \Pi U - I_d$, if the operator norm is
\begin{align*}
\eps>\|\Pi U^T \Pi U - I_d\|=\sup_{\|x\|=1} |x^T(\Pi U)^T(\Pi U)x-x^Tx|
\\
= \sup_{\|x\|=1} \||\Pi Ux\|^2-1|
\end{align*}
then the lengths of all unit norm vectors in $W$ are preserved up to error $\eps$, and thus the lengths of all vectors in $W$ are preserved up to error $\eps$.
This might remind you of approximate matrix multiplication (AMM). Recall from last lecture that approximate matrix multiplication has a guarantee of the form
\begin{align*}
\Pr\left(\|\Pi U^T \Pi U - U^T U\|_F>\eps\|U\|_F^2\right)<\delta .
\end{align*}
Since the operator norm is at most the Frobenius norm, this implies
\begin{align*}
\Pr\left(\|\Pi U^T \Pi U - U^T U\|>\eps\|U\|_F^2\right)<\delta .
\end{align*}
By orthonormality of the columns of $U$, we have $\|U\|_F^2=d$. Hence, we can apply AMM with error $\eps/d$. Then the sparse Thorup-Zhang sketch \cite{TZ12} from Problem 3, Problem set 1 uses $m=O(d^2/\eps^2)$, so we're done. The fact that AMM already implies that the Thorup-Zhang sketch gives sparse subspace embeddings was observed by Huy L.\ Nguy$\tilde{\hat{\mbox{e}}}$n. $\Pi$ has one nonzero per column, so $\Pi A$ can be multiplied very quickly (in time linear in the number of non-zeroes of $A$); on the other hand, the SVD takes more time since we're doing it for $d^2/\eps^2$ (there are some tricks around this by applying the FJLT after the Thorup-Zhang sketch; see \cite{clarkson2013low}). It's also possible to get $m=d^{1.01}/\eps^2$ by increasing $s$ to $s=O(1/\eps)$ \cite{nelson2013lower}.
\section{Other ways to use subspace embeddings}
\subsection{Iterative algorithms}
This idea is due to Tygert and Rokhlin \cite{rokhlin2008fast} and Avron et al.\ \cite{avron2010blendenpik}. The idea is to use gradient descent. The performance of the latter depends on the \emph{condition number} of the matrix:
\begin{definition}
For a matrix $A$, the \textbf{condition number} of $A$ is the ratio of its largest and smallest singular values.
\end{definition}
Let $\Pi$ be a $1/4$ subspace embedding for the column span of $A$. Then let $\Pi A = U\Sigma V^T$ (SVD of $\Pi A$). Let $R=V\Sigma^{-1}$. Then by orthonormality of $U$
\begin{align*}
\forall x: \|x\| = \|\Pi ARx\|=(1\pm1/4)\|ARx\|
\end{align*}
which means $AR=\tilde{A}$ has a good condition number. Then our algorithm is the following
\begin{enumerate}
\item Pick $x^{(0)}$ such that
\begin{align*}
\|\tilde{A}x^{(0)}-b\|\leq 1.1\|\tilde{A}x^*-b\|
\end{align*}
(which we can get using the previously stated reduction to subspace embeddings with $\eps$ being constant).
\item Iteratively let $x^{(i+1)} = x^{(i)} + \tilde{A}^T(b-\tilde{A}x^{(i)})$ until some $x^{(n)}$ is obtained.
\end{enumerate}
We will give an analysis following that in \cite{clarkson2013low} (though analysis of gradient descent can be found in many standard textbooks). Observe that
\begin{align*}
\tilde{A}(x^{(i+1)}-x^*)=\tilde{A}(x^{(i)}+\tilde{A}^T(b-\tilde{A}x^{(i)})-x^*)=(\tilde{A}-\tilde{A}\tilde{A}^T\tilde{A})(x^{(i)}-x^*),
\end{align*}
where the last equality follows by expanding the RHS. Indeed, all terms vanish except for $\tilde{A}\tilde{A}^Tb$ vs $\tilde{A}\tilde{A}^T\tilde{A}x^*$, which are equal because $x^*$ is the optimal vector, which means that $x^*$ is the projection of $b$ onto the column span of $\tilde{A}$.
Now let $AR = U'\Sigma'V'^T$ in SVD, then
\begin{align*}
\|\tilde{A}(x^{(i+1)-x^*})\|&=\|(\tilde{A}-\tilde{A}\tilde{A}^T\tilde{A})(x^{(i)}-x^*)\|
\\
&=\|U'(\Sigma'-\Sigma'^3)V'^T(x^{(i)}-x^*)\|
\\
&=\|(I-\Sigma'^2)U'\Sigma'V'^T(x^{(i)}-x^*)\|
\\
&\leq\|I-\Sigma'^2\| \cdot \|U'\Sigma'V'^T(x^{(i)}-x^*)\|
\\
&=\|I-\Sigma'^2\| \cdot \|\tilde{A}(x^{(i)}-x^*\|
\\
&\leq \frac 12 \cdot \|\tilde{A}(x^{(i)}-x^*)\|
\end{align*}
by the fact that $\tilde{A}$ has a good condition number. So, $O(\log1/\eps)$ iterations suffice to bring down the error to $\eps$. In every iteration, we have to multiply by $AR$; multiplying by $A$ can be done in time proportional to the number of nonzero entries of $A$, $\|A\|_0$, and multiplication by $R$ in time proportional to $d^2$. So the dominant term in the time complexity is $\|A\|_0\log(1/\eps)$, plus the time to find the SVD.
\subsection{Yet another approach}
This approach is due to Sarl\'{o}s \cite{sarlos2006improved}. First, a bunch of notation: let
\begin{align*}
x^*&=\textup{argmin}\|Ax-b\|
\\
\tilde{x}^*&=\textup{argmin}\|\Pi Ax-\Pi b\|.
\\
A&=U\Sigma V^T\text{ in SVD}
\\
Ax^*&=U\alpha \text{ for }\alpha\in\R^d
\\
Ax^*-b&=-w
\\
A\tilde{x}^*-Ax^*&=U\beta
\end{align*}
Then, $OPT = \|w\|=\|Ax^*-b\|$. We have
\begin{align*}
\|A\tilde{x}^*-b\|^2&=\|A\tilde{x}^*-Ax^*+Ax^*-b\|^2
\\
&=\|A\tilde{x}^*-Ax^*\|^2+\|Ax^*-b\|^2\text{ (they are orthogonal)}
\\
&=\|A\tilde{x}^*-Ax^*\|^2+OPT^2=OPT^2+\|\beta\|^2
\end{align*}
We want $\|\beta\|^2\leq 2\eps OPT^2$. Since $\Pi A,\Pi U$ have same column span,
\begin{align*}
\Pi U(\alpha +\beta)&=\Pi A\tilde{x}^*=\textup{Proj}_{\Pi A}(\Pi b)=\textup{Proj}_{\Pi U}(\Pi b)
\\
&=\textup{Proj}_{\Pi U}(\Pi(U\alpha+w))=\Pi U\alpha + \textup{Proj}_{\Pi U}(\Pi w)
\end{align*}
so $\Pi U \beta= \text{Proj}_{\Pi U}(\Pi w)$, so $(\Pi U)^T(\Pi U)\beta = (\Pi U)^T\Pi w$. Now, let $\Pi$ be a $(1 - 1/\sqrt[4]{2})$-subspace embedding -- then $\Pi U$ has smallest singular value at least $1/\sqrt[4]{2}$. Therefore
\begin{align*}
\|\beta\|^2/2\leq \|(\Pi U)^T(\Pi U)\beta\|^2=\|(\Pi U)^T\Pi w\|^2
\end{align*}
Now suppose $\Pi$ also approximately preserves matrix multiplication. Notice that $w$ is orthogonal to the columns of $A$, so $U^Tw=0$. Then, by the general approximate matrix multiplication property,
\begin{align*}
\Pr_\Pi\left(\|(\Pi U)^T\Pi w-U^Tw\|_2^2>\eps'^2\|U\|_F^2\|w\|_2^2\right)<\delta
\end{align*}
We have $\|U\|_F = \sqrt{d}$, so set error parameter $\eps'=\sqrt{\eps/d}$ to get
\begin{align*}
\Pr\left(\|(\Pi U)^T\Pi w\|^2>\eps\|w\|^2\right)<\delta
\end{align*}
so $\|\beta\|^2\leq 2\eps \|w\|^2=2\eps OPT^2$, as we wanted.
So in conclusion, we don't need $\Pi$ to be an $\eps$-subspace embedding. Rather, it suffices to simply be a $c$-subspace embedding for some fixed constant $c = 1 - 1/\sqrt{2}$, while also providing approximate matrix multiplication with error $\sqrt{\eps/d}$. Thus for example using the Thorup-Zhang sketch, using this reduction we only need $m=O(d^2+d/\eps)$ and still $s=1$, as opposed to the first reduction in these lecture notes which needed $m = \Omega(d^2/\eps^2)$.
\bibliographystyle{alpha}
\begin{thebibliography}{42}
\bibitem{avron2010blendenpik}
Haim Avron and Petar Maymounkov and Sivan Toledo.
\newblock Blendenpik: Supercharging LAPACK's least-squares solver
\newblock {\em SIAM Journal on Scientific Computing}, 32(3) 1217--1236, 2010.
\bibitem{clarkson2013low}
Kenneth L. Clarkson and David P. Woodruff.
\newblock Low rank approximation and regression in input sparsity time
\newblock {\em Proceedings of the 45th Annual ACM Symposium on the Theory of Computing (STOC)}, 81--90, 2013.
\bibitem{DDH07}
James Demmel, Ioana Dumitriu, and Olga Holtz.
\newblock Fast linear algebra is stable.
\newblock {\em Numer. Math.}, 108(1):59–-91, 2007.
\bibitem{meng2013low}
Xiangrui Meng and Michael W. Mahoney.
\newblock Low-distortion subspace embeddings in input-sparsity time and applications to robust linear regression
\newblock {\em Proceedings of the 45th Annual ACM Symposium on the Theory of Computing (STOC)}, 91--100, 2013.
\bibitem{nelson2013lower}
Jelani Nelson and Huy L. Nguy$\tilde{\hat{\mbox{e}}}$n.
\newblock OSNAP: Faster numerical linear algebra algorithms via sparser subspace embeddings.
\newblock {\em Proceedings of the 54th Annual IEEE Symposium on Foundations of Computer Science (FOCS)}, 2013.
\bibitem{rokhlin2008fast}
Vladimir Rokhlin and Mark Tygert.
\newblock A fast randomized algorithm for overdetermined linear least-squares regression.
\newblock {\em Proceedings of the National Academy of Sciences}, 105 (36) 13212--13217, 2008.
\bibitem{sarlos2006improved}
Tamas Sarl\'{o}s.
\newblock Improved approximation algorithms for large matrices via random projections.
\newblock {\em 47th Annual IEEE Symposium on Foundations of Computer Science (FOCS)}, 143--152, 2006.
\bibitem{TZ12}
Mikkel Thorup, Yin Zhang.
\newblock Tabulation-Based 5-Independent Hashing with Applications to Linear Probing and Second Moment Estimation.
\newblock {\em SIAM J. Comput.} 41(2): 293--331, 2012.
\end{thebibliography}
\end{document}