\documentclass[11pt]{article}
\usepackage{amsmath,amssymb,amsthm}
\usepackage{url}
\DeclareMathOperator*{\E}{\mathbb{E}}
\let\Pr\relax
\DeclareMathOperator*{\Pr}{\mathbb{P}}
\newcommand{\handout}[5]{
\noindent
\begin{center}
\framebox{
\vbox{
\hbox to 5.78in { {\bf CS 229r: Algorithms for Big Data } \hfill #2 }
\vspace{4mm}
\hbox to 5.78in { {\Large \hfill #5 \hfill} }
\vspace{2mm}
\hbox to 5.78in { {\em #3 \hfill #4} }
}
}
\end{center}
\vspace*{4mm}
}
\newcommand{\lecture}[4]{\handout{#1}{#2}{#3}{Scribes: #4}{Lecture #1}}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{observation}[theorem]{Observation}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{assumption}[theorem]{Assumption}
% 1-inch margins, from fullpage.sty by H.Partl, Version 2, Dec. 15, 1988.
\topmargin 0pt
\advance \topmargin by -\headheight
\advance \topmargin by -\headsep
\textheight 8.9in
\oddsidemargin 0pt
\evensidemargin \oddsidemargin
\marginparwidth 0.5in
\textwidth 6.5in
\parindent 0in
\parskip 1.5ex
\begin{document}
\lecture{1 --- September 3, 2013}{Fall 2013}{Prof.\ Jelani Nelson}{Andrew Wang and Andrew Liu}
\section{Course Logistics}
\begin{itemize}
\item{The problem sets can be found on the course website:\\ \url{http://people.seas.harvard.edu/~minilek/cs229r/index.html}}
\item{Guest lecturer on the 19th of September}
\end{itemize}
\subsection{Colloboration Policy}
\begin{itemize}
\item{You can work with others but please cite others when you do and write your own problem set solutions.}
\end{itemize}
\subsection{Prerequisites}
\begin{itemize}
\item{Algorithms (i.e. CS124)}
\item{Discrete Math}
\item{Discrete Probability (e.g. Linear Expectation)}
\item{Linear Algebra}
\end{itemize}
\subsection{Grading}
\begin{itemize}
\item{40\% of your grade is problem sets.}
\item{10\% is from scribing a lecture.}
\item{40\% is from the final project paper.}
\item{10\% is from the final project presentation.}
\end{itemize}
\subsection{Problem Set Information}
\begin{itemize}
\item{Problem sets are assigned every week (or week and a half).}
\item{All problem sets must be in \LaTeX\ and emailed.}
\end{itemize}
\subsection{Final Project Information}
\begin{itemize}
\item{You may work with one partner at most.}
\item{For the final project, you should first try to make a new theoretical research contribution. The contribution does not have to involve solving an extremely hard problem, since there are so many things out there to solve. This final project can be done in pairs.}
\item{If new research doesn't work out, you can write a survey covering many related areas in the field. Try many different things before falling back on a survey.}
\item{Or do some work on a synthetic data set and analyze that.}
\end{itemize}
\subsection{Scribe Notes Information}
\begin{itemize}
\item{Scribe notes have to be emailed to Prof. Nelson the night after the lecture at 9pm.}
\end{itemize}
\subsection{TF Information}
\begin{itemize}
\item{There is currently no TF.}
\item{The TF (whoever that may be) will grade psets and final projects.}
\end{itemize}
\section{Course Content/Topics for the semester}
\begin{enumerate}
\item{Sketching/Streaming}
\begin{itemize}
\item{A \textit{sketch of the data set} is a compression of the data set. Example: An algorithm can approximate document similarity between two documents by applying the similarity function to their sketches.}
\item{\textit{Streaming} is creating a sketch for online data that is continually updated. Example: Consider a router with packets flying at you. You may want to keep an updated sketch.}
\end{itemize}
\item{Dimensionality Reduction}
\begin{itemize}
\item{Example: You may want to run an algorithm on a data set but it scales poorly with the dimension of data, so you need to find a structure-preserving lower-dimensional representation.}
\end{itemize}
\item{Numerical Linear Algebra}
\begin{itemize}
\item{Motivation: you may want to solve some linear algebra problems algorithmically.}
\item{Example: matrix completion for the Netflix Prize - you a have products-customer matrix of customer ratings of certain products. The matrix is sparse (i.e. mostly empty) because not every user is going to rate everything. Based on limited information, you want to guess the rest of the matrix to do product suggestions, and you can do so by making assumptions on matrix structure.}
\end{itemize}
\item{Compressed Sensing}
\begin{itemize}
\item{Motivation: You want high dimensional signal with structure from sparse or approximately sparse data.}
\item{Example: Consider images that are pixelated (m by n) and every entry has a value corresponding to pixel color. These are usually not sparse, but if you think about these images in another representation, they could be sparse. How can you acquire such signals very quickly and recover them from that compressed acquisition?}
\end{itemize}
\item{External Memory Model}
\begin{itemize}
\item{Motivaton: In CS124, we measure running time by simple steps (like arithmetic operations) to predict performance of an algorithm.}
\item{But sometimes this isn't accurate, because accesses to memory have significantly different time costs (6 orders of magnitude).}
\item{We want to use a model that takes this into account.}
\item{The \textit{external memory model} assumes bounded memory has size M and an infinite disk, where touching data in memory is free and data on disk costs 1.}
\item{The cost is primarily the seeking time, not the reading time, because surrouding blocks are easy to read but seeking is expensive.}
\end{itemize}
\item{Mapreduce/Hadoop}
\begin{itemize}
\item{MapReduce and Hadoop are technologies dealing with parallel computing on massive datasets that go beyond single machine capabilities.}
\end{itemize}
\end{enumerate}
\section{Probability Review}
Let $X_1, \ldots ,X_n$ be discrete r.v.s on $S \subseteq \mathbb{R}$.
\begin{definition}
(Expectation). $\E X = \sum\limits_{j \in S} \Pr(X=j) \cdot j$.
\end{definition}
\begin{definition}
(Variance). $Var(X) = \E (X - \E X)^2 = \E X^2 - (\E X)^2$.
\end{definition}
\begin{lemma}
(Markov's inequality). If $X$ is a non-negative r.v., then $\Pr(X > \lambda) < \frac{\E X}{\lambda}$, for any $\lambda > 0$.
\end{lemma}
\begin{proof} If not (if $\exists$ bad $\lambda$), then $\E X$ would be too big (check this for yourself).\end{proof}
\begin{lemma}
(Chebyshev's inequality). If $\forall \lambda > 0, \Pr(|X- \E X| > \lambda) < \frac{Var(X)}{\lambda^2}$.
\end{lemma}
\begin{proof} $|X - \E X| > \lambda \Leftrightarrow (X - \E X)^2 > \lambda^2.$ By Markov, $$\Pr(|X - \E X| > \lambda) = \Pr((X - \E X)^2 > \lambda^2) < \frac{\E(X- \E X)^2}{\lambda^2}$$\end{proof}
\begin{lemma}
(Chernoff bound). Suppose we have $n$ independent Bernoulli r.v.s $X_1, ..., X_n$ with $X_i \sim$ Bernoulli$(p_i)$. Also let $X := \sum_i X_i$ and $\mu = \E X$. Then for constants $k, c > 0$, $$\Pr(|X- \E X| > \lambda \mu) \le ke^{-c \lambda^2 \mu}$$
\end{lemma}
\begin{proof}
By the union bound, $$\Pr(|X-\E X| > \lambda \mu) \le \Pr(X > (\lambda + 1) \mu) + \Pr(X < (1-\lambda )\mu)$$
Let's first bound $\Pr(X > (\lambda + 1) \mu)$, denoted by \textbf{(*)}. Because $X > (1 + \lambda)\mu \Leftrightarrow e^{tX} > e^{t(1+\lambda)\mu}$ for positive $t$, we apply Markov's inequality to $e^{tX}$ (a positive r.v.) to get
$$\textbf{(*)} = \Pr(e^{tX} > e^{t(1+\lambda)\mu}) < \frac{\E e^{tX}}{e^{t(1+\lambda)\mu}} \forall t > 0$$
Bounding the numerator, we have $$\E e^{tX} = \E \prod_i e^{tX_i} = \prod_i \E e^{tX_i}$$
by independence of the $X_i$. Because the $X_i$ are Bernoulli, this becomes $$\E e^{tX} = \prod_i (p_i e^t + (1-p_i)) = \prod_i (p_i(e^t-1) + 1) \le \prod_i e^{p_i(e^t - 1)} = e^{(e^t-1)\mu}$$
with the last inequality given by $1 + x \le e^x$ by Taylor's Theorem. Setting $t = \ln(1+\lambda)$, we get $$\frac{\E e^{tX}}{e^{t(1+\lambda)\mu}} \le \frac{e^{\lambda\mu}}{e^{\ln(1+\lambda)(1+\lambda)\mu}} = \left(\frac{e^\lambda}{e^{\ln(1+\lambda)(1+\lambda)}}\right)^\mu \le \left(\frac{e^\lambda}{e^{(1+\lambda)(\lambda - \frac{\lambda^2}{2} + O(\lambda^3)}}\right)^\mu = e^{(\lambda - \lambda - \frac{\lambda^2}{2})\mu} = e^{-c\lambda^2\mu}$$
We can do a similar proof to bound the other half of the union bound, $\Pr(X < (-\lambda + 1)\mu)$. This time, we have $\Pr(X < (1-\lambda )\mu) = \Pr(e^{-tX} > e^{-t(1-\lambda)\mu})$ for any positive $t$. So we can apply Markov's inequality: $$\Pr(e^{-tX} > e^{-t(1-\lambda)\mu}) < \frac{\E e^{-tX}}{e^{-t(1-\lambda)\mu}} \le \frac{e^{(e^{-t}-1)\mu}}{e^{-t(1-\lambda)\mu}}$$ by very similar steps as before (we're essentially substituting $-t$ for $t$). Choosing $t=-\ln(1-\lambda)$ gives $$\Pr(X < (1-\lambda )\mu) < \left(\frac{e^{-\lambda}}{e^{\ln(1-\lambda)(1-\lambda)}}\right)^\mu \le \left(\frac{e^{-\lambda}}{e^{(1-\lambda)(-\lambda - \frac{\lambda^2}{2} - O(\lambda^3)}}\right)^\mu = e^{(-\lambda + \lambda - \frac{\lambda^2}{2})\mu} = e^{-c\lambda^2\mu}$$ This gives the desired bound.
\end{proof}
\section{Algorithms for Big Data Example 1}
Motivating question: How do you maintain an approximate counter for the number of elements $n$ seen in a data stream that can be stored in fewer than $\log n$ bits? ($\log n$ bits can be done by just incrementing with every new stream object.)
\subsection{Preliminary Solution: Morris Algorithm}
\begin{itemize}
\item{Maintains a counter using $\log \log n$ bits.}
\item{Algorithm Steps: Have counter $X$.}
\begin{enumerate}
\item{Initialize X to 0.}
\item{If asked to increment, then do so with probability $\frac{1}{2^{X}}$. Else, do nothing.}
\item{When done, output $2^X -1$.}
\end{enumerate}
\end{itemize}
\subsection{Proof/Analysis}
\begin{itemize}
\item{Compute expectation to show that Morris provides an unbiased estimate. Then check our estimator's variance.}
\end{itemize}
\begin{claim}
$\E 2^X = n+1$
\end{claim}
\begin{proof}
Let counter's state after seeing $n$ items be $X_n$. $\E 2^{X_n} = \sum\limits_{j=0}^\infty \Pr(X_{n-1} = j)\E (2^{X_n}|X_{n-1} = j) = \sum\limits_{j=0}^\infty \Pr(X_{n-1} = j)(\frac{1}{2^j}2^{j+1} + (1-\frac{1}{2^j})2^j) = \sum\limits_{j=0}^\infty \Pr(X_{n-1} = j)(2^j +1) = 1 + \E 2^{X_{n-1}} \rightarrow$ Then the claim is proved by induction.
\end{proof}
\begin{lemma}
$\E 2^{2X} =\frac{3}{2}n^2 + \frac{3}{2}n +1$
\end{lemma}
\begin{proof}
The proof is by induction. For the inductive step,
\begin{align*}
\E 2^{2X_n} &= \sum_{j=0}^{\infty} \Pr(2^{X_{n-1}} = j) \cdot \E(2^{2X_n} | 2^{X_{n-1}} = j)\\
{}& = \sum_{j=0}^{\infty} \Pr(2^{X_{n-1}} = j)\cdot \left[\frac 1j \cdot 4j^2 + \left(1 - \frac 1j\right)\cdot j^2\right]\\
{}& = \sum_{j=0}^{\infty} \Pr(2^{X_{n-1}} = j)\cdot (j^2 + 3j)\\
{}& = \E 2^{2X_{n-1}} + 3\cdot \E 2^{X_{n-1}} \\
{}& = 3(n-1)^2/2 + 3(n-1)/2 + 1 + 3n .
\end{align*}
The lemma now follows by rearranging terms.
\end{proof}
\subsection{Revised Morris's Algorithm}
We've shown our estimator is unbiased and the above lemma shows that its variance is $O(n^2)$. We can lower the variance by having $t$ counters run in parallel and averaging them.
\begin{itemize}
\item{We have $t$ counters, $X_1, ..., X_t$ and we will output $\frac{1}{t}(\sum\limits_{j=1}^t2^{X_j}-1)$}
\item{ Then the new variance $Var(\frac{1}{t}(\sum\limits_{j=1}^t(2^{X_{j-1}})) \leq O(\frac{n^2}{t})$ due to independence of the parallel counters. The new estimator is still unbiased.}
\end{itemize}
\begin{claim}If $t \geq \frac{c}{\epsilon^2}$ then $\Pr(|\hat{n} - n| > \epsilon n) < \frac{1}{3}$ (where $\hat{n}$ is the average of the $t$ trials).
\end{claim}
\begin{proof} $\Pr(|\hat{n} - n| > \epsilon n) < O(\frac{n^2}{t})\frac{1}{\epsilon^2 n^2}$ by Chebyshev, and we can set $t = O(\frac{1}{\epsilon^2})$ for the constant in the big-Oh ``big enough'' to make the final expression less than or equal to $\frac{1}{3}$.\end{proof}
\subsection{Final Morris(ish) Algorithm}
\begin{itemize}
\item{\begin{enumerate}
\item{Initialize $X_1,...,X_n$ where $t=O(\frac{1}{\epsilon^2})$ each to 0.}
\item{Upon incrementing, run each step of $X_j$ independently.}
\item{Output sum $\frac{1}{t}\sum\limits_j 2^{X_{j-1}}$.}
\end{enumerate}}
\item{Do these three steps $m=O(\log \frac{1}{\delta})$ times independently in parallel and output the median result. Let this median result be $n_{tw}$.}
\end{itemize}
\subsection{Analysis}
\begin{claim}
$\Pr(| n_{tw} - n| > \epsilon n) < \delta$
\end{claim}
\begin{proof}
Let $Y_i$ be an indicator r.v. for the event $| \hat{n_i} - n| \leq \epsilon n$ where $\hat{n_i}$ be the ith trial. Let $Y = \sum_i Y_i$. $\Pr(|n_{tw} - n| > \epsilon n) \leq \Pr(Y \leq \frac{m}{2}) \leq \Pr(|Y-\E Y| > 2\frac{m}{3} - \frac{m}{2}) = \Pr(|Y - \E Y| \geq \frac{m}{6}) \leq \Pr(|Y-\E Y| \geq \frac{\mu}{4}) < e^{-c(\frac{1}{4})^2\frac{2m}{3}} < e^{-c\log\frac{1}{\delta}} < \delta$, with the second-to-last inequality given by the stipulated $m=O(\log \frac{1}{\delta})$, and the last few inequalities holding up to a constant.
\end{proof}
\bibliographystyle{alpha}
\begin{thebibliography}{42}
\bibitem{AlonMS99}
Robert Morris.
\newblock Counting Large Numbers of Events in Small Registers.
\newblock {\em Commun. ACM}, 21(10): 840-842, 1978.
\end{thebibliography}
\end{document}