\documentclass[11pt]{article}
\usepackage{amsmath,amssymb,amsthm}
\DeclareMathOperator*{\E}{\mathbb{E}}
\let\Pr\relax
\DeclareMathOperator*{\Pr}{\mathbb{P}}
\newcommand{\eps}{\varepsilon}
\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
\newcommand{\R}{\mathbb{R}}
\newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}}
\newcommand{\handout}[5]{
\noindent
\begin{center}
\framebox{
\vbox{
\hbox to 5.78in { {\bf CS 229r: Algorithms for Big Data } \hfill #2 }
\vspace{4mm}
\hbox to 5.78in { {\Large \hfill #5 \hfill} }
\vspace{2mm}
\hbox to 5.78in { {\em #3 \hfill #4} }
}
}
\end{center}
\vspace*{4mm}
}
\newcommand{\lecture}[4]{\handout{#1}{#2}{#3}{Scribe: #4}{Lecture #1}}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{observation}[theorem]{Observation}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{assumption}[theorem]{Assumption}
% 1-inch margins, from fullpage.sty by H.Partl, Version 2, Dec. 15, 1988.
\topmargin 0pt
\advance \topmargin by -\headheight
\advance \topmargin by -\headsep
\textheight 8.9in
\oddsidemargin 0pt
\evensidemargin \oddsidemargin
\marginparwidth 0.5in
\textwidth 6.5in
\parindent 0in
\parskip 1.5ex
\begin{document}
\lecture{2 --- Sept. 5, 2013}{Fall 2013}{Prof.\ Jelani Nelson}{William Chen and Sebastian Chiu}
\section{Overview}
In this lecture we will talk about some problems in Streaming, including \ldots
\begin{enumerate}
\item \ldots the Distinct Elements Problem (aka the $F_0$ Problem), whereby you have a stream of $m$ integers $i_1, i_2,...,i_m$, and you want to output the number of distinct elements in the stream. We will have an approximate algorithm that can probabilistically estimate the number of distinct elements. A simple motivating example is identifying the number of distinct IP addresses that you encounter.
\item \ldots a ``real'' Distinct Elements Algorithm. In the first problem, we will assume pure randomness. Here, we will show an impossibility result for streaming algorithms. To solve the $F_0$ Problem, we require both an approximation and randomization.
\item \ldots Turnstile Streams, $F_2$-estimation
\end{enumerate}
\section{The Distinct Elements Problem}
There are a number of simple ways we can count the number of distinct elements in a set. First, we can maintain a bit vector of length $n$, and if you see the integer $n$, set that bit to 1. You can also record all integers $m$ that you see. The simple approach takes $\min\{n, O(m\log m)\}$ bits. $n$ refers to the approach using a bit vector, and $O(m\log m)$ refers to the approach whereby we remember the whole stream of integers.
\subsection{Idealized Streaming Algorithm (ISA)}
\begin{enumerate}
\item Pick random hash function $h:[n]\rightarrow [0,1]$
\item Calculate $z = \displaystyle\min_{i \in \mbox{stream}} h(i)$
\item Output $\frac{1}{z}-1$
\end{enumerate}
This is idealized because we don't have perfect precision computers, so we can perfectly work with the numbers between 0 and 1. We have to make sure we use the same $h$. We have to remember the randomness associated with each $i$. That's already at least $n$ bits, and this does not perform better than our simple approach.\\
Why does this work? We will show that it is an unbiased estimator and has bounded variance. Then we can just do this multiple times independently, and take the average of the many estimators we do get, and hence reduce the variance of or overall estimate.\\
Let's say the $\mathcal{S}$ is the set of $i \in \mbox{stream}$
\[\mathcal{S} = \{j_1,...,j_t\}\]
Let
\begin{enumerate}
\item $h(j_1), ...,h(j_t) = X_1,...,X_t$ be independent $\mbox{Unif}[0,1]$
\item $z = \min\{X_i\}_{i=1}^t$
\end{enumerate}
\begin{claim}
$\E Z = \frac{1}{t+1}$
\begin{proof}
\begin{align*}
\E Z &=\int_0^\infty \Pr(Z>\lambda)d\lambda\\ %if z is nonneg\\
&=\int_0^1\Pr(\forall i X_i > \lambda) d\lambda\\
&=\int_0^1 \prod_{i=1}^t \Pr(X_i > \lambda) d \lambda\\
&= \int_0^1 (1-\lambda)^t d\lambda \\ %u sub u = 1-\lambda
&=\int_0^1 u^tdu\\
&= \bigg{[}\frac{u^{t+1}}{t+1}\bigg{]}_0^1 = \frac{1}{t+1}
\end{align*}
\end{proof}
\end{claim}
\begin{claim}
$\E Z^2 = \frac{2}{(t+1)(t+2)}$
\begin{proof}
\begin{align*}
\E Z^2 &= \int_0^1 \Pr(Z^2 > \lambda) d\lambda\\
&=\int_0^1\Pr(Z > \sqrt{\lambda} d\lambda\\
&=\int_0^1(1-\sqrt{\lambda})^td\lambda \\ %u sub let u = 1-\sqrt{\lambda}
&=2 \int_1^0u^t(u-1)du\\
&=2\int_0^1u^t(1-u)du
\end{align*}
\end{proof}
\end{claim}
%
%Average Algorithm
%
%1) Run $q = O(\frac{1}{\epsilon^2})$ ISA's in parallel
%2) $\bar{z} = \frac{1}{q} \sum_{i=1}^q z_i$
%3) Output $\frac{1}{\bar{z}}-1$
%
%Claim is that $P(|(\frac{1}{z}-1)-t|>\epsilon t) < \frac{1}{3}$
%
%Proof: Will show $P(|\bar{z} - \frac{1}{t+1}| > \frac{\epsilon}{t+1}) < \frac{1}{3}$
%
%We know that, by Chebyshev's Inequality
%\[P(|\bar{z} - \frac{1}{t+1}| > \frac{\epsilon}{t+1}) < \mbox{Var}(\bar{z}\frac{(t+1)^2}{\epsilon^2} = O\left(\frac{1}{t^2q}\frac{(t+1)^2}{\epsilon^2}\right) < \frac{1}{3}\]
%% q is equalty to constant over epislon suqared
\textbf{Averaging Algorithm}
\begin{enumerate}
\item Run $q = O(\frac{1}{\epsilon^2})$ ISA's in parallel
\item $\bar{z} = \frac{1}{q} \sum_{i=1}^q z_i$
\item output $\frac{1}{\bar{z}} - 1$
\end{enumerate}
\begin{claim}
\[\Pr\left(|(\frac{1}{z} - 1) - t| > \epsilon t\right) < \frac{1}{3}\]
\end{claim}
\begin{proof}
Will show the above claim.
\begin{align*}
\Pr\left(|(\frac{1}{z} - 1) - t| > \frac{ \epsilon }{t + 1}\right) &< Var(\bar{z}) \cdot \frac{(t+1)^2}{\epsilon^2} \\
&= O\left(\frac{1}{t^2q}\right) \cdot \frac{(t+1)^2}{\epsilon^2} < \frac{1}{3}
\end{align*}
\end{proof}
In practice, to remove ourselves from the idealized case, we will still use hash functions, but we need one that doesn't take $n$ bits or more to represent (ie. doesn't use floats). So we will use $k$-wise independent hash functions. What are $k$-wise independent hash functions?\\ \\ Let's say that $\mathcal{H}$ is a set of functions that map $[a] \rightarrow [b]$.
\begin{definition}
$\mathcal{H}$ is a $k$-wise independent hash family if
\[\forall i_1 \ne i_2\ne...\ne i_k \in [a] \ \mbox{and} \ \forall j_1,...,j_k \in [b] ,\]
\[\Pr_{h\sim \mathcal{H}} (h(i_1) = j_1 \wedge ... \wedge h(i_k) =j_k) = \frac{1}{b^k}\]
\end{definition}
An example is as follows. The set of all functions from $[a]$ to $[b]$ is $k$-wise independent $\forall k$. The size of $\mathcal{H}$ is $b^a$ ($h \leftarrow \mathcal{H}$ can be specified using $\log |\mathcal{H}|$ bits). The goal now is to come up with small $\mathcal{H}$.
Fulfilled goal: $a = b = q = \ \mbox{prime power}$. $\mathcal{H}$ will be the set of all degree $k-1$ polynomials in $\mathbb{F}_q[x]$
We will show that this is a $k$-wise independent family.
\begin{claim}
$\mathcal{H}_{\mbox{poly}-k}$ is a $k$-wise family.
\end{claim}
\begin{proof}
Lagrange interpolation. If we know $i_1,...,i_k$ and $j_1,...,j_k$ and that no $i$'s repeat, it follows
\[p(x) = \sum_{r=1}^k \left(\frac{\prod_{y=1; y\ne r}^k x-x_y}{\prod_{y=1;y\ne r}^k x_r - x_y}\right) \cdot j_r\]
satisfies
\[\forall r\ p(i_r) = j_r\]
and this polynomial is unique since $\mathbb{F}_q$ is a field. It follows that
$|\mathcal{H}_{\mbox{poly}-k}| = q^k \Rightarrow h \in \mathcal{H}_{\mbox{poly}-k}$ representable using $k\log q$ bits
\end{proof}
This raises a new algorithm we can consider.
\begin{enumerate}
\item Assume we know $t$ (the answer) up to some constant factor $c$
\item Pick $h:[n] \rightarrow [n]$ from a $2$-wise family
\item We will imagine there are $\log_2n$ different streams and we put i in stream lsb$(h(i))$ (where lsb stands for least significant bit)
\item When $i$ comes in stream, write down $i$ in the stream it hashed to (but stop keeping track of a stream if more than, say, $\frac{1000}{\epsilon^2}$ different indices hashed into it.
\end{enumerate}
\textbf{Output} - Look at stream $j$ s.t. $\frac{1}{c} \leq \frac{t}{2^{j+1}} \leq c$ and output its size times $2^{j+1}$
\subsection{Necessity of approximation and randomization}
\begin{claim}
Deterministic exact algorithm is impossible using $o(n)$ bits.
\begin{proof}Suppose that space of our algorithm $A$ is s bits. That implies that there exists an injection $f:\{0,1\} \rightarrow \{0,1\}$.
Let's define the injection
\[x \in \{0,1\}^n\]
\begin{enumerate}
\item Define a stream containing the $i$ for which $x_i = 1$
\item Run $A$ on this stream
\item $f(x) = $memory content of $A$ at the end of the stream
\end{enumerate}
We can recover $x$ bit by bit. To recover a certain bit, we can run $A$, starting with memory contents $f(x)$ and append the result $i$ to the stream. Now, we can check if $F_0$ increased. Given an $n$ bit stream, the of $F_0$ will appear in a stream, and the memory contents of that stream is my compression. How do we prove that this is an injection? Jelani claims that just knowing the compression, you know what $x$ was, so it has to be a injection.\end{proof}
\end{claim}
\begin{claim}
A deterministic approximation algorithm for $F_0$ providing a $(1\pm 1/1000)$-approximation using $o(n)$ bits is impossible.
\begin{proof}
Suppose there exists $N = 2^{cn}$ bitvectors $(c \in (0,1))$ so that for all $i \ne j$, $x_i$ and $x_j$ differ on at least $\frac{n}{3}$ bits. We then define
\[T = \{x_1,...,x_N\}\]
The compression can then be defined as
\[f: T \rightarrow \{0,1\}^S\]
using the same $f$ as above in the case of a deterministic exact algorithm.
To show $f$ is an injection on $T$, we will show a recovery algorithm that, given $f(x)$ for $x\in T$, can determine $x$. We want to know which $x \in T$ was compressed. So we try all $y \in T$ and see if it was $y$, by appending all $i$ for which $y_i = 1$ into the stream then asking for the new estimate of the number of distinct elements. Doing so yields two possibles cases. The first case is that $y$ was the thing that was compressed, and so number of distinct elements does not increase. The second case is that it wasn't the thing that was compressed, which implies that the number of distinct elements increased by at least $n/3$, so a $(1\pm 1/1000)$-approximation algorithm will notice this gap. Thus $f$ is an injection implying that the space $s$ must satisfy
\[s = \Omega(\log|T|) \ge cn\]
\end{proof}
\end{claim}
\section{AMS sketch ($F_2$ estimation)}
We will study the AMS sketch for second moment estimation \cite{AMS99}. In the problem of {\em second moment estimation}, we imagine we have a vector $x\in\R^n$ that starts as the $0$ vector. Every update in the stream is of the form ``add $v$ to $x_i$'' for some $(i,v)$. This model for updating vectors in a data stream, where arbitrary amounts can be added to arbitrary coordinates of a high-dimensional vector, is known as the {\em turnstile model} of streaming. At the end of the stream, we would like to output an estimate of
$$F_2 \eqdef \sum_{i=1}^n x_i^2 = \|x\|_2^2 .$$
In general we have the definition of the $p$th frequency moment:
$$ F_p \eqdef \sum_{i=1}^n |x_i|^p = \|x\|_p^p ,$$
and this is why the distinct elements problem is usually referred to as $F_0$-estimation (treating $0^0$ as $0$). This is because in the distinct elements problem where the stream indices are in $[n]\eqdef\{1,\ldots,n\}$, we treat $i$ being in the stream as an update $(i,1)$ to an $n$-dimensional vector $x$ (so that $x_i$ keeps track of how many times $i$ appeared in the stream). Thus $F_0$ counts the number of non-zero entries in $x$, i.e.\ the number of distinct integers in the stream.
To solve $F_2$ estimation we will use the strategy of designing a {\em linear sketch}, i.e.\ a matrix $\Pi\in\R^{t\times n}$. Our streaming algorithm will simply maintain $\Pi x$ in its memory, and then our output to estimate $F_2$ will be some function of $\Pi x$. Note that $\Pi x$ can easily be maintained in turnstile streams, since $x\rightarrow x + v\cdot e_i$ (where $e_i$ corresponds to the $i$th standard basis vector) causes $\Pi x$ to change in the following way: $v$ times the $i$th column of $\Pi$ should be added to $\Pi x$. Of course $\Pi$ is a huge matrix, it is $t\times n$, so to obtain a small-space algorithm we should not maintain $\Pi$ explicitly in memory, but rather have it implicitly defined (e.g.\ as we shall see in this example, by using $k$-wise independent hash functions).
\subsection{AMS sketch presentation and analysis}
Recall the usual approach for designing a streaming algorithm:
\begin{enumerate}
\item Find a way to maintain a random variable $X$ consuming little memory which gives an unbiased estimator of the thing you want to compute, and also bound its variance.
\item Keep many independent random variables $X_1,\ldots,X_{q_1}$ in memory, each distributed as $X$, and compute the average $\hat{X} = (\sum_{i=1}^{q_1} X_i)/q_1$.
\item Keep many independent random variables $\hat{X}_1,\ldots,\hat{X}_{q_2}$ in memory, each distributed as in the last step, and at the end output the median $\overline{X}$ of the $\hat{X}$'s.
\end{enumerate}
In the second step, if the variance is on the order of the square of the expectation, it suffices to set $q_1 = O(1/\eps^2)$ for $\hat{X}$ to be a $(1+\eps)$-approximation with $2/3$ probability. It then suffices to set $q_2 = O(\log(1/\delta))$ for $\overline{X}$ to be a $(1+\eps)$-approximation with $1-\delta$ probability.
So, it all boils down to the first step: designing an unbiased estimator. The AMS sketch of \cite{AMS99} does exactly this for $F_2$ estimation. Here is the AMS sketch:
\begin{enumerate}
\item Pick a random hash function $\sigma:[n]\rightarrow\{-1,1\}$ from a $4$-wise independent family. Abusing notation, let $\sigma$ be the vector with $\sigma_i = \sigma(i)$.
\item Maintain $X = \inprod{\sigma,x}$ in memory (so when $v$ is added to $x_i$, add $v\cdot \sigma_i$ to $X$).
\item \textbf{Output:} $X^2$
\end{enumerate}
It should be noted that in class we talked about how to design a $k$-wise independent hash family $\mathcal{H}_{poly-k}$ of functions mapping $[n]$ into $[n]$ of size $n^k$ for $n$ a power of $2$ (since $\mathbb{F}_n$ is a finite field for $n$ a power of $2$). Such families also imply families mapping from $[n]$ into $\{-1,1\}$. We make a set $\mathcal{H}_{poly-k}'$ of the same size such that for any $h\in\mathcal{H}_{poly-k}$, we define an $h'\in\mathcal{H}_{poly-k}'$ defined as follows: for any $i$, if $h(i)\in[n]$ ends in a $0$ when written in binary, then we set $h'(i) = 1$; else we set $h'(i)$ to $-1$.
\begin{lemma}
$\E X^2 = F_2 \eqdef \|x\|_2^2$.
\end{lemma}
\begin{proof}
\begin{align}
\nonumber \E X^2 &= \E \inprod{\sigma,x}^2 \\
\nonumber {}& = \E \left(\sum_{i=1}^n \sigma_i^2 x_i^2 + \sum_{i\neq j}\sigma_i \sigma_j x_i x_j\right)\\
\nonumber {}& = \underbrace{\sum_{i=1}^n x_i^2}_{F_2} + \sum_{i\neq j} (\E \sigma_i \sigma_j) x_i x_j\\
{}& = F_2 + \sum_{i\neq j} \underbrace{(\E \sigma_i)}_{=0}\underbrace{(\E \sigma_j)}_{=0} x_i x_j \label{eqn:pairwise}\\
\nonumber {}& = F_2
\end{align}
where Equation~\ref{eqn:pairwise} used that $\sigma$ is $2$-wise independent (since $4$-wise independence implies $2$-wise independence) to give that the expectation of the product is the product of expectations.
\end{proof}
\begin{lemma}
For the variance, $\E (X^2 - \E X^2)^2 \le 2F_2^2$.
\end{lemma}
\begin{proof}
\begin{align}
\nonumber \E (X^2& - \E X^2)^2 = \E \left(\sum_{i\neq j} \sigma_i \sigma_j x_i x_j\right)^2\\
\nonumber {}&= \E\left(2\sum_{i\neq j}\sigma_i^2 \sigma_j^2 x_i^2 x_j^2 + 4\sum_{i\neq j\neq k}\sigma_i^2\sigma_j\sigma_k x_i^2x_jx_k + 24\sum_{i