% chapters/chap1.tex
\documentclass[../main.tex]{subfiles}

% If this chapter is compiled *by itself*, we must load only its own .bib
% and print its bibliography at the end of the chapter.
\ifSubfilesClassLoaded{
  \addbibresource{../main.bib}
}

\usepackage{amsmath, amsfonts, amsthm}
\usepackage{fancyhdr,parskip}
\usepackage{fullpage}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% add special notation supports
\usepackage[mathscr]{euscript}
\usepackage{mathtools}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% add image package and directory
\usepackage{graphicx}
\usepackage{tikz}
\graphicspath{{../images/}}


\begin{document}


\chapter{Concentration of Measure And Quantum Entanglement}

As the future version of me might forgot everything we have over the summer, as I did for now, I will make a review again from the simple definition to recall the necessary information to tell you why we are here and how we are going to proceed.

First, we will build the mathematical model describing the behavior of quantum system and why they makes sense for physicists and meaningful for general publics.

\section{Motivation}

First, we introduce a motivation for introducing non-commutative probability theory to the study of quantum mechanics. This section is mainly based on the book~\cite{kummer1998elements}.

\subsection{Light polarization and the violation of Bell's inequality}

The light which comes through a polarizer is polarized in a certain direction. If we fix the first filter and rotate the second filter, we will observe the intensity of the light will change.

The light intensity decreases with $\alpha$ (the angle between the two filters). The light should vanish when $\alpha=\pi/2$.

However, for a system of 3 polarizing filters $F_1,F_2,F_3$, having directions $\alpha_1,\alpha_2,\alpha_3$, if we put them on the optical bench in pairs, then we will have three random variables $P_1,P_2,P_3$.

\begin{figure}[H]
	\centering
	\includegraphics[width=0.7\textwidth]{Filter_figure.png}
	\caption{The light polarization experiment, image from \cite{kummer1998elements}}
	\label{fig:Filter_figure}
\end{figure}

\begin{theorem}
	\label{theorem:Bell's_3_variable_inequality}
	Bell's 3 variable inequality:

	For any three random variables $P_1,P_2,P_3$ in a classical probability space, we have

	$$
	\operatorname{Prob}(P_1=1,P_3=0)\leq \operatorname{Prob}(P_1=1,P_2=0)+\operatorname{Prob}(P_2=1,P_3=0)
	$$
\end{theorem}

\begin{proof}
	By the law of total probability there are only two possibility if we don't observe any light passing the filter pair $F_i,F_j$, it means the photon is either blocked by $F_i$ or $F_j$, it means

    $$
    \begin{aligned}
    \operatorname{Prob}(P_1=1,P_3=0)&=\operatorname{Prob}(P_1=1,P_2=0,P_3=0)\\
    &+\operatorname{Prob}(P_1=1,P_2=1,P_3=0)\\
    &\leq\operatorname{Prob}(P_1=1,P_2=0)+\operatorname{Prob}(P_2=1,P_3=0)
    \end{aligned}
    $$
\end{proof}

However, according to our experimental measurement, for any pair of polarizers $F_i,F_j$, by the complement rule, we have
$$
\begin{aligned}
\operatorname{Prob}(P_i=1,P_j=0)&=\operatorname{Prob}(P_i=1)-\operatorname{Prob}(P_i=1,P_j=1)\\
&=\frac{1}{2}-\frac{1}{2}\cos^2(\alpha_i-\alpha_j)\\
&=\frac{1}{2}\sin^2(\alpha_i-\alpha_j)
\end{aligned}
$$

This leads to a contradiction if we apply the inequality to the experimental data.

$$
\frac{1}{2}\sin^2(\alpha_1-\alpha_3)\leq\frac{1}{2}\sin^2(\alpha_1-\alpha_2)+\frac{1}{2}\sin^2(\alpha_2-\alpha_3)
$$

If $\alpha_1=0,\alpha_2=\frac{\pi}{6},\alpha_3=\frac{\pi}{3}$, then

$$
\begin{aligned}
\frac{1}{2}\sin^2(-\frac{\pi}{3})&\leq\frac{1}{2}\sin^2(-\frac{\pi}{6})+\frac{1}{2}\sin^2(\frac{\pi}{6}-\frac{\pi}{3})\\
\frac{3}{8}&\leq\frac{1}{8}+\frac{1}{8}\\
\frac{3}{8}&\leq\frac{1}{4}
\end{aligned}
$$

Other revised experiments (e.g., Aspect's experiment, calcium entangled photon experiment) are also conducted and the inequality is still violated.

\subsection{The true model of light polarization}
    
The full description of the light polarization is given below:

State of polarization of a photon: $\psi=\alpha|0\rangle+\beta|1\rangle$, where $|0\rangle$ and $|1\rangle$ are the two orthogonal polarization states in $\mathbb{C}^2$.

Polarization filter (generalized 0,1 valued random variable): orthogonal projection $P_\alpha$ on $\mathbb{C}^2$ corresponding to the direction $\alpha$ (operator satisfies $P_\alpha^*=P_\alpha=P_\alpha^2$).

The matrix representation of $P_\alpha$ is given by

$$
P_\alpha=\begin{pmatrix}
\cos^2(\alpha) & \cos(\alpha)\sin(\alpha)\\
\cos(\alpha)\sin(\alpha) & \sin^2(\alpha)
\end{pmatrix}
$$

Probability of a photon passing through the filter $P_\alpha$ is given by $\langle P_\alpha\psi,\psi\rangle$; this is $\cos^2(\alpha)$ if we set $\psi=|0\rangle$.

Since the probability of a photon passing through the three filters is not commutative, it is impossible to discuss $\operatorname{Prob}(P_1=1,P_3=0)$ in the classical setting.


The main vector space we are interested in is $\mathbb{C}^n$; therefore, all the linear operators we defined are from $\mathbb{C}^n$ to $\mathbb{C}^n$.

We denote a vector in vector space as $\ket{\psi}=(z_1,\ldots,z_n)$ (might also be infinite dimensional, and $z_i\in\mathbb{C}$).

A natural inner product space defined on $\mathbb{C}^n$ is given by the Hermitian inner product:

$$
\langle\psi|\varphi\rangle=\sum_{i=1}^n z_iz_i^*
$$

This satisfies the following properties:

\begin{enumerate}
  \item $\bra{\psi}\sum_i \lambda_i\ket{\varphi}=\sum_i \lambda_i \langle\psi|\varphi\rangle$ (linear on the second argument. Note that in physics \cite{Nielsen_Chuang_2010} we use linear on the second argument and conjugate linear on the first argument. But in math, we use linear on the first argument and conjugate linear on the second argument \cite{Axler_2024}. As promised in the beginning, we will use the physics convention in this report.)
  \item $\langle\varphi|\psi\rangle=(\langle\psi|\varphi\rangle)^*$
  \item $\langle\psi|\psi\rangle\geq 0$ with equality if and only if $\ket{\psi}=0$
\end{enumerate}

Here $\psi$ is just a label for the vector, and you don't need to worry about it too much. This is also called the ket, where the counterpart:

\begin{itemize}
\item $\langle\psi\rangle$ is called the bra, used to denote the vector dual to $\psi$; such an element is a linear functional if you really want to know what that is.
\item $\langle\psi|\varphi\rangle$ is the inner product between two vectors, and $\bra{\psi} A\ket{\varphi}$ is the inner product between $A\ket{\varphi}$ and $\bra{\psi}$, or equivalently $A^\dagger \bra{\psi}$ and $\ket{\varphi}$.
\item Given a complex matrix $A=\mathbb{C}^{n\times n}$,
\begin{enumerate}
  \item  $A^*$ is the complex conjugate of $A$.
 i.e., 
    $$
 A=\begin{bmatrix}
        1+i & 2+i & 3+i\\
        4+i & 5+i & 6+i\\
        7+i & 8+i & 9+i\end{bmatrix}, 
 A^*=\begin{bmatrix}
        1-i & 2-i & 3-i\\
        4-i & 5-i & 6-i\\
        7-i & 8-i & 9-i
      \end{bmatrix}
    $$
  \item  $A^\top$ denotes the transpose of $A$.
 i.e., 
     $$
 A=\begin{bmatrix}
        1+i & 2+i & 3+i\\
        4+i & 5+i & 6+i\\
        7+i & 8+i & 9+i
      \end{bmatrix},
 A^\top=\begin{bmatrix}
        1+i & 4+i & 7+i\\
        2+i & 5+i & 8+i\\
        3+i & 6+i & 9+i
      \end{bmatrix}
     $$
  \item $A^\dagger=(A^*)^\top$ denotes the complex conjugate transpose, referred to as the adjoint, or Hermitian conjugate of $A$.
 i.e., 
    $$
 A=\begin{bmatrix}
      1+i & 2+i & 3+i\\
      4+i & 5+i & 6+i\\
      7+i & 8+i & 9+i
    \end{bmatrix},
 A^\dagger=\begin{bmatrix}
      1-i & 4-i & 7-i\\
      2-i & 5-i & 8-i\\
      3-i & 6-i & 9-i
    \end{bmatrix}
    $$
  \item  $A$ is unitary if $A^\dagger A=AA^\dagger=I$.
  \item  $A$ is hermitian (self-adjoint in mathematics literature) if $A^\dagger=A$.
  \end{enumerate}
\end{itemize}

\subsubsection{Motivation of Tensor product}

Recall from the traditional notation of product space of two vector spaces $V$ and $W$, that is, $V\times W$, is the set of all ordered pairs $(\ket{v},\ket{w})$ where $\ket{v}\in V$ and $\ket{w}\in W$.

The space has dimension $\dim V+\dim W$.

We want to define a vector space with the notation of multiplication of two vectors from different vector spaces.

That is

$$
(\ket{v_1}+\ket{v_2})\otimes \ket{w}=(\ket{v_1}\otimes \ket{w})+(\ket{v_2}\otimes \ket{w})
$$
$$
\ket{v}\otimes (\ket{w_1}+\ket{w_2})=(\ket{v}\otimes \ket{w_1})+(\ket{v}\otimes \ket{w_2})
$$

and enables scalar multiplication by

$$
\lambda (\ket{v}\otimes \ket{w})=(\lambda \ket{v})\otimes \ket{w}=\ket{v}\otimes (\lambda \ket{w})
$$

And we wish to build a way to associate the basis of $V$ and $W$ with the basis of $V\otimes W$. That makes the tensor product a vector space with dimension $\dim V\times \dim W$.

\begin{defn}
Definition of linear functional

A linear functional is a linear map from $V$ to $\mathbb{F}$.

\end{defn}

Note the difference between a linear functional and a linear map.

A generalized linear map is a function $f: V\to W$ satisfying the condition.

\begin{itemize}
\item $f(\ket{u}+\ket{v})=f(\ket{u})+f(\ket{v})$
\item $f(\lambda \ket{v})=\lambda f(\ket{v})$
\end{itemize}


\begin{defn}

A bilinear functional is a bilinear function $\beta:V\times W\to \mathbb{F}$ satisfying the condition that $\ket{v}\to \beta(\ket{v},\ket{w})$ is a linear functional for all $\ket{w}\in W$ and $\ket{w}\to \beta(\ket{v},\ket{w})$ is a linear functional for all $\ket{v}\in V$.

\end{defn}

The vector space of all bilinear functionals is denoted by $\mathcal{B}(V, W)$.


\begin{defn}

Let $V, W$ be two vector spaces.

Let $V'$ and $W'$ be the dual spaces of $V$ and $W$, respectively, that is $V'=\{\psi:V\to \mathbb{F}\}$ and $W'=\{\phi:W\to \mathbb{F}\}$, $\psi, \phi$ are linear functionals.

The tensor product of vectors $v\in V$ and $w\in W$ is the bilinear functional defined by $\forall (\psi,\phi)\in V'\times W'$ given by the notation

$$
(v\otimes w)(\psi,\phi)\coloneqq\psi(v)\phi(w)
$$

The tensor product of two vector spaces $V$ and $W$ is the vector space $\mathcal{B}(V',W')$

Notice that the basis of such vector space is the linear combination of the basis of $V'$ and $W'$, that is, if $\{e_i\}$ is the basis of $V'$ and $\{f_j\}$ is the basis of $W'$, then $\{e_i\otimes f_j\}$ is the basis of $\mathcal{B}(V', W')$.

That is, every element of $\mathcal{B}(V', W')$ can be written as a linear combination of the basis.

Since $\{e_i\}$ and $\{f_j\}$ are bases of $V'$ and $W'$, respectively, then we can always find a set of linear functionals $\{\phi_i\}$ and $\{\psi_j\}$ such that $\phi_i(e_j)=\delta_{ij}$ and $\psi_j(f_i)=\delta_{ij}$.

Here $\delta_{ij}=\begin{cases}
1 & \text{if } i=j \\
0 & \text{otherwise}
\end{cases}$ is the Kronecker delta.

\end{defn}

$$
V\otimes W=\left\{\sum_{i=1}^n \sum_{j=1}^m a_{ij} \phi_i(v)\psi_j(w): \phi_i\in V', \psi_j\in W'\right\}
$$

Note that $\sum_{i=1}^n \sum_{j=1}^m a_{ij} \phi_i(v)\psi_j(w)$ is a bilinear functional that maps $V'\times W'$ to $\mathbb{F}$.

This enables basis-free construction of vector spaces with proper multiplication and scalar multiplication.

This vector space is equipped with the unique inner product $\langle v\otimes w, u\otimes x\rangle_{V\otimes W}$ defined by

$$
\langle v\otimes w, u\otimes x\rangle=\langle v,u\rangle_V\langle w,x\rangle_W
$$

In practice, we ignore the subscript of the vector space and just write $\langle v\otimes w, u\otimes x\rangle=\langle v,u\rangle\langle w,x\rangle$.

This introduces a new model in mathematics explaining quantum mechanics: the non-commutative probability theory.

\section{Non-commutative probability theory}

The non-commutative probability theory is a branch of generalized probability theory that studies the probability of events in non-commutative algebras.

There are several main components of the generalized probability theory; let's see how we can formulate them, comparing with the classical probability theory.

First, we define the Hilbert space in case one did not make the step from the linear algebra courses like me.

\begin{defn}
	\label{defn:Hilbert_space}
	Hilbert space:

	A Hilbert space is a complete inner product space. 
\end{defn}

That is, a vector space equipped with an inner product that is complete (every Cauchy sequence converges to a limit).

To introduce an example of Hilbert space we use when studying quantum mechanics, we need to introduce a common inner product used in $\mathbb{C}^n$.

\begin{defn}
	\label{defn:Hermitian_inner_product}
	Hermitian inner product:

	On $\mathbb{C}^n$, the Hermitian inner product is defined by
    $$
    \langle u,v\rangle=\sum_{i=1}^n \overline{u_i}v_i
    $$
\end{defn}

\begin{prop}
	\label{prop:Hermitian_inner_product_with_complex_vectorspace}
    The Hermitian inner product on the complex vector space $\C^n$ makes it a Hilbert space.
\end{prop}

\begin{proof}
    We first verify that the Hermitian inner product 
    $$
    \langle u,v\rangle = \sum_{i=1}^n \overline{u_i} v_i
    $$
    on $\C^n$ satisfies the axioms of an inner product:
    \begin{enumerate}
        \item \textbf{Conjugate symmetry:} For all $u,v\in\C^n$, 
        $$
        \langle u,v\rangle =\sum_{i=1}^n \overline{u_i} v_i=\overline{\sum_{i=1}^n \overline{v_i} u_i}=\overline{\langle v,u\rangle}.
        $$
        \item \textbf{Linearity:} For any $u,v,w\in\C^n$ and scalars $a,b\in\C$, we have 
        $$
        \langle u, av + bw\rangle = \sum_{i=1}^n \overline{u_i} (av_i + bw_i)=a\langle u,v\rangle + b\langle u,w\rangle.
        $$
        \item \textbf{Positive definiteness:} For every $u=(u_1,u_2,\cdots,u_n)\in\C^n$, let $u_j=a_j+b_ji$, where $a_j,b_j\in\mathbb{R}$.
        $$
        \langle u,u\rangle = \sum_{j=1}^n \overline{u_j} u_j=\sum_{i=1}^n (a_i^2+b_i^2)\geq 0,
        $$
        with equality if and only if $u=0$.

        Therefore, the Hermitian inner product is an inner product.
    \end{enumerate}
    
    Next, we show that $\C^n$ is complete with respect to the norm induced by this inner product:
    $$
    \|u\| = \sqrt{\langle u,u\rangle}.
    $$
    Since $\C^n$ is finite-dimensional, every Cauchy sequence (with respect to any norm) converges in $\C^n$. This is a standard result in finite-dimensional normed spaces, which implies that $\C^n$ is indeed complete.
    
    Therefore, since the Hermitian inner product fulfills the inner product axioms and $\C^n$ is complete, the complex vector space $\C^n$ with the Hermitian inner product is a Hilbert space.
\end{proof}

Another classical example of Hilbert space is $L^2(\Omega, \mathscr{F}, P)$, where $(\Omega, \mathscr{F}, P)$ is a measure space ($\Omega$ is a set, $\mathscr{F}$ is a $\sigma$-algebra on $\Omega$, and $P$ is a measure on $\mathscr{F}$). The $L^2$ space is the space of all function on $\Omega$ that is

\begin{enumerate}
    \item \textbf{square integrable}: square integrable functions are the functions $f:\Omega\to \mathbb{C}$ such that
        $$
        \int_\Omega |f(\omega)|^2 dP(\omega)<\infty
        $$
        with inner product defined by
        $$
        \langle f,g\rangle=\int_\Omega \overline{f(\omega)}g(\omega)dP(\omega)
        $$

\item \textbf{complex-valued}: functions are complex-valued measurable. $f=u+v i$ is complex-valued if $u$ and $v$ are real-valued measurable.
\end{enumerate}

\begin{prop}
	\label{prop:L2_space_is_a_Hilbert_space}
    $L^2(\Omega, \mathscr{F}, P)$ is a Hilbert space.
\end{prop}

\begin{proof}
    We check the two conditions of the Hilbert space:
    \begin{itemize}
        \item Completeness:
            Let $(f_n)$ be a Cauchy sequence in $L^2(\Omega, \mathscr{F}, P)$. Then for any $\epsilon>0$, there exists an $N$ such that for all $m,n\geq N$, we have
            $$
            \int_\Omega |f_m(\omega)-f_n(\omega)|^2 dP(\omega)<\epsilon^2
            $$
            This means that $(f_n)$ is a Cauchy sequence in the norm of $L^2(\Omega, \mathscr{F}, P)$.
        \item Inner product:
            The inner product is defined by
            $$
            \langle f,g\rangle=\int_\Omega \overline{f(\omega)}g(\omega)dP(\omega)
            $$
            This is a well-defined inner product on $L^2(\Omega, \mathscr{F}, P)$. We can check the properties of the inner product:
            \begin{itemize}
                \item Linearity:
                    $$
                    \langle af+bg,h\rangle=a\langle f,h\rangle+b\langle g,h\rangle
                    $$
                \item Conjugate symmetry:
                    $$
                    \langle f,g\rangle=\overline{\langle g,f\rangle}
                    $$
                \item Positive definiteness:
                    $$
                    \langle f,f\rangle\geq 0
                    $$
            \end{itemize}
    \end{itemize}
\end{proof}


Let $\mathscr{H}$ be a Hilbert space. $\mathscr{H}$ consists of complex-valued functions on a finite set $\Omega=\{1,2,\cdots,n\}$, and the functions $(e_1,e_2,\cdots,e_n)$ form an orthonormal basis of $\mathscr{H}$. (We use Dirac notation $|k\rangle$ to denote the basis vector $e_k$~\cite{parthasarathy1992quantum}.)

As an analog to the classical probability space $(\Omega,\mathscr{F},\mu)$, which consists of a sample space $\Omega$ and a probability measure $\mu$ on the state space $\mathscr{F}$, the non-commutative probability space $(\mathscr{H},\mathscr{P},\rho)$ consists of a Hilbert space $\mathscr{H}$ and a state $\rho$ on the space of all orthogonal projections $\mathscr{P}$.

The detailed definition of the non-commutative probability space is given below:

\begin{defn}
	\label{defn:non-commutative_probability_space}
	Non-commutative probability space:

	A non-commutative probability space is a pair $(\mathscr{B}(\mathscr{H}),\mathscr{P})$, where $\mathscr{B}(\mathscr{H})$ is the set of all \textbf{bounded} linear operators on $\mathscr{H}$.
    
    A linear operator on $\mathscr{H}$ is \textbf{bounded} if for all $u$ such that $\|u\|\leq 1$, we have $\|Au\|\leq M$ for some $M>0$.
    
    $\mathscr{P}$ is the set of all orthogonal projections on $\mathscr{B}(\mathscr{H})$.

    The set $\mathscr{P}=\{P\in\mathscr{B}(\mathscr{H}):P^*=P=P^2\}$ is the set of all orthogonal projections on $\mathscr{B}(\mathscr{H})$.
\end{defn}

Recall from classical probability theory, we call the initial probability distribution for possible outcomes in the classical probability theory as our \textit{state}, simillarly, we need to define the \textit{state} in the non-commutative probability theory.

\begin{defn}
	\label{defn:state}
	Non-commutative probability state:

	A state on $(\mathscr{B}(\mathscr{H}),\mathscr{P})$ is a map $\rho:\mathscr{P}\to[0,1]$, (commonly named as density operator) such that:
	\begin{itemize}
		\item $\rho(O)=0$, where $O$ is the zero projection, and $\rho(I)=1$, where $I$ is the identity projection.
		\item If $P_1,P_2,\ldots,P_n$ are pairwise disjoint orthogonal projections, then $\rho(P_1 + P_2 + \cdots + P_n) = \sum_{i=1}^n \rho(P_i)$.
	\end{itemize}
\end{defn}

An example of a density operator can be given as follows:

If $(|\psi_1\rangle,|\psi_2\rangle,\cdots,|\psi_n\rangle)$ is an orthonormal basis of $\mathscr{H}$ consisting of eigenvectors of $\rho$, for the eigenvalues $p_1,p_2,\cdots,p_n$, then $p_j\geq 0$ and $\sum_{j=1}^n p_j=1$.

We can write $\rho$ as
\[
\rho=\sum_{j=1}^n p_j|\psi_j\rangle\langle\psi_j|
\]
(Under basis $|\psi_j\rangle$, it is a diagonal matrix with $p_j$ on the diagonal.)

% Then we need to introduce a theorem that ensures that every state on the space of all orthogonal projections on $\mathscr{H}$ can be represented by a density operator.

% \begin{theorem}
% 	\label{theorem:Gleason's_theorem}
% 	Gleason's theorem (Theorem 1.1.15 in~\cite{parthasarathy2005mathematical})

%     Let $\mathscr{H}$ be a Hilbert space over $\mathbb{C}$ or $\mathbb{R}$ of dimension $n\geq 3$. Let $\mu$ be a state on the space $\mathscr{P}$ of projections on $\mathscr{H}$. Then there exists a unique density operator $\rho$ such that
%     \[
%     \mu(P)=\operatorname{Tr}(\rho P)
%     \]
%     for all $P\in\mathscr{P}$. $\mathscr{P}$ is the space of all orthogonal projections on $\mathscr{H}$.
% \end{theorem}

% This proof came from~\cite{parthasarathy2005mathematical}.

% \begin{proof}
% % TODO: FILL IN THE PROOF
% \end{proof}

% This theorem is a very important theorem in non-commutative probability theory; it states that any state on the space of all orthogonal projections on $\mathscr{H}$ can be represented by a density operator.

The counterpart of the random variable in the non-commutative probability theory is called an observable, which is a Hermitian operator on $\mathscr{H}$ (for all $\psi,\phi$ in the domain of $A$, we have $\langle A\psi,\phi\rangle=\langle\psi,A\phi\rangle$. This kind of operator ensures that our outcome interpreted as probability is a real number).

\begin{defn}
	\label{defn:observable}
	Observable:

	Let $\mathscr{B}(\mathbb{R})$ be the set of all Borel sets on $\mathbb{R}$.

	A random variable on the Hilbert space $\mathscr{H}$ is a projection-valued map (measure) $P:\mathscr{B}(\mathbb{R})\to\mathscr{P}$.

	With the following properties:
	\begin{itemize}
		\item $P(\emptyset)=O$ (the zero projection)
		\item $P(\mathbb{R})=I$ (the identity projection)
		\item For any sequence $A_1,A_2,\cdots,A_n\in \mathscr{B}(\mathbb{R})$, the following holds:  
		\begin{itemize}
			\item $P(\bigcup_{i=1}^n A_i)=\bigvee_{i=1}^n P(A_i)$  
			\item $P(\bigcap_{i=1}^n A_i)=\bigwedge_{i=1}^n P(A_i)$  
			\item $P(A^c)=I-P(A)$
			\item If $A_j$ are mutually disjoint (that is $P(A_i)P(A_j)=P(A_j)P(A_i)=O$ for $i\neq j$), then $P(\bigcup_{j=1}^n A_j)=\sum_{j=1}^n P(A_j)$
		\end{itemize}
	\end{itemize}
\end{defn}

\begin{defn}
	\label{defn:probability_of_random_variable}
	Probability of a random variable:

	For a system prepared in state $\rho$, the probability that the random variable given by the projection-valued measure $P$ is in the Borel set $A$ is $\operatorname{Tr}(\rho P(A))$.
\end{defn}

When operators commute, we recover classical probability measures.

\begin{defn}
	\label{defn:measurement}
	Definition of measurement:
        
    A measurement (observation) of a system prepared in a given state produces an outcome $x$, $x$ is a physical event that is a subset of the set of all possible outcomes. For each $x$, we associate a measurement operator $M_x$ on $\mathscr{H}$.

    Given the initial state (pure state, unit vector) $u$, the probability of measurement outcome $x$ is given by:
    \[
    p(x)=\|M_xu\|^2
    \]

    Note that to make sense of this definition, the collection of measurement operators $\{M_x\}$ must satisfy the completeness requirement:
    \[
    1=\sum_{x\in X} p(x)=\sum_{x\in X}\|M_xu\|^2=\sum_{x\in X}\langle M_xu,M_xu\rangle=\langle u,(\sum_{x\in X}M_x^*M_x)u\rangle
    \]
    So $\sum_{x\in X}M_x^*M_x=I$.

\end{defn}

\begin{prop}
	\label{prop:indistinguishability}
	Proposition of indistinguishability:

    Suppose that we have two systems $u_1,u_2\in \mathscr{H}_1$, the two states are distinguishable if and only if they are orthogonal.
\end{prop}

\begin{proof}

	Ways to distinguish the two states:
    \begin{enumerate}
        \item Set $X=\{0,1,2\}$ and $M_i=|u_i\rangle\langle u_i|$, $M_0=I-M_1-M_2$
        \item Then $\{M_0,M_1,M_2\}$ is a complete collection of measurement operators on $\mathscr{H}$.
        \item Suppose the prepared state is $u_1$, then $p(1)=\|M_1u_1\|^2=\|u_1\|^2=1$, $p(2)=\|M_2u_1\|^2=0$, $p(0)=\|M_0u_1\|^2=0$.
    \end{enumerate}

    If they are not orthogonal, then there is no choice of measurement operators to perfectly distinguish the two states.

\end{proof}

\textit{Intuitively, if the two states are not orthogonal, then for any measurement (projection) there exists non-zero probability of getting the same outcome for both states.}

Here is Table~\ref{tab:analog_of_classical_probability_theory_and_non_commutative_probability_theory} summarizing the analog of classical probability theory and non-commutative (\textit{quantum}) probability theory~\cite{Feres}:

\begin{table}
	\centering
	\renewcommand{\arraystretch}{1.5}
	\caption{Analog of classical probability theory and non-commutative (\textit{quantum}) probability theory}
	\label{tab:analog_of_classical_probability_theory_and_non_commutative_probability_theory}
{\tiny
	\begin{tabular}{|p{0.5\linewidth}|p{0.5\linewidth}|}
		\hline
		\textbf{Classical probability} & \textbf{Non-commutative probability} \\
		\hline
		Sample space $\Omega$, cardinality $\vert\Omega\vert=n$, example: $\Omega=\{0,1\}$ & Complex Hilbert space $\mathscr{H}$, dimension $\dim\mathscr{H}=n$, example: $\mathscr{H}=\mathbb{C}^2$ \\
		\hline
		Common algebra of $\mathbb{C}$ valued functions & Algebra of bounded operators $\mathscr{B}(\mathscr{H})$ \\
		\hline
		$f\mapsto \bar{f}$ complex conjugation & $P\mapsto P^*$ adjoint \\
		\hline
		Events: indicator functions of sets & Projections: space of orthogonal projections $\mathscr{P}\subseteq\mathscr{B}(\mathscr{H})$ \\
		\hline
		functions $f$ such that $f^2=f=\overline{f}$ & orthogonal projections $P$ such that $P^*=P=P^2$ \\
		\hline
		$\mathbb{R}$-valued functions $f=\overline{f}$ & self-adjoint operators $A=A^*$ \\
		\hline
		$\mathbb{I}_{f^{-1}(\{\lambda\})}$ is the indicator function of the set $f^{-1}(\{\lambda\})$ & $P(\lambda)$ is the orthogonal projection to eigenspace \\
		\hline
		$f=\sum_{\lambda\in \operatorname{Range}(f)}\lambda \mathbb{I}_{f^{-1}(\{\lambda\})}$ & $A=\sum_{\lambda\in \operatorname{sp}(A)}\lambda P(\lambda)$ \\
		\hline
		Probability measure $\mu$ on $\Omega$ & Density operator $\rho$ on $\mathscr{H}$ \\
		\hline
		Delta measure $\delta_\omega$ & Pure state $\rho=\vert\psi\rangle\langle\psi\vert$ \\
		\hline
		$\mu$ is non-negative measure and $\sum_{i=1}^n\mu(\{i\})=1$ & $\rho$ is positive semi-definite and $\operatorname{Tr}(\rho)=1$ \\
		\hline
		Expected value of random variable $f$ is $\mathbb{E}_{\mu}(f)=\sum_{i=1}^n f(i)\mu(\{i\})$ & Expected value of operator $A$ is $\mathbb{E}_\rho(A)=\operatorname{Tr}(\rho A)$ \\
		\hline
		Variance of random variable $f$ is $\operatorname{Var}_\mu(f)=\sum_{i=1}^n (f(i)-\mathbb{E}_\mu(f))^2\mu(\{i\})$ & Variance of operator $A$ is $\operatorname{Var}_\rho(A)=\operatorname{Tr}(\rho A^2)-\operatorname{Tr}(\rho A)^2$ \\
		\hline
		Covariance of random variables $f$ and $g$ is $\operatorname{Cov}_\mu(f,g)=\sum_{i=1}^n (f(i)-\mathbb{E}_\mu(f))(g(i)-\mathbb{E}_\mu(g))\mu(\{i\})$ & Covariance of operators $A$ and $B$ is $\operatorname{Cov}_\rho(A,B)=\operatorname{Tr}(\rho A\circ B)-\operatorname{Tr}(\rho A)\operatorname{Tr}(\rho B)$ \\
		\hline
		Composite system is given by Cartesian product of the sample spaces $\Omega_1\times\Omega_2$ & Composite system is given by tensor product of the Hilbert spaces $\mathscr{H}_1\otimes\mathscr{H}_2$ \\
		\hline
		Product measure $\mu_1\times\mu_2$ on $\Omega_1\times\Omega_2$ & Tensor product of space $\rho_1\otimes\rho_2$ on $\mathscr{H}_1\otimes\mathscr{H}_2$ \\
		\hline
		Marginal distribution $\pi_*v$ & Partial trace $\operatorname{Tr}_2(\rho)$ \\
		\hline
	\end{tabular}
	}
	\vspace{0.5cm}
\end{table}

\section{Concentration of measure phenomenon}

\begin{defn}
	$\eta$-Lipschitz function

    Let $(X,\operatorname{dist}_X)$ and $(Y,\operatorname{dist}_Y)$ be two metric spaces. A function $f:X\to Y$ is said to be $\eta$-Lipschitz if there exists a constant $L\in \mathbb{R}$ such that
    \[
    \operatorname{dist}_Y(f(x),f(y))\leq L\operatorname{dist}_X(x,y)
    \]
    for all $x,y\in X$. And $\eta=\|f\|_{\operatorname{Lip}}=\inf_{L\in \mathbb{R}}L$.
\end{defn}

That basically means that the function $f$ should not change the distance between any two pairs of points in $X$ by more than a factor of $L$.

\begin{lemma}
	\label{lemma:isoperimetric_inequality_on_sphere}
	Isoperimetric inequality on the sphere:

    Let $\sigma_n(A)$ denote the normalized area of $A$ on the $n$-dimensional sphere $S^n$. That is, $\sigma_n(A)\coloneqq\frac{\operatorname{Area}(A)}{\operatorname{Area}(S^n)}$.

    Let $\epsilon>0$. Then for any subset $A\subset S^n$, given the area $\sigma_n(A)$, the spherical caps minimize the volume of the $\epsilon$-neighborhood of $A$.

    Suppose $\sigma^n(\cdot)$ is the normalized volume measure on the sphere $S^n(1)$, then for any closed subset $\Omega\subset S^n(1)$, we take a metric ball $B_\Omega$ of $S^n(1)$ with $\sigma^n(B_\Omega)=\sigma^n(\Omega)$. Then we have

    \[
    \sigma^n(U_r(\Omega))\geq \sigma^n(U_r(B_\Omega))
    \]

    where $U_r(A)=\{x\in X:d(x,A)< r\}$
\end{lemma}

Intuitively, the lemma means that the spherical caps are the most efficient way to cover the sphere.

Here, the efficiency is measured by the epsilon-neighborhood of the boundary of the spherical cap.

To prove the lemma, we need to have a good understanding of the Riemannian geometry of the sphere. For now, let's just take the lemma for granted.

\subsection{Levy's concentration theorem}

\begin{theorem}
	\label{theorem:Levy's_concentration_theorem}
	Levy's concentration theorem:

    An arbitrary 1-Lipschitz function $f:S^n\to \mathbb{R}$ concentrates near a single value $a_0\in \mathbb{R}$ as strongly as the distance function does.

    That is,
    \[
    \mu\{x\in S^n: |f(x)-a_0|\geq\epsilon\} < \kappa_n(\epsilon)\leq 2\exp\left(-\frac{(n-1)\epsilon^2}{2}\right)
    \]
    where 
    \[
    \kappa_n(\epsilon)=\frac{\int_\epsilon^{\frac{\pi}{2}}\cos^{n-1}(t)dt}{\int_0^{\frac{\pi}{2}}\cos^{n-1}(t)dt}
    \]
    $a_0$ is the \textbf{Levy mean} of function $f$, that is, the level set $f^{-1}:\mathbb{R}\to S^n$ divides the sphere into equal halves, characterized by the following equality:
    \[
    \mu(f^{-1}(-\infty,a_0])\geq \frac{1}{2} \text{ and } \mu(f^{-1}[a_0,\infty))\geq \frac{1}{2}
    \]
\end{theorem}

We will prove the theorem via the Maxwell-Boltzmann distribution law.~\cite{shioya2014metricmeasuregeometry}

\begin{defn}
	\label{defn:Gaussian_measure}
	Gaussian measure:

    We denote the Gaussian measure on $\mathbb{R}^k$ as $\gamma^k$.

    $$
    d\gamma^k(x)\coloneqq\frac{1}{\sqrt{2\pi}^k}\exp(-\frac{1}{2}\|x\|^2)dx
    $$
    
    $x\in \mathbb{R}^k$, $\|x\|^2=\sum_{i=1}^k x_i^2$ is the Euclidean norm, and $dx$ is the Lebesgue measure on $\mathbb{R}^k$.
    
\end{defn}

Basically, you can consider the Gaussian measure as the normalized Lebesgue measure on $\mathbb{R}^k$ with standard deviation $1$.

It also has another name, the Projective limit theorem.~\cite{romanvershyni}

If $X\sim \operatorname{Unif}(S^n(\sqrt{n}))$, then for any fixed unit vector $x$ we have $\langle X,x\rangle\to N(0,1)$ in distribution as $n\to \infty$.

\begin{figure}[h]
    \centering
    \includegraphics[width=0.8\textwidth]{./images/maxwell.png}
    \caption{Maxwell-Boltzmann distribution law, image from \cite{romanvershyni}}
    \label{fig:Maxwell-Boltzmann_distribution_law}
\end{figure}

\begin{lemma}
	\label{lemma:Maxwell-Boltzmann_distribution_law}
    Maxwell-Boltzmann distribution law:
    
    For any natural number $k$,
    \[
    \frac{d(\pi_{n,k})_*\sigma^n(x)}{dx}\to \frac{d\gamma^k(x)}{dx}
    \]
    where $(\pi_{n,k})_*\sigma^n$ is the push-forward measure of $\sigma^n$ by $\pi_{n,k}$.
    
    In other words,
    \[
    (\pi_{n,k})_*\sigma^n\to \gamma^k\text{ weakly as }n\to \infty
    \]
\end{lemma}

\begin{proof}
    We denote the $n$-dimensional volume measure on $\mathbb{R}^k$ as $\operatorname{vol}_k$.
    
    Observe that $\pi_{n,k}^{-1}(x),x\in \mathbb{R}^k$ is isometric to $S^{n-k}(\sqrt{n-\|x\|^2})$, that is, for any $x\in \mathbb{R}^k$, $\pi_{n,k}^{-1}(x)$ is a sphere with radius $\sqrt{n-\|x\|^2}$ (by the definition of $\pi_{n,k}$).
    
    So,
    \[
    \begin{aligned}
    \frac{d(\pi_{n,k})_*\sigma^n(x)}{dx}&=\frac{\operatorname{vol}_{n-k}(\pi_{n,k}^{-1}(x))}{\operatorname{vol}_k(S^n(\sqrt{n}))}\\
    &=\frac{(n-\|x\|^2)^{\frac{n-k}{2}}}{\int_{\|x\|\leq \sqrt{n}}(n-\|x\|^2)^{\frac{n-k}{2}}dx}\\
    \end{aligned}
    \]
    as $n\to \infty$.
    
    Note that $\lim_{n\to \infty}(1-\frac{a}{n})^n=e^{-a}$ for any $a>0$.
    
    $(n-\|x\|^2)^{\frac{n-k}{2}}=\left(n(1-\frac{\|x\|^2}{n})\right)^{\frac{n-k}{2}}\to n^{\frac{n-k}{2}}\exp(-\frac{\|x\|^2}{2})$
    
    So
    \[
    \begin{aligned}
    \frac{(n-\|x\|^2)^{\frac{n-k}{2}}}{\int_{\|x\|\leq \sqrt{n}}(n-\|x\|^2)^{\frac{n-k}{2}}dx}&=\frac{e^{-\frac{\|x\|^2}{2}}}{\int_{x\in \mathbb{R}^k}e^{-\frac{\|x\|^2}{2}}dx}\\
    &=\frac{1}{(2\pi)^{\frac{k}{2}}}e^{-\frac{\|x\|^2}{2}}\\
    &=\frac{d\gamma^k(x)}{dx}
    \end{aligned}
    \]
\end{proof}

Now we can prove Levy's concentration theorem, the proof is from~\cite{shioya2014metricmeasuregeometry}.

\begin{proof}
    Let $f_n:S^n(\sqrt{n})\to \mathbb{R}$, $n=1,2,\ldots$, be 1-Lipschitz functions.

    Let $x$ and $x'$ be two given real numbers and $\gamma^1(-\infty,x]=\overline{\sigma}_\infty[-\infty,x']$, suppose $\sigma_\infty\{x'\}=0$, where $\{\sigma_i\}$ is a sequence of Borel probability measures on $\mathbb{R}$.

    We want to show that, for all non-negative real numbers $\epsilon_1$ and $\epsilon_2$.

    $$
    \sigma_\infty[x'-\epsilon_1,x'+\epsilon_2]\geq \gamma^1[x-\epsilon_1,x+\epsilon_2]
    $$
    
    Consider the two spherical cap $\Omega_+\coloneq \{f_{n_i}\geq x'\}$ and $\Omega_-\coloneq \{f_{n_i}\leq x\}$. Note that $\Omega_+\cup \Omega_-=S^{n_i}(\sqrt{n_i})$.

    It is sufficient to show that,

    $$
    U_{\epsilon_1}(\Omega_+)\cup U_{\epsilon_2}(\Omega_-)\subset \{x'-\epsilon_1\leq f_{n_i}\leq x'+\epsilon_2\}
    $$

    By 1-Lipschitz continuity of $f_{n_i}$, we have for all $\zeta\in U_{\epsilon_1}(\Omega_+)$, there is a point $\xi\in \Omega_+$ such that $d(\zeta,\xi)\leq \epsilon_1$. So $U_{\epsilon_1}(\Omega_+)\subset \{f_{n_i}\geq x'-\epsilon_1\}$. With the same argument, we have $U_{\epsilon_2}(\Omega_-)\subset \{f_{n_i}\leq x+\epsilon_2\}$.

    So the push-forward measure of $(f_{n_i})_*\sigma^{n_i}$ of $[x'-\epsilon_1,x'+\epsilon_2]$ is

    \[
    \begin{aligned}
    (f_{n_i})_*\sigma^{n_i}[x'-\epsilon_1,x'+\epsilon_2]&=\sigma^{n_i}(x'-\epsilon_1\leq f_{n_i}\leq x'+\epsilon_2)\\
    &\geq \sigma^{n_i}(U_{\epsilon_1}(\Omega_+)\cap U_{\epsilon_2}(\Omega_-))\\
    &=\sigma^{n_i}(U_{\epsilon_1}(\Omega_+))+\sigma^{n_i}(U_{\epsilon_2}(\Omega_-))-1\\
    \end{aligned}
    \]

    By the lemma~\ref{lemma:isoperimetric_inequality_on_sphere}, we have

    \[
    \sigma^{n_i}(U_{\epsilon_1}(\Omega_+))\geq \sigma^{n_i}(U_{\epsilon_1}(B_{\Omega_+}))\quad \text{and} \quad \sigma^{n_i}(U_{\epsilon_2}(\Omega_-))\geq \sigma^{n_i}(U_{\epsilon_2}(B_{\Omega_-}))
    \]

    By the lemma~\ref{lemma:Maxwell-Boltzmann_distribution_law}, we have

    \[
    \sigma^{n_i}(U_{\epsilon_1}(\Omega_+))+\sigma^{n_i}(U_{\epsilon_2}(\Omega_-))\to \gamma^1[x'-\epsilon_1,x'+\epsilon_2]+\gamma^1[x-\epsilon_1,x+\epsilon_2]
    \]

    Therefore,

    $$
    \begin{aligned}
    \sigma_\infty[x'-\epsilon_1,x'+\epsilon_2]&\geq \liminf_{i\to \infty}(f_{n_i})_*\sigma^{n_i}[x'-\epsilon_1,x'+\epsilon_2]\\
    &\geq \gamma^1[x'-\epsilon_1,\infty)\cap \gamma^1(-\infty,x+\epsilon_2]-1\\
    &=\gamma^1[x-\epsilon_1,x+\epsilon_2]
    \end{aligned}
    $$

\end{proof}

The full proof of Levy's concentration theorem requires more digestion for cases where $\overline{\sigma}_\infty\neq \delta_{\pm\infty}$ but I don't have enough time to do so. This section may be filled in the next semester.

\section{The application of the concentration of measure phenomenon in non-commutative probability theory}

In quantum communication, we can pass classical bits by sending quantum states. However, by the indistinguishability (Proposition~\ref{prop:indistinguishability}) of quantum states, we cannot send an infinite number of classical bits over a single qubit. There exists a bound for zero-error classical communication rate over a quantum channel.

\begin{theorem}
	\label{theorem:Holevo_bound}
	Holevo bound:

	The maximal amount of classical information that can be transmitted by a quantum system is given by the Holevo bound. $\log_2(d)$ is the maximum amount of classical information that can be transmitted by a quantum system with $d$ levels (that is, basically, the number of qubits).
\end{theorem}

The proof of the Holevo bound can be found in~\cite{Nielsen_Chuang_2010}. In current state of the project, this theorem is not heavily used so we will not make annotated proof here.

\subsection{Quantum communication}

To surpass the Holevo bound, we need to use the entanglement of quantum states.

\begin{defn}
	\label{defn:Bell_state}
	Bell state:
        
        The Bell states are the following four states:
        
        \[
        |\Phi^+\rangle=\frac{1}{\sqrt{2}}(|00\rangle+|11\rangle),\quad |\Phi^-\rangle=\frac{1}{\sqrt{2}}(|00\rangle-|11\rangle)
        \]
        \[
        |\Psi^+\rangle=\frac{1}{\sqrt{2}}(|01\rangle+|10\rangle),\quad |\Psi^-\rangle=\frac{1}{\sqrt{2}}(|01\rangle-|10\rangle)
        \]
        These are a basis of the 2-qubit Hilbert space.
\end{defn}


\subsection{Superdense coding and entanglement}

The description of the superdense coding can be found in~\cite{gupta2015functionalanalysisquantuminformation} and~\cite{Hayden}.

Suppose $A$ and $B$ share a Bell state (or other maximally entangled state) $|\Phi^+\rangle=\frac{1}{\sqrt{2}}(|00\rangle+|11\rangle)$, where $A$ holds the first part and $B$ holds the second part.
        
$A$ wishes to send 2 \textbf{classical bits} to $B$.

$A$ performs one of four Pauli unitaries (some fancy quantum gates named X, Y, Z, I) on the combined state of entangled qubits $\otimes$ one qubit. Then $A$ sends the resulting one qubit to $B$.

This operation extends the initial one entangled qubit to a system of one of four orthogonal Bell states.

$B$ performs a measurement on the combined state of the one qubit and the entangled qubits he holds.

$B$ decodes the result and obtains the 2 classical bits sent by $A$.

\begin{figure}[h]
	\centering
	\includegraphics[width=0.8\textwidth]{superdense_coding.png}
	\caption{Superdense coding, image from \cite{Hayden}}
	\label{fig:superdense_coding}
\end{figure}

Note that superdense coding is a way to send 2 classical bits of information by sending 1 qubit with 1 entangled qubit. \textbf{The role of the entangled qubit} is to help them to distinguish the 4 possible states of the total 3 qubits system where 2 of them (the pair of entangled qubits) are mathematically the same.

Additionally, no information can be gained by measuring a pair of entangled qubits. To send information from  $A$ to $B$, we need to physically send the qubits from $A$ to $B$. That means, we cannot send information faster than the speed of light.

% TODO: FILL the description of the superdense coding here.

\subsection{Hayden's concentration of measure phenomenon}

The application of the concentration of measure phenomenon in the superdense coding can be realized in random sampling the entangled qubits~\cite{Hayden}:

It is a theorem connecting the following mathematical structure:

\begin{figure}[h]
    \centering
    \begin{tikzpicture}[node distance=30mm, thick,
        main/.style={draw, draw=white},
        towards/.style={->},
        towards_imp/.style={->,red},
        mutual/.style={<->}
        ]
        % define nodes
        \node[main] (cp) {$\mathbb{C}P^{d_A d_B-1}$};
        \node[main] (pa) [left of=cp] {$\mathcal{P}(A\otimes B)$};
        \node[main] (sa) [below of=pa] {$S_A$};
        \node[main] (rng) [right of=sa] {$[0,\infty)\subset \mathbb{R}$};

        % draw edges
        \draw[mutual] (cp) -- (pa);
        \draw[towards] (pa) -- node[left] {$\operatorname{Tr}_B$} (sa);
        \draw[towards_imp] (pa) -- node[above right] {$f$} (rng);
        \draw[towards] (sa) -- node[above] {$H(\psi_A)$} (rng);
    \end{tikzpicture}
    \caption{Mathematical structure for Hayden's concentration of measure phenomenon}
    \label{fig:Hayden_concentration_of_measure_phenomenon}
\end{figure}

\begin{itemize}
    \item The red arrow is the concentration of measure effect. $f=H(\operatorname{Tr}_B(\psi))$.
    \item $S_A$ denotes the mixed states on $A$.
\end{itemize}

To prove the concentration of measure phenomenon, we need to analyze the following elements involved in figure~\ref{fig:Hayden_concentration_of_measure_phenomenon}:

First, we need to define what is a random state in a bipartite system. In fact, for pure states, there is a unique uniform distribution under Haar measure that is unitarily invariant.

$U(n)$ is the group of all $n\times n$ \textbf{unitary matrices} over $\mathbb{C}$, 

$$
U(n)=\{A\in \mathbb{C}^{n\times n}: A^*A=AA^*=I_n\}
$$

The uniqueness of such measurement came from the lemma below~\cite{Elizabeth_book}

\begin{lemma}

    Let $(U(n), \| \cdot \|, \mu)$ be a metric measure space where $\| \cdot \|$ is the Hilbert-Schmidt norm and $\mu$ is the measure function.

    The Haar measure on $U(n)$ is the unique probability measure that is invariant under the action of $U(n)$ on itself.
    
    That is, fixing $B\in U(n)$, $\forall A\in U(n)$, $\mu(A\cdot B)=\mu(B\cdot A)=\mu(B)$.
    
    The Haar measure is the unique probability measure that is invariant under the action of $U(n)$ on itself.

\end{lemma}

    
The existence and uniqueness of the Haar measure is a theorem in compact lie group theory. For this research topic, we will not prove it.

A random pure state $\varphi$ is any random variable distributed according to the unitarily invariant probability measure on the pure states $\mathcal{P}(A)$ of the system $A$, denoted by $\varphi\in_R\mathcal{P}(A)$.

It is trivial that for the space of pure state, we can easily apply the Haar measure as the unitarily invariant probability measure since the space of pure state is $S^n$ for some $n$. However, for the case of mixed states, that is a bit complicated and we need to use partial tracing to defined the rank-$s$ random states.

\begin{defn}
    Rank-$s$ random state.

    For a system $A$ and an integer $s\geq 1$, consider the distribution onn the mixed states $\mathcal{S}(A)$ of A induced by the partial trace over the second factor form the uniform distribution on pure states of $A\otimes\mathbb{C}^s$. Any random variable $\rho$ distributed as such will be called a rank-$s$ random states; denoted as $\rho\in_R \mathcal{S}_s(A)$. And $\mathcal{P}(A)=\mathcal{S}_1(A)$.
\end{defn}

Due to time constrains of the projects, the following lemma is demonstrated but not investigated thoroughly through the research:


\begin{lemma}
    \label{pages_lemma}

    Page's lemma for expected entropy of mixed states

    Choose a random pure state $\sigma=|\psi\rangle\langle\psi|$ from $A'\otimes A$.

    The expected value of the entropy of entanglement is known and satisfies a concentration inequality known as Page's formula~\cite{Pages_conjecture,Pages_conjecture_simple_proof,Bengtsson_Życzkowski_2017}[15.72]. The detailed proof is not fully explored in this project and is intended to be done in the next semester.

    \[
    \mathbb{E}[H(\psi_A)] \geq \log_2(d_A)-\frac{1}{2\ln(2)}\frac{d_A}{d_B}
    \]

\end{lemma}

It basically provides a lower bound for the expected entropy of entanglement. Experimentally, we can have the following result (see Figure~\ref{fig:entropy_vs_dim}):

\begin{figure}[h]
	\centering
	\includegraphics[width=0.8\textwidth]{entropy_vs_dim.png}
	\caption{Entropy vs dimension}
	\label{fig:entropy_vs_dim}
\end{figure}

Then we have bound for Lipschitz constant $\eta$ of the map $H(\varphi_A)$

\begin{lemma}
    The Lipschitz constant $\eta$ of $S(\varphi_A)$ is upper bounded by $\sqrt{8}\log_2(d_A)$ for $d_A\geq 3$.
\end{lemma}

From Levy's lemma, we have

If we define $\beta=\frac{1}{\ln(2)}\frac{d_A}{d_B}$, then we have
\[
\operatorname{Pr}[H(\psi_A) < \log_2(d_A)-\alpha-\beta] \leq \exp\left(-\frac{1}{8\pi^2\ln(2)}\frac{(d_Ad_B-1)\alpha^2}{(\log_2(d_A))^2}\right)
\]
where $d_B\geq d_A\geq 3$~\cite{Hayden_2006}.

Experimentally, we can have the following result:

As the dimension of the Hilbert space increases, the chance of getting an almost maximally entangled state increases (see Figure~\ref{fig:entropy_vs_dA}).

\begin{figure}
	\centering
	\includegraphics[width=0.8\textwidth]{entropy_vs_dA.png}
	\caption{Entropy vs $d_A$}
	\label{fig:entropy_vs_dA}
\end{figure}

In Hayden's work, the result is also extended to the multiparty case~\cite{Hayden}, and the result is still under research and I will show the result in the final report if I have enough time.


% When compiled standalone, print this chapter's references at the end.
\ifSubfilesClassLoaded{
  \printbibliography[title={References for Chapter 1}]
}

\end{document}