diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..4b5a294
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,4 @@
+{
+    "python-envs.defaultEnvManager": "ms-python.python:conda",
+    "python-envs.defaultPackageManager": "ms-python.python:conda"
+}
\ No newline at end of file
diff --git a/chapters/chap0.pdf b/chapters/chap0.pdf
deleted file mode 100644
index e43ac56..0000000
Binary files a/chapters/chap0.pdf and /dev/null differ
diff --git a/chapters/chap2.pdf b/chapters/chap2.pdf
deleted file mode 100644
index 071bb66..0000000
Binary files a/chapters/chap2.pdf and /dev/null differ
diff --git a/codes/README.md b/codes/README.md
new file mode 100644
index 0000000..1143356
--- /dev/null
+++ b/codes/README.md
@@ -0,0 +1,8 @@
+# Simulation
+
+## Define random sampling using standard uniform measure on the unit sphere
+
+## Define and visualized the concentration of measure phenomenon on complex projective space
+
+## Define random sampling using Majorana Stellar representation
+
diff --git a/codes/experiment_v0.1.py b/codes/experiment_v0.1.py
new file mode 100644
index 0000000..561ce39
--- /dev/null
+++ b/codes/experiment_v0.1.py
@@ -0,0 +1,524 @@
+#!/usr/bin/env python3
+"""
+Entropy-based observable-diameter estimator on complex projective space CP^n.
+
+Interpretation
+--------------
+We identify CP^n with the projective pure-state space of C^(n+1).  To define
+an entanglement entropy observable we choose a factorization
+
+    n + 1 = d_A * d_B,
+
+so the projective space is CP^(d_A d_B - 1).  For a projective point [psi],
+represented by a unit vector psi in C^(d_A d_B), define the observable
+
+    S_A([psi]) = -Tr(rho_A log_2 rho_A),
+    rho_A = Tr_B |psi><psi|.
+
+The true observable diameter ObsDiam(X; -kappa) is the supremum over all
+1-Lipschitz observables.  This script only uses the von Neumann entropy
+observable, so it reports:
+
+1) the partial diameter of the push-forward entropy distribution,
+2) an optional Lipschitz-normalized proxy obtained by dividing by an empirical
+   Lipschitz constant estimated with the Fubini-Study metric.
+
+Hence the output is best interpreted as an entropy-based observable-diameter
+proxy, not as the exact observable diameter of CP^n.
+
+Hayden-inspired comparison
+--------------------------
+Hayden/Leung/Winter show that the entanglement entropy of a Haar-random pure
+state is highly concentrated in high dimension.  The script overlays two
+useful theoretical guides:
+
+- a one-sided lower-tail cutoff derived from the standard Hayden bound,
+- a Levy/Hayden scaling width of order (log d_A)/sqrt(d_A d_B), centered at
+  the empirical median, to visualize concentration-of-measure decay.
+
+Sampling method
+---------------
+A Haar-random pure state on C^(d_A d_B) can be generated by normalizing a
+complex Gaussian vector.  Equivalently, we sample a complex Gaussian matrix
+G in C^(d_A x d_B); then vec(G)/||G|| is Haar-random and
+rho_A = G G^* / Tr(G G^*).
+
+Outputs
+-------
+The script writes:
+- a CSV summary table,
+- per-system entropy histograms,
+- a concentration summary plot across dimensions,
+- a normalized observable-proxy plot if Lipschitz estimation is enabled,
+- a tail plot for the largest system.
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import math
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable, List, Sequence, Tuple
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from tqdm import tqdm
+
+# A commonly used explicit constant in expositions of Hayden's concentration
+# bound in natural logs.  We keep the entropy in bits, in which the same
+# constant remains after the base conversion in the exponent.
+HAYDEN_C = 1.0 / (8.0 * math.pi ** 2)
+
+
+def parse_dims(spec: str) -> List[Tuple[int, int]]:
+    dims: List[Tuple[int, int]] = []
+    for item in spec.split(","):
+        token = item.strip().lower()
+        if not token:
+            continue
+        if "x" not in token:
+            raise ValueError(f"Bad dimension token '{item}'. Use forms like 4x8,8x16.")
+        a_str, b_str = token.split("x", 1)
+        d_a = int(a_str)
+        d_b = int(b_str)
+        if d_a <= 1 or d_b <= 1:
+            raise ValueError("Both subsystem dimensions must be >= 2.")
+        if d_a > d_b:
+            d_a, d_b = d_b, d_a
+        dims.append((d_a, d_b))
+    if not dims:
+        raise ValueError("No dimensions were parsed.")
+    return dims
+
+
+def haar_matrix(d_a: int, d_b: int, rng: np.random.Generator) -> np.ndarray:
+    real = rng.normal(size=(d_a, d_b))
+    imag = rng.normal(size=(d_a, d_b))
+    return (real + 1j * imag) / math.sqrt(2.0)
+
+
+def reduced_density_from_matrix(g: np.ndarray) -> np.ndarray:
+    rho = g @ g.conj().T
+    tr = float(np.trace(rho).real)
+    rho /= tr
+    return rho
+
+
+def entropy_bits_from_rho(rho: np.ndarray, tol: float = 1e-14) -> float:
+    eigvals = np.linalg.eigvalsh(rho)
+    eigvals = np.clip(eigvals.real, 0.0, 1.0)
+    eigvals = eigvals[eigvals > tol]
+    if eigvals.size == 0:
+        return 0.0
+    return float(-np.sum(eigvals * np.log2(eigvals)))
+
+
+def random_state_and_entropy(
+    d_a: int, d_b: int, rng: np.random.Generator
+) -> Tuple[np.ndarray, float]:
+    g = haar_matrix(d_a, d_b, rng)
+    rho_a = reduced_density_from_matrix(g)
+    entropy_bits = entropy_bits_from_rho(rho_a)
+    psi = g.reshape(-1)
+    psi /= np.linalg.norm(psi)
+    return psi, entropy_bits
+
+
+def partial_diameter(samples: np.ndarray, mass: float) -> Tuple[float, float, float]:
+    if not 0.0 < mass <= 1.0:
+        raise ValueError("mass must lie in (0, 1].")
+    x = np.sort(np.asarray(samples, dtype=float))
+    n = x.size
+    if n == 0:
+        raise ValueError("samples must be non-empty")
+    if n == 1:
+        return 0.0, float(x[0]), float(x[0])
+    m = int(math.ceil(mass * n))
+    if m <= 1:
+        return 0.0, float(x[0]), float(x[0])
+    widths = x[m - 1 :] - x[: n - m + 1]
+    idx = int(np.argmin(widths))
+    left = float(x[idx])
+    right = float(x[idx + m - 1])
+    return float(right - left), left, right
+
+
+def fubini_study_distance(psi: np.ndarray, phi: np.ndarray) -> float:
+    overlap = abs(np.vdot(psi, phi))
+    overlap = min(1.0, max(0.0, float(overlap)))
+    return float(math.acos(overlap))
+
+
+def empirical_lipschitz_constant(
+    states: Sequence[np.ndarray],
+    values: np.ndarray,
+    rng: np.random.Generator,
+    num_pairs: int,
+) -> Tuple[float, float]:
+    n = len(states)
+    if n < 2 or num_pairs <= 0:
+        return float("nan"), float("nan")
+    ratios = []
+    values = np.asarray(values, dtype=float)
+    for _ in range(num_pairs):
+        i = int(rng.integers(0, n))
+        j = int(rng.integers(0, n - 1))
+        if j >= i:
+            j += 1
+        d_fs = fubini_study_distance(states[i], states[j])
+        if d_fs < 1e-12:
+            continue
+        ratio = abs(values[i] - values[j]) / d_fs
+        ratios.append(ratio)
+    if not ratios:
+        return float("nan"), float("nan")
+    arr = np.asarray(ratios, dtype=float)
+    return float(np.max(arr)), float(np.quantile(arr, 0.99))
+
+
+def hayden_mean_lower_bound_bits(d_a: int, d_b: int) -> float:
+    return math.log2(d_a) - d_a / (2.0 * math.log(2.0) * d_b)
+
+
+def hayden_beta_bits(d_a: int, d_b: int) -> float:
+    return d_a / (math.log(2.0) * d_b)
+
+
+def hayden_alpha_bits(d_a: int, d_b: int, kappa: float) -> float:
+    dim = d_a * d_b
+    return (math.log2(d_a) / math.sqrt(HAYDEN_C * (dim - 1.0))) * math.sqrt(math.log(1.0 / kappa))
+
+
+def hayden_one_sided_width_bits(d_a: int, d_b: int, kappa: float) -> float:
+    return hayden_beta_bits(d_a, d_b) + hayden_alpha_bits(d_a, d_b, kappa)
+
+
+def hayden_lower_cutoff_bits(d_a: int, d_b: int, kappa: float) -> float:
+    return math.log2(d_a) - hayden_one_sided_width_bits(d_a, d_b, kappa)
+
+
+def levy_hayden_scaling_width_bits(d_a: int, d_b: int, kappa: float) -> float:
+    dim = d_a * d_b
+    half_width = (math.log2(d_a) / math.sqrt(HAYDEN_C * (dim - 1.0))) * math.sqrt(math.log(2.0 / kappa))
+    return 2.0 * half_width
+
+
+def hayden_deficit_tail_bound_bits(d_a: int, d_b: int, deficits_bits: np.ndarray) -> np.ndarray:
+    beta = hayden_beta_bits(d_a, d_b)
+    dim = d_a * d_b
+    log_term = math.log2(d_a)
+    shifted = np.maximum(np.asarray(deficits_bits, dtype=float) - beta, 0.0)
+    exponent = -(dim - 1.0) * HAYDEN_C * (shifted ** 2) / (log_term ** 2)
+    bound = np.exp(exponent)
+    bound[deficits_bits <= beta] = 1.0
+    return np.clip(bound, 0.0, 1.0)
+
+
+def page_average_entropy_bits(d_a: int, d_b: int) -> float:
+    # Exact Page formula in bits for d_b >= d_a.
+    harmonic_tail = sum(1.0 / k for k in range(d_b + 1, d_a * d_b + 1))
+    nats = harmonic_tail - (d_a - 1.0) / (2.0 * d_b)
+    return nats / math.log(2.0)
+
+
+@dataclass
+class SystemResult:
+    d_a: int
+    d_b: int
+    projective_dim: int
+    num_samples: int
+    kappa: float
+    mass: float
+    entropy_bits: np.ndarray
+    partial_diameter_bits: float
+    interval_left_bits: float
+    interval_right_bits: float
+    mean_bits: float
+    median_bits: float
+    std_bits: float
+    page_average_bits: float
+    hayden_mean_lower_bits: float
+    hayden_cutoff_bits: float
+    hayden_one_sided_width_bits: float
+    levy_scaling_width_bits: float
+    empirical_lipschitz_max: float
+    empirical_lipschitz_q99: float
+    normalized_proxy_max: float
+    normalized_proxy_q99: float
+
+
+def simulate_system(
+    d_a: int,
+    d_b: int,
+    num_samples: int,
+    kappa: float,
+    rng: np.random.Generator,
+    lipschitz_pairs: int,
+) -> Tuple[SystemResult, List[np.ndarray]]:
+    entropies = np.empty(num_samples, dtype=float)
+    states: List[np.ndarray] = []
+    for idx in tqdm(range(num_samples),desc=f"Simulating system for {d_a}x{d_b} with kappa={kappa}", unit="samples"):
+        psi, s_bits = random_state_and_entropy(d_a, d_b, rng)
+        entropies[idx] = s_bits
+        states.append(psi)
+
+    mass = 1.0 - kappa
+    width, left, right = partial_diameter(entropies, mass)
+    lip_max, lip_q99 = empirical_lipschitz_constant(states, entropies, rng, lipschitz_pairs)
+
+    normalized_proxy_max = width / lip_max if lip_max == lip_max and lip_max > 0 else float("nan")
+    normalized_proxy_q99 = width / lip_q99 if lip_q99 == lip_q99 and lip_q99 > 0 else float("nan")
+
+    result = SystemResult(
+        d_a=d_a,
+        d_b=d_b,
+        projective_dim=d_a * d_b - 1,
+        num_samples=num_samples,
+        kappa=kappa,
+        mass=mass,
+        entropy_bits=entropies,
+        partial_diameter_bits=width,
+        interval_left_bits=left,
+        interval_right_bits=right,
+        mean_bits=float(np.mean(entropies)),
+        median_bits=float(np.median(entropies)),
+        std_bits=float(np.std(entropies, ddof=1)) if num_samples > 1 else 0.0,
+        page_average_bits=page_average_entropy_bits(d_a, d_b),
+        hayden_mean_lower_bits=hayden_mean_lower_bound_bits(d_a, d_b),
+        hayden_cutoff_bits=hayden_lower_cutoff_bits(d_a, d_b, kappa),
+        hayden_one_sided_width_bits=hayden_one_sided_width_bits(d_a, d_b, kappa),
+        levy_scaling_width_bits=levy_hayden_scaling_width_bits(d_a, d_b, kappa),
+        empirical_lipschitz_max=lip_max,
+        empirical_lipschitz_q99=lip_q99,
+        normalized_proxy_max=normalized_proxy_max,
+        normalized_proxy_q99=normalized_proxy_q99,
+    )
+    return result, states
+
+
+def write_summary_csv(results: Sequence[SystemResult], out_path: Path) -> None:
+    fieldnames = [
+        "d_a",
+        "d_b",
+        "projective_dim",
+        "num_samples",
+        "kappa",
+        "mass",
+        "partial_diameter_bits",
+        "interval_left_bits",
+        "interval_right_bits",
+        "mean_bits",
+        "median_bits",
+        "std_bits",
+        "page_average_bits",
+        "hayden_mean_lower_bits",
+        "hayden_cutoff_bits",
+        "hayden_one_sided_width_bits",
+        "levy_scaling_width_bits",
+        "empirical_lipschitz_max_bits_per_rad",
+        "empirical_lipschitz_q99_bits_per_rad",
+        "normalized_proxy_max_rad",
+        "normalized_proxy_q99_rad",
+    ]
+    with out_path.open("w", newline="") as fh:
+        writer = csv.DictWriter(fh, fieldnames=fieldnames)
+        writer.writeheader()
+        for r in results:
+            writer.writerow(
+                {
+                    "d_a": r.d_a,
+                    "d_b": r.d_b,
+                    "projective_dim": r.projective_dim,
+                    "num_samples": r.num_samples,
+                    "kappa": r.kappa,
+                    "mass": r.mass,
+                    "partial_diameter_bits": r.partial_diameter_bits,
+                    "interval_left_bits": r.interval_left_bits,
+                    "interval_right_bits": r.interval_right_bits,
+                    "mean_bits": r.mean_bits,
+                    "median_bits": r.median_bits,
+                    "std_bits": r.std_bits,
+                    "page_average_bits": r.page_average_bits,
+                    "hayden_mean_lower_bits": r.hayden_mean_lower_bits,
+                    "hayden_cutoff_bits": r.hayden_cutoff_bits,
+                    "hayden_one_sided_width_bits": r.hayden_one_sided_width_bits,
+                    "levy_scaling_width_bits": r.levy_scaling_width_bits,
+                    "empirical_lipschitz_max_bits_per_rad": r.empirical_lipschitz_max,
+                    "empirical_lipschitz_q99_bits_per_rad": r.empirical_lipschitz_q99,
+                    "normalized_proxy_max_rad": r.normalized_proxy_max,
+                    "normalized_proxy_q99_rad": r.normalized_proxy_q99,
+                }
+            )
+
+
+def plot_histogram(result: SystemResult, outdir: Path) -> Path:
+    plt.figure(figsize=(8.5, 5.5))
+    ent = result.entropy_bits
+    plt.hist(ent, bins=40, density=True, alpha=0.75)
+    plt.axvline(math.log2(result.d_a), linestyle="--", linewidth=2, label=r"$\log_2 d_A$")
+    plt.axvline(result.mean_bits, linestyle="-.", linewidth=2, label="empirical mean")
+    plt.axvline(result.page_average_bits, linestyle=":", linewidth=2, label="Page average")
+    local_min = float(np.min(ent))
+    local_max = float(np.max(ent))
+    local_range = max(local_max - local_min, 1e-9)
+    if result.hayden_cutoff_bits >= local_min - 0.15 * local_range:
+        plt.axvline(result.hayden_cutoff_bits, linestyle="-", linewidth=2, label="Hayden cutoff")
+    plt.axvspan(result.interval_left_bits, result.interval_right_bits, alpha=0.18, label=f"shortest {(result.mass):.0%} interval")
+    plt.xlim(local_min - 0.12 * local_range, local_max + 0.35 * local_range)
+    plt.xlabel("Entropy of entanglement S_A (bits)")
+    plt.ylabel("Empirical density")
+    plt.title(
+        f"Entropy distribution on CP^{result.projective_dim} via C^{result.d_a} ⊗ C^{result.d_b}"
+    )
+    plt.legend(frameon=False)
+    plt.tight_layout()
+    out_path = outdir / f"entropy_histogram_{result.d_a}x{result.d_b}.png"
+    plt.savefig(out_path, dpi=180)
+    plt.close()
+    return out_path
+
+
+def plot_tail(result: SystemResult, outdir: Path) -> Path:
+    deficits = math.log2(result.d_a) - np.sort(result.entropy_bits)
+    n = deficits.size
+    ccdf = 1.0 - (np.arange(1, n + 1) / n)
+    ccdf = np.maximum(ccdf, 1.0 / n)
+    x_grid = np.linspace(0.0, max(float(np.max(deficits)), result.hayden_one_sided_width_bits) * 1.05, 250)
+    bound = hayden_deficit_tail_bound_bits(result.d_a, result.d_b, x_grid)
+
+    plt.figure(figsize=(8.5, 5.5))
+    plt.semilogy(deficits, ccdf, marker="o", linestyle="none", markersize=3, alpha=0.5, label="empirical tail")
+    plt.semilogy(x_grid, bound, linewidth=2, label="Hayden lower-tail bound")
+    plt.axvline(hayden_beta_bits(result.d_a, result.d_b), linestyle="--", linewidth=1.8, label=r"$\beta$")
+    plt.xlabel(r"Entropy deficit $\log_2 d_A - S_A$ (bits)")
+    plt.ylabel(r"Tail probability $\Pr[\log_2 d_A - S_A > t]$")
+    plt.title(f"Entropy-deficit tail for C^{result.d_a} ⊗ C^{result.d_b}")
+    plt.legend(frameon=False)
+    plt.tight_layout()
+    out_path = outdir / f"entropy_tail_{result.d_a}x{result.d_b}.png"
+    plt.savefig(out_path, dpi=180)
+    plt.close()
+    return out_path
+
+
+def plot_concentration_summary(results: Sequence[SystemResult], outdir: Path) -> Path:
+    x = np.array([r.projective_dim for r in results], dtype=float)
+    partial_width = np.array([r.partial_diameter_bits for r in results], dtype=float)
+    std = np.array([r.std_bits for r in results], dtype=float)
+    mean_deficit = np.array([math.log2(r.d_a) - r.mean_bits for r in results], dtype=float)
+
+    plt.figure(figsize=(8.5, 5.5))
+    plt.plot(x, partial_width, marker="o", linewidth=2, label=r"shortest $(1-\kappa)$ entropy interval")
+    plt.plot(x, std, marker="s", linewidth=2, label="empirical standard deviation")
+    plt.plot(x, mean_deficit, marker="^", linewidth=2, label=r"mean deficit $\log_2 d_A - \mathbb{E}S_A$")
+    plt.xlabel(r"Projective dimension $n = d_A d_B - 1$")
+    plt.ylabel(r"Bits")
+    plt.title("Empirical concentration of the entropy observable on CP^n")
+    plt.legend(frameon=False)
+    plt.tight_layout()
+    out_path = outdir / "entropy_partial_diameter_vs_projective_dimension.png"
+    plt.savefig(out_path, dpi=180)
+    plt.close()
+    return out_path
+
+
+def plot_normalized_proxy(results: Sequence[SystemResult], outdir: Path) -> Path | None:
+    good = [r for r in results if r.normalized_proxy_q99 == r.normalized_proxy_q99]
+    if not good:
+        return None
+    x = np.array([r.projective_dim for r in good], dtype=float)
+    y_max = np.array([r.normalized_proxy_max for r in good], dtype=float)
+    y_q99 = np.array([r.normalized_proxy_q99 for r in good], dtype=float)
+
+    plt.figure(figsize=(8.5, 5.5))
+    plt.plot(x, y_max, marker="o", linewidth=2, label="width / sampled Lipschitz max")
+    plt.plot(x, y_q99, marker="s", linewidth=2, label="width / sampled Lipschitz q99")
+    plt.xlabel(r"Projective dimension $n = d_A d_B - 1$")
+    plt.ylabel("Empirical normalized proxy (radians)")
+    plt.title("Lipschitz-normalized entropy proxy for observable diameter")
+    plt.legend(frameon=False)
+    plt.tight_layout()
+    out_path = outdir / "normalized_entropy_proxy_vs_projective_dimension.png"
+    plt.savefig(out_path, dpi=180)
+    plt.close()
+    return out_path
+
+
+def print_console_summary(results: Sequence[SystemResult]) -> None:
+    print("dA dB  CP^n     mean(bits)  part_diam(bits)  Page(bits)  Hayden_cutoff(bits)  L_emp_q99")
+    for r in results:
+        lip_q99 = f"{r.empirical_lipschitz_q99:.4f}" if r.empirical_lipschitz_q99 == r.empirical_lipschitz_q99 else "nan"
+        print(
+            f"{r.d_a:2d} {r.d_b:2d}  {r.projective_dim:5d}  "
+            f"{r.mean_bits:10.6f}  {r.partial_diameter_bits:15.6f}  "
+            f"{r.page_average_bits:10.6f}  {r.hayden_cutoff_bits:20.6f}  {lip_q99}"
+        )
+
+
+def build_argument_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--dims",
+        default="4x4,8x8,12x12,16x16,32x32,64x64,128x128",
+        help="Comma-separated subsystem sizes, e.g. 4x4,8x8,8x16",
+    )
+    parser.add_argument("--samples", type=int, default=10**6, help="Samples per system")
+    parser.add_argument("--kappa", type=float, default=1e-3, help="Observable-diameter loss parameter kappa")
+    parser.add_argument(
+        "--lipschitz-pairs",
+        type=int,
+        default=6000,
+        help="Number of random state pairs used for empirical Lipschitz estimation",
+    )
+    parser.add_argument("--seed", type=int, default=7, help="RNG seed")
+    parser.add_argument(
+        "--outdir",
+        type=str,
+        default="cpn_entropy_output",
+        help="Output directory for CSV and plots",
+    )
+    return parser
+
+
+def main() -> None:
+    parser = build_argument_parser()
+    args = parser.parse_args()
+
+    if not 0.0 < args.kappa < 1.0:
+        raise ValueError("kappa must lie in (0, 1)")
+    if args.samples < 10:
+        raise ValueError("Use at least 10 samples per system")
+
+    dims = parse_dims(args.dims)
+    rng = np.random.default_rng(args.seed)
+
+    outdir = Path(args.outdir)
+    outdir.mkdir(parents=True, exist_ok=True)
+
+    results: List[SystemResult] = []
+    for d_a, d_b in dims:
+        result, _states = simulate_system(
+            d_a=d_a,
+            d_b=d_b,
+            num_samples=args.samples,
+            kappa=args.kappa,
+            rng=rng,
+            lipschitz_pairs=args.lipschitz_pairs,
+        )
+        results.append(result)
+        plot_histogram(result, outdir)
+
+    results = sorted(results, key=lambda r: r.projective_dim)
+    write_summary_csv(results, outdir / "entropy_observable_summary.csv")
+    plot_concentration_summary(results, outdir)
+    plot_normalized_proxy(results, outdir)
+    plot_tail(results[-1], outdir)
+    print_console_summary(results)
+    print(f"\nWrote results to: {outdir.resolve()}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/codes/experiment_v0.2/__pycache__/config.cpython-312.pyc b/codes/experiment_v0.2/__pycache__/config.cpython-312.pyc
new file mode 100644
index 0000000..d4064c1
Binary files /dev/null and b/codes/experiment_v0.2/__pycache__/config.cpython-312.pyc differ
diff --git a/codes/experiment_v0.2/config.py b/codes/experiment_v0.2/config.py
new file mode 100644
index 0000000..d92d3cb
--- /dev/null
+++ b/codes/experiment_v0.2/config.py
@@ -0,0 +1,24 @@
+"""Edit globals here; no CLI parser is used."""
+from datetime import datetime
+from pathlib import Path
+
+SEED = 7
+KAPPA = 1e-3
+NUM_SAMPLES = 10**4                  # requested default
+LIPSCHITZ_PAIRS = 12_000
+LIPSCHITZ_RESERVOIR = 4_096
+MAJORANA_STAR_STATES = 16            # only for visualization
+MAX_STAR_DEGREE = 63                 # avoid unstable huge root-finding plots
+
+BACKEND = "auto"                     # auto | jax | numpy
+JAX_PLATFORM = ""                    # "", "cpu", "gpu"; set before importing JAX
+RESULTS_DIR = Path("./results") / f"exp-{datetime.now():%Y%m%d-%H%M%S}"
+
+# Chosen so the three families have comparable intrinsic dimensions:
+# sphere S^(m-1), CP^(d_A d_B - 1), and Sym^N(C^2) ~ CP^N.
+SPHERE_DIMS = [16, 64, 256, 1024]
+CP_DIMS = [(4, 4), (8, 8), (16, 16), (32, 32)]
+MAJORANA_N = [15, 63, 255, 1023]
+
+# Batch sizes are the main speed knob; reduce CP batches first if memory is tight.
+BATCH = {"sphere": 32_768, "cp": 256, "majorana": 65_536}
\ No newline at end of file
diff --git a/codes/experiment_v0.2/main.py b/codes/experiment_v0.2/main.py
new file mode 100644
index 0000000..8646cc7
--- /dev/null
+++ b/codes/experiment_v0.2/main.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+"""Unified Monte Carlo for S^(m-1), CP^n, and symmetric-state CP^N via Majorana stars."""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import numpy as np
+
+import config
+
+if config.JAX_PLATFORM:
+    os.environ["JAX_PLATFORM_NAME"] = config.JAX_PLATFORM
+
+from sampling_pipeline import (  # noqa: E402
+    plot_cross_space_comparison,
+    plot_family_summary,
+    plot_histogram,
+    plot_majorana_stars,
+    plot_tail,
+    simulate_space,
+    write_summary_csv,
+)
+from spaces import ComplexProjectiveSpace, MajoranaSymmetricSpace, UnitSphereSpace  # noqa: E402
+
+
+def main() -> None:
+    outdir = Path(config.RESULTS_DIR)
+    outdir.mkdir(parents=True, exist_ok=True)
+
+    spaces = (
+        [UnitSphereSpace(m) for m in config.SPHERE_DIMS]
+        + [ComplexProjectiveSpace(a, b) for a, b in config.CP_DIMS]
+        + [MajoranaSymmetricSpace(n) for n in config.MAJORANA_N]
+    )
+
+    seeds = np.random.SeedSequence(config.SEED).spawn(len(spaces) + 16)
+    results = []
+
+    for i, space in enumerate(spaces):
+        result = simulate_space(
+            space,
+            num_samples=config.NUM_SAMPLES,
+            batch=config.BATCH[space.family],
+            kappa=config.KAPPA,
+            seed=int(seeds[i].generate_state(1, dtype=np.uint32)[0]),
+            backend=config.BACKEND,
+            lipschitz_pairs=config.LIPSCHITZ_PAIRS,
+            lipschitz_reservoir=config.LIPSCHITZ_RESERVOIR,
+        )
+        results.append(result)
+        plot_histogram(result, outdir)
+        plot_tail(result, space, outdir)
+
+        if space.family == "majorana" and space.N <= config.MAX_STAR_DEGREE:
+            star_seed = int(seeds[len(spaces) + i].generate_state(1, dtype=np.uint32)[0])
+            from pipeline import _sample_stream  # local import to avoid exporting internals
+            states, _ = _sample_stream(space, config.MAJORANA_STAR_STATES, min(config.MAJORANA_STAR_STATES, config.BATCH["majorana"]), star_seed, config.BACKEND, keep_states=True)
+            plot_majorana_stars(space, states, outdir)
+
+    results.sort(key=lambda r: (r.family, r.intrinsic_dim))
+    write_summary_csv(results, outdir / "observable_diameter_summary.csv")
+    for fam in ("sphere", "cp", "majorana"):
+        plot_family_summary(results, fam, outdir)
+    plot_cross_space_comparison(results, outdir)
+
+    with (outdir / "run_config.txt").open("w") as fh:
+        fh.write(
+            f"SEED={config.SEED}\nKAPPA={config.KAPPA}\nNUM_SAMPLES={config.NUM_SAMPLES}\n"
+            f"LIPSCHITZ_PAIRS={config.LIPSCHITZ_PAIRS}\nLIPSCHITZ_RESERVOIR={config.LIPSCHITZ_RESERVOIR}\n"
+            f"BACKEND={config.BACKEND}\nJAX_PLATFORM={config.JAX_PLATFORM}\n"
+            f"SPHERE_DIMS={config.SPHERE_DIMS}\nCP_DIMS={config.CP_DIMS}\nMAJORANA_N={config.MAJORANA_N}\n"
+            f"BATCH={config.BATCH}\n"
+        )
+
+    print("family     dim    mean(bits)   part_diam(bits)   norm_proxy_q99")
+    for r in results:
+        q = f"{r.normalized_proxy_q99:.6g}" if r.normalized_proxy_q99 == r.normalized_proxy_q99 else "nan"
+        print(f"{r.family:8s} {r.intrinsic_dim:5d}  {r.mean:11.6f}  {r.partial_diameter:16.6f}  {q:>14s}")
+    print(f"\nWrote results to: {outdir.resolve()}")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/codes/experiment_v0.2/requirements.txt b/codes/experiment_v0.2/requirements.txt
new file mode 100644
index 0000000..33e8458
--- /dev/null
+++ b/codes/experiment_v0.2/requirements.txt
@@ -0,0 +1,11 @@
+numpy>=1.26
+matplotlib>=3.8
+tqdm>=4.66
+# CPU-only JAX
+# jax
+# Apple Metal JAX (experimental; complex64/complex128 currently unsupported)
+# jax-metal
+# NVIDIA Linux JAX
+jax[cuda13]
+# or, if needed:
+# jax[cuda12]
\ No newline at end of file
diff --git a/codes/experiment_v0.2/sampling_pipline.py b/codes/experiment_v0.2/sampling_pipline.py
new file mode 100644
index 0000000..3c13bb7
--- /dev/null
+++ b/codes/experiment_v0.2/sampling_pipline.py
@@ -0,0 +1,324 @@
+from __future__ import annotations
+
+import csv
+import math
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Sequence
+
+import matplotlib.pyplot as plt
+import numpy as np
+from tqdm.auto import tqdm
+
+from spaces import HAS_JAX, MetricMeasureSpace, jax, random
+
+
+@dataclass
+class SystemResult:
+    """Compact record of one simulated metric-measure system."""
+    family: str
+    label: str
+    slug: str
+    intrinsic_dim: int
+    num_samples: int
+    kappa: float
+    mass: float
+    observable_max: float
+    values: np.ndarray
+    partial_diameter: float
+    interval_left: float
+    interval_right: float
+    mean: float
+    median: float
+    std: float
+    empirical_lipschitz_max: float
+    empirical_lipschitz_q99: float
+    normalized_proxy_max: float
+    normalized_proxy_q99: float
+    theory: dict[str, float] = field(default_factory=dict)
+
+
+def partial_diameter(samples: np.ndarray, mass: float) -> tuple[float, float, float]:
+    """Shortest interval carrying the requested empirical mass."""
+    x = np.sort(np.asarray(samples, float))
+    n = len(x)
+    if n == 0 or not (0.0 < mass <= 1.0):
+        raise ValueError("Need nonempty samples and mass in (0,1].")
+    if n == 1:
+        return 0.0, float(x[0]), float(x[0])
+    m = max(1, int(math.ceil(mass * n)))
+    if m <= 1:
+        return 0.0, float(x[0]), float(x[0])
+    w = x[m - 1 :] - x[: n - m + 1]
+    i = int(np.argmin(w))
+    return float(w[i]), float(x[i]), float(x[i + m - 1])
+
+
+def empirical_lipschitz(
+    space: MetricMeasureSpace,
+    states: np.ndarray,
+    values: np.ndarray,
+    rng: np.random.Generator,
+    num_pairs: int,
+) -> tuple[float, float]:
+    """Estimate max and q99 slope over random state pairs."""
+    n = len(states)
+    if n < 2 or num_pairs <= 0:
+        return float("nan"), float("nan")
+    i = rng.integers(0, n, size=num_pairs)
+    j = rng.integers(0, n - 1, size=num_pairs)
+    j += (j >= i)
+    d = space.metric_pairs(states[i], states[j])
+    good = d > 1e-12
+    if not np.any(good):
+        return float("nan"), float("nan")
+    r = np.abs(values[i] - values[j])[good] / d[good]
+    return float(np.max(r)), float(np.quantile(r, 0.99))
+
+
+def _sample_stream(
+    space: MetricMeasureSpace,
+    n: int,
+    batch: int,
+    seed: int,
+    backend: str,
+    keep_states: bool,
+) -> tuple[np.ndarray | None, np.ndarray]:
+    """Sample values, optionally keeping state vectors for Lipschitz estimation."""
+    vals = np.empty(n, dtype=np.float32)
+    states = np.empty((n, space.state_dim), dtype=np.float32 if space.family == "sphere" else np.complex64) if keep_states else None
+    use_jax = backend != "numpy" and HAS_JAX
+    desc = f"{space.slug}: {n:,} samples"
+    if use_jax:
+        key = random.PRNGKey(seed)
+        for s in tqdm(range(0, n, batch), desc=desc, unit="batch"):
+            b = min(batch, n - s)
+            key, sub = random.split(key)
+            x, y = space.sample_jax(sub, b)
+            vals[s : s + b] = np.asarray(jax.device_get(y), dtype=np.float32)
+            if keep_states:
+                states[s : s + b] = np.asarray(jax.device_get(x), dtype=states.dtype)
+    else:
+        rng = np.random.default_rng(seed)
+        for s in tqdm(range(0, n, batch), desc=desc, unit="batch"):
+            b = min(batch, n - s)
+            x, y = space.sample_np(rng, b)
+            vals[s : s + b] = y
+            if keep_states:
+                states[s : s + b] = x.astype(states.dtype)
+    return states, vals
+
+
+def simulate_space(
+    space: MetricMeasureSpace,
+    *,
+    num_samples: int,
+    batch: int,
+    kappa: float,
+    seed: int,
+    backend: str,
+    lipschitz_pairs: int,
+    lipschitz_reservoir: int,
+) -> SystemResult:
+    """Main Monte Carlo pass plus a smaller Lipschitz pass."""
+    vals = _sample_stream(space, num_samples, batch, seed, backend, keep_states=False)[1]
+    mass = 1.0 - kappa
+    width, left, right = partial_diameter(vals, mass)
+
+    r_states, r_vals = _sample_stream(space, min(lipschitz_reservoir, num_samples), min(batch, lipschitz_reservoir), seed + 1, backend, keep_states=True)
+    lip_rng = np.random.default_rng(seed + 2)
+    lip_max, lip_q99 = empirical_lipschitz(space, r_states, r_vals, lip_rng, lipschitz_pairs)
+    nmax = width / lip_max if lip_max == lip_max and lip_max > 0 else float("nan")
+    nq99 = width / lip_q99 if lip_q99 == lip_q99 and lip_q99 > 0 else float("nan")
+
+    return SystemResult(
+        family=space.family,
+        label=space.label,
+        slug=space.slug,
+        intrinsic_dim=space.intrinsic_dim,
+        num_samples=num_samples,
+        kappa=kappa,
+        mass=mass,
+        observable_max=space.observable_max,
+        values=vals,
+        partial_diameter=width,
+        interval_left=left,
+        interval_right=right,
+        mean=float(np.mean(vals)),
+        median=float(np.median(vals)),
+        std=float(np.std(vals, ddof=1)) if len(vals) > 1 else 0.0,
+        empirical_lipschitz_max=lip_max,
+        empirical_lipschitz_q99=lip_q99,
+        normalized_proxy_max=nmax,
+        normalized_proxy_q99=nq99,
+        theory=space.theory(kappa),
+    )
+
+
+def write_summary_csv(results: Sequence[SystemResult], out_path: Path) -> None:
+    """Write one flat CSV with optional theory fields."""
+    extras = sorted({k for r in results for k in r.theory})
+    fields = [
+        "family", "label", "intrinsic_dim", "num_samples", "kappa", "mass",
+        "observable_max_bits", "partial_diameter_bits", "interval_left_bits", "interval_right_bits",
+        "mean_bits", "median_bits", "std_bits", "empirical_lipschitz_max", "empirical_lipschitz_q99",
+        "normalized_proxy_max", "normalized_proxy_q99",
+    ] + extras
+    with out_path.open("w", newline="") as fh:
+        w = csv.DictWriter(fh, fieldnames=fields)
+        w.writeheader()
+        for r in results:
+            row = {
+                "family": r.family,
+                "label": r.label,
+                "intrinsic_dim": r.intrinsic_dim,
+                "num_samples": r.num_samples,
+                "kappa": r.kappa,
+                "mass": r.mass,
+                "observable_max_bits": r.observable_max,
+                "partial_diameter_bits": r.partial_diameter,
+                "interval_left_bits": r.interval_left,
+                "interval_right_bits": r.interval_right,
+                "mean_bits": r.mean,
+                "median_bits": r.median,
+                "std_bits": r.std,
+                "empirical_lipschitz_max": r.empirical_lipschitz_max,
+                "empirical_lipschitz_q99": r.empirical_lipschitz_q99,
+                "normalized_proxy_max": r.normalized_proxy_max,
+                "normalized_proxy_q99": r.normalized_proxy_q99,
+            }
+            row.update(r.theory)
+            w.writerow(row)
+
+
+def plot_histogram(r: SystemResult, outdir: Path) -> None:
+    """Per-system histogram with interval and theory overlays when available."""
+    v = r.values
+    vmin, vmax = float(np.min(v)), float(np.max(v))
+    vr = max(vmax - vmin, 1e-9)
+    plt.figure(figsize=(8.5, 5.5))
+    plt.hist(v, bins=48, density=True, alpha=0.75)
+    plt.axvspan(r.interval_left, r.interval_right, alpha=0.18, label=f"shortest {(r.mass):.0%} interval")
+    plt.axvline(r.observable_max, linestyle="--", linewidth=2, label="observable upper bound")
+    plt.axvline(r.mean, linestyle="-.", linewidth=2, label="empirical mean")
+    if "page_average_bits" in r.theory:
+        plt.axvline(r.theory["page_average_bits"], linestyle=":", linewidth=2, label="Page average")
+    if "hayden_cutoff_bits" in r.theory:
+        plt.axvline(r.theory["hayden_cutoff_bits"], linewidth=2, label="Hayden cutoff")
+    plt.xlim(vmin - 0.1 * vr, vmax + 0.25 * vr)
+    plt.xlabel("Entropy observable (bits)")
+    plt.ylabel("Empirical density")
+    plt.title(r.label)
+    plt.legend(frameon=False)
+    plt.tight_layout()
+    plt.savefig(outdir / f"hist_{r.slug}.png", dpi=180)
+    plt.close()
+
+
+def plot_tail(r: SystemResult, space: MetricMeasureSpace, outdir: Path) -> None:
+    """Upper-tail plot for the entropy deficit from its natural ceiling."""
+    deficits = r.observable_max - np.sort(r.values)
+    n = len(deficits)
+    ccdf = np.maximum(1.0 - (np.arange(1, n + 1) / n), 1.0 / n)
+    x = np.linspace(0.0, max(float(np.max(deficits)), 1e-6), 256)
+    plt.figure(figsize=(8.5, 5.5))
+    plt.semilogy(deficits, ccdf, marker="o", linestyle="none", markersize=3, alpha=0.45, label="empirical tail")
+    bound = space.tail_bound(x)
+    if bound is not None:
+        plt.semilogy(x, bound, linewidth=2, label="theory bound")
+    plt.xlabel("Entropy deficit (bits)")
+    plt.ylabel("Tail probability")
+    plt.title(f"Tail plot: {r.label}")
+    plt.legend(frameon=False)
+    plt.tight_layout()
+    plt.savefig(outdir / f"tail_{r.slug}.png", dpi=180)
+    plt.close()
+
+
+def plot_family_summary(results: Sequence[SystemResult], family: str, outdir: Path) -> None:
+    """Original-style summary plots, one family at a time."""
+    rs = sorted([r for r in results if r.family == family], key=lambda z: z.intrinsic_dim)
+    if not rs:
+        return
+    x = np.array([r.intrinsic_dim for r in rs], float)
+    pd = np.array([r.partial_diameter for r in rs], float)
+    sd = np.array([r.std for r in rs], float)
+    md = np.array([r.observable_max - r.mean for r in rs], float)
+
+    plt.figure(figsize=(8.5, 5.5))
+    plt.plot(x, pd, marker="o", linewidth=2, label=r"shortest $(1-\kappa)$ interval")
+    plt.plot(x, sd, marker="s", linewidth=2, label="empirical std")
+    plt.plot(x, md, marker="^", linewidth=2, label="mean deficit")
+    plt.xlabel("Intrinsic dimension")
+    plt.ylabel("Bits")
+    plt.title(f"Concentration summary: {family}")
+    plt.legend(frameon=False)
+    plt.tight_layout()
+    plt.savefig(outdir / f"summary_{family}.png", dpi=180)
+    plt.close()
+
+    good = [r for r in rs if r.normalized_proxy_q99 == r.normalized_proxy_q99]
+    if good:
+        x = np.array([r.intrinsic_dim for r in good], float)
+        y1 = np.array([r.normalized_proxy_max for r in good], float)
+        y2 = np.array([r.normalized_proxy_q99 for r in good], float)
+        plt.figure(figsize=(8.5, 5.5))
+        plt.plot(x, y1, marker="o", linewidth=2, label="width / Lipschitz max")
+        plt.plot(x, y2, marker="s", linewidth=2, label="width / Lipschitz q99")
+        plt.xlabel("Intrinsic dimension")
+        plt.ylabel("Normalized proxy")
+        plt.title(f"Lipschitz-normalized proxy: {family}")
+        plt.legend(frameon=False)
+        plt.tight_layout()
+        plt.savefig(outdir / f"normalized_{family}.png", dpi=180)
+        plt.close()
+
+
+def plot_cross_space_comparison(results: Sequence[SystemResult], outdir: Path) -> None:
+    """Direct comparison of the three spaces on one figure."""
+    marks = {"sphere": "o", "cp": "s", "majorana": "^"}
+
+    plt.figure(figsize=(8.8, 5.6))
+    for fam in ("sphere", "cp", "majorana"):
+        rs = sorted([r for r in results if r.family == fam], key=lambda z: z.intrinsic_dim)
+        if rs:
+            plt.plot([r.intrinsic_dim for r in rs], [r.partial_diameter for r in rs], marker=marks[fam], linewidth=2, label=fam)
+    plt.xlabel("Intrinsic dimension")
+    plt.ylabel("Partial diameter in bits")
+    plt.title("Entropy-based observable-diameter proxy: raw width comparison")
+    plt.legend(frameon=False)
+    plt.tight_layout()
+    plt.savefig(outdir / "compare_partial_diameter.png", dpi=180)
+    plt.close()
+
+    plt.figure(figsize=(8.8, 5.6))
+    for fam in ("sphere", "cp", "majorana"):
+        rs = sorted([r for r in results if r.family == fam and r.normalized_proxy_q99 == r.normalized_proxy_q99], key=lambda z: z.intrinsic_dim)
+        if rs:
+            plt.plot([r.intrinsic_dim for r in rs], [r.normalized_proxy_q99 for r in rs], marker=marks[fam], linewidth=2, label=fam)
+    plt.xlabel("Intrinsic dimension")
+    plt.ylabel("Normalized proxy")
+    plt.title("Entropy-based observable-diameter proxy: normalized comparison")
+    plt.legend(frameon=False)
+    plt.tight_layout()
+    plt.savefig(outdir / "compare_normalized_proxy.png", dpi=180)
+    plt.close()
+
+
+def plot_majorana_stars(space: MetricMeasureSpace, states: np.ndarray, outdir: Path) -> None:
+    """Scatter Majorana stars in longitude/latitude coordinates."""
+    if not hasattr(space, "majorana_stars") or len(states) == 0:
+        return
+    pts = np.vstack([space.majorana_stars(s) for s in states])
+    x, y, z = pts[:, 0], pts[:, 1], np.clip(pts[:, 2], -1.0, 1.0)
+    lon, lat = np.arctan2(y, x), np.arcsin(z)
+    plt.figure(figsize=(8.8, 4.6))
+    plt.scatter(lon, lat, s=10, alpha=0.35)
+    plt.xlim(-math.pi, math.pi)
+    plt.ylim(-math.pi / 2, math.pi / 2)
+    plt.xlabel("longitude")
+    plt.ylabel("latitude")
+    plt.title(f"Majorana stars: {space.label}")
+    plt.tight_layout()
+    plt.savefig(outdir / f"majorana_stars_{space.slug}.png", dpi=180)
+    plt.close()
\ No newline at end of file
diff --git a/codes/experiment_v0.2/spaces.py b/codes/experiment_v0.2/spaces.py
new file mode 100644
index 0000000..1056bee
--- /dev/null
+++ b/codes/experiment_v0.2/spaces.py
@@ -0,0 +1,284 @@
+from __future__ import annotations
+
+import math
+from dataclasses import dataclass
+from typing import Any
+
+import numpy as np
+
+try:
+    import jax
+    import jax.numpy as jnp
+    from jax import random
+
+    jax.config.update("jax_enable_x64", False)
+    HAS_JAX = True
+except Exception:  # pragma: no cover
+    jax = jnp = random = None
+    HAS_JAX = False
+
+HAYDEN_C = 1.0 / (8.0 * math.pi**2)
+
+
+def entropy_bits_from_probs(p: Any, xp: Any) -> Any:
+    """Return Shannon/von-Neumann entropy of probabilities/eigenvalues in bits."""
+    p = xp.clip(xp.real(p), 1e-30, 1.0)
+    return -xp.sum(p * xp.log2(p), axis=-1)
+
+
+def fs_metric_np(x: np.ndarray, y: np.ndarray) -> np.ndarray:
+    """Fubini-Study distance for batches of normalized complex vectors."""
+    ov = np.abs(np.sum(np.conj(x) * y, axis=-1))
+    return np.arccos(np.clip(ov, 0.0, 1.0))
+
+
+def sphere_metric_np(x: np.ndarray, y: np.ndarray) -> np.ndarray:
+    """Geodesic distance on the real unit sphere."""
+    dot = np.sum(x * y, axis=-1)
+    return np.arccos(np.clip(dot, -1.0, 1.0))
+
+
+class MetricMeasureSpace:
+    """Minimal interface: direct sampler + metric + scalar observable ceiling."""
+
+    family: str = "base"
+
+    @property
+    def label(self) -> str:
+        raise NotImplementedError
+
+    @property
+    def slug(self) -> str:
+        raise NotImplementedError
+
+    @property
+    def intrinsic_dim(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def state_dim(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def observable_max(self) -> float:
+        raise NotImplementedError
+
+    def sample_np(self, rng: np.random.Generator, batch: int) -> tuple[np.ndarray, np.ndarray]:
+        raise NotImplementedError
+
+    def sample_jax(self, key: Any, batch: int) -> tuple[Any, Any]:
+        raise NotImplementedError
+
+    def metric_pairs(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
+        raise NotImplementedError
+
+    def theory(self, kappa: float) -> dict[str, float]:
+        return {}
+
+    def tail_bound(self, deficits: np.ndarray) -> np.ndarray | None:
+        return None
+
+
+@dataclass
+class UnitSphereSpace(MetricMeasureSpace):
+    """Uniform measure on the real unit sphere S^(m-1), observable H(x_i^2)."""
+
+    dim: int
+    family: str = "sphere"
+
+    @property
+    def label(self) -> str:
+        return f"S^{self.dim - 1}"
+
+    @property
+    def slug(self) -> str:
+        return f"sphere_{self.dim}"
+
+    @property
+    def intrinsic_dim(self) -> int:
+        return self.dim - 1
+
+    @property
+    def state_dim(self) -> int:
+        return self.dim
+
+    @property
+    def observable_max(self) -> float:
+        return math.log2(self.dim)
+
+    def sample_np(self, rng: np.random.Generator, batch: int) -> tuple[np.ndarray, np.ndarray]:
+        x = rng.normal(size=(batch, self.dim)).astype(np.float32)
+        x /= np.linalg.norm(x, axis=1, keepdims=True)
+        return x, entropy_bits_from_probs(x * x, np).astype(np.float32)
+
+    def sample_jax(self, key: Any, batch: int) -> tuple[Any, Any]:
+        x = random.normal(key, (batch, self.dim), dtype=jnp.float32)
+        x /= jnp.linalg.norm(x, axis=1, keepdims=True)
+        return x, entropy_bits_from_probs(x * x, jnp)
+
+    def metric_pairs(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
+        return sphere_metric_np(x, y)
+
+
+@dataclass
+class ComplexProjectiveSpace(MetricMeasureSpace):
+    """Haar-random pure states on C^(d_A d_B), observable = entanglement entropy."""
+
+    d_a: int
+    d_b: int
+    family: str = "cp"
+
+    def __post_init__(self) -> None:
+        if self.d_a <= 1 or self.d_b <= 1:
+            raise ValueError("Need d_A,d_B >= 2.")
+        if self.d_a > self.d_b:
+            self.d_a, self.d_b = self.d_b, self.d_a
+
+    @property
+    def label(self) -> str:
+        return f"CP^{self.d_a * self.d_b - 1} via C^{self.d_a}⊗C^{self.d_b}"
+
+    @property
+    def slug(self) -> str:
+        return f"cp_{self.d_a}x{self.d_b}"
+
+    @property
+    def intrinsic_dim(self) -> int:
+        return self.d_a * self.d_b - 1
+
+    @property
+    def state_dim(self) -> int:
+        return self.d_a * self.d_b
+
+    @property
+    def observable_max(self) -> float:
+        return math.log2(self.d_a)
+
+    def sample_np(self, rng: np.random.Generator, batch: int) -> tuple[np.ndarray, np.ndarray]:
+        g = (rng.normal(size=(batch, self.d_a, self.d_b)) + 1j * rng.normal(size=(batch, self.d_a, self.d_b)))
+        g = (g / math.sqrt(2.0)).astype(np.complex64)
+        g /= np.sqrt(np.sum(np.abs(g) ** 2, axis=(1, 2), keepdims=True))
+        rho = g @ np.swapaxes(np.conj(g), 1, 2)
+        lam = np.clip(np.linalg.eigvalsh(rho).real, 1e-30, 1.0)
+        return g.reshape(batch, -1), entropy_bits_from_probs(lam, np).astype(np.float32)
+
+    def sample_jax(self, key: Any, batch: int) -> tuple[Any, Any]:
+        k1, k2 = random.split(key)
+        g = (random.normal(k1, (batch, self.d_a, self.d_b), dtype=jnp.float32)
+             + 1j * random.normal(k2, (batch, self.d_a, self.d_b), dtype=jnp.float32)) / math.sqrt(2.0)
+        g = g / jnp.sqrt(jnp.sum(jnp.abs(g) ** 2, axis=(1, 2), keepdims=True))
+        rho = g @ jnp.swapaxes(jnp.conj(g), -1, -2)
+        lam = jnp.clip(jnp.linalg.eigvalsh(rho).real, 1e-30, 1.0)
+        return g.reshape(batch, -1), entropy_bits_from_probs(lam, jnp)
+
+    def metric_pairs(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
+        return fs_metric_np(x, y)
+
+    def theory(self, kappa: float) -> dict[str, float]:
+        d = self.d_a * self.d_b
+        beta = self.d_a / (math.log(2.0) * self.d_b)
+        alpha = (math.log2(self.d_a) / math.sqrt(HAYDEN_C * (d - 1.0))) * math.sqrt(math.log(1.0 / kappa))
+        tail = sum(1.0 / k for k in range(self.d_b + 1, d + 1))
+        page = (tail - (self.d_a - 1.0) / (2.0 * self.d_b)) / math.log(2.0)
+        return {
+            "page_average_bits": page,
+            "hayden_mean_lower_bits": math.log2(self.d_a) - beta,
+            "hayden_cutoff_bits": math.log2(self.d_a) - (beta + alpha),
+            "hayden_one_sided_width_bits": beta + alpha,
+            "levy_scaling_width_bits": 2.0
+            * (math.log2(self.d_a) / math.sqrt(HAYDEN_C * (d - 1.0)))
+            * math.sqrt(math.log(2.0 / kappa)),
+        }
+
+    def tail_bound(self, deficits: np.ndarray) -> np.ndarray:
+        beta = self.d_a / (math.log(2.0) * self.d_b)
+        shifted = np.maximum(np.asarray(deficits, float) - beta, 0.0)
+        expo = -(self.d_a * self.d_b - 1.0) * HAYDEN_C * shifted**2 / (math.log2(self.d_a) ** 2)
+        out = np.exp(expo)
+        out[deficits <= beta] = 1.0
+        return np.clip(out, 0.0, 1.0)
+
+
+@dataclass
+class MajoranaSymmetricSpace(MetricMeasureSpace):
+    """Haar-random symmetric N-qubit states; stars are for visualization only."""
+
+    N: int
+    family: str = "majorana"
+
+    @property
+    def label(self) -> str:
+        return f"Sym^{self.N}(C^2) ≅ CP^{self.N}"
+
+    @property
+    def slug(self) -> str:
+        return f"majorana_{self.N}"
+
+    @property
+    def intrinsic_dim(self) -> int:
+        return self.N
+
+    @property
+    def state_dim(self) -> int:
+        return self.N + 1
+
+    @property
+    def observable_max(self) -> float:
+        return 1.0  # one-qubit entropy upper bound
+
+    def _rho1_np(self, c: np.ndarray) -> np.ndarray:
+        k = np.arange(self.N + 1, dtype=np.float32)
+        p = np.abs(c) ** 2
+        rho11 = (p * k).sum(axis=1) / self.N
+        coef = np.sqrt((np.arange(self.N, dtype=np.float32) + 1.0) * (self.N - np.arange(self.N, dtype=np.float32))) / self.N
+        off = (np.conj(c[:, :-1]) * c[:, 1:] * coef).sum(axis=1)
+        rho = np.zeros((len(c), 2, 2), dtype=np.complex64)
+        rho[:, 0, 0] = 1.0 - rho11
+        rho[:, 1, 1] = rho11
+        rho[:, 0, 1] = off
+        rho[:, 1, 0] = np.conj(off)
+        return rho
+
+    def _rho1_jax(self, c: Any) -> Any:
+        k = jnp.arange(self.N + 1, dtype=jnp.float32)
+        p = jnp.abs(c) ** 2
+        rho11 = jnp.sum(p * k, axis=1) / self.N
+        kk = jnp.arange(self.N, dtype=jnp.float32)
+        coef = jnp.sqrt((kk + 1.0) * (self.N - kk)) / self.N
+        off = jnp.sum(jnp.conj(c[:, :-1]) * c[:, 1:] * coef, axis=1)
+        rho = jnp.zeros((c.shape[0], 2, 2), dtype=jnp.complex64)
+        rho = rho.at[:, 0, 0].set(1.0 - rho11)
+        rho = rho.at[:, 1, 1].set(rho11)
+        rho = rho.at[:, 0, 1].set(off)
+        rho = rho.at[:, 1, 0].set(jnp.conj(off))
+        return rho
+
+    def sample_np(self, rng: np.random.Generator, batch: int) -> tuple[np.ndarray, np.ndarray]:
+        c = (rng.normal(size=(batch, self.N + 1)) + 1j * rng.normal(size=(batch, self.N + 1)))
+        c = (c / math.sqrt(2.0)).astype(np.complex64)
+        c /= np.linalg.norm(c, axis=1, keepdims=True)
+        lam = np.clip(np.linalg.eigvalsh(self._rho1_np(c)).real, 1e-30, 1.0)
+        return c, entropy_bits_from_probs(lam, np).astype(np.float32)
+
+    def sample_jax(self, key: Any, batch: int) -> tuple[Any, Any]:
+        k1, k2 = random.split(key)
+        c = (random.normal(k1, (batch, self.N + 1), dtype=jnp.float32)
+             + 1j * random.normal(k2, (batch, self.N + 1), dtype=jnp.float32)) / math.sqrt(2.0)
+        c = c / jnp.linalg.norm(c, axis=1, keepdims=True)
+        lam = jnp.clip(jnp.linalg.eigvalsh(self._rho1_jax(c)).real, 1e-30, 1.0)
+        return c, entropy_bits_from_probs(lam, jnp)
+
+    def metric_pairs(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
+        return fs_metric_np(x, y)
+
+    def majorana_stars(self, coeffs: np.ndarray) -> np.ndarray:
+        """Map one symmetric state to its Majorana stars on S^2."""
+        a = np.array([((-1) ** k) * math.sqrt(math.comb(self.N, k)) * coeffs[k] for k in range(self.N + 1)], np.complex128)
+        poly = np.trim_zeros(a[::-1], trim="f")
+        roots = np.roots(poly) if len(poly) > 1 else np.empty(0, dtype=np.complex128)
+        r2 = np.abs(roots) ** 2
+        pts = np.c_[2 * roots.real / (1 + r2), 2 * roots.imag / (1 + r2), (r2 - 1) / (1 + r2)]
+        missing = self.N - len(pts)
+        if missing > 0:
+            pts = np.vstack([pts, np.tile(np.array([[0.0, 0.0, 1.0]]), (missing, 1))])
+        return pts.astype(np.float32)
\ No newline at end of file
diff --git a/codes/reference/cpn_entropy_observable_diameter.py b/codes/reference/cpn_entropy_observable_diameter.py
new file mode 100644
index 0000000..561ce39
--- /dev/null
+++ b/codes/reference/cpn_entropy_observable_diameter.py
@@ -0,0 +1,524 @@
+#!/usr/bin/env python3
+"""
+Entropy-based observable-diameter estimator on complex projective space CP^n.
+
+Interpretation
+--------------
+We identify CP^n with the projective pure-state space of C^(n+1).  To define
+an entanglement entropy observable we choose a factorization
+
+    n + 1 = d_A * d_B,
+
+so the projective space is CP^(d_A d_B - 1).  For a projective point [psi],
+represented by a unit vector psi in C^(d_A d_B), define the observable
+
+    S_A([psi]) = -Tr(rho_A log_2 rho_A),
+    rho_A = Tr_B |psi><psi|.
+
+The true observable diameter ObsDiam(X; -kappa) is the supremum over all
+1-Lipschitz observables.  This script only uses the von Neumann entropy
+observable, so it reports:
+
+1) the partial diameter of the push-forward entropy distribution,
+2) an optional Lipschitz-normalized proxy obtained by dividing by an empirical
+   Lipschitz constant estimated with the Fubini-Study metric.
+
+Hence the output is best interpreted as an entropy-based observable-diameter
+proxy, not as the exact observable diameter of CP^n.
+
+Hayden-inspired comparison
+--------------------------
+Hayden/Leung/Winter show that the entanglement entropy of a Haar-random pure
+state is highly concentrated in high dimension.  The script overlays two
+useful theoretical guides:
+
+- a one-sided lower-tail cutoff derived from the standard Hayden bound,
+- a Levy/Hayden scaling width of order (log d_A)/sqrt(d_A d_B), centered at
+  the empirical median, to visualize concentration-of-measure decay.
+
+Sampling method
+---------------
+A Haar-random pure state on C^(d_A d_B) can be generated by normalizing a
+complex Gaussian vector.  Equivalently, we sample a complex Gaussian matrix
+G in C^(d_A x d_B); then vec(G)/||G|| is Haar-random and
+rho_A = G G^* / Tr(G G^*).
+
+Outputs
+-------
+The script writes:
+- a CSV summary table,
+- per-system entropy histograms,
+- a concentration summary plot across dimensions,
+- a normalized observable-proxy plot if Lipschitz estimation is enabled,
+- a tail plot for the largest system.
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import math
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable, List, Sequence, Tuple
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from tqdm import tqdm
+
+# A commonly used explicit constant in expositions of Hayden's concentration
+# bound in natural logs.  We keep the entropy in bits, in which the same
+# constant remains after the base conversion in the exponent.
+HAYDEN_C = 1.0 / (8.0 * math.pi ** 2)
+
+
+def parse_dims(spec: str) -> List[Tuple[int, int]]:
+    dims: List[Tuple[int, int]] = []
+    for item in spec.split(","):
+        token = item.strip().lower()
+        if not token:
+            continue
+        if "x" not in token:
+            raise ValueError(f"Bad dimension token '{item}'. Use forms like 4x8,8x16.")
+        a_str, b_str = token.split("x", 1)
+        d_a = int(a_str)
+        d_b = int(b_str)
+        if d_a <= 1 or d_b <= 1:
+            raise ValueError("Both subsystem dimensions must be >= 2.")
+        if d_a > d_b:
+            d_a, d_b = d_b, d_a
+        dims.append((d_a, d_b))
+    if not dims:
+        raise ValueError("No dimensions were parsed.")
+    return dims
+
+
+def haar_matrix(d_a: int, d_b: int, rng: np.random.Generator) -> np.ndarray:
+    real = rng.normal(size=(d_a, d_b))
+    imag = rng.normal(size=(d_a, d_b))
+    return (real + 1j * imag) / math.sqrt(2.0)
+
+
+def reduced_density_from_matrix(g: np.ndarray) -> np.ndarray:
+    rho = g @ g.conj().T
+    tr = float(np.trace(rho).real)
+    rho /= tr
+    return rho
+
+
+def entropy_bits_from_rho(rho: np.ndarray, tol: float = 1e-14) -> float:
+    eigvals = np.linalg.eigvalsh(rho)
+    eigvals = np.clip(eigvals.real, 0.0, 1.0)
+    eigvals = eigvals[eigvals > tol]
+    if eigvals.size == 0:
+        return 0.0
+    return float(-np.sum(eigvals * np.log2(eigvals)))
+
+
+def random_state_and_entropy(
+    d_a: int, d_b: int, rng: np.random.Generator
+) -> Tuple[np.ndarray, float]:
+    g = haar_matrix(d_a, d_b, rng)
+    rho_a = reduced_density_from_matrix(g)
+    entropy_bits = entropy_bits_from_rho(rho_a)
+    psi = g.reshape(-1)
+    psi /= np.linalg.norm(psi)
+    return psi, entropy_bits
+
+
+def partial_diameter(samples: np.ndarray, mass: float) -> Tuple[float, float, float]:
+    if not 0.0 < mass <= 1.0:
+        raise ValueError("mass must lie in (0, 1].")
+    x = np.sort(np.asarray(samples, dtype=float))
+    n = x.size
+    if n == 0:
+        raise ValueError("samples must be non-empty")
+    if n == 1:
+        return 0.0, float(x[0]), float(x[0])
+    m = int(math.ceil(mass * n))
+    if m <= 1:
+        return 0.0, float(x[0]), float(x[0])
+    widths = x[m - 1 :] - x[: n - m + 1]
+    idx = int(np.argmin(widths))
+    left = float(x[idx])
+    right = float(x[idx + m - 1])
+    return float(right - left), left, right
+
+
+def fubini_study_distance(psi: np.ndarray, phi: np.ndarray) -> float:
+    overlap = abs(np.vdot(psi, phi))
+    overlap = min(1.0, max(0.0, float(overlap)))
+    return float(math.acos(overlap))
+
+
+def empirical_lipschitz_constant(
+    states: Sequence[np.ndarray],
+    values: np.ndarray,
+    rng: np.random.Generator,
+    num_pairs: int,
+) -> Tuple[float, float]:
+    n = len(states)
+    if n < 2 or num_pairs <= 0:
+        return float("nan"), float("nan")
+    ratios = []
+    values = np.asarray(values, dtype=float)
+    for _ in range(num_pairs):
+        i = int(rng.integers(0, n))
+        j = int(rng.integers(0, n - 1))
+        if j >= i:
+            j += 1
+        d_fs = fubini_study_distance(states[i], states[j])
+        if d_fs < 1e-12:
+            continue
+        ratio = abs(values[i] - values[j]) / d_fs
+        ratios.append(ratio)
+    if not ratios:
+        return float("nan"), float("nan")
+    arr = np.asarray(ratios, dtype=float)
+    return float(np.max(arr)), float(np.quantile(arr, 0.99))
+
+
+def hayden_mean_lower_bound_bits(d_a: int, d_b: int) -> float:
+    return math.log2(d_a) - d_a / (2.0 * math.log(2.0) * d_b)
+
+
+def hayden_beta_bits(d_a: int, d_b: int) -> float:
+    return d_a / (math.log(2.0) * d_b)
+
+
+def hayden_alpha_bits(d_a: int, d_b: int, kappa: float) -> float:
+    dim = d_a * d_b
+    return (math.log2(d_a) / math.sqrt(HAYDEN_C * (dim - 1.0))) * math.sqrt(math.log(1.0 / kappa))
+
+
+def hayden_one_sided_width_bits(d_a: int, d_b: int, kappa: float) -> float:
+    return hayden_beta_bits(d_a, d_b) + hayden_alpha_bits(d_a, d_b, kappa)
+
+
+def hayden_lower_cutoff_bits(d_a: int, d_b: int, kappa: float) -> float:
+    return math.log2(d_a) - hayden_one_sided_width_bits(d_a, d_b, kappa)
+
+
+def levy_hayden_scaling_width_bits(d_a: int, d_b: int, kappa: float) -> float:
+    dim = d_a * d_b
+    half_width = (math.log2(d_a) / math.sqrt(HAYDEN_C * (dim - 1.0))) * math.sqrt(math.log(2.0 / kappa))
+    return 2.0 * half_width
+
+
+def hayden_deficit_tail_bound_bits(d_a: int, d_b: int, deficits_bits: np.ndarray) -> np.ndarray:
+    beta = hayden_beta_bits(d_a, d_b)
+    dim = d_a * d_b
+    log_term = math.log2(d_a)
+    shifted = np.maximum(np.asarray(deficits_bits, dtype=float) - beta, 0.0)
+    exponent = -(dim - 1.0) * HAYDEN_C * (shifted ** 2) / (log_term ** 2)
+    bound = np.exp(exponent)
+    bound[deficits_bits <= beta] = 1.0
+    return np.clip(bound, 0.0, 1.0)
+
+
+def page_average_entropy_bits(d_a: int, d_b: int) -> float:
+    # Exact Page formula in bits for d_b >= d_a.
+    harmonic_tail = sum(1.0 / k for k in range(d_b + 1, d_a * d_b + 1))
+    nats = harmonic_tail - (d_a - 1.0) / (2.0 * d_b)
+    return nats / math.log(2.0)
+
+
+@dataclass
+class SystemResult:
+    d_a: int
+    d_b: int
+    projective_dim: int
+    num_samples: int
+    kappa: float
+    mass: float
+    entropy_bits: np.ndarray
+    partial_diameter_bits: float
+    interval_left_bits: float
+    interval_right_bits: float
+    mean_bits: float
+    median_bits: float
+    std_bits: float
+    page_average_bits: float
+    hayden_mean_lower_bits: float
+    hayden_cutoff_bits: float
+    hayden_one_sided_width_bits: float
+    levy_scaling_width_bits: float
+    empirical_lipschitz_max: float
+    empirical_lipschitz_q99: float
+    normalized_proxy_max: float
+    normalized_proxy_q99: float
+
+
+def simulate_system(
+    d_a: int,
+    d_b: int,
+    num_samples: int,
+    kappa: float,
+    rng: np.random.Generator,
+    lipschitz_pairs: int,
+) -> Tuple[SystemResult, List[np.ndarray]]:
+    entropies = np.empty(num_samples, dtype=float)
+    states: List[np.ndarray] = []
+    for idx in tqdm(range(num_samples),desc=f"Simulating system for {d_a}x{d_b} with kappa={kappa}", unit="samples"):
+        psi, s_bits = random_state_and_entropy(d_a, d_b, rng)
+        entropies[idx] = s_bits
+        states.append(psi)
+
+    mass = 1.0 - kappa
+    width, left, right = partial_diameter(entropies, mass)
+    lip_max, lip_q99 = empirical_lipschitz_constant(states, entropies, rng, lipschitz_pairs)
+
+    normalized_proxy_max = width / lip_max if lip_max == lip_max and lip_max > 0 else float("nan")
+    normalized_proxy_q99 = width / lip_q99 if lip_q99 == lip_q99 and lip_q99 > 0 else float("nan")
+
+    result = SystemResult(
+        d_a=d_a,
+        d_b=d_b,
+        projective_dim=d_a * d_b - 1,
+        num_samples=num_samples,
+        kappa=kappa,
+        mass=mass,
+        entropy_bits=entropies,
+        partial_diameter_bits=width,
+        interval_left_bits=left,
+        interval_right_bits=right,
+        mean_bits=float(np.mean(entropies)),
+        median_bits=float(np.median(entropies)),
+        std_bits=float(np.std(entropies, ddof=1)) if num_samples > 1 else 0.0,
+        page_average_bits=page_average_entropy_bits(d_a, d_b),
+        hayden_mean_lower_bits=hayden_mean_lower_bound_bits(d_a, d_b),
+        hayden_cutoff_bits=hayden_lower_cutoff_bits(d_a, d_b, kappa),
+        hayden_one_sided_width_bits=hayden_one_sided_width_bits(d_a, d_b, kappa),
+        levy_scaling_width_bits=levy_hayden_scaling_width_bits(d_a, d_b, kappa),
+        empirical_lipschitz_max=lip_max,
+        empirical_lipschitz_q99=lip_q99,
+        normalized_proxy_max=normalized_proxy_max,
+        normalized_proxy_q99=normalized_proxy_q99,
+    )
+    return result, states
+
+
+def write_summary_csv(results: Sequence[SystemResult], out_path: Path) -> None:
+    fieldnames = [
+        "d_a",
+        "d_b",
+        "projective_dim",
+        "num_samples",
+        "kappa",
+        "mass",
+        "partial_diameter_bits",
+        "interval_left_bits",
+        "interval_right_bits",
+        "mean_bits",
+        "median_bits",
+        "std_bits",
+        "page_average_bits",
+        "hayden_mean_lower_bits",
+        "hayden_cutoff_bits",
+        "hayden_one_sided_width_bits",
+        "levy_scaling_width_bits",
+        "empirical_lipschitz_max_bits_per_rad",
+        "empirical_lipschitz_q99_bits_per_rad",
+        "normalized_proxy_max_rad",
+        "normalized_proxy_q99_rad",
+    ]
+    with out_path.open("w", newline="") as fh:
+        writer = csv.DictWriter(fh, fieldnames=fieldnames)
+        writer.writeheader()
+        for r in results:
+            writer.writerow(
+                {
+                    "d_a": r.d_a,
+                    "d_b": r.d_b,
+                    "projective_dim": r.projective_dim,
+                    "num_samples": r.num_samples,
+                    "kappa": r.kappa,
+                    "mass": r.mass,
+                    "partial_diameter_bits": r.partial_diameter_bits,
+                    "interval_left_bits": r.interval_left_bits,
+                    "interval_right_bits": r.interval_right_bits,
+                    "mean_bits": r.mean_bits,
+                    "median_bits": r.median_bits,
+                    "std_bits": r.std_bits,
+                    "page_average_bits": r.page_average_bits,
+                    "hayden_mean_lower_bits": r.hayden_mean_lower_bits,
+                    "hayden_cutoff_bits": r.hayden_cutoff_bits,
+                    "hayden_one_sided_width_bits": r.hayden_one_sided_width_bits,
+                    "levy_scaling_width_bits": r.levy_scaling_width_bits,
+                    "empirical_lipschitz_max_bits_per_rad": r.empirical_lipschitz_max,
+                    "empirical_lipschitz_q99_bits_per_rad": r.empirical_lipschitz_q99,
+                    "normalized_proxy_max_rad": r.normalized_proxy_max,
+                    "normalized_proxy_q99_rad": r.normalized_proxy_q99,
+                }
+            )
+
+
+def plot_histogram(result: SystemResult, outdir: Path) -> Path:
+    plt.figure(figsize=(8.5, 5.5))
+    ent = result.entropy_bits
+    plt.hist(ent, bins=40, density=True, alpha=0.75)
+    plt.axvline(math.log2(result.d_a), linestyle="--", linewidth=2, label=r"$\log_2 d_A$")
+    plt.axvline(result.mean_bits, linestyle="-.", linewidth=2, label="empirical mean")
+    plt.axvline(result.page_average_bits, linestyle=":", linewidth=2, label="Page average")
+    local_min = float(np.min(ent))
+    local_max = float(np.max(ent))
+    local_range = max(local_max - local_min, 1e-9)
+    if result.hayden_cutoff_bits >= local_min - 0.15 * local_range:
+        plt.axvline(result.hayden_cutoff_bits, linestyle="-", linewidth=2, label="Hayden cutoff")
+    plt.axvspan(result.interval_left_bits, result.interval_right_bits, alpha=0.18, label=f"shortest {(result.mass):.0%} interval")
+    plt.xlim(local_min - 0.12 * local_range, local_max + 0.35 * local_range)
+    plt.xlabel("Entropy of entanglement S_A (bits)")
+    plt.ylabel("Empirical density")
+    plt.title(
+        f"Entropy distribution on CP^{result.projective_dim} via C^{result.d_a} ⊗ C^{result.d_b}"
+    )
+    plt.legend(frameon=False)
+    plt.tight_layout()
+    out_path = outdir / f"entropy_histogram_{result.d_a}x{result.d_b}.png"
+    plt.savefig(out_path, dpi=180)
+    plt.close()
+    return out_path
+
+
+def plot_tail(result: SystemResult, outdir: Path) -> Path:
+    deficits = math.log2(result.d_a) - np.sort(result.entropy_bits)
+    n = deficits.size
+    ccdf = 1.0 - (np.arange(1, n + 1) / n)
+    ccdf = np.maximum(ccdf, 1.0 / n)
+    x_grid = np.linspace(0.0, max(float(np.max(deficits)), result.hayden_one_sided_width_bits) * 1.05, 250)
+    bound = hayden_deficit_tail_bound_bits(result.d_a, result.d_b, x_grid)
+
+    plt.figure(figsize=(8.5, 5.5))
+    plt.semilogy(deficits, ccdf, marker="o", linestyle="none", markersize=3, alpha=0.5, label="empirical tail")
+    plt.semilogy(x_grid, bound, linewidth=2, label="Hayden lower-tail bound")
+    plt.axvline(hayden_beta_bits(result.d_a, result.d_b), linestyle="--", linewidth=1.8, label=r"$\beta$")
+    plt.xlabel(r"Entropy deficit $\log_2 d_A - S_A$ (bits)")
+    plt.ylabel(r"Tail probability $\Pr[\log_2 d_A - S_A > t]$")
+    plt.title(f"Entropy-deficit tail for C^{result.d_a} ⊗ C^{result.d_b}")
+    plt.legend(frameon=False)
+    plt.tight_layout()
+    out_path = outdir / f"entropy_tail_{result.d_a}x{result.d_b}.png"
+    plt.savefig(out_path, dpi=180)
+    plt.close()
+    return out_path
+
+
+def plot_concentration_summary(results: Sequence[SystemResult], outdir: Path) -> Path:
+    x = np.array([r.projective_dim for r in results], dtype=float)
+    partial_width = np.array([r.partial_diameter_bits for r in results], dtype=float)
+    std = np.array([r.std_bits for r in results], dtype=float)
+    mean_deficit = np.array([math.log2(r.d_a) - r.mean_bits for r in results], dtype=float)
+
+    plt.figure(figsize=(8.5, 5.5))
+    plt.plot(x, partial_width, marker="o", linewidth=2, label=r"shortest $(1-\kappa)$ entropy interval")
+    plt.plot(x, std, marker="s", linewidth=2, label="empirical standard deviation")
+    plt.plot(x, mean_deficit, marker="^", linewidth=2, label=r"mean deficit $\log_2 d_A - \mathbb{E}S_A$")
+    plt.xlabel(r"Projective dimension $n = d_A d_B - 1$")
+    plt.ylabel(r"Bits")
+    plt.title("Empirical concentration of the entropy observable on CP^n")
+    plt.legend(frameon=False)
+    plt.tight_layout()
+    out_path = outdir / "entropy_partial_diameter_vs_projective_dimension.png"
+    plt.savefig(out_path, dpi=180)
+    plt.close()
+    return out_path
+
+
+def plot_normalized_proxy(results: Sequence[SystemResult], outdir: Path) -> Path | None:
+    good = [r for r in results if r.normalized_proxy_q99 == r.normalized_proxy_q99]
+    if not good:
+        return None
+    x = np.array([r.projective_dim for r in good], dtype=float)
+    y_max = np.array([r.normalized_proxy_max for r in good], dtype=float)
+    y_q99 = np.array([r.normalized_proxy_q99 for r in good], dtype=float)
+
+    plt.figure(figsize=(8.5, 5.5))
+    plt.plot(x, y_max, marker="o", linewidth=2, label="width / sampled Lipschitz max")
+    plt.plot(x, y_q99, marker="s", linewidth=2, label="width / sampled Lipschitz q99")
+    plt.xlabel(r"Projective dimension $n = d_A d_B - 1$")
+    plt.ylabel("Empirical normalized proxy (radians)")
+    plt.title("Lipschitz-normalized entropy proxy for observable diameter")
+    plt.legend(frameon=False)
+    plt.tight_layout()
+    out_path = outdir / "normalized_entropy_proxy_vs_projective_dimension.png"
+    plt.savefig(out_path, dpi=180)
+    plt.close()
+    return out_path
+
+
+def print_console_summary(results: Sequence[SystemResult]) -> None:
+    print("dA dB  CP^n     mean(bits)  part_diam(bits)  Page(bits)  Hayden_cutoff(bits)  L_emp_q99")
+    for r in results:
+        lip_q99 = f"{r.empirical_lipschitz_q99:.4f}" if r.empirical_lipschitz_q99 == r.empirical_lipschitz_q99 else "nan"
+        print(
+            f"{r.d_a:2d} {r.d_b:2d}  {r.projective_dim:5d}  "
+            f"{r.mean_bits:10.6f}  {r.partial_diameter_bits:15.6f}  "
+            f"{r.page_average_bits:10.6f}  {r.hayden_cutoff_bits:20.6f}  {lip_q99}"
+        )
+
+
+def build_argument_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--dims",
+        default="4x4,8x8,12x12,16x16,32x32,64x64,128x128",
+        help="Comma-separated subsystem sizes, e.g. 4x4,8x8,8x16",
+    )
+    parser.add_argument("--samples", type=int, default=10**6, help="Samples per system")
+    parser.add_argument("--kappa", type=float, default=1e-3, help="Observable-diameter loss parameter kappa")
+    parser.add_argument(
+        "--lipschitz-pairs",
+        type=int,
+        default=6000,
+        help="Number of random state pairs used for empirical Lipschitz estimation",
+    )
+    parser.add_argument("--seed", type=int, default=7, help="RNG seed")
+    parser.add_argument(
+        "--outdir",
+        type=str,
+        default="cpn_entropy_output",
+        help="Output directory for CSV and plots",
+    )
+    return parser
+
+
+def main() -> None:
+    parser = build_argument_parser()
+    args = parser.parse_args()
+
+    if not 0.0 < args.kappa < 1.0:
+        raise ValueError("kappa must lie in (0, 1)")
+    if args.samples < 10:
+        raise ValueError("Use at least 10 samples per system")
+
+    dims = parse_dims(args.dims)
+    rng = np.random.default_rng(args.seed)
+
+    outdir = Path(args.outdir)
+    outdir.mkdir(parents=True, exist_ok=True)
+
+    results: List[SystemResult] = []
+    for d_a, d_b in dims:
+        result, _states = simulate_system(
+            d_a=d_a,
+            d_b=d_b,
+            num_samples=args.samples,
+            kappa=args.kappa,
+            rng=rng,
+            lipschitz_pairs=args.lipschitz_pairs,
+        )
+        results.append(result)
+        plot_histogram(result, outdir)
+
+    results = sorted(results, key=lambda r: r.projective_dim)
+    write_summary_csv(results, outdir / "entropy_observable_summary.csv")
+    plot_concentration_summary(results, outdir)
+    plot_normalized_proxy(results, outdir)
+    plot_tail(results[-1], outdir)
+    print_console_summary(results)
+    print(f"\nWrote results to: {outdir.resolve()}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/codes/plot_entropy_and_alpha.py b/codes/reference/plot_entropy_and_alpha.py
similarity index 97%
rename from codes/plot_entropy_and_alpha.py
rename to codes/reference/plot_entropy_and_alpha.py
index 920d0f1..5762dda 100644
--- a/codes/plot_entropy_and_alpha.py
+++ b/codes/reference/plot_entropy_and_alpha.py
@@ -1,48 +1,48 @@
-"""
-plot the probability of the entropy of the reduced density matrix of the pure state being greater than log2(d_A) - alpha - beta
-for different alpha values
-
-IGNORE THE CONSTANT C
-
-NOTE there is bug in the program, You should fix it if you want to use the visualization, it relates to the alpha range and you should not plot the prob of 0
-"""
-
-import numpy as np
-import matplotlib.pyplot as plt
-from quantum_states import sample_and_calculate
-from tqdm import tqdm
-
-# Set dimensions
-db = 16
-da_values = [8, 16, 32]
-alpha_range = np.linspace(0, 2, 100)  # Range of alpha values to plot
-n_samples = 100000
-
-plt.figure(figsize=(10, 6))
-
-for da in tqdm(da_values, desc="Processing d_A values"):
-    # Calculate beta according to the formula
-    beta = da / (np.log(2) * db)
-    
-    # Calculate probability for each alpha
-    predicted_probabilities = []
-    actual_probabilities = []
-    for alpha in tqdm(alpha_range, desc=f"Calculating probabilities for d_A={da}", leave=False):
-        # Calculate probability according to the formula
-        # Ignoring constant C as requested
-        prob = np.exp(-(da * db - 1) * alpha**2 / (np.log2(da))**2)
-        predicted_probabilities.append(prob)
-        # Calculate actual probability
-        entropies = sample_and_calculate(da, db, n_samples=n_samples)
-        actual_probabilities.append(np.sum(entropies > np.log2(da) - alpha - beta) / n_samples)
-    
-    # plt.plot(alpha_range, predicted_probabilities, label=f'$d_A={da}$', linestyle='--')
-    plt.plot(alpha_range, actual_probabilities, label=f'$d_A={da}$', linestyle='-')
-
-plt.xlabel(r'$\alpha$')
-plt.ylabel('Probability')
-plt.title(r'$\operatorname{Pr}[H(\psi_A) <\log_2(d_A)-\alpha-\beta]$ vs $\alpha$ for different $d_A$')
-plt.legend()
-plt.grid(True)
-plt.yscale('log')  # Use log scale for better visualization
-plt.show()
+"""
+plot the probability of the entropy of the reduced density matrix of the pure state being greater than log2(d_A) - alpha - beta
+for different alpha values
+
+IGNORE THE CONSTANT C
+
+NOTE there is bug in the program, You should fix it if you want to use the visualization, it relates to the alpha range and you should not plot the prob of 0
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+from quantum_states import sample_and_calculate
+from tqdm import tqdm
+
+# Set dimensions
+db = 16
+da_values = [8, 16, 32]
+alpha_range = np.linspace(0, 2, 100)  # Range of alpha values to plot
+n_samples = 100000
+
+plt.figure(figsize=(10, 6))
+
+for da in tqdm(da_values, desc="Processing d_A values"):
+    # Calculate beta according to the formula
+    beta = da / (np.log(2) * db)
+    
+    # Calculate probability for each alpha
+    predicted_probabilities = []
+    actual_probabilities = []
+    for alpha in tqdm(alpha_range, desc=f"Calculating probabilities for d_A={da}", leave=False):
+        # Calculate probability according to the formula
+        # Ignoring constant C as requested
+        prob = np.exp(-(da * db - 1) * alpha**2 / (np.log2(da))**2)
+        predicted_probabilities.append(prob)
+        # Calculate actual probability
+        entropies = sample_and_calculate(da, db, n_samples=n_samples)
+        actual_probabilities.append(np.sum(entropies > np.log2(da) - alpha - beta) / n_samples)
+    
+    # plt.plot(alpha_range, predicted_probabilities, label=f'$d_A={da}$', linestyle='--')
+    plt.plot(alpha_range, actual_probabilities, label=f'$d_A={da}$', linestyle='-')
+
+plt.xlabel(r'$\alpha$')
+plt.ylabel('Probability')
+plt.title(r'$\operatorname{Pr}[H(\psi_A) <\log_2(d_A)-\alpha-\beta]$ vs $\alpha$ for different $d_A$')
+plt.legend()
+plt.grid(True)
+plt.yscale('log')  # Use log scale for better visualization
+plt.show()
diff --git a/codes/plot_entropy_and_da.py b/codes/reference/plot_entropy_and_da.py
similarity index 97%
rename from codes/plot_entropy_and_da.py
rename to codes/reference/plot_entropy_and_da.py
index 445ba14..27d4484 100644
--- a/codes/plot_entropy_and_da.py
+++ b/codes/reference/plot_entropy_and_da.py
@@ -1,52 +1,52 @@
-"""
-plot the probability of the entropy of the reduced density matrix of the pure state being greater than log2(d_A) - alpha - beta
-
-for different d_A values, with fixed alpha and d_B Note, d_B>d_A
-"""
-
-import numpy as np
-import matplotlib.pyplot as plt
-from quantum_states import sample_and_calculate
-from tqdm import tqdm
-
-# Set dimensions
-db = 32
-alpha = 0
-da_range = np.arange(2, 10, 1)  # Range of d_A values to plot
-n_samples = 1000000
-
-plt.figure(figsize=(10, 6))
-
-predicted_probabilities = []
-actual_probabilities = []
-
-for da in tqdm(da_range, desc="Processing d_A values"):
-    # Calculate beta according to the formula
-    beta = da / (np.log(2) * db)
-    
-    # Calculate probability according to the formula
-    # Ignoring constant C as requested
-    prob = np.exp(-((da * db - 1) * alpha**2 / (np.log2(da)**2)))
-    predicted_probabilities.append(prob)
-    # Calculate actual probability
-    entropies = sample_and_calculate(da, db, n_samples=n_samples)
-    count = np.sum(entropies < np.log2(da) - alpha - beta)
-    # early stop if count is 0
-    if count != 0:
-        actual_probabilities.append(count / n_samples)
-    else:
-        actual_probabilities.extend([np.nan] * (len(da_range) - len(actual_probabilities)))
-        break
-    # debug
-    print(f'da={da}, theoretical_prob={prob}, threshold={np.log2(da) - alpha - beta}, actual_prob={actual_probabilities[-1]}, entropy_heads={entropies[:10]}')
-    
-# plt.plot(da_range, predicted_probabilities, label=f'$d_A={da}$', linestyle='--')
-plt.plot(da_range, actual_probabilities, label=f'$d_A={da}$', linestyle='-')
-
-plt.xlabel(r'$d_A$')
-plt.ylabel('Probability')
-plt.title(r'$\operatorname{Pr}[H(\psi_A) < \log_2(d_A)-\alpha-\beta]$ vs $d_A$ for fixed $\alpha=$'+str(alpha)+r' and $d_B=$' +str(db)+ r' with $n=$' +str(n_samples))
-# plt.legend()
-plt.grid(True)
-plt.yscale('log')  # Use log scale for better visualization
-plt.show()
+"""
+plot the probability of the entropy of the reduced density matrix of the pure state being greater than log2(d_A) - alpha - beta
+
+for different d_A values, with fixed alpha and d_B Note, d_B>d_A
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+from quantum_states import sample_and_calculate
+from tqdm import tqdm
+
+# Set dimensions
+db = 32
+alpha = 0
+da_range = np.arange(2, 10, 1)  # Range of d_A values to plot
+n_samples = 1000000
+
+plt.figure(figsize=(10, 6))
+
+predicted_probabilities = []
+actual_probabilities = []
+
+for da in tqdm(da_range, desc="Processing d_A values"):
+    # Calculate beta according to the formula
+    beta = da / (np.log(2) * db)
+    
+    # Calculate probability according to the formula
+    # Ignoring constant C as requested
+    prob = np.exp(-((da * db - 1) * alpha**2 / (np.log2(da)**2)))
+    predicted_probabilities.append(prob)
+    # Calculate actual probability
+    entropies = sample_and_calculate(da, db, n_samples=n_samples)
+    count = np.sum(entropies < np.log2(da) - alpha - beta)
+    # early stop if count is 0
+    if count != 0:
+        actual_probabilities.append(count / n_samples)
+    else:
+        actual_probabilities.extend([np.nan] * (len(da_range) - len(actual_probabilities)))
+        break
+    # debug
+    print(f'da={da}, theoretical_prob={prob}, threshold={np.log2(da) - alpha - beta}, actual_prob={actual_probabilities[-1]}, entropy_heads={entropies[:10]}')
+    
+# plt.plot(da_range, predicted_probabilities, label=f'$d_A={da}$', linestyle='--')
+plt.plot(da_range, actual_probabilities, label=f'$d_A={da}$', linestyle='-')
+
+plt.xlabel(r'$d_A$')
+plt.ylabel('Probability')
+plt.title(r'$\operatorname{Pr}[H(\psi_A) < \log_2(d_A)-\alpha-\beta]$ vs $d_A$ for fixed $\alpha=$'+str(alpha)+r' and $d_B=$' +str(db)+ r' with $n=$' +str(n_samples))
+# plt.legend()
+plt.grid(True)
+plt.yscale('log')  # Use log scale for better visualization
+plt.show()
diff --git a/codes/plot_entropy_and_deviate.py b/codes/reference/plot_entropy_and_deviate.py
similarity index 97%
rename from codes/plot_entropy_and_deviate.py
rename to codes/reference/plot_entropy_and_deviate.py
index fdf3e36..1b2fbc0 100644
--- a/codes/plot_entropy_and_deviate.py
+++ b/codes/reference/plot_entropy_and_deviate.py
@@ -1,55 +1,55 @@
-import numpy as np
-import matplotlib.pyplot as plt
-from quantum_states import sample_and_calculate
-from tqdm import tqdm
-
-# Set dimensions, keep db\geq da\geq 3
-db = 64
-da_values = [4, 8, 16, 32]
-da_colors = ['b', 'g', 'r', 'c']
-n_samples = 100000
-
-plt.figure(figsize=(10, 6))
-
-# Define range of deviations to test (in bits)
-deviations = np.linspace(0, 1, 50)  # Test deviations from 0 to 1 bits
-
-for i, da in enumerate(tqdm(da_values, desc="Processing d_A values")):
-    # Calculate maximal entropy
-    max_entropy = np.log2(min(da, db))
-    
-    # Sample random states and calculate their entropies
-    entropies = sample_and_calculate(da, db, n_samples=n_samples)
-    
-    # Calculate probabilities for each deviation
-    probabilities = []
-    theoretical_probs = []
-    for dev in deviations:
-        # Count states that deviate by more than dev bits from max entropy
-        count = np.sum(max_entropy - entropies > dev)
-        # Omit the case where count is 0
-        if count != 0:
-            prob = count / len(entropies)
-            probabilities.append(prob)
-        else:
-            probabilities.append(np.nan)
-        
-        # Calculate theoretical probability using concentration inequality
-        # note max_entropy - dev = max_entropy - beta - alpha, so alpha = dev - beta
-        beta = da / (np.log(2)*db)
-        alpha = dev - beta
-        theoretical_prob = np.exp(-(da * db - 1) * alpha**2 / (np.log2(da))**2)
-        # # debug
-        # print(f"dev: {dev}, beta: {beta}, alpha: {alpha}, theoretical_prob: {theoretical_prob}")
-        theoretical_probs.append(theoretical_prob)
-    
-    plt.plot(deviations, probabilities, '-', label=f'$d_A={da}$ (simulated)', color=da_colors[i])
-    plt.plot(deviations, theoretical_probs, '--', label=f'$d_A={da}$ (theoretical)', color=da_colors[i])
-
-plt.xlabel('Deviation from maximal entropy (bits)')
-plt.ylabel('Probability')
-plt.title(f'Probability of deviation from maximal entropy simulation with sample size {n_samples} for $d_B={db}$ ignoring the constant $C$')
-plt.legend()
-plt.grid(True)
-plt.yscale('log')  # Use log scale for better visualization
-plt.show()
+import numpy as np
+import matplotlib.pyplot as plt
+from quantum_states import sample_and_calculate
+from tqdm import tqdm
+
+# Set dimensions, keep db\geq da\geq 3
+db = 64
+da_values = [4, 8, 16, 32]
+da_colors = ['b', 'g', 'r', 'c']
+n_samples = 100000
+
+plt.figure(figsize=(10, 6))
+
+# Define range of deviations to test (in bits)
+deviations = np.linspace(0, 1, 50)  # Test deviations from 0 to 1 bits
+
+for i, da in enumerate(tqdm(da_values, desc="Processing d_A values")):
+    # Calculate maximal entropy
+    max_entropy = np.log2(min(da, db))
+    
+    # Sample random states and calculate their entropies
+    entropies = sample_and_calculate(da, db, n_samples=n_samples)
+    
+    # Calculate probabilities for each deviation
+    probabilities = []
+    theoretical_probs = []
+    for dev in deviations:
+        # Count states that deviate by more than dev bits from max entropy
+        count = np.sum(max_entropy - entropies > dev)
+        # Omit the case where count is 0
+        if count != 0:
+            prob = count / len(entropies)
+            probabilities.append(prob)
+        else:
+            probabilities.append(np.nan)
+        
+        # Calculate theoretical probability using concentration inequality
+        # note max_entropy - dev = max_entropy - beta - alpha, so alpha = dev - beta
+        beta = da / (np.log(2)*db)
+        alpha = dev - beta
+        theoretical_prob = np.exp(-(da * db - 1) * alpha**2 / (np.log2(da))**2)
+        # # debug
+        # print(f"dev: {dev}, beta: {beta}, alpha: {alpha}, theoretical_prob: {theoretical_prob}")
+        theoretical_probs.append(theoretical_prob)
+    
+    plt.plot(deviations, probabilities, '-', label=f'$d_A={da}$ (simulated)', color=da_colors[i])
+    plt.plot(deviations, theoretical_probs, '--', label=f'$d_A={da}$ (theoretical)', color=da_colors[i])
+
+plt.xlabel('Deviation from maximal entropy (bits)')
+plt.ylabel('Probability')
+plt.title(f'Probability of deviation from maximal entropy simulation with sample size {n_samples} for $d_B={db}$ ignoring the constant $C$')
+plt.legend()
+plt.grid(True)
+plt.yscale('log')  # Use log scale for better visualization
+plt.show()
diff --git a/codes/plot_entropy_and_dim.py b/codes/reference/plot_entropy_and_dim.py
similarity index 97%
rename from codes/plot_entropy_and_dim.py
rename to codes/reference/plot_entropy_and_dim.py
index 43ce800..186fdc1 100644
--- a/codes/plot_entropy_and_dim.py
+++ b/codes/reference/plot_entropy_and_dim.py
@@ -1,33 +1,33 @@
-import numpy as np
-import matplotlib.pyplot as plt
-from quantum_states import sample_and_calculate
-from tqdm import tqdm
-
-# Define range of dimensions to test
-fixed_dim = 64
-dimensions = np.arange(2, 64, 2)  # Test dimensions from 2 to 50 in steps of 2
-expected_entropies = []
-theoretical_entropies = []
-predicted_entropies = []
-
-# Calculate entropies for each dimension
-for dim in tqdm(dimensions, desc="Calculating entropies"):
-    # For each dimension, we'll keep one subsystem fixed at dim=2 
-    # and vary the other dimension
-    entropies = sample_and_calculate(dim, fixed_dim, n_samples=1000)
-    expected_entropies.append(np.mean(entropies))
-    theoretical_entropies.append(np.log2(min(dim, fixed_dim)))
-    beta = min(dim, fixed_dim)/(2*np.log(2)*max(dim, fixed_dim))
-    predicted_entropies.append(np.log2(min(dim, fixed_dim)) - beta)
-
-# Create the plot
-plt.figure(figsize=(10, 6))
-plt.plot(dimensions, expected_entropies, 'b-', label='Expected Entropy')
-plt.plot(dimensions, theoretical_entropies, 'r--', label='Theoretical Entropy')
-plt.plot(dimensions, predicted_entropies, 'g--', label='Predicted Entropy')
-plt.xlabel('Dimension of Subsystem B')
-plt.ylabel('von Neumann Entropy (bits)')
-plt.title(f'von Neumann Entropy vs. System Dimension, with Dimension of Subsystem A = {fixed_dim}')
-plt.legend()
-plt.grid(True)
-plt.show()
+import numpy as np
+import matplotlib.pyplot as plt
+from quantum_states import sample_and_calculate
+from tqdm import tqdm
+
+# Define range of dimensions to test
+fixed_dim = 64
+dimensions = np.arange(2, 64, 2)  # Test dimensions from 2 to 50 in steps of 2
+expected_entropies = []
+theoretical_entropies = []
+predicted_entropies = []
+
+# Calculate entropies for each dimension
+for dim in tqdm(dimensions, desc="Calculating entropies"):
+    # For each dimension, we'll keep one subsystem fixed at dim=2 
+    # and vary the other dimension
+    entropies = sample_and_calculate(dim, fixed_dim, n_samples=1000)
+    expected_entropies.append(np.mean(entropies))
+    theoretical_entropies.append(np.log2(min(dim, fixed_dim)))
+    beta = min(dim, fixed_dim)/(2*np.log(2)*max(dim, fixed_dim))
+    predicted_entropies.append(np.log2(min(dim, fixed_dim)) - beta)
+
+# Create the plot
+plt.figure(figsize=(10, 6))
+plt.plot(dimensions, expected_entropies, 'b-', label='Expected Entropy')
+plt.plot(dimensions, theoretical_entropies, 'r--', label='Theoretical Entropy')
+plt.plot(dimensions, predicted_entropies, 'g--', label='Predicted Entropy')
+plt.xlabel('Dimension of Subsystem B')
+plt.ylabel('von Neumann Entropy (bits)')
+plt.title(f'von Neumann Entropy vs. System Dimension, with Dimension of Subsystem A = {fixed_dim}')
+plt.legend()
+plt.grid(True)
+plt.show()
diff --git a/codes/plot_entropy_and_dim_3d.py b/codes/reference/plot_entropy_and_dim_3d.py
similarity index 97%
rename from codes/plot_entropy_and_dim_3d.py
rename to codes/reference/plot_entropy_and_dim_3d.py
index 2fcc712..2477c51 100644
--- a/codes/plot_entropy_and_dim_3d.py
+++ b/codes/reference/plot_entropy_and_dim_3d.py
@@ -1,51 +1,51 @@
-import numpy as np
-import matplotlib.pyplot as plt
-from quantum_states import sample_and_calculate
-from tqdm import tqdm
-from mpl_toolkits.mplot3d import Axes3D
-
-# Define range of dimensions to test
-dimensionsA = np.arange(2, 64, 2)  # Test dimensions from 2 to 50 in steps of 2
-dimensionsB = np.arange(2, 64, 2)  # Test dimensions from 2 to 50 in steps of 2
-
-# Create meshgrid for 3D plot
-X, Y = np.meshgrid(dimensionsA, dimensionsB)
-Z = np.zeros_like(X, dtype=float)
-
-# Calculate entropies for each dimension combination
-total_iterations = len(dimensionsA) * len(dimensionsB)
-pbar = tqdm(total=total_iterations, desc="Calculating entropies")
-
-for i, dim_a in enumerate(dimensionsA):
-    for j, dim_b in enumerate(dimensionsB):
-        entropies = sample_and_calculate(dim_a, dim_b, n_samples=100)
-        Z[j,i] = np.mean(entropies)
-        pbar.update(1)
-pbar.close()
-
-# Create the 3D plot
-fig = plt.figure(figsize=(12, 8))
-ax = fig.add_subplot(111, projection='3d')
-
-# Plot the surface
-surf = ax.plot_surface(X, Y, Z, cmap='viridis')
-
-# Add labels and title with larger font sizes
-ax.set_xlabel('Dimension of Subsystem A', fontsize=12, labelpad=10)
-ax.set_ylabel('Dimension of Subsystem B', fontsize=12, labelpad=10)
-ax.set_zlabel('von Neumann Entropy (bits)', fontsize=12, labelpad=10)
-ax.set_title('von Neumann Entropy vs. System Dimensions', fontsize=14, pad=20)
-
-# Add colorbar
-cbar = fig.colorbar(surf, ax=ax, label='Entropy')
-cbar.ax.set_ylabel('Entropy', fontsize=12)
-
-# Add tick labels with larger font size
-ax.tick_params(axis='x', labelsize=10)
-ax.tick_params(axis='y', labelsize=10)
-ax.tick_params(axis='z', labelsize=10)
-
-# Rotate the plot for better visibility
-ax.view_init(elev=30, azim=45)
-
+import numpy as np
+import matplotlib.pyplot as plt
+from quantum_states import sample_and_calculate
+from tqdm import tqdm
+from mpl_toolkits.mplot3d import Axes3D
+
+# Define range of dimensions to test
+dimensionsA = np.arange(2, 64, 2)  # Test dimensions from 2 to 50 in steps of 2
+dimensionsB = np.arange(2, 64, 2)  # Test dimensions from 2 to 50 in steps of 2
+
+# Create meshgrid for 3D plot
+X, Y = np.meshgrid(dimensionsA, dimensionsB)
+Z = np.zeros_like(X, dtype=float)
+
+# Calculate entropies for each dimension combination
+total_iterations = len(dimensionsA) * len(dimensionsB)
+pbar = tqdm(total=total_iterations, desc="Calculating entropies")
+
+for i, dim_a in enumerate(dimensionsA):
+    for j, dim_b in enumerate(dimensionsB):
+        entropies = sample_and_calculate(dim_a, dim_b, n_samples=100)
+        Z[j,i] = np.mean(entropies)
+        pbar.update(1)
+pbar.close()
+
+# Create the 3D plot
+fig = plt.figure(figsize=(12, 8))
+ax = fig.add_subplot(111, projection='3d')
+
+# Plot the surface
+surf = ax.plot_surface(X, Y, Z, cmap='viridis')
+
+# Add labels and title with larger font sizes
+ax.set_xlabel('Dimension of Subsystem A', fontsize=12, labelpad=10)
+ax.set_ylabel('Dimension of Subsystem B', fontsize=12, labelpad=10)
+ax.set_zlabel('von Neumann Entropy (bits)', fontsize=12, labelpad=10)
+ax.set_title('von Neumann Entropy vs. System Dimensions', fontsize=14, pad=20)
+
+# Add colorbar
+cbar = fig.colorbar(surf, ax=ax, label='Entropy')
+cbar.ax.set_ylabel('Entropy', fontsize=12)
+
+# Add tick labels with larger font size
+ax.tick_params(axis='x', labelsize=10)
+ax.tick_params(axis='y', labelsize=10)
+ax.tick_params(axis='z', labelsize=10)
+
+# Rotate the plot for better visibility
+ax.view_init(elev=30, azim=45)
+
 plt.show()
\ No newline at end of file
diff --git a/codes/quantum_states.py b/codes/reference/quantum_states.py
similarity index 97%
rename from codes/quantum_states.py
rename to codes/reference/quantum_states.py
index 7c6da27..89e9ea9 100644
--- a/codes/quantum_states.py
+++ b/codes/reference/quantum_states.py
@@ -1,96 +1,96 @@
-import numpy as np
-from scipy.linalg import sqrtm
-from scipy.stats import unitary_group
-from tqdm import tqdm
-
-def random_pure_state(dim_a, dim_b):
-    """
-    Generate a random pure state for a bipartite system.
-
-    The random pure state is uniformly distributed by the Haar (Fubini-Study) measure on the unit sphere $S^{dim_a * dim_b - 1}$. (Invariant under the unitary group $U(dim_a) \times U(dim_b)$)
-    
-    Args:
-        dim_a (int): Dimension of subsystem A
-        dim_b (int): Dimension of subsystem B
-        
-    Returns:
-        numpy.ndarray: Random pure state vector of shape (dim_a * dim_b,)
-    """
-    # Total dimension of the composite system
-    dim_total = dim_a * dim_b
-    
-    # Generate non-zero random complex vector
-    while True:
-        state = np.random.normal(size=(dim_total,)) + 1j * np.random.normal(size=(dim_total,))
-        if np.linalg.norm(state) > 0:
-            break
-    
-    # Normalize the state
-    state = state / np.linalg.norm(state)
-    
-    return state
-
-def von_neumann_entropy_bipartite_pure_state(state, dim_a, dim_b):
-    """
-    Calculate the von Neumann entropy of the reduced density matrix.
-    
-    Args:
-        state (numpy.ndarray): Pure state vector
-        dim_a (int): Dimension of subsystem A
-        dim_b (int): Dimension of subsystem B
-        
-    Returns:
-        float: Von Neumann entropy
-    """
-    # Reshape state vector to matrix form
-    state_matrix = state.reshape(dim_a, dim_b)
-    
-    # Calculate reduced density matrix of subsystem A
-    rho_a = np.dot(state_matrix, state_matrix.conj().T)
-    
-    # Calculate eigenvalues
-    eigenvals = np.linalg.eigvalsh(rho_a)
-    
-    # Remove very small eigenvalues (numerical errors)
-    eigenvals = eigenvals[eigenvals > 1e-15]
-    
-    # Calculate von Neumann entropy
-    entropy = -np.sum(eigenvals * np.log2(eigenvals))
-    
-    return np.real(entropy)
-
-def sample_and_calculate(dim_a, dim_b, n_samples=1000):
-    """
-    Sample random pure states (generate random co) and calculate their von Neumann entropy.
-    
-    Args:
-        dim_a (int): Dimension of subsystem A
-        dim_b (int): Dimension of subsystem B
-        n_samples (int): Number of samples to generate
-        
-    Returns:
-        numpy.ndarray: Array of entropy values
-    """
-    entropies = np.zeros(n_samples)
-    
-    for i in tqdm(range(n_samples), desc=f"Sampling states (d_A={dim_a}, d_B={dim_b})", leave=False):
-        state = random_pure_state(dim_a, dim_b)
-        entropies[i] = von_neumann_entropy_bipartite_pure_state(state, dim_a, dim_b)
-    
-    return entropies
-
-# Example usage:
-if __name__ == "__main__":
-    # Example: 2-qubit system
-    dim_a, dim_b = 50,100
-    
-    # Generate single random state and calculate entropy
-    state = random_pure_state(dim_a, dim_b)
-    entropy = von_neumann_entropy_bipartite_pure_state(state, dim_a, dim_b)
-    print(f"Single state entropy: {entropy}")
-    
-    # Sample multiple states
-    entropies = sample_and_calculate(dim_a, dim_b, n_samples=1000)
-    print(f"Expected entropy: {np.mean(entropies)}")
-    print(f"Theoretical entropy: {np.log2(max(dim_a, dim_b))}")
-    print(f"Standard deviation: {np.std(entropies)}")
+import numpy as np
+from scipy.linalg import sqrtm
+from scipy.stats import unitary_group
+from tqdm import tqdm
+
+def random_pure_state(dim_a, dim_b):
+    """
+    Generate a random pure state for a bipartite system.
+
+    The random pure state is uniformly distributed by the Haar (Fubini-Study) measure on the unit sphere $S^{dim_a * dim_b - 1}$. (Invariant under the unitary group $U(dim_a) \times U(dim_b)$)
+    
+    Args:
+        dim_a (int): Dimension of subsystem A
+        dim_b (int): Dimension of subsystem B
+        
+    Returns:
+        numpy.ndarray: Random pure state vector of shape (dim_a * dim_b,)
+    """
+    # Total dimension of the composite system
+    dim_total = dim_a * dim_b
+    
+    # Generate non-zero random complex vector
+    while True:
+        state = np.random.normal(size=(dim_total,)) + 1j * np.random.normal(size=(dim_total,))
+        if np.linalg.norm(state) > 0:
+            break
+    
+    # Normalize the state
+    state = state / np.linalg.norm(state)
+    
+    return state
+
+def von_neumann_entropy_bipartite_pure_state(state, dim_a, dim_b):
+    """
+    Calculate the von Neumann entropy of the reduced density matrix.
+    
+    Args:
+        state (numpy.ndarray): Pure state vector
+        dim_a (int): Dimension of subsystem A
+        dim_b (int): Dimension of subsystem B
+        
+    Returns:
+        float: Von Neumann entropy
+    """
+    # Reshape state vector to matrix form
+    state_matrix = state.reshape(dim_a, dim_b)
+    
+    # Calculate reduced density matrix of subsystem A
+    rho_a = np.dot(state_matrix, state_matrix.conj().T)
+    
+    # Calculate eigenvalues
+    eigenvals = np.linalg.eigvalsh(rho_a)
+    
+    # Remove very small eigenvalues (numerical errors)
+    eigenvals = eigenvals[eigenvals > 1e-15]
+    
+    # Calculate von Neumann entropy
+    entropy = -np.sum(eigenvals * np.log2(eigenvals))
+    
+    return np.real(entropy)
+
+def sample_and_calculate(dim_a, dim_b, n_samples=1000):
+    """
+    Sample random pure states (generate random co) and calculate their von Neumann entropy.
+    
+    Args:
+        dim_a (int): Dimension of subsystem A
+        dim_b (int): Dimension of subsystem B
+        n_samples (int): Number of samples to generate
+        
+    Returns:
+        numpy.ndarray: Array of entropy values
+    """
+    entropies = np.zeros(n_samples)
+    
+    for i in tqdm(range(n_samples), desc=f"Sampling states (d_A={dim_a}, d_B={dim_b})", leave=False):
+        state = random_pure_state(dim_a, dim_b)
+        entropies[i] = von_neumann_entropy_bipartite_pure_state(state, dim_a, dim_b)
+    
+    return entropies
+
+# Example usage:
+if __name__ == "__main__":
+    # Example: 2-qubit system
+    dim_a, dim_b = 50,100
+    
+    # Generate single random state and calculate entropy
+    state = random_pure_state(dim_a, dim_b)
+    entropy = von_neumann_entropy_bipartite_pure_state(state, dim_a, dim_b)
+    print(f"Single state entropy: {entropy}")
+    
+    # Sample multiple states
+    entropies = sample_and_calculate(dim_a, dim_b, n_samples=1000)
+    print(f"Expected entropy: {np.mean(entropies)}")
+    print(f"Theoretical entropy: {np.log2(max(dim_a, dim_b))}")
+    print(f"Standard deviation: {np.std(entropies)}")
diff --git a/codes/test.py b/codes/reference/test.py
similarity index 97%
rename from codes/test.py
rename to codes/reference/test.py
index ea4659e..b91d374 100644
--- a/codes/test.py
+++ b/codes/reference/test.py
@@ -1,32 +1,32 @@
-# unit test for the functions in quantum_states.py
-
-import unittest
-import numpy as np
-from quantum_states import random_pure_state, von_neumann_entropy_bipartite_pure_state
-
-class LearningCase(unittest.TestCase):
-    def test_random_pure_state_shape_and_norm(self):
-        dim_a = 2
-        dim_b = 2
-        state = random_pure_state(dim_a, dim_b)
-        self.assertEqual(state.shape, (dim_a * dim_b,))
-        self.assertAlmostEqual(np.linalg.norm(state), 1)
-
-    def test_partial_trace_entropy(self):
-        dim_a = 2
-        dim_b = 2
-        state = random_pure_state(dim_a, dim_b)
-        self.assertAlmostEqual(von_neumann_entropy_bipartite_pure_state(state, dim_a, dim_b), von_neumann_entropy_bipartite_pure_state(state, dim_b, dim_a))
-
-    def test_sample_uniformly(self):
-        # calculate the distribution of the random pure state
-        dim_a = 2
-        dim_b = 2
-        state = random_pure_state(dim_a, dim_b)
-        
-
-def main():
-    unittest.main()
-
-if __name__ == "__main__":
+# unit test for the functions in quantum_states.py
+
+import unittest
+import numpy as np
+from quantum_states import random_pure_state, von_neumann_entropy_bipartite_pure_state
+
+class LearningCase(unittest.TestCase):
+    def test_random_pure_state_shape_and_norm(self):
+        dim_a = 2
+        dim_b = 2
+        state = random_pure_state(dim_a, dim_b)
+        self.assertEqual(state.shape, (dim_a * dim_b,))
+        self.assertAlmostEqual(np.linalg.norm(state), 1)
+
+    def test_partial_trace_entropy(self):
+        dim_a = 2
+        dim_b = 2
+        state = random_pure_state(dim_a, dim_b)
+        self.assertAlmostEqual(von_neumann_entropy_bipartite_pure_state(state, dim_a, dim_b), von_neumann_entropy_bipartite_pure_state(state, dim_b, dim_a))
+
+    def test_sample_uniformly(self):
+        # calculate the distribution of the random pure state
+        dim_a = 2
+        dim_b = 2
+        state = random_pure_state(dim_a, dim_b)
+        
+
+def main():
+    unittest.main()
+
+if __name__ == "__main__":
     main()
\ No newline at end of file
diff --git a/.gitignore b/latex/.gitignore
similarity index 99%
rename from .gitignore
rename to latex/.gitignore
index e7d22c4..6e8d7d6 100644
--- a/.gitignore
+++ b/latex/.gitignore
@@ -308,4 +308,7 @@ TSWLatexianTemp*
 #*Notes.bib
 
 # additional trash files
-*.bcf-*
\ No newline at end of file
+*.bcf-*
+
+# python
+__pycache__
\ No newline at end of file
diff --git a/LICENSE b/latex/LICENSE
similarity index 100%
rename from LICENSE
rename to latex/LICENSE
diff --git a/latex/chapters/chap0.pdf b/latex/chapters/chap0.pdf
new file mode 100644
index 0000000..d980ecc
Binary files /dev/null and b/latex/chapters/chap0.pdf differ
diff --git a/chapters/chap0.tex b/latex/chapters/chap0.tex
similarity index 88%
rename from chapters/chap0.tex
rename to latex/chapters/chap0.tex
index 9ac7a88..97d0710 100644
--- a/chapters/chap0.tex
+++ b/latex/chapters/chap0.tex
@@ -1,1039 +1,1189 @@
-% chapters/chap0.tex
-\documentclass[../main.tex]{subfiles}
-
-% If this chapter is compiled *by itself*, we must load only its own .bib
-% and print its bibliography at the end of the chapter.
-\ifSubfilesClassLoaded{
-  \addbibresource{\subfix{../main.bib}}
-}
-
-\begin{document}
-
-\chapter*{Chapter 0: Brief definitions and basic concepts}
-\addcontentsline{toc}{chapter}{Chapter 0: Brief definitions and basic concepts}
-\markboth{Chapter 0: Brief definitions and basic concepts}{}
-
-As the future version of me might forgot everything we have over the summer, as I did for now, I will make a review again from the simple definition to recall the necessary information to tell you why we are here and how we are going to proceed.
-
-This section serve as reference for definitions, notations, and theorems that we will use later. This section can be safely ignored if you are already familiar with the definitions and theorems.
-
-But for the future self who might have no idea what I'm talking about, we will provided detailed definitions to you to understand the concepts.
-
-\section{Complex vector spaces}
-
-The main vector space we are interested in is $\mathbb{C}^n$; therefore, all the linear operators we defined are from $\mathbb{C}^n$ to $\mathbb{C}^n$.
-
-\begin{defn}
-    \label{defn:braket}
-
-    We denote a vector in vector space as $\ket{\psi}=(z_1,\ldots,z_n)$ (might also be infinite dimensional, and $z_i\in\mathbb{C}$).
-
-\end{defn}
-
-
-Here $\psi$ is just a label for the vector, and you don't need to worry about it too much. This is also called the ket, where the counterpart $\bra{\psi}$ is called the bra, used to denote the vector dual to $\psi$; such an element is a linear functional if you really want to know what that is.
-
-Few additional notation will be introduced, in this document, we will follows the notation used in mathematics literature \cite{axler2023linear}
-
-\begin{itemize}
-    \item $\langle\psi|\varphi\rangle$ is the inner product between two vectors, and $\bra{\psi} A\ket{\varphi}$ is the inner product between $A\ket{\varphi}$ and $\bra{\psi}$, or equivalently $A^\dagger \bra{\psi}$ and $\ket{\varphi}$.
-    \item Given a complex matrix $A=\mathbb{C}^{n\times n}$,
-          \begin{enumerate}
-              \item  $\overline{A}$ is the complex conjugate of $A$.
-                    \begin{examples}
-                        $$
-                            A=\begin{bmatrix}
-                                1+i & 2+i & 3+i \\
-                                4+i & 5+i & 6+i \\
-                                7+i & 8+i & 9+i\end{bmatrix},
-                            \overline{A}=\begin{bmatrix}
-                                1-i & 2-i & 3-i \\
-                                4-i & 5-i & 6-i \\
-                                7-i & 8-i & 9-i
-                            \end{bmatrix}
-                        $$
-                    \end{examples}
-              \item  $A^\top$ denotes the transpose of $A$.
-                    \begin{examples}
-                        $$
-                            A=\begin{bmatrix}
-                                1+i & 2+i & 3+i \\
-                                4+i & 5+i & 6+i \\
-                                7+i & 8+i & 9+i
-                            \end{bmatrix},
-                            A^\top=\begin{bmatrix}
-                                1+i & 4+i & 7+i \\
-                                2+i & 5+i & 8+i \\
-                                3+i & 6+i & 9+i
-                            \end{bmatrix}
-                        $$
-                    \end{examples}
-              \item $A^*=\overline{(A^\top)}$ denotes the complex conjugate transpose, referred to as the adjoint, or Hermitian conjugate of $A$.
-                    \begin{examples}
-                        $$
-                            A=\begin{bmatrix}
-                                1+i & 2+i & 3+i \\
-                                4+i & 5+i & 6+i \\
-                                7+i & 8+i & 9+i
-                            \end{bmatrix},
-                            A^*=\begin{bmatrix}
-                                1-i & 4-i & 7-i \\
-                                2-i & 5-i & 8-i \\
-                                3-i & 6-i & 9-i
-                            \end{bmatrix}
-                        $$
-                    \end{examples}
-              \item  $A$ is unitary if $A^* A=AA^*=I$.
-              \item  $A$ is self-adjoint (hermitian in physics literature) if $A^*=A$.
-          \end{enumerate}
-\end{itemize}
-
-\subsubsection{Motivation of Tensor product}
-
-Recall from the traditional notation of product space of two vector spaces $V$ and $W$, that is, $V\times W$, is the set of all ordered pairs $(\ket{v},\ket{w})$ where $\ket{v}\in V$ and $\ket{w}\in W$.
-
-The space has dimension $\dim V+\dim W$.
-
-We want to define a vector space with the notation of multiplication of two vectors from different vector spaces.
-
-That is
-
-$$
-    (\ket{v_1}+\ket{v_2})\otimes \ket{w}=(\ket{v_1}\otimes \ket{w})+(\ket{v_2}\otimes \ket{w})
-$$
-$$
-    \ket{v}\otimes (\ket{w_1}+\ket{w_2})=(\ket{v}\otimes \ket{w_1})+(\ket{v}\otimes \ket{w_2})
-$$
-
-and enables scalar multiplication by
-
-$$
-    \lambda (\ket{v}\otimes \ket{w})=(\lambda \ket{v})\otimes \ket{w}=\ket{v}\otimes (\lambda \ket{w})
-$$
-
-And we wish to build a way to associate the basis of $V$ and $W$ with the basis of $V\otimes W$. That makes the tensor product a vector space with dimension $\dim V\times \dim W$.
-
-\begin{defn}
-    \label{defn:linear_functional}
-    Definition of linear functional
-
-    A linear functional is a linear map from $V$ to $\mathbb{F}$.
-
-\end{defn}
-
-Note the difference between a linear functional and a linear map.
-
-A generalized linear map is a function $f: V\to W$ satisfying the condition.
-
-\begin{itemize}
-    \item $f(\ket{u}+\ket{v})=f(\ket{u})+f(\ket{v})$
-    \item $f(\lambda \ket{v})=\lambda f(\ket{v})$
-\end{itemize}
-
-
-\begin{defn}
-    \label{defn:bilinear_functional}
-    A bilinear functional is a bilinear function $\beta:V\times W\to \mathbb{F}$ satisfying the condition that $\ket{v}\to \beta(\ket{v},\ket{w})$ is a linear functional for all $\ket{w}\in W$ and $\ket{w}\to \beta(\ket{v},\ket{w})$ is a linear functional for all $\ket{v}\in V$.
-
-\end{defn}
-
-The vector space of all bilinear functionals is denoted by $\mathcal{B}(V, W)$.
-
-
-\begin{defn}
-    \label{defn:tensor_product}
-    Let $V, W$ be two vector spaces.
-
-    Let $V'$ and $W'$ be the dual spaces of $V$ and $W$, respectively, that is $V'=\{\psi:V\to \mathbb{F}\}$ and $W'=\{\phi:W\to \mathbb{F}\}$, $\psi, \phi$ are linear functionals.
-
-    The tensor product of vectors $v\in V$ and $w\in W$ is the bilinear functional defined by $\forall (\psi,\phi)\in V'\times W'$ given by the notation
-
-    $$
-        (v\otimes w)(\psi,\phi)=\psi(v)\phi(w)
-    $$
-
-    The tensor product of two vector spaces $V$ and $W$ is the vector space $\mathcal{B}(V',W')$
-
-    Notice that the basis of such vector space is the linear combination of the basis of $V'$ and $W'$, that is, if $\{e_i\}$ is the basis of $V'$ and $\{f_j\}$ is the basis of $W'$, then $\{e_i\otimes f_j\}$ is the basis of $\mathcal{B}(V', W')$.
-
-    That is, every element of $\mathcal{B}(V', W')$ can be written as a linear combination of the basis.
-
-    Since $\{e_i\}$ and $\{f_j\}$ are bases of $V'$ and $W'$, respectively, then we can always find a set of linear functionals $\{\phi_i\}$ and $\{\psi_j\}$ such that $\phi_i(e_j)=\delta_{ij}$ and $\psi_j(f_i)=\delta_{ij}$.
-
-    Here $\delta_{ij}=\begin{cases}
-            1 & \text{if } i=j   \\
-            0 & \text{otherwise}
-        \end{cases}$ is the Kronecker delta.
-
-    $$
-        V\otimes W=\left\{\sum_{i=1}^n \sum_{j=1}^m a_{ij} \phi_i(v)\psi_j(w): \phi_i\in V', \psi_j\in W'\right\}
-    $$
-
-\end{defn}
-
-Note that $\sum_{i=1}^n \sum_{j=1}^m a_{ij} \phi_i(v)\psi_j(w)$ is a bilinear functional that maps $V'\times W'$ to $\mathbb{F}$.
-
-This enables basis-free construction of vector spaces with proper multiplication and scalar multiplication.
-
-\begin{examples}[Examples of tensor product for vectors]
-
-    Let $V = \mathbb{C}^2, W = \mathbb{C}^3$, choose bases $\{\ket{0}, \ket{1}\} \subset V, \{\ket{0}, \ket{1}, \ket{2}\} \subset W$.
-
-    $$
-    v=\begin{pmatrix}
-            v_1 \\
-            v_2
-        \end{pmatrix}=v_1\ket{0}+v_2\ket{1}\in V,w=\begin{pmatrix}
-            w_1 \\
-            w_2 \\
-            w_3
-        \end{pmatrix}=w_1\ket{0}+w_2\ket{1}+w_3\ket{2}\in W
-    $$.
-
-    Then the tensor product $v\otimes w$ is given by
-
-    $$
-        v\otimes w=\begin{pmatrix}
-            v_1 w_1 &v_1 w_2 &v_1 w_3 \\
-            v_2 w_1 &v_2 w_2 &v_2 w_3
-        \end{pmatrix}\in \mathbb{C}^6
-    $$
-\end{examples}
-
-\begin{examples}[Examples of tensor product for vector spaces]
-
-Let $V = \mathbb{C}^2, W = \mathbb{C}^3$, choose bases $\{\ket{0}, \ket{1}\} \subset V, \{\ket{0}, \ket{1}, \ket{2}\} \subset W.$
-
-Then a basis of the tensor product is
-$$
-\{
-\ket{00}, \ket{01}, \ket{02},
-\ket{10}, \ket{11}, \ket{12}
-\},
-$$
-where $\ket{ij} := \ket{i}\otimes\ket{j}$.
-
-An example element of $V \otimes W$ is
-$$
-\ket{\psi}
-=
-2\,\ket{0}\otimes\ket{1}
-+
-(1+i)\,\ket{1}\otimes\ket{0}
--
-i\,\ket{1}\otimes\ket{2}.
-$$
-
-With respect to the ordered basis
-$$
-(\ket{00}, \ket{01}, \ket{02}, \ket{10}, \ket{11}, \ket{12}),
-$$
-this tensor corresponds to the coordinate vector
-$$
-\ket{\psi}
-\;\longleftrightarrow\;
-\begin{pmatrix}
-0\\
-2\\
-0\\
-1+i\\
-0\\
--i
-\end{pmatrix}
-\in \mathbb{C}^6.
-$$
-
-Using the canonical identification
-$$
-\mathbb{C}^2 \otimes \mathbb{C}^3 \cong \mathbb{C}^{2\times 3},
-$$
-where
-$$
-\ket{i}\otimes\ket{j} \longmapsto E_{ij},
-$$
-the same tensor is represented by the matrix
-$$
-\ket{\psi}
-\;\longleftrightarrow\;
-\begin{pmatrix}
-0 & 2 & 0\\
-1+i & 0 & -i
-\end{pmatrix}.
-$$
-
-\end{examples}
-
-\begin{defn}
-    \label{defn:inner_product_on_tensor_product}
-
-    The vector space defined by the tensor product is equipped with the unique inner product $\langle v\otimes w, u\otimes x\rangle_{V\otimes W}: V\otimes W\times V\otimes W\to \mathbb{F}$ defined by
-
-    $$
-        \langle v\otimes w, u\otimes x\rangle=\langle v,u\rangle_V\langle w,x\rangle_W
-    $$
-\end{defn}
-
-In practice, we ignore the subscript of the vector space and just write $\langle v\otimes w, u\otimes x\rangle=\langle v,u\rangle\langle w,x\rangle$.
-Partial trace
-
-\begin{defn}
-
-\label{defn:trace}
-
-Let $T$ be a linear operator on $\mathscr{H}$, $(e_1,e_2,\cdots,e_n)$ be a basis of $\mathscr{H}$ and $(\epsilon_1,\epsilon_2,\cdots,\epsilon_n)$ be a basis of dual space $\mathscr{H}^*$. Then the trace of $T$ is defined by
-
-$$
-\operatorname{Tr}(T)=\sum_{i=1}^n \epsilon_i(T(e_i))=\sum_{i=1}^n \langle e_i,T(e_i)\rangle
-$$
-
-\end{defn}
-
-This is equivalent to the sum of the diagonal elements of $T$.
-
-\begin{defn}
-    \label{defn:partial_trace}
-
-Let $T$ be a linear operator on $\mathscr{H}=\mathscr{A}\otimes \mathscr{B}$, where $\mathscr{A}$ and $\mathscr{B}$ are finite-dimensional Hilbert spaces.
-
-An operator $T$ on $\mathscr{H}=\mathscr{A}\otimes \mathscr{B}$ can be written as 
-
-$$
-T=\sum_{i=1}^n a_i A_i\otimes B_i
-$$
-
-where $A_i$ is a linear operator on $\mathscr{A}$ and $B_i$ is a linear operator on $\mathscr{B}$.
-
-The $\mathscr{B}$-partial trace of $T$ ($\operatorname{Tr}_{\mathscr{B}}(T):\mathcal{L}(\mathscr{A}\otimes \mathscr{B})\to \mathcal{L}(\mathscr{A})$) is the linear operator on $\mathscr{A}$ defined by
-
-$$
-\operatorname{Tr}_{\mathscr{B}}(T)=\sum_{i=1}^n a_i \operatorname{Tr}(B_i) A_i
-$$
-
-
-\end{defn}
-Or we can define the map $L_v: \mathscr{A}\to \mathscr{A}\otimes \mathscr{B}$ by
-
-$$
-L_v(u)=u\otimes v
-$$
-
-Note that $\langle u,L_v^*(u')\otimes v'\rangle=\langle u,u'\rangle \langle v,v'\rangle=\langle u\otimes v,u'\otimes v'\rangle=\langle L_v(u),u'\otimes v'\rangle$.
-
-Therefore, $L_v^*\sum_{j} u_j\otimes v_j=\sum_{j} \langle v,v_j\rangle u_j$.
-
-Then the partial trace of $T$ can also be defined by
-
-Let $\{v_j\}$ be a set of orthonormal basis of $\mathscr{B}$.
-
-$$
-\operatorname{Tr}_{\mathscr{B}}(T)=\sum_{j} L^*_{v_j}(T)L_{v_j}
-$$
-
-
-\begin{defn}
-    \label{defn:partial_trace_with_respect_to_state}
-Let $T$ be a linear operator on $\mathscr{H}=\mathscr{A}\otimes \mathscr{B}$, where $\mathscr{A}$ and $\mathscr{B}$ are finite-dimensional Hilbert spaces.
-
-Let $\rho$ be a state on $\mathscr{B}$ consisting of orthonormal basis $\{v_j\}$ and eigenvalue $\{\lambda_j\}$.
-
-The partial trace of $T$ with respect to $\rho$ is the linear operator on $\mathscr{A}$ defined by
-
-$$
-\operatorname{Tr}_{\mathscr{A}}(T)=\sum_{j} \lambda_j L^*_{v_j}(T)L_{v_j}
-$$
-\end{defn}
-
-
-This introduces a new model in mathematics explaining quantum mechanics: the non-commutative probability theory.
-
-\section{Non-commutative probability theory}
-
-The non-commutative probability theory is a branch of generalized probability theory that studies the probability of events in non-commutative algebras.
-
-There are several main components of the generalized probability theory; let's see how we can formulate them, comparing with the classical probability theory.
-
-First, we define the Hilbert space in case one did not make the step from the linear algebra courses like me.
-
-\begin{defn}
-    \label{defn:Hilbert_space}
-    Hilbert space:
-
-    A Hilbert space is a complete inner product space.
-\end{defn}
-
-That is, a vector space equipped with an inner product, with the induced metric defined by the norm of the inner product, we have a metric space, which is complete. Reminds that complete mean that every Cauchy sequence, the sequence such that for any $\epsilon>0$, there exists an $N$ such that for all $m,n\geq N$, we have $|x_m-x_n|<\epsilon$, converges to a limit.
-
-As a side note we will use later, we also defined the Borel measure on a space, here we use the following definition specialized for the space (manifolds) we are interested in. 
-
-\begin{defn}
-    \label{defn:Borel_measure}
-    Borel measure:
-    
-    Let $X$ be a topological space, then a Borel measure $\mu:\mathscr{B}(X)\to [0,\infty]$ on $X$ is a measure on the Borel $\sigma$-algebra of $X$ $\mathscr{B}(X)$ satisfying the following properties:
-
-    \begin{enumerate}
-        \item $X \in \mathscr{B}$.
-        \item Close under complement: If $A\subseteq X$, then $\mu(A^c)=\mu(X)-\mu(A)$
-        \item Close under countable unions; If $E_1,E_2,\cdots$ are disjoint sets, then $\mu(\bigcup_{i=1}^\infty E_i)=\sum_{i=1}^\infty \mu(E_i)$
-    \end{enumerate}
-\end{defn}
-
-In later sections, we will use Lebesgue measure, and Haar measure for various circumstances, their detailed definition may be introduced in later sections.
-
-\begin{examples}
-
-To introduce an example of Hilbert space we use when studying quantum mechanics, we need to introduce a common inner product used in $\mathbb{C}^n$.
-
-
-\begin{prop}
-    \label{prop:Hermitian_inner_product_with_complex_vectorspace}
-    The Hermitian inner product on the complex vector space $\C^n$ makes it a Hilbert space.
-\end{prop}
-
-\begin{proof}
-    We first verify that the Hermitian inner product
-    $$
-        \langle u,v\rangle = \sum_{i=1}^n \overline{u_i} v_i
-    $$
-    on $\C^n$ satisfies the axioms of an inner product:
-    \begin{enumerate}
-        \item \textbf{Conjugate symmetry:} For all $u,v\in\C^n$,
-              $$
-                  \langle u,v\rangle =\sum_{i=1}^n \overline{u_i} v_i=\overline{\sum_{i=1}^n \overline{v_i} u_i}=\overline{\langle v,u\rangle}.
-              $$
-        \item \textbf{Linearity:} For any $u,v,w\in\C^n$ and scalars $a,b\in\C$, we have
-              $$
-                  \langle u, av + bw\rangle = \sum_{i=1}^n \overline{u_i} (av_i + bw_i)=a\langle u,v\rangle + b\langle u,w\rangle.
-              $$
-        \item \textbf{Positive definiteness:} For every $u=(u_1,u_2,\cdots,u_n)\in\C^n$, let $u_j=a_j+b_ji$, where $a_j,b_j\in\mathbb{R}$.
-              $$
-                  \langle u,u\rangle = \sum_{j=1}^n \overline{u_j} u_j=\sum_{i=1}^n (a_i^2+b_i^2)\geq 0,
-              $$
-              with equality if and only if $u=0$.
-
-              Therefore, the Hermitian inner product is an inner product.
-    \end{enumerate}
-
-    Next, we show that $\C^n$ is complete with respect to the norm induced by this inner product:
-    $$
-        \|u\| = \sqrt{\langle u,u\rangle}.
-    $$
-    Since $\C^n$ is finite-dimensional, every Cauchy sequence (with respect to any norm) converges in $\C^n$. This is a standard result in finite-dimensional normed spaces, which implies that $\C^n$ is indeed complete.
-
-    Therefore, since the Hermitian inner product fulfills the inner product axioms and $\C^n$ is complete, the complex vector space $\C^n$ with the Hermitian inner product is a Hilbert space.
-\end{proof}
-
-\end{examples}
-
-Another classical example of Hilbert space is $L^2(\Omega, \mathscr{F}, P)$, where $(\Omega, \mathscr{F}, P)$ is a measure space ($\Omega$ is a set, $\mathscr{F}$ is a $\sigma$-algebra on $\Omega$, and $P$ is a measure on $\mathscr{F}$). The $L^2$ space is the space of all function on $\Omega$ that is
-
-\begin{enumerate}
-    \item \textbf{square integrable}: square integrable functions are the functions $f:\Omega\to \mathbb{C}$ such that
-          $$
-              \int_\Omega |f(\omega)|^2 dP(\omega)<\infty
-          $$
-          with inner product defined by
-          $$
-              \langle f,g\rangle=\int_\Omega \overline{f(\omega)}g(\omega)dP(\omega)
-          $$
-
-    \item \textbf{complex-valued}: functions are complex-valued measurable. $f=u+v i$ is complex-valued if $u$ and $v$ are real-valued measurable.
-\end{enumerate}
-
-\begin{examples}
-    
-
-\begin{prop}
-    \label{prop:L2_space_is_a_Hilbert_space}
-    $L^2(\Omega, \mathscr{F}, P)$ is a Hilbert space.
-\end{prop}
-
-\begin{proof}
-    We check the two conditions of the Hilbert space:
-    \begin{itemize}
-        \item Completeness:
-              Let $(f_n)$ be a Cauchy sequence in $L^2(\Omega, \mathscr{F}, P)$. Then for any $\epsilon>0$, there exists an $N$ such that for all $m,n\geq N$, we have
-              $$
-                  \int_\Omega |f_m(\omega)-f_n(\omega)|^2 dP(\omega)<\epsilon^2
-              $$
-              This means that $(f_n)$ is a Cauchy sequence in the norm of $L^2(\Omega, \mathscr{F}, P)$.
-        \item Inner product:
-              The inner product is defined by
-              $$
-                  \langle f,g\rangle=\int_\Omega \overline{f(\omega)}g(\omega)dP(\omega)
-              $$
-              This is a well-defined inner product on $L^2(\Omega, \mathscr{F}, P)$. We can check the properties of the inner product:
-              \begin{itemize}
-                  \item Linearity:
-                        $$
-                            \langle af+bg,h\rangle=a\langle f,h\rangle+b\langle g,h\rangle
-                        $$
-                  \item Conjugate symmetry:
-                        $$
-                            \langle f,g\rangle=\overline{\langle g,f\rangle}
-                        $$
-                  \item Positive definiteness:
-                        $$
-                            \langle f,f\rangle\geq 0
-                        $$
-              \end{itemize}
-    \end{itemize}
-\end{proof}
-
-\end{examples}
-
-Let $\mathscr{H}$ be a Hilbert space. $\mathscr{H}$ consists of complex-valued functions on a finite set $\Omega=\{1,2,\ldots,n\}$, and the functions $(e_1,e_2,\ldots,e_n)$ form an orthonormal basis of $\mathscr{H}$. (We use Dirac notation $|k\rangle$ to denote the basis vector $e_k$~\cite{parthasarathy1992quantum}.)
-
-As an analog to the classical probability space $(\Omega,\mathscr{F},\mu)$, which consists of a sample space $\Omega$ and a probability measure $\mu$ on the state space $\mathscr{F}$, the non-commutative probability space $(\mathscr{H},\mathscr{P},\rho)$ consists of a Hilbert space $\mathscr{H}$ and a state $\rho$ on the space of all orthogonal projections $\mathscr{P}$.
-
-The detailed definition of the non-commutative probability space is given below:
-
-\begin{defn}
-    \label{defn:non-commutative_probability_space}
-    Non-commutative probability space:
-
-    A non-commutative probability space is a pair $(\mathscr{B}(\mathscr{H}),\mathscr{P})$, where $\mathscr{B}(\mathscr{H})$ is the set of all \textbf{bounded} linear operators on $\mathscr{H}$.
-
-    A linear operator on $\mathscr{H}$ is \textbf{bounded} if for all $u$ such that $\|u\|\leq 1$, we have $\|Au\|\leq M$ for some $M>0$.
-
-    $\mathscr{P}$ is the set of all orthogonal projections on $\mathscr{B}(\mathscr{H})$.
-
-    The set $\mathscr{P}=\{P\in\mathscr{B}(\mathscr{H}):P^*=P=P^2\}$ is the set of all orthogonal projections on $\mathscr{B}(\mathscr{H})$.
-\end{defn}
-
-Recall from classical probability theory, we call the initial probability distribution for possible outcomes in the classical probability theory as our \textit{state}, simillarly, we need to define the \textit{state} in the non-commutative probability theory.
-
-\begin{defn}
-    \label{defn:state}
-    Non-commutative probability state:
-
-    Given a non-commutative probability space $(\mathscr{B}(\mathscr{H}),\mathscr{P})$,
-
-    A state is a unit vector $\bra{\psi}$ in the Hilbert space $\mathscr{H}$, such that $\bra{\psi}\ket{\psi}=1$. 
-    
-    Every state uniquely defines a map $\rho:\mathscr{P}\to[0,1]$, $\rho(P)=\bra{\psi}P\ket{\psi}$ (commonly named as density operator) such that:
-    \begin{itemize}
-        \item $\rho(O)=0$, where $O$ is the zero projection, and $\rho(I)=1$, where $I$ is the identity projection.
-        \item If $P_1,P_2,\ldots,P_n$ are pairwise disjoint orthogonal projections, then $\rho(P_1 + P_2 + \cdots + P_n) = \sum_{i=1}^n \rho(P_i)$.
-    \end{itemize}
-\end{defn}
-
-Note that the pure states are the density operators that can be represented by a unit vector $\bra{\psi}$ in the Hilbert space $\mathscr{H}$, whereas mixed states are the density operators that cannot be represented by a unit vector in the Hilbert space $\mathscr{H}$.
-
-If $(|\psi_1\rangle,|\psi_2\rangle,\cdots,|\psi_n\rangle)$ is an orthonormal basis of $\mathscr{H}$ consisting of eigenvectors of $\rho$, for the eigenvalues $p_1,p_2,\cdots,p_n$, then $p_j\geq 0$ and $\sum_{j=1}^n p_j=1$.
-
-We can write $\rho$ as
-$$
-    \rho=\sum_{j=1}^n p_j|\psi_j\rangle\langle\psi_j|
-$$
-(Under basis $|\psi_j\rangle$, it is a diagonal matrix with $p_j$ on the diagonal.)
-
-% Then we need to introduce a theorem that ensures that every state on the space of all orthogonal projections on $\mathscr{H}$ can be represented by a density operator.
-
-% \begin{theorem}
-% 	\label{theorem:Gleason's_theorem}
-% 	Gleason's theorem (Theorem 1.1.15 in~\cite{parthasarathy2005mathematical})
-
-%     Let $\mathscr{H}$ be a Hilbert space over $\mathbb{C}$ or $\mathbb{R}$ of dimension $n\geq 3$. Let $\mu$ be a state on the space $\mathscr{P}$ of projections on $\mathscr{H}$. Then there exists a unique density operator $\rho$ such that
-%     $$
-%     \mu(P)=\operatorname{Tr}(\rho P)
-%     $$
-%     for all $P\in\mathscr{P}$. $\mathscr{P}$ is the space of all orthogonal projections on $\mathscr{H}$.
-% \end{theorem}
-
-% This proof came from~\cite{parthasarathy2005mathematical}.
-
-% \begin{proof}
-% % TODO: FILL IN THE PROOF
-% \end{proof}
-
-% This theorem is a very important theorem in non-commutative probability theory; it states that any state on the space of all orthogonal projections on $\mathscr{H}$ can be represented by a density operator.
-
-The counterpart of the random variable in the non-commutative probability theory is called an observable, which is a Hermitian operator on $\mathscr{H}$ (for all $\psi,\phi$ in the domain of $A$, we have $\langle A\psi,\phi\rangle=\langle\psi,A\phi\rangle$. This kind of operator ensures that our outcome interpreted as probability is a real number). 
-
-\begin{defn}
-    \label{defn:observable}
-    Observable:
-
-    Let $\mathcal{B}(\mathbb{R})$ be the set of all Borel sets on $\mathbb{R}$.
-
-    An (real-valued) observable (random variable) on the Hilbert space $\mathscr{H}$, denoted by $A$, is a projection-valued map (measure) $P_A:\mathscr{B}(\mathbb{R})\to\mathscr{P}(\mathscr{H})$.
-
-    Satisfies the following properties:
-    \begin{itemize}
-        \item $P_A(\emptyset)=O$ (the zero projection)
-        \item $P_A(\mathbb{R})=I$ (the identity projection)
-        \item For any sequence $A_1,A_2,\cdots,A_n\in \mathscr{B}(\mathbb{R})$, the following holds:
-              \begin{itemize}
-                  \item $P_A(\bigcup_{i=1}^n A_i)=\bigvee_{i=1}^n P_A(A_i)$
-                  \item $P_A(\bigcap_{i=1}^n A_i)=\bigwedge_{i=1}^n P_A(A_i)$
-                  \item $P_A(A^c)=I-P_A(A),\forall A\in\mathscr{B}(\mathbb{R})$
-              \end{itemize}
-    \end{itemize}
-\end{defn}
-
-If $A$ is an observable determined by the map $P_A:\mathcal{B}(\mathbb{R})\to\mathcal{P}(\mathscr{H})$, $P_A$ is a spectral measure (a complete additive orthogonal projection valued measure on $\mathcal{B}(\mathbb{R})$). And every spectral measure can be represented by an observable. \cite{parthasarathy2005mathematical}
-
-\begin{prop}
-    If $A_j$ are mutually disjoint (that is $P_A(A_i)P_A(A_j)=P_A(A_j)P_A(A_i)=O$ for $i\neq j$), then $P_A(\bigcup_{j=1}^n A_j)=\sum_{j=1}^n P_A(A_j)$
-\end{prop}
-
-\begin{defn}
-    \label{defn:probability_of_random_variable}
-    Probability of a random variable:
-
-    Let $A$ be a real-valued observable on a Hilbert space $\mathscr{H}$. $\rho$ be a state. The probability of observing the outcome $E\in \mathcal{B}(\mathbb{R})$ is given by:
-
-    $$
-    \mu(E)=\operatorname{Tr}(\rho P_A(E))
-    $$
-\end{defn}
-
-Restriction of a quantum state to a commutative subalgebra defines an ordinary probability measure.
-
-\begin{examples}
-Let
-$$
-Z=\begin{pmatrix}
-1 & 0\\
-0 & -1
-\end{pmatrix}.
-$$
-
-The eigenvalues of $Z$ are $+1$ and $-1$, with corresponding normalized eigenvectors
-
-$$
-\ket{0}=\begin{pmatrix}1\\0\end{pmatrix},
-\qquad
-\ket{1}=\begin{pmatrix}0\\1\end{pmatrix}.
-$$
-
-The spectral projections are
-$$
-P_Z(\{1\}) = \ket{0}\bra{0}
-=
-\begin{pmatrix}
-1 & 0\\
-0 & 0
-\end{pmatrix},
-\qquad
-P_Z(\{-1\}) =  \ket{1}\bra{1}
-=
-\begin{pmatrix}
-0 & 0\\
-0 & 1
-\end{pmatrix}.
-$$
-
-The associated projection-valued measure $P_Z$ satisfies
-$$
-P_Z(\{1,-1\}) = I,
-\qquad
-P_Z(\emptyset)=0.
-$$
-
-%==============================
-% 4. Example: X measurement and its PVM
-%==============================
-
-Let
-$$
-X=\begin{pmatrix}
-0 & 1\\
-1 & 0
-\end{pmatrix}.
-$$
-
-The normalized eigenvectors of $X$ are
-$$
-\ket{+}=\frac{1}{\sqrt{2}}\left(\ket{0}+\ket{1}\right),
-\qquad
-\ket{-}=\frac{1}{\sqrt{2}}\left(\ket{0}-\ket{1}\right),
-$$
-with eigenvalues $+1$ and $-1$, respectively.
-
-The corresponding spectral projections are
-$$
-P_X(\{1\}) = \ket{+}\bra{+}
-=
-\frac{1}{2}
-\begin{pmatrix}
-1 & 1\\
-1 & 1
-\end{pmatrix},
-$$
-$$
-P_X(\{-1\}) = \ket{-}\bra{-}
-=
-\frac{1}{2}
-\begin{pmatrix}
-1 & -1\\
--1 & 1
-\end{pmatrix}.
-$$
-
-%==============================
-% 5. Noncommutativity of the projections
-%==============================
-
-Compute
-$$
-P_Z(\{1\})P_X(\{1\})
-=
-\begin{pmatrix}
-1 & 0\\
-0 & 0
-\end{pmatrix}
-\cdot
-\frac{1}{2}
-\begin{pmatrix}
-1 & 1\\
-1 & 1
-\end{pmatrix}
-=
-\frac{1}{2}
-\begin{pmatrix}
-1 & 1\\
-0 & 0
-\end{pmatrix}.
-$$
-
-On the other hand,
-$$
-P_X(\{1\})P_Z(\{1\})
-=
-\frac{1}{2}
-\begin{pmatrix}
-1 & 1\\
-1 & 1
-\end{pmatrix}
-\cdot
-\begin{pmatrix}
-1 & 0\\
-0 & 0
-\end{pmatrix}
-=
-\frac{1}{2}
-\begin{pmatrix}
-1 & 0\\
-1 & 0
-\end{pmatrix}.
-$$
-
-Since
-$$
-P_Z(\{1\})P_X(\{1\}) \neq P_X(\{1\})P_Z(\{1\}),
-$$
-the projections do not commute.
-
-Let $\rho$ be a density operator on $\mathbb C^2$, i.e.
-$$
-\rho \ge 0,
-\qquad
-\operatorname{Tr}(\rho)=1.
-$$
-
-For a pure state $\ket{\psi}$, one has
-$$
-\rho = \ket{\psi}\bra{\psi}.
-$$
-
-The probability that a measurement associated with a PVM $P$ yields an outcome in a Borel set $A\in \mathcal{B}$ is
-$$
-\mathbb P(A) = \operatorname{Tr}(\rho\, P(A)).
-$$
-
-For example, let
-$$
-\rho = \ket{0}\langle 0|
-=
-\begin{pmatrix}
-1 & 0\\
-0 & 0
-\end{pmatrix}.
-$$
-
-Then
-$$
-\operatorname{Tr}\bigl(\rho\, P_Z(\{1\})\bigr) = 1,
-\qquad
-\operatorname{Tr}\bigl(\rho\, P_X(\{1\})\bigr) = \frac{1}{2}.
-$$
-
-\end{examples}
-
-\begin{defn}
-    \label{defn:measurement}
-    Definition of measurement:
-
-    A measurement (observation) of a system prepared in a given state produces an outcome $x$, $x$ is a physical event that is a subset of the set of all possible outcomes. For each $x$, we associate a measurement operator $M_x$ on $\mathscr{H}$.
-
-    Given the initial state (pure state, unit vector) $u$, the probability of measurement outcome $x$ is given by:
-    $$
-        p(x)=\|M_xu\|^2
-    $$
-
-    Note that to make sense of this definition, the collection of measurement operators $\{M_x\}$ must satisfy the completeness requirement:
-    $$
-        1=\sum_{x\in X} p(x)=\sum_{x\in X}\|M_xu\|^2=\sum_{x\in X}\langle M_xu,M_xu\rangle=\langle u,(\sum_{x\in X}M_x^*M_x)u\rangle
-    $$
-    So $\sum_{x\in X}M_x^*M_x=I$.
-
-\end{defn}
-
-
-Here is Table~\ref{tab:analog_of_classical_probability_theory_and_non_commutative_probability_theory} summarizing the analog of classical probability theory and non-commutative (\textit{quantum}) probability theory~\cite{Feres}:
-
-\begin{table}[H]
-    \centering
-    \renewcommand{\arraystretch}{1.5}
-    \caption{Analog of classical probability theory and non-commutative (\textit{quantum}) probability theory}
-    \label{tab:analog_of_classical_probability_theory_and_non_commutative_probability_theory}
-    {\small
-        \begin{tabular}{|p{0.5\linewidth}|p{0.5\linewidth}|}
-            \hline
-            \textbf{Classical probability}                                                                                                                      & \textbf{Non-commutative probability}                                                                                                                      \\
-            \hline
-            Sample space $\Omega$, cardinality $\vert\Omega\vert=n$, example: $\Omega=\{0,1\}$                                                                  & Complex Hilbert space $\mathscr{H}$, dimension $\dim\mathscr{H}=n$, example: $\mathscr{H}=\mathbb{C}^2$                                                   \\
-            \hline
-            Common algebra of $\mathbb{C}$ valued functions                                                                                                     & Algebra of bounded operators $\mathcal{B}(\mathscr{H})$                                                                                                   \\
-            \hline
-            $f\mapsto \bar{f}$ complex conjugation                                                                                                              & $P\mapsto P^*$ adjoint                                                                                                                                    \\
-            \hline
-            Events: indicator functions of sets                                                                                                                 & Projections: space of orthogonal projections $\mathscr{P}\subseteq\mathscr{B}(\mathscr{H})$                                                               \\
-            \hline
-            functions $f$ such that $f^2=f=\overline{f}$                                                                                                        & orthogonal projections $P$ such that $P^*=P=P^2$                                                                                                          \\
-            \hline
-            $\mathbb{R}$-valued functions $f=\overline{f}$                                                                                                      & self-adjoint operators $A=A^*$                                                                                                                            \\
-            \hline
-            $\mathbb{I}_{f^{-1}(\{\lambda\})}$ is the indicator function of the set $f^{-1}(\{\lambda\})$                                                       & $P(\lambda)$ is the orthogonal projection to eigenspace                                                                                                   \\
-            \hline
-            $f=\sum_{\lambda\in \operatorname{Range}(f)}\lambda \mathbb{I}_{f^{-1}(\{\lambda\})}$                                                               & $A=\sum_{\lambda\in \operatorname{sp}(A)}\lambda P(\lambda)$                                                                                              \\
-            \hline
-            Probability measure $\mu$ on $\Omega$                                                                                                               & Density operator $\rho$ on $\mathscr{H}$                                                                                                                  \\
-            \hline
-            Delta measure $\delta_\omega$                                                                                                                       & Pure state $\rho=\vert\psi\rangle\langle\psi\vert$                                                                                                        \\
-            \hline
-            $\mu$ is non-negative measure and $\sum_{i=1}^n\mu(\{i\})=1$                                                                                        & $\rho$ is positive semi-definite and $\operatorname{Tr}(\rho)=1$                                                                                          \\
-            \hline
-            Expected value of random variable $f$ is $\mathbb{E}_{\mu}(f)=\sum_{i=1}^n f(i)\mu(\{i\})$                                                          & Expected value of operator $A$ is $\mathbb{E}_\rho(A)=\operatorname{Tr}(\rho A)$                                                                          \\
-            \hline
-            Variance of random variable $f$ is $\operatorname{Var}_\mu(f)=\sum_{i=1}^n (f(i)-\mathbb{E}_\mu(f))^2\mu(\{i\})$                                    & Variance of operator $A$ is $\operatorname{Var}_\rho(A)=\operatorname{Tr}(\rho A^2)-\operatorname{Tr}(\rho A)^2$                                          \\
-            \hline
-            Covariance of random variables $f$ and $g$ is $\operatorname{Cov}_\mu(f,g)=\sum_{i=1}^n (f(i)-\mathbb{E}_\mu(f))(g(i)-\mathbb{E}_\mu(g))\mu(\{i\})$ & Covariance of operators $A$ and $B$ is $\operatorname{Cov}_\rho(A,B)=\operatorname{Tr}(\rho A\circ B)-\operatorname{Tr}(\rho A)\operatorname{Tr}(\rho B)$ \\
-            \hline
-            Composite system is given by Cartesian product of the sample spaces $\Omega_1\times\Omega_2$                                                        & Composite system is given by tensor product of the Hilbert spaces $\mathscr{H}_1\otimes\mathscr{H}_2$                                                     \\
-            \hline
-            Product measure $\mu_1\times\mu_2$ on $\Omega_1\times\Omega_2$                                                                                      & Tensor product of space $\rho_1\otimes\rho_2$ on $\mathscr{H}_1\otimes\mathscr{H}_2$                                                                      \\
-            \hline
-            Marginal distribution $\pi_*v$                                                                                                                      & Partial trace $\operatorname{Tr}_2(\rho)$                                                                                                                 \\
-            \hline
-        \end{tabular}
-    }
-    \vspace{0.5cm}
-\end{table}
-
-\section{Manifolds}
-
-In this section, we will introduce some basic definitions and theorems used in manifold theory that are relevant to our study. Assuming no prior knowledge of manifold theory but basic topology understanding. We will provide brief definitions and explanations for each term. From the most abstract Manifold definition to the Riemannian manifolds and related theorems.
-
-\subsection{Manifolds}
-
-\begin{defn}
-    \label{defn:m-manifold}
-
-An $m$-manifold is a Topological space $X$ that is
-
-\begin{enumerate}
-    \item Hausdroff: every distinct two points in $X$ can be separated by two disjoint open sets.
-    \item Second countable: $X$ has countable basis.
-    \item Every point $p$ has an open neighborhood $p\in U$ that is homeomorphic to an open subset of $\mathbb{R}^m$.
-\end{enumerate}
-\end{defn}
-
-
-\begin{examples}
-    \label{example:second_countable_space}
-    Let $X=\mathbb{R}$ and $\mathcal{B}=\{(a,b)|a,b\in \mathbb{R},a<b\}$ (collection of all open intervals with rational endpoints).
-
-    Since the rational numbers are countable, so $\mathcal{B}$ is countable.
-
-    So $\mathbb{R}$ is second countable.
-
-    Likewise, $\mathbb{R}^n$ is also second countable.
-\end{examples}
-
-\begin{examples}
-    \label{example:manifold}
-1-manifold is a curve and 2-manifold is a surface.
-\end{examples}
-
-\begin{theorem}
-    \label{Theorem of imbedded space}
-    If $X$ is a compact $m$-manifold, then $X$ can be imbedded in $\mathbb{R}^n$ for some $n$.
-\end{theorem}
-
-This theorem might save you from imagining abstract structures back to real dimension. Good news, at least you stay in some real numbers.
-
-\subsection{Smooth manifolds and Lie groups}
-
-This section is adopted from \cite{lee_introduction_2012}
-
-\begin{defn}
-    \label{defn:partial_derivative}
-
-    Let $U\subseteq \mathbb{R}^n$ and $f:U\to \mathbb{R}^n$ be a map.
-
-    For any $a=(a_1,\cdots,a_n)\in U$, $j\in \{1,\cdots,n\}$, the $j$-th partial derivative of $F$ at $a$ is defined as
-
-    $$
-    \begin{aligned}
-    \frac{\partial f}{\partial x_j}(a)&=\lim_{h\to 0}\frac{f(a_1,\cdots,a_j+h,\cdots,a_n)-f(a_1,\cdots,a_j,\cdots,a_n)}{h} \\
-    &=\lim_{h\to 0}\frac{f(a+he_j)-f(a)}{h}
-    \end{aligned}
-$$
-
-\end{defn}
-
-\begin{defn}
-    \label{defn:continuously_differentiable_map}
-    Let $U\subseteq \mathbb{R}^n$ and $f:U\to \mathbb{R}^n$ be a map.
-
-    If for any $j\in \{1,\cdots,n\}$, the $j$-th partial derivative of $f$ is continuous at $a$, then $f$ is continuously differentiable at $a$.
-
-    If $\forall a\in U$, $\frac{\partial f}{\partial x_j}$ exists and is continuous at $a$, then $f$ is continuously differentiable on $U$. or $C^1$ map. (Note that $C^0$ map is just a continuous map.)
-\end{defn} 
-
-
-\begin{defn}
-    \label{defn:smooth_map}
-    A function $f:U\to \mathbb{R}^n$ is smooth if it is of class $C^k$ for every $k\geq 0$ on $U$. Such function is called a diffeomorphism if it is also a \textbf{\texttt{bijection}} and its \textbf{\texttt{inverse is also smooth}}.
-\end{defn}
-
-
-\begin{defn}
-    \label{defn:chart}
-
-    Let $M$ be a smooth manifold. A \textbf{\texttt{chart}} is a pair $(U,\varphi)$ where $U\subseteq M$ is an open subset and $\varphi:U\to \hat{U}\subseteq \mathbb{R}^n$ is a homeomorphism (a continuous bijection map and its inverse is also continuous).
-
-    If $p\in U$ and $\varphi(p)=0$, then we say that $p$ is the origin of the chart $(U,\varphi)$.
-
-    For $p\in U$, we note that the continuous function $\varphi(p)=(x_1(p),\cdots,x_n(p))$ gives a vector in $\mathbb{R}^n$. The $(x_1(p),\cdots,x_n(p))$ is called the \textbf{\texttt{local coordinates}} of $p$ in the chart $(U,\varphi)$.
-
-\end{defn}
-
-\begin{defn}
-    \label{defn:atlas}
-    Let $M$ be a smooth manifold. An \textbf{\texttt{atlas}} is a collection of charts $\mathcal{A}=\{(U_\alpha,\phi_\alpha)\}_{\alpha\in I}$ such that $M=\bigcup_{\alpha\in I} U_\alpha$.
-
-    An atlas is said to be \textbf{\texttt{smooth}} if the transition maps $\phi_\alpha\circ \phi_\beta^{-1}:\phi_\beta(U_\alpha\cap U_\beta)\to \phi_\alpha(U_\alpha\cap U_\beta)$ are smooth for all $\alpha, \beta\in I$.
-\end{defn}
-
-
-\begin{defn}
-    \label{defn:smooth_manifold}
-    A smooth manifold is a pair $(M,\mathcal{A})$ where $M$ is a topological manifold and $\mathcal{A}$ is a smooth atlas.
-\end{defn}
-
-TODO: There is some section gaps here, from smooth manifold to smooth submersion. 
-
-Here are some additional propositions that will be helpful for our study in later sections:
-
-This one is from \cite{lee_introduction_2012} Theorem 4.26
-
-\begin{theorem}
-    \label{theorem:local_section_theorem}
-
-    Let $M$ and $N$ be smooth manifolds and $\pi:M\to N$ is a smooth map. Then $\pi$ is a smooth submersion if and only if every point of $M$ is in the image of a smooth local section of $\pi$ (a local section of $\pi$ is a map $\sigma:U\to M$ defined on some open subset $U\subseteq N$ with $\pi\circ \sigma=Id_U$).   
-\end{theorem}
-
-
-
-\section{Quantum physics and terminologies}
-
-In this section, we will introduce some terminologies and theorems used in quantum physics that are relevant to our study. Assuming no prior knowledge of quantum physics, we will provide brief definitions and explanations for each term.
-
-One might ask, what is the fundamental difference between a quantum system and a classical system, and why can we not directly apply those theorems in classical computers to a quantum computer? It turns out that quantum error-correcting codes are hard due to the following definitions and features for quantum computing.
-
-\begin{defn}
- All quantum operations can be constructed by composing four kinds of transformations: (adapted from Chapter 10 of \cite{Bengtsson_Zyczkowski_2017})
-
-  \begin{enumerate}
-    \item Unitary operations. $U(\cdot)$ for any quantum state. It is possible to apply a non-unitary operation for an open quantum system, but that is usually not the focus for quantum computing and usually leads to non-recoverable loss of information that we wish to obtain.
-    \item Extend the system. Given a quantum state $\rho\in\mathcal{H}^N$, we can extend it to a larger quantum system by "entangle" (For this report, you don't need to worry for how quantum entanglement works) it with some new states $\sigma\in \mathcal{H}^K$ (The space where the new state dwells is usually called ancilla system) and get $\rho'=\rho\otimes\sigma\in \mathcal{H}^N\otimes \mathcal{K}$.
-    \item Partial trace. Given a quantum state $\rho\in\mathcal{H}^N$ and some reference state $\sigma\in\mathcal {H}^K$, we can trace out some subsystems and get a new state $\rho'\in\mathcal{H}^{N-K}$.
-    \item Selective measurement. Given a quantum state, we measure it and get a classical bit; unlike the classical case, the measurement is a probabilistic operation. (More specifically, this is some projection to a reference state corresponding to a classical bit output. For this report, you don't need to worry about how such a result is obtained and how the reference state is constructed.)
-  \end{enumerate}
-\end{defn}
-
-
-$U(n)$ is the group of all $n\times n$ \textbf{unitary matrices} over $\mathbb{C}$, 
-
-$$
-U(n)=\{A\in \mathbb{C}^{n\times n}: A^*A=AA^*=I_n\}
-$$
-
-The uniqueness of such measurement came from the lemma below~\cite{Elizabeth_book}
-
-\begin{lemma}
-    \label{lemma:haar_measure}
-
-    Let $(U(n), \| \cdot \|, \mu)$ be a metric measure space where $\| \cdot \|$ is the Hilbert-Schmidt norm and $\mu$ is the measure function.
-
-    The Haar measure on $U(n)$ is the unique probability measure that is invariant under the action of $U(n)$ on itself.
-    
-    That is, fixing $B\in U(n)$, $\forall A\in U(n)$, $\mu(A\cdot B)=\mu(B\cdot A)=\mu(B)$.
-    
-    The Haar measure is the unique probability measure that is invariant under the action of $U(n)$ on itself.
-\end{lemma}
-
-
-\begin{defn}
-    \label{defn:pure_state}
-    Pure state:
-
-A random pure state $\varphi$ is any random variable distributed according to the unitarily invariant probability measure on the pure states $\mathcal{P}(A)$ of the system $A$, denoted by $\varphi\in_R\mathcal{P}(A)$.
-\end{defn}
-
-It is trivial that for the space of pure state, we can easily apply the Haar measure as the unitarily invariant probability measure since the space of pure state is $S^n$ for some $n$. However, for the case of mixed states, that is a bit complicated and we need to use partial tracing to defined the rank-$s$ random states.
-
-\begin{defn}
-    \label{defn:rank_s_random_state}
-    Rank-$s$ random state.
-
-    For a system $A$ and an integer $s\geq 1$, consider the distribution onn the mixed states $\mathcal{S}(A)$ of A induced by the partial trace over the second factor form the uniform distribution on pure states of $A\otimes\mathbb{C}^s$. Any random variable $\rho$ distributed as such will be called a rank-$s$ random states; denoted as $\rho\in_R \mathcal{S}_s(A)$. And $\mathcal{P}(A)=\mathcal{S}_1(A)$.
-\end{defn}
-
-
-\begin{prop}
-    \label{prop:indistinguishability}
-    Proposition of indistinguishability:
-
-    Suppose that we have two systems $u_1,u_2\in \mathscr{H}_1$, the two states are distinguishable if and only if they are orthogonal.
-\end{prop}
-
-\begin{proof}
-    Ways to distinguish the two states:
-    \begin{enumerate}
-        \item Set $X=\{0,1,2\}$ and $M_i=|u_i\rangle\langle u_i|$, $M_0=I-M_1-M_2$
-        \item Then $\{M_0,M_1,M_2\}$ is a complete collection of measurement operators on $\mathscr{H}$.
-        \item Suppose the prepared state is $u_1$, then $p(1)=\|M_1u_1\|^2=\|u_1\|^2=1$, $p(2)=\|M_2u_1\|^2=0$, $p(0)=\|M_0u_1\|^2=0$.
-    \end{enumerate}
-
-    If they are not orthogonal, then there is no choice of measurement operators to perfectly distinguish the two states.
-
-\end{proof}
-
-Intuitively, if the two states are not orthogonal, then for any measurement (projection) there exists non-zero probability of getting the same outcome for both states.
-
-\subsection{Random quantum states}
-
-First, we need to define what is a random state in a bipartite system.
-
-
-% When compiled standalone, print this chapter's references at the end.
-\ifSubfilesClassLoaded{
-    \printbibliography[title={References}]
-}
-
-\end{document}
+% chapters/chap0.tex
+\documentclass[../main.tex]{subfiles}
+
+% If this chapter is compiled *by itself*, we must load only its own .bib
+% and print its bibliography at the end of the chapter.
+\ifSubfilesClassLoaded{
+  \addbibresource{\subfix{../main.bib}}
+}
+
+\begin{document}
+
+\chapter*{Chapter 0: Brief definitions and basic concepts}
+\addcontentsline{toc}{chapter}{Chapter 0: Brief definitions and basic concepts}
+\markboth{Chapter 0: Brief definitions and basic concepts}{}
+
+As the future version of me might forgot everything we have over the summer, as I did for now, I will make a review again from the simple definition to recall the necessary information to tell you why we are here and how we are going to proceed.
+
+This section serve as reference for definitions, notations, and theorems that we will use later. This section can be safely ignored if you are already familiar with the definitions and theorems.
+
+But for the future self who might have no idea what I'm talking about, we will provided detailed definitions to you to understand the concepts.
+
+\section{Complex vector spaces}
+
+The main vector space we are interested in is $\mathbb{C}^n$; therefore, all the linear operators we defined are from $\mathbb{C}^n$ to $\mathbb{C}^n$.
+
+\begin{defn}
+    \label{defn:braket}
+
+    We denote a vector in vector space as $\ket{\psi}=(z_1,\ldots,z_n)$ (might also be infinite dimensional, and $z_i\in\mathbb{C}$).
+
+\end{defn}
+
+
+Here $\psi$ is just a label for the vector, and you don't need to worry about it too much. This is also called the ket, where the counterpart $\bra{\psi}$ is called the bra, used to denote the vector dual to $\psi$; such an element is a linear functional if you really want to know what that is.
+
+Few additional notation will be introduced, in this document, we will follows the notation used in mathematics literature \cite{axler2023linear}
+
+\begin{itemize}
+    \item $\langle\psi|\varphi\rangle$ is the inner product between two vectors, and $\bra{\psi} A\ket{\varphi}$ is the inner product between $A\ket{\varphi}$ and $\bra{\psi}$, or equivalently $A^\dagger \bra{\psi}$ and $\ket{\varphi}$.
+    \item Given a complex matrix $A=\mathbb{C}^{n\times n}$,
+          \begin{enumerate}
+              \item  $\overline{A}$ is the complex conjugate of $A$.
+                    \begin{examples}
+                        $$
+                            A=\begin{bmatrix}
+                                1+i & 2+i & 3+i \\
+                                4+i & 5+i & 6+i \\
+                                7+i & 8+i & 9+i\end{bmatrix},
+                            \overline{A}=\begin{bmatrix}
+                                1-i & 2-i & 3-i \\
+                                4-i & 5-i & 6-i \\
+                                7-i & 8-i & 9-i
+                            \end{bmatrix}
+                        $$
+                    \end{examples}
+              \item  $A^\top$ denotes the transpose of $A$.
+                    \begin{examples}
+                        $$
+                            A=\begin{bmatrix}
+                                1+i & 2+i & 3+i \\
+                                4+i & 5+i & 6+i \\
+                                7+i & 8+i & 9+i
+                            \end{bmatrix},
+                            A^\top=\begin{bmatrix}
+                                1+i & 4+i & 7+i \\
+                                2+i & 5+i & 8+i \\
+                                3+i & 6+i & 9+i
+                            \end{bmatrix}
+                        $$
+                    \end{examples}
+              \item $A^*=\overline{(A^\top)}$ denotes the complex conjugate transpose, referred to as the adjoint, or Hermitian conjugate of $A$.
+                    \begin{examples}
+                        $$
+                            A=\begin{bmatrix}
+                                1+i & 2+i & 3+i \\
+                                4+i & 5+i & 6+i \\
+                                7+i & 8+i & 9+i
+                            \end{bmatrix},
+                            A^*=\begin{bmatrix}
+                                1-i & 4-i & 7-i \\
+                                2-i & 5-i & 8-i \\
+                                3-i & 6-i & 9-i
+                            \end{bmatrix}
+                        $$
+                    \end{examples}
+              \item  $A$ is unitary if $A^* A=AA^*=I$.
+              \item  $A$ is self-adjoint (hermitian in physics literature) if $A^*=A$.
+          \end{enumerate}
+\end{itemize}
+
+\subsubsection{Motivation of Tensor product}
+
+Recall from the traditional notation of product space of two vector spaces $V$ and $W$, that is, $V\times W$, is the set of all ordered pairs $(\ket{v},\ket{w})$ where $\ket{v}\in V$ and $\ket{w}\in W$.
+
+The space has dimension $\dim V+\dim W$.
+
+We want to define a vector space with the notation of multiplication of two vectors from different vector spaces.
+
+That is
+
+$$
+    (\ket{v_1}+\ket{v_2})\otimes \ket{w}=(\ket{v_1}\otimes \ket{w})+(\ket{v_2}\otimes \ket{w})
+$$
+$$
+    \ket{v}\otimes (\ket{w_1}+\ket{w_2})=(\ket{v}\otimes \ket{w_1})+(\ket{v}\otimes \ket{w_2})
+$$
+
+and enables scalar multiplication by
+
+$$
+    \lambda (\ket{v}\otimes \ket{w})=(\lambda \ket{v})\otimes \ket{w}=\ket{v}\otimes (\lambda \ket{w})
+$$
+
+And we wish to build a way to associate the basis of $V$ and $W$ with the basis of $V\otimes W$. That makes the tensor product a vector space with dimension $\dim V\times \dim W$.
+
+\begin{defn}
+    \label{defn:linear_functional}
+    Definition of linear functional
+
+    A linear functional is a linear map from $V$ to $\mathbb{F}$.
+
+\end{defn}
+
+Note the difference between a linear functional and a linear map.
+
+A generalized linear map is a function $f: V\to W$ satisfying the condition.
+
+\begin{itemize}
+    \item $f(\ket{u}+\ket{v})=f(\ket{u})+f(\ket{v})$
+    \item $f(\lambda \ket{v})=\lambda f(\ket{v})$
+\end{itemize}
+
+
+\begin{defn}
+    \label{defn:bilinear_functional}
+    A bilinear functional is a bilinear function $\beta:V\times W\to \mathbb{F}$ satisfying the condition that $\ket{v}\to \beta(\ket{v},\ket{w})$ is a linear functional for all $\ket{w}\in W$ and $\ket{w}\to \beta(\ket{v},\ket{w})$ is a linear functional for all $\ket{v}\in V$.
+
+\end{defn}
+
+The vector space of all bilinear functionals is denoted by $\mathcal{B}(V, W)$.
+
+
+\begin{defn}
+    \label{defn:tensor_product}
+    Let $V, W$ be two vector spaces.
+
+    Let $V'$ and $W'$ be the dual spaces of $V$ and $W$, respectively, that is $V'=\{\psi:V\to \mathbb{F}\}$ and $W'=\{\phi:W\to \mathbb{F}\}$, $\psi, \phi$ are linear functionals.
+
+    The tensor product of vectors $v\in V$ and $w\in W$ is the bilinear functional defined by $\forall (\psi,\phi)\in V'\times W'$ given by the notation
+
+    $$
+        (v\otimes w)(\psi,\phi)=\psi(v)\phi(w)
+    $$
+
+    The tensor product of two vector spaces $V$ and $W$ is the vector space $\mathcal{B}(V',W')$
+
+    Notice that the basis of such vector space is the linear combination of the basis of $V'$ and $W'$, that is, if $\{e_i\}$ is the basis of $V'$ and $\{f_j\}$ is the basis of $W'$, then $\{e_i\otimes f_j\}$ is the basis of $\mathcal{B}(V', W')$.
+
+    That is, every element of $\mathcal{B}(V', W')$ can be written as a linear combination of the basis.
+
+    Since $\{e_i\}$ and $\{f_j\}$ are bases of $V'$ and $W'$, respectively, then we can always find a set of linear functionals $\{\phi_i\}$ and $\{\psi_j\}$ such that $\phi_i(e_j)=\delta_{ij}$ and $\psi_j(f_i)=\delta_{ij}$.
+
+    Here $\delta_{ij}=\begin{cases}
+            1 & \text{if } i=j   \\
+            0 & \text{otherwise}
+        \end{cases}$ is the Kronecker delta.
+
+    $$
+        V\otimes W=\left\{\sum_{i=1}^n \sum_{j=1}^m a_{ij} \phi_i(v)\psi_j(w): \phi_i\in V', \psi_j\in W'\right\}
+    $$
+
+\end{defn}
+
+Note that $\sum_{i=1}^n \sum_{j=1}^m a_{ij} \phi_i(v)\psi_j(w)$ is a bilinear functional that maps $V'\times W'$ to $\mathbb{F}$.
+
+This enables basis-free construction of vector spaces with proper multiplication and scalar multiplication.
+
+\begin{examples}[Examples of tensor product for vectors]
+
+    Let $V = \mathbb{C}^2, W = \mathbb{C}^3$, choose bases $\{\ket{0}, \ket{1}\} \subset V, \{\ket{0}, \ket{1}, \ket{2}\} \subset W$.
+
+    $$
+    v=\begin{pmatrix}
+            v_1 \\
+            v_2
+        \end{pmatrix}=v_1\ket{0}+v_2\ket{1}\in V,w=\begin{pmatrix}
+            w_1 \\
+            w_2 \\
+            w_3
+        \end{pmatrix}=w_1\ket{0}+w_2\ket{1}+w_3\ket{2}\in W
+    $$.
+
+    Then the tensor product $v\otimes w$ is given by
+
+    $$
+        v\otimes w=\begin{pmatrix}
+            v_1 w_1 &v_1 w_2 &v_1 w_3 \\
+            v_2 w_1 &v_2 w_2 &v_2 w_3
+        \end{pmatrix}\in \mathbb{C}^6
+    $$
+\end{examples}
+
+\begin{examples}[Examples of tensor product for vector spaces]
+
+Let $V = \mathbb{C}^2, W = \mathbb{C}^3$, choose bases $\{\ket{0}, \ket{1}\} \subset V, \{\ket{0}, \ket{1}, \ket{2}\} \subset W.$
+
+Then a basis of the tensor product is
+$$
+\{
+\ket{00}, \ket{01}, \ket{02},
+\ket{10}, \ket{11}, \ket{12}
+\},
+$$
+where $\ket{ij} := \ket{i}\otimes\ket{j}$.
+
+An example element of $V \otimes W$ is
+$$
+\ket{\psi}
+=
+2\,\ket{0}\otimes\ket{1}
++
+(1+i)\,\ket{1}\otimes\ket{0}
+-
+i\,\ket{1}\otimes\ket{2}.
+$$
+
+With respect to the ordered basis
+$$
+(\ket{00}, \ket{01}, \ket{02}, \ket{10}, \ket{11}, \ket{12}),
+$$
+this tensor corresponds to the coordinate vector
+$$
+\ket{\psi}
+\;\longleftrightarrow\;
+\begin{pmatrix}
+0\\
+2\\
+0\\
+1+i\\
+0\\
+-i
+\end{pmatrix}
+\in \mathbb{C}^6.
+$$
+
+Using the canonical identification
+$$
+\mathbb{C}^2 \otimes \mathbb{C}^3 \cong \mathbb{C}^{2\times 3},
+$$
+where
+$$
+\ket{i}\otimes\ket{j} \longmapsto E_{ij},
+$$
+the same tensor is represented by the matrix
+$$
+\ket{\psi}
+\;\longleftrightarrow\;
+\begin{pmatrix}
+0 & 2 & 0\\
+1+i & 0 & -i
+\end{pmatrix}.
+$$
+
+\end{examples}
+
+\begin{defn}
+    \label{defn:inner_product_on_tensor_product}
+
+    The vector space defined by the tensor product is equipped with the unique inner product $\langle v\otimes w, u\otimes x\rangle_{V\otimes W}: V\otimes W\times V\otimes W\to \mathbb{F}$ defined by
+
+    $$
+        \langle v\otimes w, u\otimes x\rangle=\langle v,u\rangle_V\langle w,x\rangle_W
+    $$
+\end{defn}
+
+In practice, we ignore the subscript of the vector space and just write $\langle v\otimes w, u\otimes x\rangle=\langle v,u\rangle\langle w,x\rangle$.
+Partial trace
+
+\begin{defn}
+
+\label{defn:trace}
+
+Let $T$ be a linear operator on $\mathscr{H}$, $(e_1,e_2,\cdots,e_n)$ be a basis of $\mathscr{H}$ and $(\epsilon_1,\epsilon_2,\cdots,\epsilon_n)$ be a basis of dual space $\mathscr{H}^*$. Then the trace of $T$ is defined by
+
+$$
+\operatorname{Tr}(T)=\sum_{i=1}^n \epsilon_i(T(e_i))=\sum_{i=1}^n \langle e_i,T(e_i)\rangle
+$$
+
+\end{defn}
+
+This is equivalent to the sum of the diagonal elements of $T$.
+
+\begin{defn}
+    \label{defn:partial_trace}
+
+Let $T$ be a linear operator on $\mathscr{H}=\mathscr{A}\otimes \mathscr{B}$, where $\mathscr{A}$ and $\mathscr{B}$ are finite-dimensional Hilbert spaces.
+
+An operator $T$ on $\mathscr{H}=\mathscr{A}\otimes \mathscr{B}$ can be written as 
+
+$$
+T=\sum_{i=1}^n a_i A_i\otimes B_i
+$$
+
+where $A_i$ is a linear operator on $\mathscr{A}$ and $B_i$ is a linear operator on $\mathscr{B}$.
+
+The $\mathscr{B}$-partial trace of $T$ ($\operatorname{Tr}_{\mathscr{B}}(T):\mathcal{L}(\mathscr{A}\otimes \mathscr{B})\to \mathcal{L}(\mathscr{A})$) is the linear operator on $\mathscr{A}$ defined by
+
+$$
+\operatorname{Tr}_{\mathscr{B}}(T)=\sum_{i=1}^n a_i \operatorname{Tr}(B_i) A_i
+$$
+
+
+\end{defn}
+Or we can define the map $L_v: \mathscr{A}\to \mathscr{A}\otimes \mathscr{B}$ by
+
+$$
+L_v(u)=u\otimes v
+$$
+
+Note that $\langle u,L_v^*(u')\otimes v'\rangle=\langle u,u'\rangle \langle v,v'\rangle=\langle u\otimes v,u'\otimes v'\rangle=\langle L_v(u),u'\otimes v'\rangle$.
+
+Therefore, $L_v^*\sum_{j} u_j\otimes v_j=\sum_{j} \langle v,v_j\rangle u_j$.
+
+Then the partial trace of $T$ can also be defined by
+
+Let $\{v_j\}$ be a set of orthonormal basis of $\mathscr{B}$.
+
+$$
+\operatorname{Tr}_{\mathscr{B}}(T)=\sum_{j} L^*_{v_j}(T)L_{v_j}
+$$
+
+
+\begin{defn}
+    \label{defn:partial_trace_with_respect_to_state}
+Let $T$ be a linear operator on $\mathscr{H}=\mathscr{A}\otimes \mathscr{B}$, where $\mathscr{A}$ and $\mathscr{B}$ are finite-dimensional Hilbert spaces.
+
+Let $\rho$ be a state on $\mathscr{B}$ consisting of orthonormal basis $\{v_j\}$ and eigenvalue $\{\lambda_j\}$.
+
+The partial trace of $T$ with respect to $\rho$ is the linear operator on $\mathscr{A}$ defined by
+
+$$
+\operatorname{Tr}_{\mathscr{A}}(T)=\sum_{j} \lambda_j L^*_{v_j}(T)L_{v_j}
+$$
+\end{defn}
+
+
+This introduces a new model in mathematics explaining quantum mechanics: the non-commutative probability theory.
+
+\section{Non-commutative probability theory}
+
+The non-commutative probability theory is a branch of generalized probability theory that studies the probability of events in non-commutative algebras.
+
+There are several main components of the generalized probability theory; let's see how we can formulate them, comparing with the classical probability theory.
+
+First, we define the Hilbert space in case one did not make the step from the linear algebra courses like me.
+
+\begin{defn}
+    \label{defn:Hilbert_space}
+    Hilbert space:
+
+    A Hilbert space is a complete inner product space.
+\end{defn}
+
+That is, a vector space equipped with an inner product, with the induced metric defined by the norm of the inner product, we have a metric space, which is complete. Reminds that complete mean that every Cauchy sequence, the sequence such that for any $\epsilon>0$, there exists an $N$ such that for all $m,n\geq N$, we have $|x_m-x_n|<\epsilon$, converges to a limit.
+
+As a side note we will use later, we also defined the Borel measure on a space, here we use the following definition specialized for the space (manifolds) we are interested in. 
+
+\begin{defn}
+    \label{defn:Borel_measure}
+    Borel measure:
+    
+    Let $X$ be a topological space, then a Borel measure $\mu:\mathscr{B}(X)\to [0,\infty]$ on $X$ is a measure on the Borel $\sigma$-algebra of $X$ $\mathscr{B}(X)$ satisfying the following properties:
+
+    \begin{enumerate}
+        \item $X \in \mathscr{B}$.
+        \item Close under complement: If $A\subseteq X$, then $\mu(A^c)=\mu(X)-\mu(A)$
+        \item Close under countable unions; If $E_1,E_2,\cdots$ are disjoint sets, then $\mu(\bigcup_{i=1}^\infty E_i)=\sum_{i=1}^\infty \mu(E_i)$
+    \end{enumerate}
+\end{defn}
+
+In later sections, we will use Lebesgue measure, and Haar measure for various circumstances, their detailed definition may be introduced in later sections.
+
+\begin{examples}
+
+To introduce an example of Hilbert space we use when studying quantum mechanics, we need to introduce a common inner product used in $\mathbb{C}^n$.
+
+
+\begin{prop}
+    \label{prop:Hermitian_inner_product_with_complex_vectorspace}
+    The Hermitian inner product on the complex vector space $\C^n$ makes it a Hilbert space.
+\end{prop}
+
+\begin{proof}
+    We first verify that the Hermitian inner product
+    $$
+        \langle u,v\rangle = \sum_{i=1}^n \overline{u_i} v_i
+    $$
+    on $\C^n$ satisfies the axioms of an inner product:
+    \begin{enumerate}
+        \item \textbf{Conjugate symmetry:} For all $u,v\in\C^n$,
+              $$
+                  \langle u,v\rangle =\sum_{i=1}^n \overline{u_i} v_i=\overline{\sum_{i=1}^n \overline{v_i} u_i}=\overline{\langle v,u\rangle}.
+              $$
+        \item \textbf{Linearity:} For any $u,v,w\in\C^n$ and scalars $a,b\in\C$, we have
+              $$
+                  \langle u, av + bw\rangle = \sum_{i=1}^n \overline{u_i} (av_i + bw_i)=a\langle u,v\rangle + b\langle u,w\rangle.
+              $$
+        \item \textbf{Positive definiteness:} For every $u=(u_1,u_2,\cdots,u_n)\in\C^n$, let $u_j=a_j+b_ji$, where $a_j,b_j\in\mathbb{R}$.
+              $$
+                  \langle u,u\rangle = \sum_{j=1}^n \overline{u_j} u_j=\sum_{i=1}^n (a_i^2+b_i^2)\geq 0,
+              $$
+              with equality if and only if $u=0$.
+
+              Therefore, the Hermitian inner product is an inner product.
+    \end{enumerate}
+
+    Next, we show that $\C^n$ is complete with respect to the norm induced by this inner product:
+    $$
+        \|u\| = \sqrt{\langle u,u\rangle}.
+    $$
+    Since $\C^n$ is finite-dimensional, every Cauchy sequence (with respect to any norm) converges in $\C^n$. This is a standard result in finite-dimensional normed spaces, which implies that $\C^n$ is indeed complete.
+
+    Therefore, since the Hermitian inner product fulfills the inner product axioms and $\C^n$ is complete, the complex vector space $\C^n$ with the Hermitian inner product is a Hilbert space.
+\end{proof}
+
+\end{examples}
+
+Another classical example of Hilbert space is $L^2(\Omega, \mathscr{F}, P)$, where $(\Omega, \mathscr{F}, P)$ is a measure space ($\Omega$ is a set, $\mathscr{F}$ is a $\sigma$-algebra on $\Omega$, and $P$ is a measure on $\mathscr{F}$). The $L^2$ space is the space of all function on $\Omega$ that is
+
+\begin{enumerate}
+    \item \textbf{square integrable}: square integrable functions are the functions $f:\Omega\to \mathbb{C}$ such that
+          $$
+              \int_\Omega |f(\omega)|^2 dP(\omega)<\infty
+          $$
+          with inner product defined by
+          $$
+              \langle f,g\rangle=\int_\Omega \overline{f(\omega)}g(\omega)dP(\omega)
+          $$
+
+    \item \textbf{complex-valued}: functions are complex-valued measurable. $f=u+v i$ is complex-valued if $u$ and $v$ are real-valued measurable.
+\end{enumerate}
+
+\begin{examples}
+    
+
+\begin{prop}
+    \label{prop:L2_space_is_a_Hilbert_space}
+    $L^2(\Omega, \mathscr{F}, P)$ is a Hilbert space.
+\end{prop}
+
+\begin{proof}
+    We check the two conditions of the Hilbert space:
+    \begin{itemize}
+        \item Completeness:
+              Let $(f_n)$ be a Cauchy sequence in $L^2(\Omega, \mathscr{F}, P)$. Then for any $\epsilon>0$, there exists an $N$ such that for all $m,n\geq N$, we have
+              $$
+                  \int_\Omega |f_m(\omega)-f_n(\omega)|^2 dP(\omega)<\epsilon^2
+              $$
+              This means that $(f_n)$ is a Cauchy sequence in the norm of $L^2(\Omega, \mathscr{F}, P)$.
+        \item Inner product:
+              The inner product is defined by
+              $$
+                  \langle f,g\rangle=\int_\Omega \overline{f(\omega)}g(\omega)dP(\omega)
+              $$
+              This is a well-defined inner product on $L^2(\Omega, \mathscr{F}, P)$. We can check the properties of the inner product:
+              \begin{itemize}
+                  \item Linearity:
+                        $$
+                            \langle af+bg,h\rangle=a\langle f,h\rangle+b\langle g,h\rangle
+                        $$
+                  \item Conjugate symmetry:
+                        $$
+                            \langle f,g\rangle=\overline{\langle g,f\rangle}
+                        $$
+                  \item Positive definiteness:
+                        $$
+                            \langle f,f\rangle\geq 0
+                        $$
+              \end{itemize}
+    \end{itemize}
+\end{proof}
+
+\end{examples}
+
+Let $\mathscr{H}$ be a Hilbert space. $\mathscr{H}$ consists of complex-valued functions on a finite set $\Omega=\{1,2,\ldots,n\}$, and the functions $(e_1,e_2,\ldots,e_n)$ form an orthonormal basis of $\mathscr{H}$. (We use Dirac notation $|k\rangle$ to denote the basis vector $e_k$~\cite{parthasarathy1992quantum}.)
+
+As an analog to the classical probability space $(\Omega,\mathscr{F},\mu)$, which consists of a sample space $\Omega$ and a probability measure $\mu$ on the state space $\mathscr{F}$, the non-commutative probability space $(\mathscr{H},\mathscr{P},\rho)$ consists of a Hilbert space $\mathscr{H}$ and a state $\rho$ on the space of all orthogonal projections $\mathscr{P}$.
+
+The detailed definition of the non-commutative probability space is given below:
+
+\begin{defn}
+    \label{defn:non-commutative_probability_space}
+    Non-commutative probability space:
+
+    A non-commutative probability space is a pair $(\mathscr{B}(\mathscr{H}),\mathscr{P})$, where $\mathscr{B}(\mathscr{H})$ is the set of all \textbf{bounded} linear operators on $\mathscr{H}$.
+
+    A linear operator on $\mathscr{H}$ is \textbf{bounded} if for all $u$ such that $\|u\|\leq 1$, we have $\|Au\|\leq M$ for some $M>0$.
+
+    $\mathscr{P}$ is the set of all orthogonal projections on $\mathscr{B}(\mathscr{H})$.
+
+    The set $\mathscr{P}=\{P\in\mathscr{B}(\mathscr{H}):P^*=P=P^2\}$ is the set of all orthogonal projections on $\mathscr{B}(\mathscr{H})$.
+\end{defn}
+
+Recall from classical probability theory, we call the initial probability distribution for possible outcomes in the classical probability theory as our \textit{state}, simillarly, we need to define the \textit{state} in the non-commutative probability theory.
+
+\begin{defn}
+    \label{defn:state}
+    Non-commutative probability state:
+
+    Given a non-commutative probability space $(\mathscr{B}(\mathscr{H}),\mathscr{P})$,
+
+    A state is a unit vector $\bra{\psi}$ in the Hilbert space $\mathscr{H}$, such that $\bra{\psi}\ket{\psi}=1$. 
+    
+    Every state uniquely defines a map $\rho:\mathscr{P}\to[0,1]$, $\rho(P)=\bra{\psi}P\ket{\psi}$ (commonly named as density operator) such that:
+    \begin{itemize}
+        \item $\rho(O)=0$, where $O$ is the zero projection, and $\rho(I)=1$, where $I$ is the identity projection.
+        \item If $P_1,P_2,\ldots,P_n$ are pairwise disjoint orthogonal projections, then $\rho(P_1 + P_2 + \cdots + P_n) = \sum_{i=1}^n \rho(P_i)$.
+    \end{itemize}
+\end{defn}
+
+Note that the pure states are the density operators that can be represented by a unit vector $\bra{\psi}$ in the Hilbert space $\mathscr{H}$, whereas mixed states are the density operators that cannot be represented by a unit vector in the Hilbert space $\mathscr{H}$.
+
+If $(|\psi_1\rangle,|\psi_2\rangle,\cdots,|\psi_n\rangle)$ is an orthonormal basis of $\mathscr{H}$ consisting of eigenvectors of $\rho$, for the eigenvalues $p_1,p_2,\cdots,p_n$, then $p_j\geq 0$ and $\sum_{j=1}^n p_j=1$.
+
+We can write $\rho$ as
+$$
+    \rho=\sum_{j=1}^n p_j|\psi_j\rangle\langle\psi_j|
+$$
+(Under basis $|\psi_j\rangle$, it is a diagonal matrix with $p_j$ on the diagonal.)
+
+% Then we need to introduce a theorem that ensures that every state on the space of all orthogonal projections on $\mathscr{H}$ can be represented by a density operator.
+
+% \begin{theorem}
+% 	\label{theorem:Gleason's_theorem}
+% 	Gleason's theorem (Theorem 1.1.15 in~\cite{parthasarathy2005mathematical})
+
+%     Let $\mathscr{H}$ be a Hilbert space over $\mathbb{C}$ or $\mathbb{R}$ of dimension $n\geq 3$. Let $\mu$ be a state on the space $\mathscr{P}$ of projections on $\mathscr{H}$. Then there exists a unique density operator $\rho$ such that
+%     $$
+%     \mu(P)=\operatorname{Tr}(\rho P)
+%     $$
+%     for all $P\in\mathscr{P}$. $\mathscr{P}$ is the space of all orthogonal projections on $\mathscr{H}$.
+% \end{theorem}
+
+% This proof came from~\cite{parthasarathy2005mathematical}.
+
+% \begin{proof}
+% % TODO: FILL IN THE PROOF
+% \end{proof}
+
+% This theorem is a very important theorem in non-commutative probability theory; it states that any state on the space of all orthogonal projections on $\mathscr{H}$ can be represented by a density operator.
+
+The counterpart of the random variable in the non-commutative probability theory is called an observable, which is a Hermitian operator on $\mathscr{H}$ (for all $\psi,\phi$ in the domain of $A$, we have $\langle A\psi,\phi\rangle=\langle\psi,A\phi\rangle$. This kind of operator ensures that our outcome interpreted as probability is a real number). 
+
+\begin{defn}
+    \label{defn:observable}
+    Observable:
+
+    Let $\mathcal{B}(\mathbb{R})$ be the set of all Borel sets on $\mathbb{R}$.
+
+    An (real-valued) observable (random variable) on the Hilbert space $\mathscr{H}$, denoted by $A$, is a projection-valued map (measure) $P_A:\mathscr{B}(\mathbb{R})\to\mathscr{P}(\mathscr{H})$.
+
+    Satisfies the following properties:
+    \begin{itemize}
+        \item $P_A(\emptyset)=O$ (the zero projection)
+        \item $P_A(\mathbb{R})=I$ (the identity projection)
+        \item For any sequence $A_1,A_2,\cdots,A_n\in \mathscr{B}(\mathbb{R})$, the following holds:
+              \begin{itemize}
+                  \item $P_A(\bigcup_{i=1}^n A_i)=\bigvee_{i=1}^n P_A(A_i)$
+                  \item $P_A(\bigcap_{i=1}^n A_i)=\bigwedge_{i=1}^n P_A(A_i)$
+                  \item $P_A(A^c)=I-P_A(A),\forall A\in\mathscr{B}(\mathbb{R})$
+              \end{itemize}
+    \end{itemize}
+\end{defn}
+
+If $A$ is an observable determined by the map $P_A:\mathcal{B}(\mathbb{R})\to\mathcal{P}(\mathscr{H})$, $P_A$ is a spectral measure (a complete additive orthogonal projection valued measure on $\mathcal{B}(\mathbb{R})$). And every spectral measure can be represented by an observable. \cite{parthasarathy2005mathematical}
+
+\begin{prop}
+    If $A_j$ are mutually disjoint (that is $P_A(A_i)P_A(A_j)=P_A(A_j)P_A(A_i)=O$ for $i\neq j$), then $P_A(\bigcup_{j=1}^n A_j)=\sum_{j=1}^n P_A(A_j)$
+\end{prop}
+
+\begin{defn}
+    \label{defn:probability_of_random_variable}
+    Probability of a random variable:
+
+    Let $A$ be a real-valued observable on a Hilbert space $\mathscr{H}$. $\rho$ be a state. The probability of observing the outcome $E\in \mathcal{B}(\mathbb{R})$ is given by:
+
+    $$
+    \mu(E)=\operatorname{Tr}(\rho P_A(E))
+    $$
+\end{defn}
+
+Restriction of a quantum state to a commutative subalgebra defines an ordinary probability measure.
+
+\begin{examples}
+Let
+$$
+Z=\begin{pmatrix}
+1 & 0\\
+0 & -1
+\end{pmatrix}.
+$$
+
+The eigenvalues of $Z$ are $+1$ and $-1$, with corresponding normalized eigenvectors
+
+$$
+\ket{0}=\begin{pmatrix}1\\0\end{pmatrix},
+\qquad
+\ket{1}=\begin{pmatrix}0\\1\end{pmatrix}.
+$$
+
+The spectral projections are
+$$
+P_Z(\{1\}) = \ket{0}\bra{0}
+=
+\begin{pmatrix}
+1 & 0\\
+0 & 0
+\end{pmatrix},
+\qquad
+P_Z(\{-1\}) =  \ket{1}\bra{1}
+=
+\begin{pmatrix}
+0 & 0\\
+0 & 1
+\end{pmatrix}.
+$$
+
+The associated projection-valued measure $P_Z$ satisfies
+$$
+P_Z(\{1,-1\}) = I,
+\qquad
+P_Z(\emptyset)=0.
+$$
+
+%==============================
+% 4. Example: X measurement and its PVM
+%==============================
+
+Let
+$$
+X=\begin{pmatrix}
+0 & 1\\
+1 & 0
+\end{pmatrix}.
+$$
+
+The normalized eigenvectors of $X$ are
+$$
+\ket{+}=\frac{1}{\sqrt{2}}\left(\ket{0}+\ket{1}\right),
+\qquad
+\ket{-}=\frac{1}{\sqrt{2}}\left(\ket{0}-\ket{1}\right),
+$$
+with eigenvalues $+1$ and $-1$, respectively.
+
+The corresponding spectral projections are
+$$
+P_X(\{1\}) = \ket{+}\bra{+}
+=
+\frac{1}{2}
+\begin{pmatrix}
+1 & 1\\
+1 & 1
+\end{pmatrix},
+$$
+$$
+P_X(\{-1\}) = \ket{-}\bra{-}
+=
+\frac{1}{2}
+\begin{pmatrix}
+1 & -1\\
+-1 & 1
+\end{pmatrix}.
+$$
+
+%==============================
+% 5. Noncommutativity of the projections
+%==============================
+
+Compute
+$$
+P_Z(\{1\})P_X(\{1\})
+=
+\begin{pmatrix}
+1 & 0\\
+0 & 0
+\end{pmatrix}
+\cdot
+\frac{1}{2}
+\begin{pmatrix}
+1 & 1\\
+1 & 1
+\end{pmatrix}
+=
+\frac{1}{2}
+\begin{pmatrix}
+1 & 1\\
+0 & 0
+\end{pmatrix}.
+$$
+
+On the other hand,
+$$
+P_X(\{1\})P_Z(\{1\})
+=
+\frac{1}{2}
+\begin{pmatrix}
+1 & 1\\
+1 & 1
+\end{pmatrix}
+\cdot
+\begin{pmatrix}
+1 & 0\\
+0 & 0
+\end{pmatrix}
+=
+\frac{1}{2}
+\begin{pmatrix}
+1 & 0\\
+1 & 0
+\end{pmatrix}.
+$$
+
+Since
+$$
+P_Z(\{1\})P_X(\{1\}) \neq P_X(\{1\})P_Z(\{1\}),
+$$
+the projections do not commute.
+
+Let $\rho$ be a density operator on $\mathbb C^2$, i.e.
+$$
+\rho \ge 0,
+\qquad
+\operatorname{Tr}(\rho)=1.
+$$
+
+For a pure state $\ket{\psi}$, one has
+$$
+\rho = \ket{\psi}\bra{\psi}.
+$$
+
+The probability that a measurement associated with a PVM $P$ yields an outcome in a Borel set $A\in \mathcal{B}$ is
+$$
+\mathbb P(A) = \operatorname{Tr}(\rho\, P(A)).
+$$
+
+For example, let
+$$
+\rho = \ket{0}\langle 0|
+=
+\begin{pmatrix}
+1 & 0\\
+0 & 0
+\end{pmatrix}.
+$$
+
+Then
+$$
+\operatorname{Tr}\bigl(\rho\, P_Z(\{1\})\bigr) = 1,
+\qquad
+\operatorname{Tr}\bigl(\rho\, P_X(\{1\})\bigr) = \frac{1}{2}.
+$$
+
+\end{examples}
+
+\begin{defn}
+    \label{defn:measurement}
+    Definition of measurement:
+
+    A measurement (observation) of a system prepared in a given state produces an outcome $x$, $x$ is a physical event that is a subset of the set of all possible outcomes. For each $x$, we associate a measurement operator $M_x$ on $\mathscr{H}$.
+
+    Given the initial state (pure state, unit vector) $u$, the probability of measurement outcome $x$ is given by:
+    $$
+        p(x)=\|M_xu\|^2
+    $$
+
+    Note that to make sense of this definition, the collection of measurement operators $\{M_x\}$ must satisfy the completeness requirement:
+    $$
+        1=\sum_{x\in X} p(x)=\sum_{x\in X}\|M_xu\|^2=\sum_{x\in X}\langle M_xu,M_xu\rangle=\langle u,(\sum_{x\in X}M_x^*M_x)u\rangle
+    $$
+    So $\sum_{x\in X}M_x^*M_x=I$.
+
+\end{defn}
+
+
+Here is Table~\ref{tab:analog_of_classical_probability_theory_and_non_commutative_probability_theory} summarizing the analog of classical probability theory and non-commutative (\textit{quantum}) probability theory~\cite{Feres}:
+
+\begin{table}[H]
+    \centering
+    \renewcommand{\arraystretch}{1.5}
+    \caption{Analog of classical probability theory and non-commutative (\textit{quantum}) probability theory}
+    \label{tab:analog_of_classical_probability_theory_and_non_commutative_probability_theory}
+    {\small
+        \begin{tabular}{|p{0.5\linewidth}|p{0.5\linewidth}|}
+            \hline
+            \textbf{Classical probability}                                                                                                                      & \textbf{Non-commutative probability}                                                                                                                      \\
+            \hline
+            Sample space $\Omega$, cardinality $\vert\Omega\vert=n$, example: $\Omega=\{0,1\}$                                                                  & Complex Hilbert space $\mathscr{H}$, dimension $\dim\mathscr{H}=n$, example: $\mathscr{H}=\mathbb{C}^2$                                                   \\
+            \hline
+            Common algebra of $\mathbb{C}$ valued functions                                                                                                     & Algebra of bounded operators $\mathcal{B}(\mathscr{H})$                                                                                                   \\
+            \hline
+            $f\mapsto \bar{f}$ complex conjugation                                                                                                              & $P\mapsto P^*$ adjoint                                                                                                                                    \\
+            \hline
+            Events: indicator functions of sets                                                                                                                 & Projections: space of orthogonal projections $\mathscr{P}\subseteq\mathscr{B}(\mathscr{H})$                                                               \\
+            \hline
+            functions $f$ such that $f^2=f=\overline{f}$                                                                                                        & orthogonal projections $P$ such that $P^*=P=P^2$                                                                                                          \\
+            \hline
+            $\mathbb{R}$-valued functions $f=\overline{f}$                                                                                                      & self-adjoint operators $A=A^*$                                                                                                                            \\
+            \hline
+            $\mathbb{I}_{f^{-1}(\{\lambda\})}$ is the indicator function of the set $f^{-1}(\{\lambda\})$                                                       & $P(\lambda)$ is the orthogonal projection to eigenspace                                                                                                   \\
+            \hline
+            $f=\sum_{\lambda\in \operatorname{Range}(f)}\lambda \mathbb{I}_{f^{-1}(\{\lambda\})}$                                                               & $A=\sum_{\lambda\in \operatorname{sp}(A)}\lambda P(\lambda)$                                                                                              \\
+            \hline
+            Probability measure $\mu$ on $\Omega$                                                                                                               & Density operator $\rho$ on $\mathscr{H}$                                                                                                                  \\
+            \hline
+            Delta measure $\delta_\omega$                                                                                                                       & Pure state $\rho=\vert\psi\rangle\langle\psi\vert$                                                                                                        \\
+            \hline
+            $\mu$ is non-negative measure and $\sum_{i=1}^n\mu(\{i\})=1$                                                                                        & $\rho$ is positive semi-definite and $\operatorname{Tr}(\rho)=1$                                                                                          \\
+            \hline
+            Expected value of random variable $f$ is $\mathbb{E}_{\mu}(f)=\sum_{i=1}^n f(i)\mu(\{i\})$                                                          & Expected value of operator $A$ is $\mathbb{E}_\rho(A)=\operatorname{Tr}(\rho A)$                                                                          \\
+            \hline
+            Variance of random variable $f$ is $\operatorname{Var}_\mu(f)=\sum_{i=1}^n (f(i)-\mathbb{E}_\mu(f))^2\mu(\{i\})$                                    & Variance of operator $A$ is $\operatorname{Var}_\rho(A)=\operatorname{Tr}(\rho A^2)-\operatorname{Tr}(\rho A)^2$                                          \\
+            \hline
+            Covariance of random variables $f$ and $g$ is $\operatorname{Cov}_\mu(f,g)=\sum_{i=1}^n (f(i)-\mathbb{E}_\mu(f))(g(i)-\mathbb{E}_\mu(g))\mu(\{i\})$ & Covariance of operators $A$ and $B$ is $\operatorname{Cov}_\rho(A,B)=\operatorname{Tr}(\rho A\circ B)-\operatorname{Tr}(\rho A)\operatorname{Tr}(\rho B)$ \\
+            \hline
+            Composite system is given by Cartesian product of the sample spaces $\Omega_1\times\Omega_2$                                                        & Composite system is given by tensor product of the Hilbert spaces $\mathscr{H}_1\otimes\mathscr{H}_2$                                                     \\
+            \hline
+            Product measure $\mu_1\times\mu_2$ on $\Omega_1\times\Omega_2$                                                                                      & Tensor product of space $\rho_1\otimes\rho_2$ on $\mathscr{H}_1\otimes\mathscr{H}_2$                                                                      \\
+            \hline
+            Marginal distribution $\pi_*v$                                                                                                                      & Partial trace $\operatorname{Tr}_2(\rho)$                                                                                                                 \\
+            \hline
+        \end{tabular}
+    }
+    \vspace{0.5cm}
+\end{table}
+
+\section{Manifolds}
+
+In this section, we will introduce some basic definitions and theorems used in manifold theory that are relevant to our study. Assuming no prior knowledge of manifold theory but basic topology understanding. We will provide brief definitions and explanations for each term. From the most abstract Manifold definition to the Riemannian manifolds and related theorems.
+
+\subsection{Manifolds}
+
+\begin{defn}
+    \label{defn:m-manifold}
+
+An $m$-manifold is a Topological space $X$ that is
+
+\begin{enumerate}
+    \item Hausdroff: every distinct two points in $X$ can be separated by two disjoint open sets.
+    \item Second countable: $X$ has countable basis.
+    \item Every point $p$ has an open neighborhood $p\in U$ that is homeomorphic to an open subset of $\mathbb{R}^m$.
+\end{enumerate}
+\end{defn}
+
+
+\begin{examples}
+    \label{example:second_countable_space}
+    Let $X=\mathbb{R}$ and $\mathcal{B}=\{(a,b)|a,b\in \mathbb{R},a<b\}$ (collection of all open intervals with rational endpoints).
+
+    Since the rational numbers are countable, so $\mathcal{B}$ is countable.
+
+    So $\mathbb{R}$ is second countable.
+
+    Likewise, $\mathbb{R}^n$ is also second countable.
+\end{examples}
+
+\begin{examples}
+    \label{example:manifold}
+1-manifold is a curve and 2-manifold is a surface.
+\end{examples}
+
+\begin{theorem}
+    \label{Theorem of imbedded space}
+    If $X$ is a compact $m$-manifold, then $X$ can be imbedded in $\mathbb{R}^n$ for some $n$.
+\end{theorem}
+
+This theorem might save you from imagining abstract structures back to real dimension. Good news, at least you stay in some real numbers.
+
+\subsection{Smooth manifolds and Lie groups}
+
+This section is adopted from \cite{lee_introduction_2012}
+
+\begin{defn}
+    \label{defn:partial_derivative}
+
+    Let $U\subseteq \mathbb{R}^n$ and $f:U\to \mathbb{R}^n$ be a map.
+
+    For any $a=(a_1,\cdots,a_n)\in U$, $j\in \{1,\cdots,n\}$, the $j$-th partial derivative of $F$ at $a$ is defined as
+
+    $$
+    \begin{aligned}
+    \frac{\partial f}{\partial x_j}(a)&=\lim_{h\to 0}\frac{f(a_1,\cdots,a_j+h,\cdots,a_n)-f(a_1,\cdots,a_j,\cdots,a_n)}{h} \\
+    &=\lim_{h\to 0}\frac{f(a+he_j)-f(a)}{h}
+    \end{aligned}
+$$
+
+\end{defn}
+
+\begin{defn}
+    \label{defn:continuously_differentiable_map}
+    Let $U\subseteq \mathbb{R}^n$ and $f:U\to \mathbb{R}^n$ be a map.
+
+    If for any $j\in \{1,\cdots,n\}$, the $j$-th partial derivative of $f$ is continuous at $a$, then $f$ is continuously differentiable at $a$.
+
+    If $\forall a\in U$, $\frac{\partial f}{\partial x_j}$ exists and is continuous at $a$, then $f$ is continuously differentiable on $U$. or $C^1$ map. (Note that $C^0$ map is just a continuous map.)
+\end{defn} 
+
+
+\begin{defn}
+    \label{defn:smooth_map}
+    A function $f:U\to \mathbb{R}^n$ is smooth if it is of class $C^k$ for every $k\geq 0$ on $U$. Such function is called a diffeomorphism if it is also a \textbf{\texttt{bijection}} and its \textbf{\texttt{inverse is also smooth}}.
+\end{defn}
+
+
+\begin{defn}
+    \label{defn:chart}
+
+    Let $M$ be a smooth manifold. A \textbf{\texttt{chart}} is a pair $(U,\varphi)$ where $U\subseteq M$ is an open subset and $\varphi:U\to \hat{U}\subseteq \mathbb{R}^n$ is a homeomorphism (a continuous bijection map and its inverse is also continuous).
+
+    If $p\in U$ and $\varphi(p)=0$, then we say that $p$ is the origin of the chart $(U,\varphi)$.
+
+    For $p\in U$, we note that the continuous function $\varphi(p)=(x_1(p),\cdots,x_n(p))$ gives a vector in $\mathbb{R}^n$. The $(x_1(p),\cdots,x_n(p))$ is called the \textbf{\texttt{local coordinates}} of $p$ in the chart $(U,\varphi)$.
+
+\end{defn}
+
+\begin{defn}
+    \label{defn:atlas}
+    Let $M$ be a smooth manifold. An \textbf{\texttt{atlas}} is a collection of charts $\mathcal{A}=\{(U_\alpha,\phi_\alpha)\}_{\alpha\in I}$ such that $M=\bigcup_{\alpha\in I} U_\alpha$.
+
+    An atlas is said to be \textbf{\texttt{smooth}} if the transition maps $\phi_\alpha\circ \phi_\beta^{-1}:\phi_\beta(U_\alpha\cap U_\beta)\to \phi_\alpha(U_\alpha\cap U_\beta)$ are smooth for all $\alpha, \beta\in I$.
+\end{defn}
+
+
+\begin{defn}
+    \label{defn:smooth_manifold}
+    A smooth manifold is a pair $(M,\mathcal{A})$ where $M$ is a topological manifold and $\mathcal{A}$ is a smooth atlas.
+\end{defn}
+
+\begin{defn}
+  \label{defn:differential}
+
+\end{defn}
+
+\begin{defn}
+  \label{defn:smooth-submersion}
+  
+\end{defn}
+
+Here are some additional propositions that will be helpful for our study in later sections:
+
+This one is from \cite{lee_introduction_2012} Theorem 4.26
+
+\begin{theorem}
+    \label{theorem:local_section_theorem}
+
+    Let $M$ and $N$ be smooth manifolds and $\pi:M\to N$ is a smooth map. Then $\pi$ is a smooth submersion if and only if every point of $M$ is in the image of a smooth local section of $\pi$ (a local section of $\pi$ is a map $\sigma:U\to M$ defined on some open subset $U\subseteq N$ with $\pi\circ \sigma=Id_U$).   
+\end{theorem}
+
+\subsection{Riemannian manifolds}
+
+\begin{defn}
+  \label{defn:riemannian-metric}
+
+  Let $M$ be a smooth manifold. A \textit{\textbf{Riemannian metric}} on $M$ is a smooth covariant tensor field $g\in \mathcal{T}^2(M)$ such that for each $p\in M$, $g_p$ is an inner product on $T_pM$.
+
+  $g_p(v,v)\geq 0$ for each $p\in M$ and each $v\in T_pM$. equality holds if and only if $v=0$.
+
+\end{defn}
+
+\begin{defn}
+  \label{defn:riemannian-submersion}
+  Suppose $(\tilde{M},\tilde{g})$ and $(M,g)$ are smooth Riemannian manifolds, and $\pi:\tilde{M}\to M$ is a smooth submersion. Then $\pi$ is said to be a \textit{\textbf{Riemannian submersion}} if for each $x\in \tilde{M}$, the differential $d\pi_x:\tilde{g}_x\to g_{\pi(x)}$ restricts to a linear isometry from $H_x$ onto $T_{\pi(x)}M$.
+
+  In other words, $\tilde{g}_x(v,w)=g_{\pi(x)}(d\pi_x(v),d\pi_x(w))$ whenever $v,w\in H_x$.
+\end{defn}
+
+\begin{theorem}
+  \label{theorem:riemannian-submersion}
+
+  Let $(\tilde{M},\tilde{g})$ be a Riemannian manifold, let $\pi:\tilde{M}\to M$ be a surjective smooth submersion, and let $G$ be a group acting on $\tilde{M}$. If the \textbf{action} is
+  \begin{enumerate}
+    \item isometric: the map $x\mapsto \varphi\cdot x$ is an isometry for each $\varphi\in G$.
+    \item vertical: every element $\varphi\in G$ takes each fiber to itself, that is $\pi(\varphi\cdot p)=\pi(p)$ for all $p\in \tilde{M}$.
+    \item transitive on fibers: for each $p,q\in \tilde{M}$ such that $\pi(p)=\pi(q)$, there exists $\varphi\in G$ such that $\varphi\cdot p = q$.
+  \end{enumerate}
+  Then there is a unique Riemannian metric on $M$ such that $\pi$ is a Riemannian submersion.
+
+\end{theorem}
+
+\begin{proof}
+For each $p\in \tilde{M}$, let
+$$
+V_p:=\ker(d\pi_p)\subseteq T_p\tilde{M}
+$$
+be the vertical space, and let
+$$
+H_p:=V_p^{\perp_{\tilde g}}
+$$
+be its $\tilde g$-orthogonal complement. Since $\pi$ is a surjective smooth submersion, each $d\pi_p:T_p\tilde M\to T_{\pi(p)}M$ is surjective, so
+$$
+T_p\tilde M = V_p\oplus H_p,
+$$
+and therefore the restriction
+$$
+d\pi_p|_{H_p}:H_p\to T_{\pi(p)}M
+$$
+is a linear isomorphism.
+
+We first show that the group action preserves the horizontal distribution. Fix $\varphi\in G$. Since the action is vertical, we have
+$$
+\pi(\varphi\cdot x)=\pi(x)\qquad\text{for all }x\in \tilde M.
+$$
+Differentiating at $p$ gives
+$$
+d\pi_{\varphi\cdot p}\circ d\varphi_p = d\pi_p.
+$$
+Hence if $v\in V_p=\ker(d\pi_p)$, then
+$$
+d\pi_{\varphi\cdot p}(d\varphi_p v)=d\pi_p(v)=0,
+$$
+so $d\varphi_p(V_p)\subseteq V_{\varphi\cdot p}$. Since $\varphi$ acts isometrically, $d\varphi_p$ is a linear isometry, and thus it preserves orthogonal complements. Therefore
+$$
+d\varphi_p(H_p)=H_{\varphi\cdot p}.
+$$
+
+We now define a metric on $M$. Let $m\in M$, and choose any $p\in \pi^{-1}(m)$. For $u,v\in T_mM$, let $\tilde u,\tilde v\in H_p$ be the unique horizontal lifts satisfying
+$$
+d\pi_p(\tilde u)=u,\qquad d\pi_p(\tilde v)=v.
+$$
+Define
+$$
+g_m(u,v):=\tilde g_p(\tilde u,\tilde v).
+$$
+This is a symmetric bilinear form on $T_mM$, and it is positive definite because $\tilde g_p$ is positive definite on $H_p$ and $d\pi_p|_{H_p}$ is an isomorphism.
+
+It remains to show that this definition is independent of the choice of $p$ in the fiber. Suppose $p,q\in \pi^{-1}(m)$. By transitivity of the action on fibers, there exists $\varphi\in G$ such that $\varphi\cdot p=q$. Let $\tilde u_p,\tilde v_p\in H_p$ be the horizontal lifts of $u,v$ at $p$, and define
+$$
+\tilde u_q:=d\varphi_p(\tilde u_p),\qquad \tilde v_q:=d\varphi_p(\tilde v_p).
+$$
+By the previous paragraph, $\tilde u_q,\tilde v_q\in H_q$. Moreover,
+$$
+d\pi_q(\tilde u_q)
+=
+d\pi_q(d\varphi_p\tilde u_p)
+=
+d\pi_p(\tilde u_p)
+=
+u,
+$$
+and similarly $d\pi_q(\tilde v_q)=v$. Thus $\tilde u_q,\tilde v_q$ are exactly the horizontal lifts of $u,v$ at $q$. Since $\varphi$ is an isometry,
+$$
+\tilde g_q(\tilde u_q,\tilde v_q)
+=
+\tilde g_q(d\varphi_p\tilde u_p,d\varphi_p\tilde v_p)
+=
+\tilde g_p(\tilde u_p,\tilde v_p).
+$$
+Therefore $g_m(u,v)$ is independent of the chosen point $p\in \pi^{-1}(m)$, so $g$ is well defined on $M$.
+
+Next we prove that $g$ is smooth. Let $m_0\in M$. Since $\pi$ is a smooth submersion, there exists an open neighborhood $U\subseteq M$ of $m_0$ and a smooth local section
+$$
+s:U\to \tilde M
+\qquad\text{such that}\qquad
+\pi\circ s=\mathrm{id}_U.
+$$
+Over $s(U)$, the vertical bundle $V=\ker d\pi$ is a smooth subbundle of $T\tilde M$, and hence so is its orthogonal complement $H=V^\perp$. For each $x\in U$, the restriction
+$$
+d\pi_{s(x)}|_{H_{s(x)}}:H_{s(x)}\to T_xM
+$$
+is a linear isomorphism, and these isomorphisms depend smoothly on $x$. Thus they define a smooth vector bundle isomorphism
+$$
+d\pi|_H:H|_{s(U)}\to TU,
+$$
+whose inverse is also smooth.
+
+If $X,Y$ are smooth vector fields on $U$, define their horizontal lifts along $s$ by
+$$
+X_x^H:=\bigl(d\pi_{s(x)}|_{H_{s(x)}}\bigr)^{-1}(X_x),
+\qquad
+Y_x^H:=\bigl(d\pi_{s(x)}|_{H_{s(x)}}\bigr)^{-1}(Y_x).
+$$
+Then $X^H$ and $Y^H$ are smooth vector fields along $s(U)$, and by construction,
+$$
+g(X,Y)(x)=\tilde g_{s(x)}(X_x^H,Y_x^H).
+$$
+Since the right-hand side depends smoothly on $x$, it follows that $g$ is a smooth Riemannian metric on $M$.
+
+By construction, for every $p\in \tilde M$ and every $\tilde u,\tilde v\in H_p$,
+$$
+g_{\pi(p)}(d\pi_p\tilde u,d\pi_p\tilde v)=\tilde g_p(\tilde u,\tilde v).
+$$
+Thus $d\pi_p:H_p\to T_{\pi(p)}M$ is an isometry for every $p$, so $\pi:(\tilde M,\tilde g)\to (M,g)$ is a Riemannian submersion.
+
+Finally, uniqueness is immediate. Indeed, if $g'$ is another Riemannian metric on $M$ such that $\pi:(\tilde M,\tilde g)\to (M,g')$ is a Riemannian submersion, then for any $m\in M$, any $p\in \pi^{-1}(m)$, and any $u,v\in T_mM$, letting $\tilde u,\tilde v\in H_p$ denote the horizontal lifts of $u,v$, we must have
+$$
+g'_m(u,v)=\tilde g_p(\tilde u,\tilde v)=g_m(u,v).
+$$
+Hence $g'=g$.
+
+Therefore there exists a unique Riemannian metric on $M$ such that $\pi$ is a Riemannian submersion.
+\end{proof}
+
+\section{Quantum physics and terminologies}
+
+In this section, we will introduce some terminologies and theorems used in quantum physics that are relevant to our study. Assuming no prior knowledge of quantum physics, we will provide brief definitions and explanations for each term.
+
+One might ask, what is the fundamental difference between a quantum system and a classical system, and why can we not directly apply those theorems in classical computers to a quantum computer? It turns out that quantum error-correcting codes are hard due to the following definitions and features for quantum computing.
+
+\begin{defn}
+ All quantum operations can be constructed by composing four kinds of transformations: (adapted from Chapter 10 of \cite{Bengtsson_Zyczkowski_2017})
+
+  \begin{enumerate}
+    \item Unitary operations. $U(\cdot)$ for any quantum state. It is possible to apply a non-unitary operation for an open quantum system, but that is usually not the focus for quantum computing and usually leads to non-recoverable loss of information that we wish to obtain.
+    \item Extend the system. Given a quantum state $\rho\in\mathcal{H}^N$, we can extend it to a larger quantum system by "entangle" (For this report, you don't need to worry for how quantum entanglement works) it with some new states $\sigma\in \mathcal{H}^K$ (The space where the new state dwells is usually called ancilla system) and get $\rho'=\rho\otimes\sigma\in \mathcal{H}^N\otimes \mathcal{K}$.
+    \item Partial trace. Given a quantum state $\rho\in\mathcal{H}^N$ and some reference state $\sigma\in\mathcal {H}^K$, we can trace out some subsystems and get a new state $\rho'\in\mathcal{H}^{N-K}$.
+    \item Selective measurement. Given a quantum state, we measure it and get a classical bit; unlike the classical case, the measurement is a probabilistic operation. (More specifically, this is some projection to a reference state corresponding to a classical bit output. For this report, you don't need to worry about how such a result is obtained and how the reference state is constructed.)
+  \end{enumerate}
+\end{defn}
+
+
+$U(n)$ is the group of all $n\times n$ \textbf{unitary matrices} over $\mathbb{C}$, 
+
+$$
+U(n)=\{A\in \mathbb{C}^{n\times n}: A^*A=AA^*=I_n\}
+$$
+
+The uniqueness of such measurement came from the lemma below~\cite{Elizabeth_book}
+
+\begin{lemma}
+    \label{lemma:haar_measure}
+
+    Let $(U(n), \| \cdot \|, \mu)$ be a metric measure space where $\| \cdot \|$ is the Hilbert-Schmidt norm and $\mu$ is the measure function.
+
+    The Haar measure on $U(n)$ is the unique probability measure that is invariant under the action of $U(n)$ on itself.
+    
+    That is, fixing $B\in U(n)$, $\forall A\in U(n)$, $\mu(A\cdot B)=\mu(B\cdot A)=\mu(B)$.
+    
+    The Haar measure is the unique probability measure that is invariant under the action of $U(n)$ on itself.
+\end{lemma}
+
+
+\begin{defn}
+    \label{defn:pure_state}
+    Pure state:
+
+A random pure state $\varphi$ is any random variable distributed according to the unitarily invariant probability measure on the pure states $\mathcal{P}(A)$ of the system $A$, denoted by $\varphi\in_R\mathcal{P}(A)$.
+\end{defn}
+
+It is trivial that for the space of pure state, we can easily apply the Haar measure as the unitarily invariant probability measure since the space of pure state is $S^n$ for some $n$. However, for the case of mixed states, that is a bit complicated and we need to use partial tracing to defined the rank-$s$ random states.
+
+\begin{defn}
+    \label{defn:rank_s_random_state}
+    Rank-$s$ random state.
+
+    For a system $A$ and an integer $s\geq 1$, consider the distribution onn the mixed states $\mathcal{S}(A)$ of A induced by the partial trace over the second factor form the uniform distribution on pure states of $A\otimes\mathbb{C}^s$. Any random variable $\rho$ distributed as such will be called a rank-$s$ random states; denoted as $\rho\in_R \mathcal{S}_s(A)$. And $\mathcal{P}(A)=\mathcal{S}_1(A)$.
+\end{defn}
+
+
+\begin{prop}
+    \label{prop:indistinguishability}
+    Proposition of indistinguishability:
+
+    Suppose that we have two systems $u_1,u_2\in \mathscr{H}_1$, the two states are distinguishable if and only if they are orthogonal.
+\end{prop}
+
+\begin{proof}
+    Ways to distinguish the two states:
+    \begin{enumerate}
+        \item Set $X=\{0,1,2\}$ and $M_i=|u_i\rangle\langle u_i|$, $M_0=I-M_1-M_2$
+        \item Then $\{M_0,M_1,M_2\}$ is a complete collection of measurement operators on $\mathscr{H}$.
+        \item Suppose the prepared state is $u_1$, then $p(1)=\|M_1u_1\|^2=\|u_1\|^2=1$, $p(2)=\|M_2u_1\|^2=0$, $p(0)=\|M_0u_1\|^2=0$.
+    \end{enumerate}
+
+    If they are not orthogonal, then there is no choice of measurement operators to perfectly distinguish the two states.
+
+\end{proof}
+
+Intuitively, if the two states are not orthogonal, then for any measurement (projection) there exists non-zero probability of getting the same outcome for both states.
+
+\subsection{Random quantum states}
+
+First, we need to define what is a random state in a bipartite system.
+
+
+% When compiled standalone, print this chapter's references at the end.
+\ifSubfilesClassLoaded{
+    \printbibliography[title={References}]
+}
+
+\end{document}
diff --git a/chapters/chap1.pdf b/latex/chapters/chap1.pdf
similarity index 100%
rename from chapters/chap1.pdf
rename to latex/chapters/chap1.pdf
diff --git a/chapters/chap1.tex b/latex/chapters/chap1.tex
similarity index 97%
rename from chapters/chap1.tex
rename to latex/chapters/chap1.tex
index 752b566..b0f27c8 100644
--- a/chapters/chap1.tex
+++ b/latex/chapters/chap1.tex
@@ -1,603 +1,603 @@
-% chapters/chap1.tex
-\documentclass[../main.tex]{subfiles}
-
-% If this chapter is compiled *by itself*, we must load only its own .bib
-% and print its bibliography at the end of the chapter.
-\ifSubfilesClassLoaded{
-  \addbibresource{../main.bib}
-}
-
-\usepackage{amsmath, amsfonts, amsthm}
-\usepackage{fancyhdr,parskip}
-\usepackage{fullpage}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-% add special notation supports
-\usepackage[mathscr]{euscript}
-\usepackage{mathtools}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-% add image package and directory
-\usepackage{graphicx}
-\usepackage{tikz}
-\graphicspath{{../images/}}
-
-
-\begin{document}
-
-
-\chapter{Concentration of Measure And Quantum Entanglement}
-
-
-First, we will build the mathematical model describing the behavior of quantum system and why they makes sense for physicists and meaningful for general publics.
-
-\section{Motivation}
-
-First, we introduce a motivation for introducing non-commutative probability theory to the study of quantum mechanics. This section is mainly based on the book~\cite{kummer1998elements}.
-
-\subsection{Light polarization and the violation of Bell's inequality}
-
-The light which comes through a polarizer is polarized in a certain direction. If we fix the first filter and rotate the second filter, we will observe the intensity of the light will change.
-
-The light intensity decreases with $\alpha$ (the angle between the two filters). The light should vanish when $\alpha=\pi/2$.
-
-However, for a system of 3 polarizing filters $F_1,F_2,F_3$, having directions $\alpha_1,\alpha_2,\alpha_3$, if we put them on the optical bench in pairs, then we will have three random variables $P_1,P_2,P_3$.
-
-\begin{figure}[h]
-	\centering
-	\includegraphics[width=0.7\textwidth]{Filter_figure.png}
-	\caption{The light polarization experiment, image from \cite{kummer1998elements}}
-	\label{fig:Filter_figure}
-\end{figure}
-
-\begin{theorem}
-	\label{theorem:Bell's_3_variable_inequality}
-	Bell's 3 variable inequality:
-
-	For any three random variables $P_1,P_2,P_3$ in a classical probability space, we have
-
-	$$
-	\operatorname{Prob}(P_1=1,P_3=0)\leq \operatorname{Prob}(P_1=1,P_2=0)+\operatorname{Prob}(P_2=1,P_3=0)
-	$$
-\end{theorem}
-
-\begin{proof}
-	By the law of total probability there are only two possibility if we don't observe any light passing the filter pair $F_i,F_j$, it means the photon is either blocked by $F_i$ or $F_j$, it means
-
-    $$
-    \begin{aligned}
-    \operatorname{Prob}(P_1=1,P_3=0)&=\operatorname{Prob}(P_1=1,P_2=0,P_3=0)\\
-    &+\operatorname{Prob}(P_1=1,P_2=1,P_3=0)\\
-    &\leq\operatorname{Prob}(P_1=1,P_2=0)+\operatorname{Prob}(P_2=1,P_3=0)
-    \end{aligned}
-    $$
-\end{proof}
-
-However, according to our experimental measurement, for any pair of polarizers $F_i,F_j$, by the complement rule, we have
-$$
-\begin{aligned}
-\operatorname{Prob}(P_i=1,P_j=0)&=\operatorname{Prob}(P_i=1)-\operatorname{Prob}(P_i=1,P_j=1)\\
-&=\frac{1}{2}-\frac{1}{2}\cos^2(\alpha_i-\alpha_j)\\
-&=\frac{1}{2}\sin^2(\alpha_i-\alpha_j)
-\end{aligned}
-$$
-
-This leads to a contradiction if we apply the inequality to the experimental data.
-
-$$
-\frac{1}{2}\sin^2(\alpha_1-\alpha_3)\leq\frac{1}{2}\sin^2(\alpha_1-\alpha_2)+\frac{1}{2}\sin^2(\alpha_2-\alpha_3)
-$$
-
-If $\alpha_1=0,\alpha_2=\frac{\pi}{6},\alpha_3=\frac{\pi}{3}$, then
-
-$$
-\begin{aligned}
-\frac{1}{2}\sin^2(-\frac{\pi}{3})&\leq\frac{1}{2}\sin^2(-\frac{\pi}{6})+\frac{1}{2}\sin^2(\frac{\pi}{6}-\frac{\pi}{3})\\
-\frac{3}{8}&\leq\frac{1}{8}+\frac{1}{8}\\
-\frac{3}{8}&\leq\frac{1}{4}
-\end{aligned}
-$$
-
-Other revised experiments (e.g., Aspect's experiment, calcium entangled photon experiment) are also conducted and the inequality is still violated.
-
-\subsection{The true model of light polarization}
-    
-The full description of the light polarization is given below:
-
-State of polarization of a photon: $\psi=\alpha|0\rangle+\beta|1\rangle$, where $|0\rangle$ and $|1\rangle$ are the two orthogonal polarization states in $\mathbb{C}^2$.
-
-Polarization filter (generalized 0,1 valued random variable): orthogonal projection $P_\alpha$ on $\mathbb{C}^2$ corresponding to the direction $\alpha$ (operator satisfies $P_\alpha^*=P_\alpha=P_\alpha^2$).
-
-The matrix representation of $P_\alpha$ is given by
-
-$$
-P_\alpha=\begin{pmatrix}
-\cos^2(\alpha) & \cos(\alpha)\sin(\alpha)\\
-\cos(\alpha)\sin(\alpha) & \sin^2(\alpha)
-\end{pmatrix}
-$$
-
-Probability of a photon passing through the filter $P_\alpha$ is given by $\langle P_\alpha\psi,\psi\rangle$; this is $\cos^2(\alpha)$ if we set $\psi=|0\rangle$.
-
-Since the probability of a photon passing through the three filters is not commutative, it is impossible to discuss $\operatorname{Prob}(P_1=1,P_3=0)$ in the classical setting.
-
-We now show how the experimentally observed probability
-$$
-\frac{1}{2}\sin^2(\alpha_i-\alpha_j)
-$$
-arises from the operator model.
-
-Assume the incoming light is \emph{unpolarized}. It is therefore described by
-the density matrix
-$$
-\rho=\frac{1}{2} I .
-$$
-
-Let $P_{\alpha_i}$ and $P_{\alpha_j}$ be the orthogonal projections corresponding
-to the two polarization filters with angles $\alpha_i$ and $\alpha_j$.
-
-The probability that a photon passes the first filter $P_{\alpha_i}$ is given by the Born rule:
-
-$$
-\operatorname{Prob}(P_i=1)
-=\operatorname{tr}(\rho P_{\alpha_i})
-=\frac{1}{2} \operatorname{tr}(P_{\alpha_i})
-=\frac{1}{2}
-$$
-
-If the photon passes the first filter, the post-measurement state is given by the L\"uders rule:
-
-$$
-\rho \longmapsto
-\rho_i
-=\frac{P_{\alpha_i}\rho P_{\alpha_i}}{\operatorname{tr}(\rho P_{\alpha_i})}
-= P_{\alpha_i}.
-$$
-
-The probability that the photon then passes the second filter is
-
-$$
-\operatorname{Prob}(P_j=1 \mid P_i=1)
-=\operatorname{tr}(P_{\alpha_i} P_{\alpha_j})
-=\cos^2(\alpha_i-\alpha_j).
-$$
-
-Hence, the probability that the photon passes $P_{\alpha_i}$ and is then blocked by $P_{\alpha_j}$ is
-
-$$
-\begin{aligned}
-\operatorname{Prob}(P_i=1, P_j=0)
-&= \operatorname{Prob}(P_i=1)
-   - \operatorname{Prob}(P_i=1, P_j=1) \\
-&= \frac12 - \frac12 \cos^2(\alpha_i-\alpha_j) \\
-&= \frac12 \sin^2(\alpha_i-\alpha_j).
-\end{aligned}
-$$
-
-This agrees with the experimentally observed transmission probabilities, but it should be emphasized that this quantity corresponds to a \emph{sequential measurement} rather than a joint probability in the classical sense.
-
-\section{Concentration of measure phenomenon}
-
-\begin{defn}
-	$\eta$-Lipschitz function
-
-    Let $(X,\operatorname{dist}_X)$ and $(Y,\operatorname{dist}_Y)$ be two metric spaces. A function $f:X\to Y$ is said to be $\eta$-Lipschitz if there exists a constant $L\in \mathbb{R}$ such that
-    $$
-    \operatorname{dist}_Y(f(x),f(y))\leq L\operatorname{dist}_X(x,y)
-    $$
-    for all $x,y\in X$. And $\eta=\|f\|_{\operatorname{Lip}}=\inf_{L\in \mathbb{R}}L$.
-\end{defn}
-
-That basically means that the function $f$ should not change the distance between any two pairs of points in $X$ by more than a factor of $L$.
-
-This is a stronger condition than continuity, every Lipschitz function is continuous, but not every continuous function is Lipschitz.
-
-\begin{lemma}
-	\label{lemma:isoperimetric_inequality_on_sphere}
-	Isoperimetric inequality on the sphere:
-
-    Let $\sigma_n(A)$ denote the normalized area of $A$ on the $n$-dimensional sphere $S^n$. That is, $\sigma_n(A)\coloneqq\frac{\operatorname{Area}(A)}{\operatorname{Area}(S^n)}$.
-
-    Let $\epsilon>0$. Then for any subset $A\subset S^n$, given the area $\sigma_n(A)$, the spherical caps minimize the volume of the $\epsilon$-neighborhood of $A$.
-
-    Suppose $\sigma^n(\cdot)$ is the normalized volume measure on the sphere $S^n(1)$, then for any closed subset $\Omega\subset S^n(1)$, we take a metric ball $B_\Omega$ of $S^n(1)$ with $\sigma^n(B_\Omega)=\sigma^n(\Omega)$. Then we have
-
-    $$
-    \sigma^n(U_r(\Omega))\geq \sigma^n(U_r(B_\Omega))
-    $$
-
-    where $U_r(A)=\{x\in X:d(x,A)< r\}$
-\end{lemma}
-
-Intuitively, the lemma means that the spherical caps are the most efficient way to cover the sphere.
-
-Here, the efficiency is measured by the epsilon-neighborhood of the boundary of the spherical cap.
-
-To prove the lemma, we need to have a good understanding of the Riemannian geometry of the sphere. For now, let's just take the lemma for granted.
-
-\subsection{Levy's concentration theorem}
-
-\begin{theorem}
-	\label{theorem:Levy's_concentration_theorem}
-	Levy's concentration theorem:
-
-    An arbitrary 1-Lipschitz function $f:S^n\to \mathbb{R}$ concentrates near a single value $a_0\in \mathbb{R}$ as strongly as the distance function does.
-
-    That is,
-    $$
-    \mu\{x\in S^n: |f(x)-a_0|\geq\epsilon\} < \kappa_n(\epsilon)\leq 2\exp\left(-\frac{(n-1)\epsilon^2}{2}\right)
-    $$
-    where 
-    $$
-    \kappa_n(\epsilon)=\frac{\int_\epsilon^{\frac{\pi}{2}}\cos^{n-1}(t)dt}{\int_0^{\frac{\pi}{2}}\cos^{n-1}(t)dt}
-    $$
-    $a_0$ is the \textbf{Levy mean} of function $f$, that is, the level set $f^{-1}:\mathbb{R}\to S^n$ divides the sphere into equal halves, characterized by the following equality:
-    $$
-    \mu(f^{-1}(-\infty,a_0])\geq \frac{1}{2} \text{ and } \mu(f^{-1}[a_0,\infty))\geq \frac{1}{2}
-    $$
-\end{theorem}
-
-We will prove the theorem via the Maxwell-Boltzmann distribution law in this section for simplicity. ~\cite{shioya2014metricmeasuregeometry} The theorem will be discussed later in more general cases.
-
-\begin{defn}
-	\label{defn:Gaussian_measure}
-	Gaussian measure:
-
-    We denote the Gaussian measure on $\mathbb{R}^k$ as $\gamma^k$.
-
-    $$
-    d\gamma^k(x)\coloneqq\frac{1}{\sqrt{2\pi}^k}\exp(-\frac{1}{2}\|x\|^2)dx
-    $$
-    
-    $x\in \mathbb{R}^k$, $\|x\|^2=\sum_{i=1}^k x_i^2$ is the Euclidean norm, and $dx$ is the Lebesgue measure on $\mathbb{R}^k$.
-    
-\end{defn}
-
-Basically, you can consider the Gaussian measure as the normalized Lebesgue measure on $\mathbb{R}^k$ with standard deviation $1$.
-
-It also has another name, the Projective limit theorem.~\cite{romanvershyni}
-
-If $X\sim \operatorname{Unif}(S^n(\sqrt{n}))$, then for any fixed unit vector $x$ we have $\langle X,x\rangle\to N(0,1)$ in distribution as $n\to \infty$.
-
-\begin{figure}[h]
-    \centering
-    \includegraphics[width=0.8\textwidth]{../images/maxwell.png}
-    \caption{Maxwell-Boltzmann distribution law, image from \cite{romanvershyni}}
-    \label{fig:Maxwell-Boltzmann_distribution_law}
-\end{figure}
-
-\begin{lemma}
-	\label{lemma:Maxwell-Boltzmann_distribution_law}
-    Maxwell-Boltzmann distribution law:
-    
-    For any natural number $k$,
-    $$
-    \frac{d(\pi_{n,k})_*\sigma^n(x)}{dx}\to \frac{d\gamma^k(x)}{dx}
-    $$
-    where $(\pi_{n,k})_*\sigma^n$ is the push-forward measure of $\sigma^n$ by $\pi_{n,k}$.
-    
-    In other words,
-    $$
-    (\pi_{n,k})_*\sigma^n\to \gamma^k\text{ weakly as }n\to \infty
-    $$
-\end{lemma}
-
-\begin{proof}
-    We denote the $n$-dimensional volume measure on $\mathbb{R}^k$ as $\operatorname{vol}_k$.
-    
-    Observe that $\pi_{n,k}^{-1}(x),x\in \mathbb{R}^k$ is isometric to $S^{n-k}(\sqrt{n-\|x\|^2})$, that is, for any $x\in \mathbb{R}^k$, $\pi_{n,k}^{-1}(x)$ is a sphere with radius $\sqrt{n-\|x\|^2}$ (by the definition of $\pi_{n,k}$).
-    
-    So,
-    $$
-    \begin{aligned}
-    \frac{d(\pi_{n,k})_*\sigma^n(x)}{dx}&=\frac{\operatorname{vol}_{n-k}(\pi_{n,k}^{-1}(x))}{\operatorname{vol}_k(S^n(\sqrt{n}))}\\
-    &=\frac{(n-\|x\|^2)^{\frac{n-k}{2}}}{\int_{\|x\|\leq \sqrt{n}}(n-\|x\|^2)^{\frac{n-k}{2}}dx}\\
-    \end{aligned}
-    $$
-    as $n\to \infty$.
-    
-    Note that $\lim_{n\to \infty}(1-\frac{a}{n})^n=e^{-a}$ for any $a>0$.
-    
-    $(n-\|x\|^2)^{\frac{n-k}{2}}=\left(n(1-\frac{\|x\|^2}{n})\right)^{\frac{n-k}{2}}\to n^{\frac{n-k}{2}}\exp(-\frac{\|x\|^2}{2})$
-    
-    So
-    $$
-    \begin{aligned}
-    \frac{(n-\|x\|^2)^{\frac{n-k}{2}}}{\int_{\|x\|\leq \sqrt{n}}(n-\|x\|^2)^{\frac{n-k}{2}}dx}&=\frac{e^{-\frac{\|x\|^2}{2}}}{\int_{x\in \mathbb{R}^k}e^{-\frac{\|x\|^2}{2}}dx}\\
-    &=\frac{1}{(2\pi)^{\frac{k}{2}}}e^{-\frac{\|x\|^2}{2}}\\
-    &=\frac{d\gamma^k(x)}{dx}
-    \end{aligned}
-    $$
-\end{proof}
-
-Now we can prove Levy's concentration theorem, the proof is from~\cite{shioya2014metricmeasuregeometry}.
-
-\begin{proof}
-    Let $f_n:S^n(\sqrt{n})\to \mathbb{R}$, $n=1,2,\ldots$, be 1-Lipschitz functions.
-
-    Let $x$ and $x'$ be two given real numbers and $\gamma^1(-\infty,x]=\overline{\sigma}_\infty[-\infty,x']$, suppose $\sigma_\infty\{x'\}=0$, where $\{\sigma_i\}$ is a sequence of Borel probability measures on $\mathbb{R}$.
-
-    We want to show that, for all non-negative real numbers $\epsilon_1$ and $\epsilon_2$.
-
-    $$
-    \sigma_\infty[x'-\epsilon_1,x'+\epsilon_2]\geq \gamma^1[x-\epsilon_1,x+\epsilon_2]
-    $$
-    
-    Consider the two spherical cap $\Omega_+\coloneq \{f_{n_i}\geq x'\}$ and $\Omega_-\coloneq \{f_{n_i}\leq x\}$. Note that $\Omega_+\cup \Omega_-=S^{n_i}(\sqrt{n_i})$.
-
-    It is sufficient to show that,
-
-    $$
-    U_{\epsilon_1}(\Omega_+)\cup U_{\epsilon_2}(\Omega_-)\subset \{x'-\epsilon_1\leq f_{n_i}\leq x'+\epsilon_2\}
-    $$
-
-    By 1-Lipschitz continuity of $f_{n_i}$, we have for all $\zeta\in U_{\epsilon_1}(\Omega_+)$, there is a point $\xi\in \Omega_+$ such that $d(\zeta,\xi)\leq \epsilon_1$. So $U_{\epsilon_1}(\Omega_+)\subset \{f_{n_i}\geq x'-\epsilon_1\}$. With the same argument, we have $U_{\epsilon_2}(\Omega_-)\subset \{f_{n_i}\leq x+\epsilon_2\}$.
-
-    So the push-forward measure of $(f_{n_i})_*\sigma^{n_i}$ of $[x'-\epsilon_1,x'+\epsilon_2]$ is
-
-    $$
-    \begin{aligned}
-    (f_{n_i})_*\sigma^{n_i}[x'-\epsilon_1,x'+\epsilon_2]&=\sigma^{n_i}(x'-\epsilon_1\leq f_{n_i}\leq x'+\epsilon_2)\\
-    &\geq \sigma^{n_i}(U_{\epsilon_1}(\Omega_+)\cap U_{\epsilon_2}(\Omega_-))\\
-    &=\sigma^{n_i}(U_{\epsilon_1}(\Omega_+))+\sigma^{n_i}(U_{\epsilon_2}(\Omega_-))-1\\
-    \end{aligned}
-    $$
-
-    By the lemma~\ref{lemma:isoperimetric_inequality_on_sphere}, we have
-
-    $$
-    \sigma^{n_i}(U_{\epsilon_1}(\Omega_+))\geq \sigma^{n_i}(U_{\epsilon_1}(B_{\Omega_+}))\quad \text{and} \quad \sigma^{n_i}(U_{\epsilon_2}(\Omega_-))\geq \sigma^{n_i}(U_{\epsilon_2}(B_{\Omega_-}))
-    $$
-
-    By the lemma~\ref{lemma:Maxwell-Boltzmann_distribution_law}, we have
-
-    $$
-    \sigma^{n_i}(U_{\epsilon_1}(\Omega_+))+\sigma^{n_i}(U_{\epsilon_2}(\Omega_-))\to \gamma^1[x'-\epsilon_1,x'+\epsilon_2]+\gamma^1[x-\epsilon_1,x+\epsilon_2]
-    $$
-
-    Therefore,
-
-    $$
-    \begin{aligned}
-    \sigma_\infty[x'-\epsilon_1,x'+\epsilon_2]&\geq \liminf_{i\to \infty}(f_{n_i})_*\sigma^{n_i}[x'-\epsilon_1,x'+\epsilon_2]\\
-    &\geq \gamma^1[x'-\epsilon_1,\infty)\cap \gamma^1(-\infty,x+\epsilon_2]-1\\
-    &=\gamma^1[x-\epsilon_1,x+\epsilon_2]
-    \end{aligned}
-    $$
-
-\end{proof}
-
-The full proof of Levy's concentration theorem requires more digestion for cases where $\overline{\sigma}_\infty\neq \delta_{\pm\infty}$ but I don't have enough time to do so. This section may be filled in the next semester.
-
-\section{The application of the concentration of measure phenomenon in non-commutative probability theory}
-
-In quantum communication, we can pass classical bits by sending quantum states. However, by the indistinguishability (Proposition~\ref{prop:indistinguishability}) of quantum states, we cannot send an infinite number of classical bits over a single qubit. There exists a bound for zero-error classical communication rate over a quantum channel.
-
-\begin{theorem}
-	\label{theorem:Holevo_bound}
-	Holevo bound:
-
-	The maximal amount of classical information that can be transmitted by a quantum system is given by the Holevo bound. $\log_2(d)$ is the maximum amount of classical information that can be transmitted by a quantum system with $d$ levels (that is, basically, the number of qubits).
-\end{theorem}
-
-The proof of the Holevo bound can be found in~\cite{Nielsen_Chuang_2010}. In current state of the project, this theorem is not heavily used so we will not make annotated proof here.
-
-\subsection{Quantum communication}
-
-To surpass the Holevo bound, we need to use the entanglement of quantum states.
-
-\begin{defn}
-	\label{defn:Bell_state}
-	Bell state:
-        
-        The Bell states are the following four states:
-        
-        $$
-        |\Phi^+\rangle=\frac{1}{\sqrt{2}}(|00\rangle+|11\rangle),\quad |\Phi^-\rangle=\frac{1}{\sqrt{2}}(|00\rangle-|11\rangle)
-        $$
-        $$
-        |\Psi^+\rangle=\frac{1}{\sqrt{2}}(|01\rangle+|10\rangle),\quad |\Psi^-\rangle=\frac{1}{\sqrt{2}}(|01\rangle-|10\rangle)
-        $$
-        These are a basis of the 2-qubit Hilbert space.
-\end{defn}
-
-
-\subsection{Superdense coding and entanglement}
-
-The description of the superdense coding can be found in~\cite{gupta2015functionalanalysisquantuminformation} and~\cite{Hayden}.
-
-Suppose $A$ and $B$ share a Bell state (or other maximally entangled state) $|\Phi^+\rangle=\frac{1}{\sqrt{2}}(|00\rangle+|11\rangle)$, where $A$ holds the first part and $B$ holds the second part.
-        
-$A$ wishes to send 2 \textbf{classical bits} to $B$.
-
-$A$ performs one of four Pauli unitaries (some fancy quantum gates named X, Y, Z, I) on the combined state of entangled qubits $\otimes$ one qubit. Then $A$ sends the resulting one qubit to $B$.
-
-This operation extends the initial one entangled qubit to a system of one of four orthogonal Bell states.
-
-$B$ performs a measurement on the combined state of the one qubit and the entangled qubits he holds.
-
-$B$ decodes the result and obtains the 2 classical bits sent by $A$.
-
-\begin{figure}[h]
-	\centering
-	\includegraphics[width=0.8\textwidth]{superdense_coding.png}
-	\caption{Superdense coding, image from \cite{Hayden}}
-	\label{fig:superdense_coding}
-\end{figure}
-
-Note that superdense coding is a way to send 2 classical bits of information by sending 1 qubit with 1 entangled qubit. \textbf{The role of the entangled qubit} is to help them to distinguish the 4 possible states of the total 3 qubits system where 2 of them (the pair of entangled qubits) are mathematically the same.
-
-Additionally, no information can be gained by measuring a pair of entangled qubits. To send information from  $A$ to $B$, we need to physically send the qubits from $A$ to $B$. That means, we cannot send information faster than the speed of light.
-
-% TODO: FILL the description of the superdense coding here.
-
-\subsection{Hayden's concentration of measure phenomenon}
-
-The application of the concentration of measure phenomenon in the superdense coding can be realized in random sampling the entangled qubits~\cite{Hayden}:
-
-It is a theorem connecting the following mathematical structure:
-
-\begin{figure}[h]
-    \centering
-    \begin{tikzpicture}[node distance=30mm, thick,
-        main/.style={draw, draw=white},
-        towards/.style={->},
-        towards_imp/.style={->,red},
-        mutual/.style={<->}
-        ]
-        % define nodes
-        \node[main] (cp) {$\mathbb{C}P^{d_A d_B-1}$};
-        \node[main] (pa) [left of=cp] {$\mathcal{P}(A\otimes B)$};
-        \node[main] (sa) [below of=pa] {$S_A$};
-        \node[main] (rng) [right of=sa] {$[0,\infty)\subset \mathbb{R}$};
-
-        % draw edges
-        \draw[mutual] (cp) -- (pa);
-        \draw[towards] (pa) -- node[left] {$\operatorname{Tr}_B$} (sa);
-        \draw[towards_imp] (pa) -- node[above right] {$f$} (rng);
-        \draw[towards] (sa) -- node[above] {$H(\psi_A)$} (rng);
-    \end{tikzpicture}
-    \caption{Mathematical structure for Hayden's concentration of measure phenomenon}
-    \label{fig:Hayden_concentration_of_measure_phenomenon}
-\end{figure}
-
-\begin{itemize}
-    \item The red arrow is the concentration of measure effect. $f=H(\operatorname{Tr}_B(\psi))$.
-    \item $S_A$ denotes the mixed states on $A$.
-\end{itemize}
-
-To prove the concentration of measure phenomenon, we need to analyze the following elements involved in figure~\ref{fig:Hayden_concentration_of_measure_phenomenon}:
-
-    
-The existence and uniqueness of the Haar measure is a theorem in compact lie group theory. For this research topic, we will not prove it.
-
-Due to time constrains of the projects, the following lemma is demonstrated but not investigated thoroughly through the research:
-
-
-\begin{lemma}
-    \label{pages_lemma}
-
-    Page's lemma for expected entropy of mixed states
-
-    Choose a random pure state $\sigma=|\psi\rangle\langle\psi|$ from $A'\otimes A$.
-
-    The expected value of the entropy of entanglement is known and satisfies a concentration inequality known as Page's formula~\cite{Pages_conjecture,Pages_conjecture_simple_proof,Bengtsson_Zyczkowski_2017}[15.72].
-
-    $$
-    \mathbb{E}[H(\psi_A)]=\frac{1}{\ln(2)}\left(\sum_{j=d_B+1}^{d_Ad_B}\frac{1}{j}-\frac{d_A-1}{2d_B}\right) \geq \log_2(d_A)-\frac{1}{2\ln(2)}\frac{d_A}{d_B}
-    $$
-
-\end{lemma}
-
-It basically provides a lower bound for the expected entropy of entanglement. Experimentally, we can have the following result (see Figure~\ref{fig:entropy_vs_dim}):
-
-\begin{figure}[h]
-	\centering
-	\includegraphics[width=0.8\textwidth]{entropy_vs_dim.png}
-	\caption{Entropy vs dimension}
-	\label{fig:entropy_vs_dim}
-\end{figure}
-
-Then we have bound for Lipschitz constant $\eta$ of the map $S(\varphi_A): \mathcal{P}(A\otimes B)\to \R$
-
-\begin{lemma}
-    The Lipschitz constant $\eta$ of $S(\varphi_A)$ is upper bounded by $\sqrt{8}\log_2(d_A)$ for $d_A\geq 3$.
-\end{lemma}
-
-\begin{proof}
-    Consider the Lipschitz constant of the function $g:A\otimes B\to \R$ defined as $g(\varphi)=H(M(\varphi_A))$, where $M:A\otimes B\to \mathcal{P}(A)$ is any fixed complete von Neumann measurement and $H: \mathcal{P}(A)\otimes \mathcal{P}(B)\to \R$ is the Shannon entropy.
-
-    Let $\{\ket{e_j}_A\}$ be the orthonormal basis for $A$ and $\{\ket{f_k}_B\}$ be the orthonormal basis for $B$. Then we decompose the state as spectral form $\ket{\varphi}=\sum_{j=1}^{d_A}\sum_{k=1}^{d_B}\varphi_{jk}\ket{e_j}_A\ket{f_k}_B$.
-
-    By unitary invariance, suppose $M_j=\ket{e_j}\bra{e_j}_A$, and define
-
-    $$
-    p_j(\varphi)=\bra{e_j}\varphi_A \ket{e_j}=\sum_{k=1}^{d_B}|\varphi_{jk}|^2
-    $$
-
-    Then 
-    
-    $$
-    g(\varphi)=H(M(\varphi_A))=-\sum_{j=1}^{d_A}p_j(\varphi)\log_2(p_j(\varphi))
-    $$
-
-    Let $h(p)=-p\log_2(p)$, $h(p)=-\frac{p\ln p}{\ln 2}$, and $h'(p)=-\frac{\ln p+1}{\ln 2}$. Let $\varphi_{jk}=x_{jk}+i y_{jk}$, then $p_j(\varphi)=\sum_{k=1}^{d_B}(x_{jk}^2+y_{jk}^2)$, $\frac{\partial p_j}{\partial x_{jk}}=2x_{jk}$, $\frac{\partial p_j}{\partial y_{jk}}=2y_{jk}$.
-
-    Therefore 
-    
-    $$
-    \frac{\partial g}{\partial x_{jk}}=\frac{\partial g}{\partial p_j}\frac{\partial p_j}{x_{jk}}=-\frac{1+\ln p_j}{\ln 2}\cdot 2x_{jk}
-    \qquad 
-    \frac{\partial g}{\partial y_{jk}}=-\frac{1+\ln p_j}{\ln 2}\cdot 2y_{jk}
-    $$
-    
-    Then the lipschitz constant of $g$ is
-
-    $$
-    \begin{aligned}
-    \eta^2&=\sup_{\langle \varphi|\varphi\rangle \leq 1}\nabla g\cdot \nabla g\\
-    &=\sum_{j=1}^{d_A}\sum_{k=1}^{d_B}\left(\frac{\partial g}{\partial x_{jk}}\right)^2+\left(\frac{\partial g}{\partial y_{jk}}\right)^2\\
-    &=\sum_{j=1}^{d_A}\sum_{k=1}^{d_B}\frac{4(x_{jk}^2+y_{jk}^2)}{(\ln 2)^2}[1+\ln p_j(\varphi)]^2\\
-    &=\sum_{j=1}^{d_A}\sum_{k=1}^{d_B}\frac{4|\varphi_{jk}|^2}{(\ln 2)^2}[1+\ln p_j(\varphi)]^2\\
-    \end{aligned}
-    $$
-
-    Note that $\sum_{k=1}^{d_B}|\varphi_{jk}|^2=p_j(\varphi)$, $\nabla g\cdot \nabla g=\frac{4}{(\ln 2)^2}\sum_{j=1}^{d_A}p_j(\varphi)(1+\ln p_j(\varphi))^2$.
-
-    Since $0\leq p_j\leq 1$, we have $\ln p_j(\varphi)\leq 0$, hence $\sum_{j=0}^{d_A}p_j(\varphi)\ln p_j(\varphi)\leq 0$.
-    
-    $$
-    \begin{aligned}
-        \sum_{j=1}^{d_A}p_j(\varphi)(1+\ln p_j(\varphi))^2&=\sum_{j=1}^{d_A}p_j(\varphi)(1+2\ln p_j(\varphi)+(\ln p_j(\varphi))^2)\\
-        &=1+2\sum_{j=1}^{d_A} p_j(\varphi)\ln p_j(\varphi)+\sum_{j=1}^{d_A}p_j(\varphi)(\ln p_j(\varphi))^2\\
-        &\leq 1+\sum_{j=1}^{d_A}p_j(\varphi)(\ln p_j(\varphi))^2\\
-    \end{aligned}
-    $$
-
-    Thus,
-    $$
-    \begin{aligned}
-    \nabla g\cdot \nabla g&\leq \frac{4}{(\ln 2)^2}[1+\sum_{j=1}^{d_A}p_j(\varphi)(\ln p_j(\varphi))^2]\\
-    &\leq \frac{4}{(\ln 2)^2}[1+(\ln d_A)^2]\\
-    &\leq 8(\log_2 d_A)^2
-    \end{aligned}
-    $$
-
-    Proving $\sum_j^{d_A} p_j(\varphi)\ln p_j(\varphi)\leq (\ln d_A)^2$ for $d_A\geq 3$ takes some efforts and we will continue that later.
-
-    Consider any two unit vectors $\ket{\varphi}$ and $\ket{\psi}$, assume $S(\varphi_A)\leq S(\psi_A)$. If we choose the measurement $M$ to be along the eigenbasis of $\varphi_A$, $H(M(\varphi_A))=S(\varphi_A)$ and we have
-
-    $$
-    S(\psi_A)-S(\varphi_A)\leq H(M(\psi_A))-H(M(\varphi_A))\leq \eta\|\ket{\psi}-\ket{\varphi}\|
-    $$
-
-    Thus the lipschitz constant of $S(\varphi_A)$ is upper bounded by $\sqrt{8}\log_2(d_A)$.
-\end{proof}
-
-From Levy's lemma, we have
-
-If we define $\beta=\frac{1}{\ln(2)}\frac{d_A}{d_B}$, then we have
-
-$$
-\operatorname{Pr}[H(\psi_A) < \log_2(d_A)-\alpha-\beta] \leq \exp\left(-\frac{1}{8\pi^2\ln(2)}\frac{(d_Ad_B-1)\alpha^2}{(\log_2(d_A))^2}\right)
-$$
-
-where $d_B\geq d_A\geq 3$~\cite{Hayden_2006}.
-
-Experimentally, we can have the following result:
-
-As the dimension of the Hilbert space increases, the chance of getting an almost maximally entangled state increases (see Figure~\ref{fig:entropy_vs_dA}).
-
-\begin{figure}[h]
-	\centering
-	\includegraphics[width=0.8\textwidth]{entropy_vs_dA.png}
-	\caption{Entropy vs $d_A$}
-	\label{fig:entropy_vs_dA}
-\end{figure}
-
-% When compiled standalone, print this chapter's references at the end.
-\ifSubfilesClassLoaded{
-  \printbibliography[title={References for Chapter 1}]
-}
-
-\end{document}
+% chapters/chap1.tex
+\documentclass[../main.tex]{subfiles}
+
+% If this chapter is compiled *by itself*, we must load only its own .bib
+% and print its bibliography at the end of the chapter.
+\ifSubfilesClassLoaded{
+  \addbibresource{../main.bib}
+}
+
+\usepackage{amsmath, amsfonts, amsthm}
+\usepackage{fancyhdr,parskip}
+\usepackage{fullpage}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% add special notation supports
+\usepackage[mathscr]{euscript}
+\usepackage{mathtools}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% add image package and directory
+\usepackage{graphicx}
+\usepackage{tikz}
+\graphicspath{{../images/}}
+
+
+\begin{document}
+
+
+\chapter{Concentration of Measure And Quantum Entanglement}
+
+
+First, we will build the mathematical model describing the behavior of quantum system and why they makes sense for physicists and meaningful for general publics.
+
+\section{Motivation}
+
+First, we introduce a motivation for introducing non-commutative probability theory to the study of quantum mechanics. This section is mainly based on the book~\cite{kummer1998elements}.
+
+\subsection{Light polarization and the violation of Bell's inequality}
+
+The light which comes through a polarizer is polarized in a certain direction. If we fix the first filter and rotate the second filter, we will observe the intensity of the light will change.
+
+The light intensity decreases with $\alpha$ (the angle between the two filters). The light should vanish when $\alpha=\pi/2$.
+
+However, for a system of 3 polarizing filters $F_1,F_2,F_3$, having directions $\alpha_1,\alpha_2,\alpha_3$, if we put them on the optical bench in pairs, then we will have three random variables $P_1,P_2,P_3$.
+
+\begin{figure}[h]
+	\centering
+	\includegraphics[width=0.7\textwidth]{Filter_figure.png}
+	\caption{The light polarization experiment, image from \cite{kummer1998elements}}
+	\label{fig:Filter_figure}
+\end{figure}
+
+\begin{theorem}
+	\label{theorem:Bell's_3_variable_inequality}
+	Bell's 3 variable inequality:
+
+	For any three random variables $P_1,P_2,P_3$ in a classical probability space, we have
+
+	$$
+	\operatorname{Prob}(P_1=1,P_3=0)\leq \operatorname{Prob}(P_1=1,P_2=0)+\operatorname{Prob}(P_2=1,P_3=0)
+	$$
+\end{theorem}
+
+\begin{proof}
+	By the law of total probability there are only two possibility if we don't observe any light passing the filter pair $F_i,F_j$, it means the photon is either blocked by $F_i$ or $F_j$, it means
+
+    $$
+    \begin{aligned}
+    \operatorname{Prob}(P_1=1,P_3=0)&=\operatorname{Prob}(P_1=1,P_2=0,P_3=0)\\
+    &+\operatorname{Prob}(P_1=1,P_2=1,P_3=0)\\
+    &\leq\operatorname{Prob}(P_1=1,P_2=0)+\operatorname{Prob}(P_2=1,P_3=0)
+    \end{aligned}
+    $$
+\end{proof}
+
+However, according to our experimental measurement, for any pair of polarizers $F_i,F_j$, by the complement rule, we have
+$$
+\begin{aligned}
+\operatorname{Prob}(P_i=1,P_j=0)&=\operatorname{Prob}(P_i=1)-\operatorname{Prob}(P_i=1,P_j=1)\\
+&=\frac{1}{2}-\frac{1}{2}\cos^2(\alpha_i-\alpha_j)\\
+&=\frac{1}{2}\sin^2(\alpha_i-\alpha_j)
+\end{aligned}
+$$
+
+This leads to a contradiction if we apply the inequality to the experimental data.
+
+$$
+\frac{1}{2}\sin^2(\alpha_1-\alpha_3)\leq\frac{1}{2}\sin^2(\alpha_1-\alpha_2)+\frac{1}{2}\sin^2(\alpha_2-\alpha_3)
+$$
+
+If $\alpha_1=0,\alpha_2=\frac{\pi}{6},\alpha_3=\frac{\pi}{3}$, then
+
+$$
+\begin{aligned}
+\frac{1}{2}\sin^2(-\frac{\pi}{3})&\leq\frac{1}{2}\sin^2(-\frac{\pi}{6})+\frac{1}{2}\sin^2(\frac{\pi}{6}-\frac{\pi}{3})\\
+\frac{3}{8}&\leq\frac{1}{8}+\frac{1}{8}\\
+\frac{3}{8}&\leq\frac{1}{4}
+\end{aligned}
+$$
+
+Other revised experiments (e.g., Aspect's experiment, calcium entangled photon experiment) are also conducted and the inequality is still violated.
+
+\subsection{The true model of light polarization}
+    
+The full description of the light polarization is given below:
+
+State of polarization of a photon: $\psi=\alpha|0\rangle+\beta|1\rangle$, where $|0\rangle$ and $|1\rangle$ are the two orthogonal polarization states in $\mathbb{C}^2$.
+
+Polarization filter (generalized 0,1 valued random variable): orthogonal projection $P_\alpha$ on $\mathbb{C}^2$ corresponding to the direction $\alpha$ (operator satisfies $P_\alpha^*=P_\alpha=P_\alpha^2$).
+
+The matrix representation of $P_\alpha$ is given by
+
+$$
+P_\alpha=\begin{pmatrix}
+\cos^2(\alpha) & \cos(\alpha)\sin(\alpha)\\
+\cos(\alpha)\sin(\alpha) & \sin^2(\alpha)
+\end{pmatrix}
+$$
+
+Probability of a photon passing through the filter $P_\alpha$ is given by $\langle P_\alpha\psi,\psi\rangle$; this is $\cos^2(\alpha)$ if we set $\psi=|0\rangle$.
+
+Since the probability of a photon passing through the three filters is not commutative, it is impossible to discuss $\operatorname{Prob}(P_1=1,P_3=0)$ in the classical setting.
+
+We now show how the experimentally observed probability
+$$
+\frac{1}{2}\sin^2(\alpha_i-\alpha_j)
+$$
+arises from the operator model.
+
+Assume the incoming light is \emph{unpolarized}. It is therefore described by
+the density matrix
+$$
+\rho=\frac{1}{2} I .
+$$
+
+Let $P_{\alpha_i}$ and $P_{\alpha_j}$ be the orthogonal projections corresponding
+to the two polarization filters with angles $\alpha_i$ and $\alpha_j$.
+
+The probability that a photon passes the first filter $P_{\alpha_i}$ is given by the Born rule:
+
+$$
+\operatorname{Prob}(P_i=1)
+=\operatorname{tr}(\rho P_{\alpha_i})
+=\frac{1}{2} \operatorname{tr}(P_{\alpha_i})
+=\frac{1}{2}
+$$
+
+If the photon passes the first filter, the post-measurement state is given by the L\"uders rule:
+
+$$
+\rho \longmapsto
+\rho_i
+=\frac{P_{\alpha_i}\rho P_{\alpha_i}}{\operatorname{tr}(\rho P_{\alpha_i})}
+= P_{\alpha_i}.
+$$
+
+The probability that the photon then passes the second filter is
+
+$$
+\operatorname{Prob}(P_j=1 \mid P_i=1)
+=\operatorname{tr}(P_{\alpha_i} P_{\alpha_j})
+=\cos^2(\alpha_i-\alpha_j).
+$$
+
+Hence, the probability that the photon passes $P_{\alpha_i}$ and is then blocked by $P_{\alpha_j}$ is
+
+$$
+\begin{aligned}
+\operatorname{Prob}(P_i=1, P_j=0)
+&= \operatorname{Prob}(P_i=1)
+   - \operatorname{Prob}(P_i=1, P_j=1) \\
+&= \frac12 - \frac12 \cos^2(\alpha_i-\alpha_j) \\
+&= \frac12 \sin^2(\alpha_i-\alpha_j).
+\end{aligned}
+$$
+
+This agrees with the experimentally observed transmission probabilities, but it should be emphasized that this quantity corresponds to a \emph{sequential measurement} rather than a joint probability in the classical sense.
+
+\section{Concentration of measure phenomenon}
+
+\begin{defn}
+	$\eta$-Lipschitz function
+
+    Let $(X,\operatorname{dist}_X)$ and $(Y,\operatorname{dist}_Y)$ be two metric spaces. A function $f:X\to Y$ is said to be $\eta$-Lipschitz if there exists a constant $L\in \mathbb{R}$ such that
+    $$
+    \operatorname{dist}_Y(f(x),f(y))\leq L\operatorname{dist}_X(x,y)
+    $$
+    for all $x,y\in X$. And $\eta=\|f\|_{\operatorname{Lip}}=\inf_{L\in \mathbb{R}}L$.
+\end{defn}
+
+That basically means that the function $f$ should not change the distance between any two pairs of points in $X$ by more than a factor of $L$.
+
+This is a stronger condition than continuity, every Lipschitz function is continuous, but not every continuous function is Lipschitz.
+
+\begin{lemma}
+	\label{lemma:isoperimetric_inequality_on_sphere}
+	Isoperimetric inequality on the sphere:
+
+    Let $\sigma_n(A)$ denote the normalized area of $A$ on the $n$-dimensional sphere $S^n$. That is, $\sigma_n(A)\coloneqq\frac{\operatorname{Area}(A)}{\operatorname{Area}(S^n)}$.
+
+    Let $\epsilon>0$. Then for any subset $A\subset S^n$, given the area $\sigma_n(A)$, the spherical caps minimize the volume of the $\epsilon$-neighborhood of $A$.
+
+    Suppose $\sigma^n(\cdot)$ is the normalized volume measure on the sphere $S^n(1)$, then for any closed subset $\Omega\subset S^n(1)$, we take a metric ball $B_\Omega$ of $S^n(1)$ with $\sigma^n(B_\Omega)=\sigma^n(\Omega)$. Then we have
+
+    $$
+    \sigma^n(U_r(\Omega))\geq \sigma^n(U_r(B_\Omega))
+    $$
+
+    where $U_r(A)=\{x\in X:d(x,A)< r\}$
+\end{lemma}
+
+Intuitively, the lemma means that the spherical caps are the most efficient way to cover the sphere.
+
+Here, the efficiency is measured by the epsilon-neighborhood of the boundary of the spherical cap.
+
+To prove the lemma, we need to have a good understanding of the Riemannian geometry of the sphere. For now, let's just take the lemma for granted.
+
+\subsection{Levy's concentration theorem}
+
+\begin{theorem}
+	\label{theorem:Levy's_concentration_theorem}
+	Levy's concentration theorem:
+
+    An arbitrary 1-Lipschitz function $f:S^n\to \mathbb{R}$ concentrates near a single value $a_0\in \mathbb{R}$ as strongly as the distance function does.
+
+    That is,
+    $$
+    \mu\{x\in S^n: |f(x)-a_0|\geq\epsilon\} < \kappa_n(\epsilon)\leq 2\exp\left(-\frac{(n-1)\epsilon^2}{2}\right)
+    $$
+    where 
+    $$
+    \kappa_n(\epsilon)=\frac{\int_\epsilon^{\frac{\pi}{2}}\cos^{n-1}(t)dt}{\int_0^{\frac{\pi}{2}}\cos^{n-1}(t)dt}
+    $$
+    $a_0$ is the \textbf{Levy mean} of function $f$, that is, the level set $f^{-1}:\mathbb{R}\to S^n$ divides the sphere into equal halves, characterized by the following equality:
+    $$
+    \mu(f^{-1}(-\infty,a_0])\geq \frac{1}{2} \text{ and } \mu(f^{-1}[a_0,\infty))\geq \frac{1}{2}
+    $$
+\end{theorem}
+
+We will prove the theorem via the Maxwell-Boltzmann distribution law in this section for simplicity. ~\cite{shioya2014metricmeasuregeometry} The theorem will be discussed later in more general cases.
+
+\begin{defn}
+	\label{defn:Gaussian_measure}
+	Gaussian measure:
+
+    We denote the Gaussian measure on $\mathbb{R}^k$ as $\gamma^k$.
+
+    $$
+    d\gamma^k(x)\coloneqq\frac{1}{\sqrt{2\pi}^k}\exp(-\frac{1}{2}\|x\|^2)dx
+    $$
+    
+    $x\in \mathbb{R}^k$, $\|x\|^2=\sum_{i=1}^k x_i^2$ is the Euclidean norm, and $dx$ is the Lebesgue measure on $\mathbb{R}^k$.
+    
+\end{defn}
+
+Basically, you can consider the Gaussian measure as the normalized Lebesgue measure on $\mathbb{R}^k$ with standard deviation $1$.
+
+It also has another name, the Projective limit theorem.~\cite{romanvershyni}
+
+If $X\sim \operatorname{Unif}(S^n(\sqrt{n}))$, then for any fixed unit vector $x$ we have $\langle X,x\rangle\to N(0,1)$ in distribution as $n\to \infty$.
+
+\begin{figure}[h]
+    \centering
+    \includegraphics[width=0.8\textwidth]{../images/maxwell.png}
+    \caption{Maxwell-Boltzmann distribution law, image from \cite{romanvershyni}}
+    \label{fig:Maxwell-Boltzmann_distribution_law}
+\end{figure}
+
+\begin{lemma}
+	\label{lemma:Maxwell-Boltzmann_distribution_law}
+    Maxwell-Boltzmann distribution law:
+    
+    For any natural number $k$,
+    $$
+    \frac{d(\pi_{n,k})_*\sigma^n(x)}{dx}\to \frac{d\gamma^k(x)}{dx}
+    $$
+    where $(\pi_{n,k})_*\sigma^n$ is the push-forward measure of $\sigma^n$ by $\pi_{n,k}$.
+    
+    In other words,
+    $$
+    (\pi_{n,k})_*\sigma^n\to \gamma^k\text{ weakly as }n\to \infty
+    $$
+\end{lemma}
+
+\begin{proof}
+    We denote the $n$-dimensional volume measure on $\mathbb{R}^k$ as $\operatorname{vol}_k$.
+    
+    Observe that $\pi_{n,k}^{-1}(x),x\in \mathbb{R}^k$ is isometric to $S^{n-k}(\sqrt{n-\|x\|^2})$, that is, for any $x\in \mathbb{R}^k$, $\pi_{n,k}^{-1}(x)$ is a sphere with radius $\sqrt{n-\|x\|^2}$ (by the definition of $\pi_{n,k}$).
+    
+    So,
+    $$
+    \begin{aligned}
+    \frac{d(\pi_{n,k})_*\sigma^n(x)}{dx}&=\frac{\operatorname{vol}_{n-k}(\pi_{n,k}^{-1}(x))}{\operatorname{vol}_k(S^n(\sqrt{n}))}\\
+    &=\frac{(n-\|x\|^2)^{\frac{n-k}{2}}}{\int_{\|x\|\leq \sqrt{n}}(n-\|x\|^2)^{\frac{n-k}{2}}dx}\\
+    \end{aligned}
+    $$
+    as $n\to \infty$.
+    
+    Note that $\lim_{n\to \infty}(1-\frac{a}{n})^n=e^{-a}$ for any $a>0$.
+    
+    $(n-\|x\|^2)^{\frac{n-k}{2}}=\left(n(1-\frac{\|x\|^2}{n})\right)^{\frac{n-k}{2}}\to n^{\frac{n-k}{2}}\exp(-\frac{\|x\|^2}{2})$
+    
+    So
+    $$
+    \begin{aligned}
+    \frac{(n-\|x\|^2)^{\frac{n-k}{2}}}{\int_{\|x\|\leq \sqrt{n}}(n-\|x\|^2)^{\frac{n-k}{2}}dx}&=\frac{e^{-\frac{\|x\|^2}{2}}}{\int_{x\in \mathbb{R}^k}e^{-\frac{\|x\|^2}{2}}dx}\\
+    &=\frac{1}{(2\pi)^{\frac{k}{2}}}e^{-\frac{\|x\|^2}{2}}\\
+    &=\frac{d\gamma^k(x)}{dx}
+    \end{aligned}
+    $$
+\end{proof}
+
+Now we can prove Levy's concentration theorem, the proof is from~\cite{shioya2014metricmeasuregeometry}.
+
+\begin{proof}
+    Let $f_n:S^n(\sqrt{n})\to \mathbb{R}$, $n=1,2,\ldots$, be 1-Lipschitz functions.
+
+    Let $x$ and $x'$ be two given real numbers and $\gamma^1(-\infty,x]=\overline{\sigma}_\infty[-\infty,x']$, suppose $\sigma_\infty\{x'\}=0$, where $\{\sigma_i\}$ is a sequence of Borel probability measures on $\mathbb{R}$.
+
+    We want to show that, for all non-negative real numbers $\epsilon_1$ and $\epsilon_2$.
+
+    $$
+    \sigma_\infty[x'-\epsilon_1,x'+\epsilon_2]\geq \gamma^1[x-\epsilon_1,x+\epsilon_2]
+    $$
+    
+    Consider the two spherical cap $\Omega_+\coloneq \{f_{n_i}\geq x'\}$ and $\Omega_-\coloneq \{f_{n_i}\leq x\}$. Note that $\Omega_+\cup \Omega_-=S^{n_i}(\sqrt{n_i})$.
+
+    It is sufficient to show that,
+
+    $$
+    U_{\epsilon_1}(\Omega_+)\cup U_{\epsilon_2}(\Omega_-)\subset \{x'-\epsilon_1\leq f_{n_i}\leq x'+\epsilon_2\}
+    $$
+
+    By 1-Lipschitz continuity of $f_{n_i}$, we have for all $\zeta\in U_{\epsilon_1}(\Omega_+)$, there is a point $\xi\in \Omega_+$ such that $d(\zeta,\xi)\leq \epsilon_1$. So $U_{\epsilon_1}(\Omega_+)\subset \{f_{n_i}\geq x'-\epsilon_1\}$. With the same argument, we have $U_{\epsilon_2}(\Omega_-)\subset \{f_{n_i}\leq x+\epsilon_2\}$.
+
+    So the push-forward measure of $(f_{n_i})_*\sigma^{n_i}$ of $[x'-\epsilon_1,x'+\epsilon_2]$ is
+
+    $$
+    \begin{aligned}
+    (f_{n_i})_*\sigma^{n_i}[x'-\epsilon_1,x'+\epsilon_2]&=\sigma^{n_i}(x'-\epsilon_1\leq f_{n_i}\leq x'+\epsilon_2)\\
+    &\geq \sigma^{n_i}(U_{\epsilon_1}(\Omega_+)\cap U_{\epsilon_2}(\Omega_-))\\
+    &=\sigma^{n_i}(U_{\epsilon_1}(\Omega_+))+\sigma^{n_i}(U_{\epsilon_2}(\Omega_-))-1\\
+    \end{aligned}
+    $$
+
+    By the lemma~\ref{lemma:isoperimetric_inequality_on_sphere}, we have
+
+    $$
+    \sigma^{n_i}(U_{\epsilon_1}(\Omega_+))\geq \sigma^{n_i}(U_{\epsilon_1}(B_{\Omega_+}))\quad \text{and} \quad \sigma^{n_i}(U_{\epsilon_2}(\Omega_-))\geq \sigma^{n_i}(U_{\epsilon_2}(B_{\Omega_-}))
+    $$
+
+    By the lemma~\ref{lemma:Maxwell-Boltzmann_distribution_law}, we have
+
+    $$
+    \sigma^{n_i}(U_{\epsilon_1}(\Omega_+))+\sigma^{n_i}(U_{\epsilon_2}(\Omega_-))\to \gamma^1[x'-\epsilon_1,x'+\epsilon_2]+\gamma^1[x-\epsilon_1,x+\epsilon_2]
+    $$
+
+    Therefore,
+
+    $$
+    \begin{aligned}
+    \sigma_\infty[x'-\epsilon_1,x'+\epsilon_2]&\geq \liminf_{i\to \infty}(f_{n_i})_*\sigma^{n_i}[x'-\epsilon_1,x'+\epsilon_2]\\
+    &\geq \gamma^1[x'-\epsilon_1,\infty)\cap \gamma^1(-\infty,x+\epsilon_2]-1\\
+    &=\gamma^1[x-\epsilon_1,x+\epsilon_2]
+    \end{aligned}
+    $$
+
+\end{proof}
+
+The full proof of Levy's concentration theorem requires more digestion for cases where $\overline{\sigma}_\infty\neq \delta_{\pm\infty}$ but I don't have enough time to do so. This section may be filled in the next semester.
+
+\section{The application of the concentration of measure phenomenon in non-commutative probability theory}
+
+In quantum communication, we can pass classical bits by sending quantum states. However, by the indistinguishability (Proposition~\ref{prop:indistinguishability}) of quantum states, we cannot send an infinite number of classical bits over a single qubit. There exists a bound for zero-error classical communication rate over a quantum channel.
+
+\begin{theorem}
+	\label{theorem:Holevo_bound}
+	Holevo bound:
+
+	The maximal amount of classical information that can be transmitted by a quantum system is given by the Holevo bound. $\log_2(d)$ is the maximum amount of classical information that can be transmitted by a quantum system with $d$ levels (that is, basically, the number of qubits).
+\end{theorem}
+
+The proof of the Holevo bound can be found in~\cite{Nielsen_Chuang_2010}. In current state of the project, this theorem is not heavily used so we will not make annotated proof here.
+
+\subsection{Quantum communication}
+
+To surpass the Holevo bound, we need to use the entanglement of quantum states.
+
+\begin{defn}
+	\label{defn:Bell_state}
+	Bell state:
+        
+        The Bell states are the following four states:
+        
+        $$
+        |\Phi^+\rangle=\frac{1}{\sqrt{2}}(|00\rangle+|11\rangle),\quad |\Phi^-\rangle=\frac{1}{\sqrt{2}}(|00\rangle-|11\rangle)
+        $$
+        $$
+        |\Psi^+\rangle=\frac{1}{\sqrt{2}}(|01\rangle+|10\rangle),\quad |\Psi^-\rangle=\frac{1}{\sqrt{2}}(|01\rangle-|10\rangle)
+        $$
+        These are a basis of the 2-qubit Hilbert space.
+\end{defn}
+
+
+\subsection{Superdense coding and entanglement}
+
+The description of the superdense coding can be found in~\cite{gupta2015functionalanalysisquantuminformation} and~\cite{Hayden}.
+
+Suppose $A$ and $B$ share a Bell state (or other maximally entangled state) $|\Phi^+\rangle=\frac{1}{\sqrt{2}}(|00\rangle+|11\rangle)$, where $A$ holds the first part and $B$ holds the second part.
+        
+$A$ wishes to send 2 \textbf{classical bits} to $B$.
+
+$A$ performs one of four Pauli unitaries (some fancy quantum gates named X, Y, Z, I) on the combined state of entangled qubits $\otimes$ one qubit. Then $A$ sends the resulting one qubit to $B$.
+
+This operation extends the initial one entangled qubit to a system of one of four orthogonal Bell states.
+
+$B$ performs a measurement on the combined state of the one qubit and the entangled qubits he holds.
+
+$B$ decodes the result and obtains the 2 classical bits sent by $A$.
+
+\begin{figure}[h]
+	\centering
+	\includegraphics[width=0.8\textwidth]{superdense_coding.png}
+	\caption{Superdense coding, image from \cite{Hayden}}
+	\label{fig:superdense_coding}
+\end{figure}
+
+Note that superdense coding is a way to send 2 classical bits of information by sending 1 qubit with 1 entangled qubit. \textbf{The role of the entangled qubit} is to help them to distinguish the 4 possible states of the total 3 qubits system where 2 of them (the pair of entangled qubits) are mathematically the same.
+
+Additionally, no information can be gained by measuring a pair of entangled qubits. To send information from  $A$ to $B$, we need to physically send the qubits from $A$ to $B$. That means, we cannot send information faster than the speed of light.
+
+% TODO: FILL the description of the superdense coding here.
+
+\subsection{Hayden's concentration of measure phenomenon}
+
+The application of the concentration of measure phenomenon in the superdense coding can be realized in random sampling the entangled qubits~\cite{Hayden}:
+
+It is a theorem connecting the following mathematical structure:
+
+\begin{figure}[h]
+    \centering
+    \begin{tikzpicture}[node distance=30mm, thick,
+        main/.style={draw, draw=white},
+        towards/.style={->},
+        towards_imp/.style={->,red},
+        mutual/.style={<->}
+        ]
+        % define nodes
+        \node[main] (cp) {$\mathbb{C}P^{d_A d_B-1}$};
+        \node[main] (pa) [left of=cp] {$\mathcal{P}(A\otimes B)$};
+        \node[main] (sa) [below of=pa] {$S_A$};
+        \node[main] (rng) [right of=sa] {$[0,\infty)\subset \mathbb{R}$};
+
+        % draw edges
+        \draw[mutual] (cp) -- (pa);
+        \draw[towards] (pa) -- node[left] {$\operatorname{Tr}_B$} (sa);
+        \draw[towards_imp] (pa) -- node[above right] {$f$} (rng);
+        \draw[towards] (sa) -- node[above] {$H(\psi_A)$} (rng);
+    \end{tikzpicture}
+    \caption{Mathematical structure for Hayden's concentration of measure phenomenon}
+    \label{fig:Hayden_concentration_of_measure_phenomenon}
+\end{figure}
+
+\begin{itemize}
+    \item The red arrow is the concentration of measure effect. $f=H(\operatorname{Tr}_B(\psi))$.
+    \item $S_A$ denotes the mixed states on $A$.
+\end{itemize}
+
+To prove the concentration of measure phenomenon, we need to analyze the following elements involved in figure~\ref{fig:Hayden_concentration_of_measure_phenomenon}:
+
+    
+The existence and uniqueness of the Haar measure is a theorem in compact lie group theory. For this research topic, we will not prove it.
+
+Due to time constrains of the projects, the following lemma is demonstrated but not investigated thoroughly through the research:
+
+
+\begin{lemma}
+    \label{pages_lemma}
+
+    Page's lemma for expected entropy of mixed states
+
+    Choose a random pure state $\sigma=|\psi\rangle\langle\psi|$ from $A'\otimes A$.
+
+    The expected value of the entropy of entanglement is known and satisfies a concentration inequality known as Page's formula~\cite{Pages_conjecture,Pages_conjecture_simple_proof,Bengtsson_Zyczkowski_2017}[15.72].
+
+    $$
+    \mathbb{E}[H(\psi_A)]=\frac{1}{\ln(2)}\left(\sum_{j=d_B+1}^{d_Ad_B}\frac{1}{j}-\frac{d_A-1}{2d_B}\right) \geq \log_2(d_A)-\frac{1}{2\ln(2)}\frac{d_A}{d_B}
+    $$
+
+\end{lemma}
+
+It basically provides a lower bound for the expected entropy of entanglement. Experimentally, we can have the following result (see Figure~\ref{fig:entropy_vs_dim}):
+
+\begin{figure}[h]
+	\centering
+	\includegraphics[width=0.8\textwidth]{entropy_vs_dim.png}
+	\caption{Entropy vs dimension}
+	\label{fig:entropy_vs_dim}
+\end{figure}
+
+Then we have bound for Lipschitz constant $\eta$ of the map $S(\varphi_A): \mathcal{P}(A\otimes B)\to \R$
+
+\begin{lemma}
+    The Lipschitz constant $\eta$ of $S(\varphi_A)$ is upper bounded by $\sqrt{8}\log_2(d_A)$ for $d_A\geq 3$.
+\end{lemma}
+
+\begin{proof}
+    Consider the Lipschitz constant of the function $g:A\otimes B\to \R$ defined as $g(\varphi)=H(M(\varphi_A))$, where $M:A\otimes B\to \mathcal{P}(A)$ is any fixed complete von Neumann measurement and $H: \mathcal{P}(A)\otimes \mathcal{P}(B)\to \R$ is the Shannon entropy.
+
+    Let $\{\ket{e_j}_A\}$ be the orthonormal basis for $A$ and $\{\ket{f_k}_B\}$ be the orthonormal basis for $B$. Then we decompose the state as spectral form $\ket{\varphi}=\sum_{j=1}^{d_A}\sum_{k=1}^{d_B}\varphi_{jk}\ket{e_j}_A\ket{f_k}_B$.
+
+    By unitary invariance, suppose $M_j=\ket{e_j}\bra{e_j}_A$, and define
+
+    $$
+    p_j(\varphi)=\bra{e_j}\varphi_A \ket{e_j}=\sum_{k=1}^{d_B}|\varphi_{jk}|^2
+    $$
+
+    Then 
+    
+    $$
+    g(\varphi)=H(M(\varphi_A))=-\sum_{j=1}^{d_A}p_j(\varphi)\log_2(p_j(\varphi))
+    $$
+
+    Let $h(p)=-p\log_2(p)$, $h(p)=-\frac{p\ln p}{\ln 2}$, and $h'(p)=-\frac{\ln p+1}{\ln 2}$. Let $\varphi_{jk}=x_{jk}+i y_{jk}$, then $p_j(\varphi)=\sum_{k=1}^{d_B}(x_{jk}^2+y_{jk}^2)$, $\frac{\partial p_j}{\partial x_{jk}}=2x_{jk}$, $\frac{\partial p_j}{\partial y_{jk}}=2y_{jk}$.
+
+    Therefore 
+    
+    $$
+    \frac{\partial g}{\partial x_{jk}}=\frac{\partial g}{\partial p_j}\frac{\partial p_j}{x_{jk}}=-\frac{1+\ln p_j}{\ln 2}\cdot 2x_{jk}
+    \qquad 
+    \frac{\partial g}{\partial y_{jk}}=-\frac{1+\ln p_j}{\ln 2}\cdot 2y_{jk}
+    $$
+    
+    Then the lipschitz constant of $g$ is
+
+    $$
+    \begin{aligned}
+    \eta^2&=\sup_{\langle \varphi|\varphi\rangle \leq 1}\nabla g\cdot \nabla g\\
+    &=\sum_{j=1}^{d_A}\sum_{k=1}^{d_B}\left(\frac{\partial g}{\partial x_{jk}}\right)^2+\left(\frac{\partial g}{\partial y_{jk}}\right)^2\\
+    &=\sum_{j=1}^{d_A}\sum_{k=1}^{d_B}\frac{4(x_{jk}^2+y_{jk}^2)}{(\ln 2)^2}[1+\ln p_j(\varphi)]^2\\
+    &=\sum_{j=1}^{d_A}\sum_{k=1}^{d_B}\frac{4|\varphi_{jk}|^2}{(\ln 2)^2}[1+\ln p_j(\varphi)]^2\\
+    \end{aligned}
+    $$
+
+    Note that $\sum_{k=1}^{d_B}|\varphi_{jk}|^2=p_j(\varphi)$, $\nabla g\cdot \nabla g=\frac{4}{(\ln 2)^2}\sum_{j=1}^{d_A}p_j(\varphi)(1+\ln p_j(\varphi))^2$.
+
+    Since $0\leq p_j\leq 1$, we have $\ln p_j(\varphi)\leq 0$, hence $\sum_{j=0}^{d_A}p_j(\varphi)\ln p_j(\varphi)\leq 0$.
+    
+    $$
+    \begin{aligned}
+        \sum_{j=1}^{d_A}p_j(\varphi)(1+\ln p_j(\varphi))^2&=\sum_{j=1}^{d_A}p_j(\varphi)(1+2\ln p_j(\varphi)+(\ln p_j(\varphi))^2)\\
+        &=1+2\sum_{j=1}^{d_A} p_j(\varphi)\ln p_j(\varphi)+\sum_{j=1}^{d_A}p_j(\varphi)(\ln p_j(\varphi))^2\\
+        &\leq 1+\sum_{j=1}^{d_A}p_j(\varphi)(\ln p_j(\varphi))^2\\
+    \end{aligned}
+    $$
+
+    Thus,
+    $$
+    \begin{aligned}
+    \nabla g\cdot \nabla g&\leq \frac{4}{(\ln 2)^2}[1+\sum_{j=1}^{d_A}p_j(\varphi)(\ln p_j(\varphi))^2]\\
+    &\leq \frac{4}{(\ln 2)^2}[1+(\ln d_A)^2]\\
+    &\leq 8(\log_2 d_A)^2
+    \end{aligned}
+    $$
+
+    Proving $\sum_j^{d_A} p_j(\varphi)\ln p_j(\varphi)\leq (\ln d_A)^2$ for $d_A\geq 3$ takes some efforts and we will continue that later.
+
+    Consider any two unit vectors $\ket{\varphi}$ and $\ket{\psi}$, assume $S(\varphi_A)\leq S(\psi_A)$. If we choose the measurement $M$ to be along the eigenbasis of $\varphi_A$, $H(M(\varphi_A))=S(\varphi_A)$ and we have
+
+    $$
+    S(\psi_A)-S(\varphi_A)\leq H(M(\psi_A))-H(M(\varphi_A))\leq \eta\|\ket{\psi}-\ket{\varphi}\|
+    $$
+
+    Thus the lipschitz constant of $S(\varphi_A)$ is upper bounded by $\sqrt{8}\log_2(d_A)$.
+\end{proof}
+
+From Levy's lemma, we have
+
+If we define $\beta=\frac{1}{\ln(2)}\frac{d_A}{d_B}$, then we have
+
+$$
+\operatorname{Pr}[H(\psi_A) < \log_2(d_A)-\alpha-\beta] \leq \exp\left(-\frac{1}{8\pi^2\ln(2)}\frac{(d_Ad_B-1)\alpha^2}{(\log_2(d_A))^2}\right)
+$$
+
+where $d_B\geq d_A\geq 3$~\cite{Hayden_2006}.
+
+Experimentally, we can have the following result:
+
+As the dimension of the Hilbert space increases, the chance of getting an almost maximally entangled state increases (see Figure~\ref{fig:entropy_vs_dA}).
+
+\begin{figure}[h]
+	\centering
+	\includegraphics[width=0.8\textwidth]{entropy_vs_dA.png}
+	\caption{Entropy vs $d_A$}
+	\label{fig:entropy_vs_dA}
+\end{figure}
+
+% When compiled standalone, print this chapter's references at the end.
+\ifSubfilesClassLoaded{
+  \printbibliography[title={References for Chapter 1}]
+}
+
+\end{document}
diff --git a/latex/chapters/chap2.pdf b/latex/chapters/chap2.pdf
new file mode 100644
index 0000000..d998479
Binary files /dev/null and b/latex/chapters/chap2.pdf differ
diff --git a/chapters/chap2.tex b/latex/chapters/chap2.tex
similarity index 86%
rename from chapters/chap2.tex
rename to latex/chapters/chap2.tex
index 3f1cc72..d2a828c 100644
--- a/chapters/chap2.tex
+++ b/latex/chapters/chap2.tex
@@ -1,266 +1,244 @@
-% chapters/chap2.tex
-\documentclass[../main.tex]{subfiles}
-
-\ifSubfilesClassLoaded{
-  \addbibresource{../main.bib}
-}
-
-\begin{document}
-
-\chapter{Levy's family and observable diameters}
-
-In this section, we will explore how the results from Hayden's concentration of measure theorem can be understood in terms of observable diameters from Gromov's perspective and what properties it reveals for entropy functions.
-
-We will try to use the results from previous sections to estimate the observable diameter for complex projective spaces.
-
-\section{Observable diameters}
-
-Recall from previous sections, an arbitrary 1-Lipschitz function $f:S^n\to \mathbb{R}$ concentrates near a single value $a_0\in \mathbb{R}$ as strongly as the distance function does.
-
-\begin{defn}
-  \label{defn:mm-space}
-
-  Let $X$ be a topological space with the following:
-
-  \begin{enumerate}
-    \item $X$ is a complete (every Cauchy sequence converges)
-    \item $X$ is a metric space with metric $d_X$
-    \item $X$ has a Borel probability measure $\mu_X$
-  \end{enumerate}
-
-  Then $(X,d_X,\mu_X)$ is a \textbf{metric measure space}.
-\end{defn}
-
-\begin{defn}
-  \label{defn:diameter}
-
-  Let $(X,d_X)$ be a metric space. The \textbf{diameter} of a set $A\subset X$ is defined as
-  $$
-  \diam(A)=\sup_{x,y\in A}d_X(x,y).
-  $$
-\end{defn}
-
-\begin{defn}
-  \label{defn:partial-diameter}
-
-  Let $(X,d_X,\mu_X)$ be a metric measure space, For any real number $\alpha\leq 1$, the \textbf{partial diameter} of $X$ is defined as
-  $$
-  \diam(A;\alpha)=\inf_{A\subseteq X|\mu_X(A)\geq \alpha}\diam(A).
-  $$
-\end{defn}
-
-This definition generalize the relation between the measure and metric in the metric-measure space. Intuitively, the space with smaller partial diameter can take more volume with the same diameter constrains.
-
-However, in higher dimensions, the volume may tend to concentrates more around a small neighborhood of the set, as we see in previous chapters with high dimensional sphere as example. We can safely cut $\kappa>0$ volume to significantly reduce the diameter, this yields better measure for concentration for shapes in spaces with high dimension.
-
-
-\begin{defn}
-  \label{defn:observable-diameter}
-  Let $X$ be a metric-measure space, $Y$ be a metric space, and $f:X\to Y$ be a 1-Lipschitz function. Then $f_*\mu_X=\mu_Y$ is a push forward measure on $Y$.
-  
-  For any real number $\kappa>0$, the \textbf{$\kappa$-observable diameter with screen $Y$} is defined as
-
-  $$
-  \obdiam_Y(X;\kappa)=\sup\{\diam(f_*\mu_X;1-\kappa)\}
-  $$
-
-  And the \textbf{obbservable diameter with screen $Y$} is defined as
-
-  $$
-  \obdiam_Y(X)=\inf_{\kappa>0}\max\{\obdiam_Y(X;\kappa)\}
-  $$
-
-  If $Y=\R$, we call it the \textbf{observable diameter}.
-
-\end{defn}
-
-If we collapse it naively via
-$$
-  \inf_{\kappa>0}\obdiam_Y(X;\kappa),
-$$
-we typically get something degenerate: as $\kappa\to 1$, the condition ``mass $\ge 1-\kappa$'' becomes almost empty space, so $\diam(\nu;1-\kappa)$ can be forced to be $0$ (take a tiny set of positive mass), and hence the infimum tends to $0$ for essentially any non-atomic space.
-
-This is why one either:
-\begin{enumerate}
-  \item keeps $\obdiam_Y(X;\kappa)$ as a \emph{function of $\kappa$} (picking $\kappa$ to be small but not $0$), or
-  \item if one insists on a single number, balances ``spread'' against ``exceptional mass'' by defining $\obdiam_Y(X)=\inf_{\kappa>0}\max\{\obdiam_Y(X;\kappa),\kappa\}$ as above.
-\end{enumerate}
-The point of the $\max\{\cdot,\kappa\}$ is that it prevents cheating by taking $\kappa$ close to $1$: if $\kappa$ is large then the maximum is large regardless of how small $\obdiam_Y(X;\kappa)$ is, so the infimum is forced to occur where the exceptional mass and the observable spread are small.
-
-Few additional proposition in \cite{shioya2014metricmeasuregeometry} will help us to estimate the observable diameter for complex projective spaces.
-
-\begin{prop}
-  \label{prop:observable-diameter-domination}
-  Let $X$ and $Y$ be two metric-measure spaces and $\kappa>0$, and let $f:Y\to X$ be a 1-Lipschitz function ($Y$ dominates $X$, denoted as $X\prec Y$) then:
-
-  \begin{enumerate}
-    \item
-    $$
-    \diam(X,1-\kappa)\leq \diam(Y,1-\kappa)
-    $$
-    \item $\obdiam(X;-\kappa)\leq \diam(X;1-\kappa)$, and $\obdiam(X)$ is finite.
-    \item
-    $$
-    \obdiam(X;-\kappa)\leq \obdiam(Y;-\kappa)
-    $$
-  \end{enumerate}
-\end{prop}
-
-\begin{proof}
-  Since $f$ is 1-Lipschitz, we have $f_*\mu Y=\mu_X$. Let $A$ be any Borel set of $Y$ with $\mu_Y(A)\geq 1-\kappa$ and $\overline{f(A)}$ be the closure of $f(A)$ in $X$. We have $\mu_X(\overline{f(A)})=\mu_Y(f^{-1}(\overline{f(A)}))\geq \mu_Y(A)\geq 1-\kappa$ and by the 1-lipschitz property, $\diam(\overline{f(A)})\leq \diam(A)$, so $\diam(X;1-\kappa)\leq \diam(A)\leq \diam(Y;1-\kappa)$.
-
-  Let $g:X\to \R$ be any 1-lipschitz function, since $(\R,|\cdot|,g_*\mu_X)$ is dominated by $X$, $\diam(\R;1-\kappa)\leq \diam(X;1-\kappa)$. Therefore, $\obdiam(X;-\kappa)\leq \diam(X;1-\kappa)$.
- 
-  and
-  $$
-  \diam(g_*\mu_X;1-\kappa)\leq \diam((f\circ g)_*\mu_Y;1-\kappa)\leq \obdiam(Y;1-\kappa)
-  $$
-\end{proof}
-
-\begin{prop}
-  \label{prop:observable-diameter-scale}
-  Let $X$ be an metric-measure space. Then for any real number $t>0$, we have
-  
-  $$
-  \obdiam(tX;-\kappa)=t\obdiam(X;-\kappa)
-  $$
-
-  Where $tX=(X,tdX,\mu X)$.
-\end{prop}
-
-\begin{proof}
-  $$
-  \begin{aligned}
-    \obdiam(tX;-\kappa)&=\sup\{\diam(f_*\mu_X;1-\kappa)|f:tX\to \R \text{ is 1-Lipschitz}\}\\
-    &=\sup\{\diam(f_*\mu_X;1-\kappa)|t^{-1}f:X\to \R \text{ is 1-Lipschitz}\}\\
-    &=\sup\{\diam((tg)_*\mu_X;1-\kappa)|g:X\to \R \text{ is 1-Lipschitz}\}\\
-    &=t\sup\{\diam(g_*\mu_X;1-\kappa)|g:X\to \R \text{ is 1-Lipschitz}\}\\
-    &=t\obdiam(X;-\kappa)
-  \end{aligned}
-  $$
-\end{proof}
-
-\subsection{Observable diameter for class of spheres}
-
-In this section, we will try to use the results from previous sections to estimate the observable diameter for class of spheres.
-
-\begin{theorem}
-  \label{thm:observable-diameter-sphere}
-  For any real number $\kappa$ with $0<\kappa<1$, we have
-  $$
-  \obdiam(S^n(1);-\kappa)=O(\sqrt{n})
-  $$
-\end{theorem}
-
-\begin{proof}
-  First, recall that by maxwell boltzmann distribution, we have that for any $n>0$, let $I(r)$ denote the measure of standard gaussian measure on the interval $[0,r]$. Then we have
-  
-  $$
-  \begin{aligned}
-    \lim_{n\to \infty} \obdiam(S^n(\sqrt{n});-\kappa)&=\lim_{n\to \infty} \sup\{\diam((\pi_{n,k})_*\sigma^n;1-\kappa)|\pi_{n,k} \text{ is 1-Lipschitz}\}\\
-    &=\lim_{n\to \infty} \sup\{\diam(\gamma^1;1-\kappa)|\gamma^1 \text{ is the standard gaussian measure}\}\\
-    &=\diam(\gamma^1;1-\kappa)\\
-    &=2I^{-1}(\frac{1-\kappa}{2})\text { cutting the extremum for normal distribution}\\
-  \end{aligned}
-  $$
-
-  By proposition \ref{prop:observable-diameter-scale}, we have
-
-  $$
-  \obdiam(S^n(\sqrt{n});-\kappa)=\sqrt{n}\obdiam(S^n(1);-\kappa)
-  $$
-
-  So $\obdiam(S^n(1);-\kappa)=\sqrt{n}(2I^{-1}(\frac{1-\kappa}{2}))=O(\sqrt{n})$.
-\end{proof}
-
-From the previous discussion, we see that the only remaining for finding observable diameter of $\C P^n$ is to find the lipchitz function that is isometric with consistent push-forward measure.
-
-To find such metric, we need some additional results.
-
-\begin{defn}
-  \label{defn:riemannian-metric}
-
-  Let $M$ be a smooth manifold. A \textit{\textbf{Riemannian metric}} on $M$ is a smooth covariant tensor field $g\in \mathcal{T}^2(M)$ such that for each $p\in M$, $g_p$ is an inner product on $T_pM$.
-
-  $g_p(v,v)\geq 0$ for each $p\in M$ and each $v\in T_pM$. equality holds if and only if $v=0$.
-
-\end{defn}
-
-TODO: There is a hidden chapter on group action on manifolds, can you find that?
-
-\begin{theorem}
-  \label{theorem:riemannian-submersion}
-
-  Let $(\tilde{M},\tilde{g})$ be a Riemannian manifold, let $\pi:\tilde{M}\to M$ be a surjective smooth submersion, and let $G$ be a group acting on $\tilde{M}$. If the \textbf{action} is
-  \begin{enumerate}
-    \item isometric: the map $x\mapsto \varphi\cdot x$ is an isometry for each $\varphi\in G$.
-    \item vertical: every element $\varphi\in G$ takes each fiber to itself, that is $\pi(\varphi\cdot p)=\pi(p)$ for all $p\in \tilde{M}$.
-    \item transitive on fibers: for each $p,q\in \tilde{M}$ such that $\pi(p)=\pi(q)$, there exists $\varphi\in G$ such that $\varphi\cdot p = q$.
-  \end{enumerate}
-  Then there is a unique Riemannian metric on $M$ such that $\pi$ is a Riemannian submersion.
-
-\end{theorem}
-
-A natural measure for $\C P^n$ is the normalized volume measure on $\C P^n$ induced from the Fubini-Study metric. \cite{lee_introduction_2018} Example 2.30
-
-\begin{defn}
-  \label{defn:fubini-study-metric}
-
-  Let $n$ be a positive integer, and consider the complex projective space $\C P^n$ defined as the quotient space of $\C^{n+1}$ by the equivalence relation $z\sim z'$ if there exists $\lambda \in \C$ such that $z=\lambda z'$. The map $\pi:\C^{n+1}\setminus\{0\}\to \C P^n$ sending each point in $\C^{n+1}\setminus\{0\}$ to its span is surjective smooth submersion.
-
-  Identifying $\C^{n+1}$ with $\R^{2n+2}$ with its Euclidean metric, we can view the unit sphere $S^{2n+1}$ with its roud metric $\mathring{g}$ as an embedded Riemannian submanifold of $\C^{n+1}\setminus\{0\}$. Let $p:S^{2n+1}\to \C P^n$ denote the restriction of the map $\pi$. Then $p$ is smooth, and its is surjective, because every 1-dimensional complex subspace contains elements of unit norm.
-
-\end{defn}
-
-There are many additional properties for such construction, we will check them just for curiosity.
-
-We need to show that it is a submersion.
-
-\begin{proof}
-   Let $z_0\in S^{2n+1}$ and set $\zeta_0=p(z_0)\in \C P^n$. Since $\pi$ is a smooth submersion, it has a smooth local section $\sigma: U\to \C^{n+1}$ defined on a neighborhood $U$ of $\zeta_0$ and satisfying $\sigma(\zeta_0)=z_0$ by the local section theorem (Theorem \ref{theorem:local_section_theorem}). Let $v:\C^{n+1}\setminus\{0\}\to S^{2n+1}$ be the radial projection on to the sphere:
-
-   $$
-   v(z)=\frac{z}{|z|}
-   $$
-
-   Since dividing an element of $\C^{n+1}$ by a nonzero scalar does not change its span, it follows that $p\circ v=\pi$. Therefore, if we set $\tilde{\sigma}=v\circ \sigma$, then $\tilde{\sigma}$ is a smooth local section of $p$. Apply the local section theorem (Theorem \ref{theorem:local_section_theorem}) again, this shows that $p$ is a submersion.
-
-   Define an action of $S^1$ on $S^{2n+1}$ by complex multiplication:
-
-   $$
-   \lambda (z^1,z^2,\ldots,z^{n+1})=(\lambda z^1,\lambda z^2,\ldots,\lambda z^{n+1})
-   $$
-
-   for $\lambda\in S^1$ (viewed as complex number of norm 1) and $z=(z^1,z^2,\ldots,z^{n+1})\in S^{2n+1}$. This is easily seen to be isometric, vertical, and transitive on fibers of $p$.
-
-   By (Theorem \ref{theorem:riemannian-submersion}). Therefore, there is a unique metric on $\C P^n$ such that the map $p:S^{2n+1}\to \C P^n$ is a Riemannian submersion. This metric is called the Fubini-study metric.
-\end{proof}
-
-\subsection{Observable diameter for complex projective spaces}
-
-Using the projection map and Hopf's fibration, we can estimate the observable diameter for complex projective spaces from the observable diameter of spheres.
-
-\begin{theorem}
-  \label{thm:observable-diameter-complex-projective-space}
-  For any real number $\kappa$ with $0<\kappa<1$, we have
-  $$
-  \obdiam(\mathbb{C}P^n(1);-\kappa)\leq O(\sqrt{n})
-  $$
-\end{theorem}
-
-\begin{proof}
-  Recall from Example 2.30 in \cite{lee_introduction_2018}, the Hopf fibration $f_n:S^{2n+1}(1)\to \C P^n$ is 1-Lipschitz continuous with respect to the Fubini-Study metric on $\C P^n$. and the push-forward $(f_n)_*\sigma^{2n+1}$ coincides with the normalized volume measure on $\C P^n$ induced from the Fubini-Study metric.
-
-  By proposition \ref{prop:observable-diameter-domination}, we have $\obdiam(\mathbb{C}P^n(1);-\kappa)\leq \obdiam(S^{2n+1}(1);-\kappa)\leq O(\sqrt{n})$.
-
-\end{proof}
-
-\section{Example for concentration of measure and observable diameter}
-
-In this section, we wish to use observable diameter to estimate the statics of thermal dynamics of some classical systems.
-
-\ifSubfilesClassLoaded{
-  \printbibliography[title={References}]
-}
-
-\end{document}
+% chapters/chap2.tex
+\documentclass[../main.tex]{subfiles}
+
+\ifSubfilesClassLoaded{
+  \addbibresource{../main.bib}
+}
+
+\begin{document}
+
+\chapter{Levy's family and observable diameters}
+
+In this section, we will explore how the results from Hayden's concentration of measure theorem can be understood in terms of observable diameters from Gromov's perspective and what properties it reveals for entropy functions.
+
+We will try to use the results from previous sections to estimate the observable diameter for complex projective spaces.
+
+\section{Observable diameters}
+
+Recall from previous sections, an arbitrary 1-Lipschitz function $f:S^n\to \mathbb{R}$ concentrates near a single value $a_0\in \mathbb{R}$ as strongly as the distance function does.
+
+\begin{defn}
+  \label{defn:mm-space}
+
+  Let $X$ be a topological space with the following:
+
+  \begin{enumerate}
+    \item $X$ is a complete (every Cauchy sequence converges)
+    \item $X$ is a metric space with metric $d_X$
+    \item $X$ has a Borel probability measure $\mu_X$
+  \end{enumerate}
+
+  Then $(X,d_X,\mu_X)$ is a \textbf{metric measure space}.
+\end{defn}
+
+\begin{defn}
+  \label{defn:diameter}
+
+  Let $(X,d_X)$ be a metric space. The \textbf{diameter} of a set $A\subset X$ is defined as
+  $$
+  \diam(A)=\sup_{x,y\in A}d_X(x,y).
+  $$
+\end{defn}
+
+\begin{defn}
+  \label{defn:partial-diameter}
+
+  Let $(X,d_X,\mu_X)$ be a metric measure space, For any real number $\alpha\leq 1$, the \textbf{partial diameter} of $X$ is defined as
+  $$
+  \diam(A;\alpha)=\inf_{A\subseteq X|\mu_X(A)\geq \alpha}\diam(A).
+  $$
+\end{defn}
+
+This definition generalize the relation between the measure and metric in the metric-measure space. Intuitively, the space with smaller partial diameter can take more volume with the same diameter constrains.
+
+However, in higher dimensions, the volume may tend to concentrates more around a small neighborhood of the set, as we see in previous chapters with high dimensional sphere as example. We can safely cut $\kappa>0$ volume to significantly reduce the diameter, this yields better measure for concentration for shapes in spaces with high dimension.
+
+
+\begin{defn}
+  \label{defn:observable-diameter}
+  Let $X$ be a metric-measure space, $Y$ be a metric space, and $f:X\to Y$ be a 1-Lipschitz function. Then $f_*\mu_X=\mu_Y$ is a push forward measure on $Y$.
+  
+  For any real number $\kappa>0$, the \textbf{$\kappa$-observable diameter with screen $Y$} is defined as
+
+  $$
+  \obdiam_Y(X;\kappa)=\sup\{\diam(f_*\mu_X;1-\kappa)\}
+  $$
+
+  And the \textbf{obbservable diameter with screen $Y$} is defined as
+
+  $$
+  \obdiam_Y(X)=\inf_{\kappa>0}\max\{\obdiam_Y(X;\kappa)\}
+  $$
+
+  If $Y=\R$, we call it the \textbf{observable diameter}.
+
+\end{defn}
+
+If we collapse it naively via
+$$
+  \inf_{\kappa>0}\obdiam_Y(X;\kappa),
+$$
+we typically get something degenerate: as $\kappa\to 1$, the condition ``mass $\ge 1-\kappa$'' becomes almost empty space, so $\diam(\nu;1-\kappa)$ can be forced to be $0$ (take a tiny set of positive mass), and hence the infimum tends to $0$ for essentially any non-atomic space.
+
+This is why one either:
+\begin{enumerate}
+  \item keeps $\obdiam_Y(X;\kappa)$ as a \emph{function of $\kappa$} (picking $\kappa$ to be small but not $0$), or
+  \item if one insists on a single number, balances ``spread'' against ``exceptional mass'' by defining $\obdiam_Y(X)=\inf_{\kappa>0}\max\{\obdiam_Y(X;\kappa),\kappa\}$ as above.
+\end{enumerate}
+The point of the $\max\{\cdot,\kappa\}$ is that it prevents cheating by taking $\kappa$ close to $1$: if $\kappa$ is large then the maximum is large regardless of how small $\obdiam_Y(X;\kappa)$ is, so the infimum is forced to occur where the exceptional mass and the observable spread are small.
+
+Few additional proposition in \cite{shioya2014metricmeasuregeometry} will help us to estimate the observable diameter for complex projective spaces.
+
+\begin{prop}
+  \label{prop:observable-diameter-domination}
+  Let $X$ and $Y$ be two metric-measure spaces and $\kappa>0$, and let $f:Y\to X$ be a 1-Lipschitz function ($Y$ dominates $X$, denoted as $X\prec Y$) then:
+
+  \begin{enumerate}
+    \item
+    $
+    \diam(X,1-\kappa)\leq \diam(Y,1-\kappa)
+    $
+    \item $\obdiam(X;-\kappa)\leq \diam(X;1-\kappa)$, and $\obdiam(X)$ is finite.
+    \item
+    $
+    \obdiam(X;-\kappa)\leq \obdiam(Y;-\kappa)
+    $
+  \end{enumerate}
+\end{prop}
+
+\begin{proof}
+  Since $f$ is 1-Lipschitz, we have $f_*\mu Y=\mu_X$. Let $A$ be any Borel set of $Y$ with $\mu_Y(A)\geq 1-\kappa$ and $\overline{f(A)}$ be the closure of $f(A)$ in $X$. We have $\mu_X(\overline{f(A)})=\mu_Y(f^{-1}(\overline{f(A)}))\geq \mu_Y(A)\geq 1-\kappa$ and by the 1-lipschitz property, $\diam(\overline{f(A)})\leq \diam(A)$, so $\diam(X;1-\kappa)\leq \diam(A)\leq \diam(Y;1-\kappa)$.
+
+  Let $g:X\to \R$ be any 1-lipschitz function, since $(\R,|\cdot|,g_*\mu_X)$ is dominated by $X$, $\diam(\R;1-\kappa)\leq \diam(X;1-\kappa)$. Therefore, $\obdiam(X;-\kappa)\leq \diam(X;1-\kappa)$.
+ 
+  and
+  $$
+  \diam(g_*\mu_X;1-\kappa)\leq \diam((f\circ g)_*\mu_Y;1-\kappa)\leq \obdiam(Y;1-\kappa)
+  $$
+\end{proof}
+
+\begin{prop}
+  \label{prop:observable-diameter-scale}
+  Let $X$ be an metric-measure space. Then for any real number $t>0$, we have
+  
+  $$
+  \obdiam(tX;-\kappa)=t\obdiam(X;-\kappa)
+  $$
+
+  Where $tX=(X,tdX,\mu X)$.
+\end{prop}
+
+\begin{proof}
+  $$
+  \begin{aligned}
+    \obdiam(tX;-\kappa)&=\sup\{\diam(f_*\mu_X;1-\kappa)|f:tX\to \R \text{ is 1-Lipschitz}\}\\
+    &=\sup\{\diam(f_*\mu_X;1-\kappa)|t^{-1}f:X\to \R \text{ is 1-Lipschitz}\}\\
+    &=\sup\{\diam((tg)_*\mu_X;1-\kappa)|g:X\to \R \text{ is 1-Lipschitz}\}\\
+    &=t\sup\{\diam(g_*\mu_X;1-\kappa)|g:X\to \R \text{ is 1-Lipschitz}\}\\
+    &=t\obdiam(X;-\kappa)
+  \end{aligned}
+  $$
+\end{proof}
+
+\subsection{Observable diameter for class of spheres}
+
+In this section, we will try to use the results from previous sections to estimate the observable diameter for class of spheres.
+
+\begin{theorem}
+  \label{thm:observable-diameter-sphere}
+  For any real number $\kappa$ with $0<\kappa<1$, we have
+  $$
+  \obdiam(S^n(1);-\kappa)=O(\sqrt{n})
+  $$
+\end{theorem}
+
+\begin{proof}
+  First, recall that by maxwell boltzmann distribution, we have that for any $n>0$, let $I(r)$ denote the measure of standard gaussian measure on the interval $[0,r]$. Then we have
+  
+  $$
+  \begin{aligned}
+    \lim_{n\to \infty} \obdiam(S^n(\sqrt{n});-\kappa)&=\lim_{n\to \infty} \sup\{\diam((\pi_{n,k})_*\sigma^n;1-\kappa)|\pi_{n,k} \text{ is 1-Lipschitz}\}\\
+    &=\lim_{n\to \infty} \sup\{\diam(\gamma^1;1-\kappa)|\gamma^1 \text{ is the standard gaussian measure}\}\\
+    &=\diam(\gamma^1;1-\kappa)\\
+    &=2I^{-1}(\frac{1-\kappa}{2})\text { cutting the extremum for normal distribution}\\
+  \end{aligned}
+  $$
+
+  By proposition \ref{prop:observable-diameter-scale}, we have
+
+  $$
+  \obdiam(S^n(\sqrt{n});-\kappa)=\sqrt{n}\obdiam(S^n(1);-\kappa)
+  $$
+
+  So $\obdiam(S^n(1);-\kappa)=\sqrt{n}(2I^{-1}(\frac{1-\kappa}{2}))=O(\sqrt{n})$.
+\end{proof}
+
+From the previous discussion, we see that the only remaining for finding observable diameter of $\C P^n$ is to find the lipchitz function that is isometric with consistent push-forward measure.
+
+To find such metric, we need some additional results from previous sections.
+
+A natural measure for $\C P^n$ is the normalized volume measure on $\C P^n$ induced from the Fubini-Study metric. \cite{lee_introduction_2018} Example 2.30
+
+\begin{defn}
+  \label{defn:fubini-study-metric}
+
+  Let $n$ be a positive integer, and consider the complex projective space $\C P^n$ defined as the quotient space of $\C^{n+1}$ by the equivalence relation $z\sim z'$ if there exists $\lambda \in \C$ such that $z=\lambda z'$. The map $\pi:\C^{n+1}\setminus\{0\}\to \C P^n$ sending each point in $\C^{n+1}\setminus\{0\}$ to its span is surjective smooth submersion.
+
+  Identifying $\C^{n+1}$ with $\R^{2n+2}$ with its Euclidean metric, we can view the unit sphere $S^{2n+1}$ with its roud metric $\mathring{g}$ as an embedded Riemannian submanifold of $\C^{n+1}\setminus\{0\}$. Let $p:S^{2n+1}\to \C P^n$ denote the restriction of the map $\pi$. Then $p$ is smooth, and its is surjective, because every 1-dimensional complex subspace contains elements of unit norm.
+
+\end{defn}
+
+There are many additional properties for such construction, we will check them just for curiosity.
+
+We need to show that it is a submersion.
+
+\begin{proof}
+   Let $z_0\in S^{2n+1}$ and set $\zeta_0=p(z_0)\in \C P^n$. Since $\pi$ is a smooth submersion, it has a smooth local section $\sigma: U\to \C^{n+1}$ defined on a neighborhood $U$ of $\zeta_0$ and satisfying $\sigma(\zeta_0)=z_0$ by the local section theorem (Theorem \ref{theorem:local_section_theorem}). Let $v:\C^{n+1}\setminus\{0\}\to S^{2n+1}$ be the radial projection on to the sphere:
+
+   $$
+   v(z)=\frac{z}{|z|}
+   $$
+
+   Since dividing an element of $\C^{n+1}$ by a nonzero scalar does not change its span, it follows that $p\circ v=\pi$. Therefore, if we set $\tilde{\sigma}=v\circ \sigma$, then $\tilde{\sigma}$ is a smooth local section of $p$. Apply the local section theorem (Theorem \ref{theorem:local_section_theorem}) again, this shows that $p$ is a submersion.
+
+   Define an action of $S^1$ on $S^{2n+1}$ by complex multiplication:
+
+   $$
+   \lambda (z^1,z^2,\ldots,z^{n+1})=(\lambda z^1,\lambda z^2,\ldots,\lambda z^{n+1})
+   $$
+
+   for $\lambda\in S^1$ (viewed as complex number of norm 1) and $z=(z^1,z^2,\ldots,z^{n+1})\in S^{2n+1}$. This is easily seen to be isometric, vertical, and transitive on fibers of $p$.
+
+   By (Theorem \ref{theorem:riemannian-submersion}). Therefore, there is a unique metric on $\C P^n$ such that the map $p:S^{2n+1}\to \C P^n$ is a Riemannian submersion. This metric is called the Fubini-study metric.
+\end{proof}
+
+\subsection{Observable diameter for complex projective spaces}
+
+Using the projection map and Hopf's fibration, we can estimate the observable diameter for complex projective spaces from the observable diameter of spheres.
+
+\begin{theorem}
+  \label{thm:observable-diameter-complex-projective-space}
+  For any real number $\kappa$ with $0<\kappa<1$, we have
+  $$
+  \obdiam(\mathbb{C}P^n(1);-\kappa)\leq O(\sqrt{n})
+  $$
+\end{theorem}
+
+\begin{proof}
+  Recall from Example 2.30 in \cite{lee_introduction_2018}, the Hopf fibration $f_n:S^{2n+1}(1)\to \C P^n$ is 1-Lipschitz continuous with respect to the Fubini-Study metric on $\C P^n$. and the push-forward $(f_n)_*\sigma^{2n+1}$ coincides with the normalized volume measure on $\C P^n$ induced from the Fubini-Study metric.
+
+  By proposition \ref{prop:observable-diameter-domination}, we have $\obdiam(\mathbb{C}P^n(1);-\kappa)\leq \obdiam(S^{2n+1}(1);-\kappa)\leq O(\sqrt{n})$.
+
+\end{proof}
+
+\section{Use entropy function as estimator of observable diameter for complex projective spaces}
+
+In this section, we wish to use observable diameter to estimate the statics of thermal dynamics of some classical systems.
+
+
+
+\ifSubfilesClassLoaded{
+  \printbibliography[title={References}]
+}
+
+\end{document}
diff --git a/chapters/chap3.pdf b/latex/chapters/chap3.pdf
similarity index 100%
rename from chapters/chap3.pdf
rename to latex/chapters/chap3.pdf
diff --git a/chapters/chap3.tex b/latex/chapters/chap3.tex
similarity index 91%
rename from chapters/chap3.tex
rename to latex/chapters/chap3.tex
index 074bc83..a2ffaad 100644
--- a/chapters/chap3.tex
+++ b/latex/chapters/chap3.tex
@@ -1,52 +1,57 @@
-% chapters/chap2.tex
-\documentclass[../main.tex]{subfiles}
-
-\ifSubfilesClassLoaded{
-  \addbibresource{../main.bib}
-}
-
-\begin{document}
-
-\chapter{Seigel-Bargmann Space}
-
-In this chapter, we will collect ideas and other perspective we have understanding the concentration of measure phenomenon. Especially with symmetric product of $\C P^1$ and see how it relates to Riemman surfaces and Seigel-Bargmann spaces.
-
-\begin{figure}[h]
-    \centering
-    \begin{tikzpicture}[node distance=40mm, thick,
-        main/.style={draw, draw=white},
-        towards/.style={->},
-        towards_imp/.style={<->,red},
-        mutual/.style={<->}
-        ]
-        \node[main] (cp) {$\mathbb{C}P^{n}$};
-        \node[main] (c) [below of=cp] {$\mathbb{C}^{n+1}$};
-        \node[main] (p) [right of=cp] {$\mathbb{P}^n$};
-        \node[main] (sym) [below of=p] {$\operatorname{Sym}_n(\mathbb{C}P^1)$};
-        % draw edges
-        \draw[towards] (c) -- (cp) node[midway, left] {$z\sim \lambda z$};
-        \draw[towards] (c) -- (p) node[midway, fill=white] {$w(z)=\sum_{i=0}^n Z_i z^i$};
-        \draw[towards_imp] (cp) -- (p) node[midway, above] {$w(z)\sim w(\lambda z)$};
-        \draw[mutual] (p) -- (sym) node[midway, right] {root of $w(z)$};
-    \end{tikzpicture}
-    \caption{Majorana stellar representation}
-    \label{fig:majorana_stellar_representation}
-\end{figure}
-
-Basically, there is a bijection between the complex projective space $\mathbb{C}P^n$ and the set of roots of a polynomial of degree $n$.
-
-We can use a symmetric group of permutations of $n$ complex numbers (or $S^2$) to represent the $\mathbb{C}P^n$, that is, $\mathbb{C}P^n=S^2\times S^2\times \cdots \times S^2/S_n$.
-
-One might be interested in the random sampling over the $\operatorname{Sym}_n(\mathbb{C}P^1)$ and the concentration of measure phenomenon on that.
-
-\section{Majorana stellar representation of the quantum state}
-
-\section{Space of complex valued functions and pure states}
-
-
-
-\ifSubfilesClassLoaded{
-  \printbibliography[title={References for Chapter 2}]
-}
-
-\end{document}
+% chapters/chap2.tex
+\documentclass[../main.tex]{subfiles}
+
+\ifSubfilesClassLoaded{
+  \addbibresource{../main.bib}
+}
+
+\begin{document}
+
+\chapter{Seigel-Bargmann Space}
+
+In this chapter, we will collect ideas and other perspective we have understanding the concentration of measure phenomenon. Especially with symmetric product of $\C P^1$ and see how it relates to Riemman surfaces and Seigel-Bargmann spaces.
+
+\begin{figure}[h]
+    \centering
+    \begin{tikzpicture}[node distance=40mm, thick,
+        main/.style={draw, draw=white},
+        towards/.style={->},
+        towards_imp/.style={<->,red},
+        mutual/.style={<->}
+        ]
+        \node[main] (cp) {$\mathbb{C}P^{n}$};
+        \node[main] (c) [below of=cp] {$\mathbb{C}^{n+1}$};
+        \node[main] (p) [right of=cp] {$\mathbb{P}^n$};
+        \node[main] (sym) [below of=p] {$\operatorname{Sym}_n(\mathbb{C}P^1)$};
+        % draw edges
+        \draw[towards] (c) -- (cp) node[midway, left] {$z\sim \lambda z$};
+        \draw[towards] (c) -- (p) node[midway, fill=white] {$w(z)=\sum_{i=0}^n Z_i z^i$};
+        \draw[towards_imp] (cp) -- (p) node[midway, above] {$w(z)\sim w(\lambda z)$};
+        \draw[mutual] (p) -- (sym) node[midway, right] {root of $w(z)$};
+    \end{tikzpicture}
+    \caption{Majorana stellar representation}
+    \label{fig:majorana_stellar_representation}
+\end{figure}
+
+Basically, there is a bijection between the complex projective space $\mathbb{C}P^n$ and the set of roots of a polynomial of degree $n$.
+
+We can use a symmetric group of permutations of $n$ complex numbers (or $S^2$) to represent the $\mathbb{C}P^n$, that is, $\mathbb{C}P^n=S^2\times S^2\times \cdots \times S^2/S_n$.
+
+One might be interested in the random sampling over the $\operatorname{Sym}_n(\mathbb{C}P^1)$ and the concentration of measure phenomenon on that.
+
+\section{Majorana stellar representation of the quantum state}
+
+\begin{defn}
+    Let $n$ be a positive integer. The Majorana stellar representation of the quantum state is the set of all roots of a polynomial of degree $n$ in $\mathbb{C}$.
+
+    
+\end{defn}
+\section{Space of complex valued functions and pure states}
+
+
+
+\ifSubfilesClassLoaded{
+  \printbibliography[title={References for Chapter 2}]
+}
+
+\end{document}
diff --git a/images/Filter_figure.png b/latex/images/Filter_figure.png
similarity index 100%
rename from images/Filter_figure.png
rename to latex/images/Filter_figure.png
diff --git a/images/Superdense_coding.png b/latex/images/Superdense_coding.png
similarity index 100%
rename from images/Superdense_coding.png
rename to latex/images/Superdense_coding.png
diff --git a/images/entropy_vs_dA.png b/latex/images/entropy_vs_dA.png
similarity index 100%
rename from images/entropy_vs_dA.png
rename to latex/images/entropy_vs_dA.png
diff --git a/images/entropy_vs_deviate.png b/latex/images/entropy_vs_deviate.png
similarity index 100%
rename from images/entropy_vs_deviate.png
rename to latex/images/entropy_vs_deviate.png
diff --git a/images/entropy_vs_dim.png b/latex/images/entropy_vs_dim.png
similarity index 100%
rename from images/entropy_vs_dim.png
rename to latex/images/entropy_vs_dim.png
diff --git a/images/maxwell.png b/latex/images/maxwell.png
similarity index 100%
rename from images/maxwell.png
rename to latex/images/maxwell.png
diff --git a/main.bib b/latex/main.bib
similarity index 100%
rename from main.bib
rename to latex/main.bib
diff --git a/main.pdf b/latex/main.pdf
similarity index 100%
rename from main.pdf
rename to latex/main.pdf
diff --git a/main.tex b/latex/main.tex
similarity index 96%
rename from main.tex
rename to latex/main.tex
index cabebc8..a8606b6 100644
--- a/main.tex
+++ b/latex/main.tex
@@ -1,112 +1,112 @@
-% main.tex
-\documentclass[11pt]{book}
-
-% --- Math + structure ---
-\usepackage{amsmath,amssymb,amsthm}
-\usepackage{hyperref}
-\usepackage{subfiles} % allows chapters to compile independently
-
-% --- Formatting ---
-\usepackage{fancyhdr,parskip}
-\usepackage{fullpage}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-% add special notation supports
-\usepackage[mathscr]{euscript}
-\usepackage{mathtools}
-\usepackage{braket}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-% add image package and directory
-\usepackage{graphicx}
-\usepackage{tikz}
-\graphicspath{{./images/}}
-% dependency graph
-\usetikzlibrary{trees,positioning,arrows.meta,backgrounds}
-% floating graph
-\usepackage{float}
-
-% --- Bibliography: biblatex + biber ---
-\usepackage[
-  backend=biber,
-  style=alphabetic,
-  sorting=nyt,
-  giveninits=true
-]{biblatex}
-
-% --- Beamer-like blocks (printer-friendly) ---
-\usepackage[most]{tcolorbox}
-\usepackage{xcolor}
-
-% A dedicated "Examples" block (optional convenience wrapper)
-\newtcolorbox{examples}[1][Example]{%
-  enhanced,
-  breakable,
-  colback=white,
-  colframe=black!90,
-  coltitle=white,            % title text color
-  colbacktitle=black!90,      % <<< grey 80 title bar
-  boxrule=0.6pt,
-  arc=1.5mm,
-  left=1.2mm,right=1.2mm,top=1.0mm,bottom=1.0mm,
-  fonttitle=\bfseries,
-  title=#1
-}
-
-
-% In the assembled book, we load *all* chapter bib files here,
-% and print one combined bibliography at the end.
-
-\addbibresource{main.bib}
-
-%%
-% Some convenient commands if you need to use integrals
-\newcommand{\is}{\hspace{2pt}}
-\newcommand{\dx}{\is dx}
-
-
-%%%%%%%%%%%%%%%%%%%%%%
-% These are commands you can use that will generate nice things in TeX.  Feel free to define your own, too.
-\newcommand{\Z}{\mathbb{Z}} % integers
-\newcommand{\Q}{\mathbb{Q}} % rationals
-\newcommand{\R}{\mathbb{R}} % reals
-\newcommand{\C}{\mathbb{C}} % complex numbers
-\newcommand{\ds}{\displaystyle} % invoke "display style", which makes fractions come out big, etc.
-\newcommand{\charac}{\operatorname{char}} % characteristic of a field
-\newcommand{\st}{\ensuremath{\,:\,}} % Makes the colon in set-builder notation space properly
-
-%%%%%%%%%%%%%%%%%%%%%%
-% These commands are for convenient notation for the concentration of measure theorem
-\newcommand{\obdiam}{\operatorname{ObserDiam}}
-\newcommand{\diam}{\operatorname{diam}}
-
-
-%%%%%%%%%%%%%%%%%%%%%%
-% These commands create theorem-like environments.
-\newtheorem{theorem}{Theorem}
-\newtheorem{lemma}[theorem]{Lemma}
-\newtheorem{prop}[theorem]{Proposition}
-\newtheorem{defn}[theorem]{Definition}
-
-\title{Concentration of Measure And Quantum Entanglement}
-\author{Zheyuan Wu}
-\date{\today}
-
-\begin{document}
-\frontmatter
-\maketitle
-\tableofcontents
-\mainmatter
-
-% Each chapter is in its own file and included as a subfile.
-% \subfile{preface}
-\subfile{chapters/chap0}
-\subfile{chapters/chap1}
-\subfile{chapters/chap2}
-% \subfile{chapters/chap3}
-
-\backmatter
-\cleardoublepage
-\printbibliography[title={References}]
-
-\end{document}
+% main.tex
+\documentclass[11pt]{book}
+
+% --- Math + structure ---
+\usepackage{amsmath,amssymb,amsthm}
+\usepackage{hyperref}
+\usepackage{subfiles} % allows chapters to compile independently
+
+% --- Formatting ---
+\usepackage{fancyhdr,parskip}
+\usepackage{fullpage}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% add special notation supports
+\usepackage[mathscr]{euscript}
+\usepackage{mathtools}
+\usepackage{braket}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% add image package and directory
+\usepackage{graphicx}
+\usepackage{tikz}
+\graphicspath{{./images/}}
+% dependency graph
+\usetikzlibrary{trees,positioning,arrows.meta,backgrounds}
+% floating graph
+\usepackage{float}
+
+% --- Bibliography: biblatex + biber ---
+\usepackage[
+  backend=biber,
+  style=alphabetic,
+  sorting=nyt,
+  giveninits=true
+]{biblatex}
+
+% --- Beamer-like blocks (printer-friendly) ---
+\usepackage[most]{tcolorbox}
+\usepackage{xcolor}
+
+% A dedicated "Examples" block (optional convenience wrapper)
+\newtcolorbox{examples}[1][Example]{%
+  enhanced,
+  breakable,
+  colback=white,
+  colframe=black!90,
+  coltitle=white,            % title text color
+  colbacktitle=black!90,      % <<< grey 80 title bar
+  boxrule=0.6pt,
+  arc=1.5mm,
+  left=1.2mm,right=1.2mm,top=1.0mm,bottom=1.0mm,
+  fonttitle=\bfseries,
+  title=#1
+}
+
+
+% In the assembled book, we load *all* chapter bib files here,
+% and print one combined bibliography at the end.
+
+\addbibresource{main.bib}
+
+%%
+% Some convenient commands if you need to use integrals
+\newcommand{\is}{\hspace{2pt}}
+\newcommand{\dx}{\is dx}
+
+
+%%%%%%%%%%%%%%%%%%%%%%
+% These are commands you can use that will generate nice things in TeX.  Feel free to define your own, too.
+\newcommand{\Z}{\mathbb{Z}} % integers
+\newcommand{\Q}{\mathbb{Q}} % rationals
+\newcommand{\R}{\mathbb{R}} % reals
+\newcommand{\C}{\mathbb{C}} % complex numbers
+\newcommand{\ds}{\displaystyle} % invoke "display style", which makes fractions come out big, etc.
+\newcommand{\charac}{\operatorname{char}} % characteristic of a field
+\newcommand{\st}{\ensuremath{\,:\,}} % Makes the colon in set-builder notation space properly
+
+%%%%%%%%%%%%%%%%%%%%%%
+% These commands are for convenient notation for the concentration of measure theorem
+\newcommand{\obdiam}{\operatorname{ObserDiam}}
+\newcommand{\diam}{\operatorname{diam}}
+
+
+%%%%%%%%%%%%%%%%%%%%%%
+% These commands create theorem-like environments.
+\newtheorem{theorem}{Theorem}
+\newtheorem{lemma}[theorem]{Lemma}
+\newtheorem{prop}[theorem]{Proposition}
+\newtheorem{defn}[theorem]{Definition}
+
+\title{Concentration of Measure And Quantum Entanglement}
+\author{Zheyuan Wu}
+\date{\today}
+
+\begin{document}
+\frontmatter
+\maketitle
+\tableofcontents
+\mainmatter
+
+% Each chapter is in its own file and included as a subfile.
+% \subfile{preface}
+\subfile{chapters/chap0}
+\subfile{chapters/chap1}
+\subfile{chapters/chap2}
+% \subfile{chapters/chap3}
+
+\backmatter
+\cleardoublepage
+\printbibliography[title={References}]
+
+\end{document}
diff --git a/preface.pdf b/latex/preface.pdf
similarity index 100%
rename from preface.pdf
rename to latex/preface.pdf
diff --git a/preface.tex b/latex/preface.tex
similarity index 98%
rename from preface.tex
rename to latex/preface.tex
index 035673e..c9c1b14 100644
--- a/preface.tex
+++ b/latex/preface.tex
@@ -1,86 +1,86 @@
-% preface.tex
-\documentclass[main.tex]{subfiles}
-
-\ifSubfilesClassLoaded{
-  \addbibresource{main.bib}
-}
-
-\begin{document}
-
-\chapter*{Preface}
-\addcontentsline{toc}{chapter}{Preface}
-
-Non-commutative probability theory is a branch of generalized probability theory that studies the probability of events in non-commutative algebras (e.g. the algebra of observables in quantum mechanics). In the 20th century, non-commutative probability theory has been applied to the study of quantum mechanics as the classical probability theory is not enough to describe quantum mechanics~\cite{kummer1998elements}.
-
-Recently, the concentration of measure phenomenon has been applied to the study of non-commutative probability theory. Basically, the non-trivial observation, citing from Gromov's work~\cite{MGomolovs}, states that an arbitrary 1-Lipschitz function $f:S^n\to \mathbb{R}$ concentrates near a single value $a_0\in \mathbb{R}$ as strongly as the distance function does. That is,
-
-$$
-\mu\{x\in S^n: |f(x)-a_0|\geq\epsilon\} < \kappa_n(\epsilon)\leq 2\exp\left(-\frac{(n-1)\epsilon^2}{2}\right)
-$$
-
-is applied to computing the probability that, given a bipartite system $A\otimes B$, assume $\dim(B)\geq \dim(A)\geq 3$, as the dimension of the smaller system $A$ increases, with very high probability, a random pure state $\sigma=|\psi\rangle\langle\psi|$ selected from $A\otimes B$ is almost as good as the maximally entangled state.
-
-Mathematically, that is:
-
-Let $\psi\in \mathcal{P}(A\otimes B)$ be a random pure state on $A\otimes B$.
-
-If we define $\beta=\frac{1}{\ln(2)}\frac{d_A}{d_B}$, then we have
-
-$$
-\operatorname{Pr}[H(\psi_A) < \log_2(d_A)-\alpha-\beta] \leq \exp\left(-\frac{1}{8\pi^2\ln(2)}\frac{(d_Ad_B-1)\alpha^2}{(\log_2(d_A))^2}\right)
-$$
-
-where $d_B\geq d_A\geq 3$~\cite{Hayden_2006}.
-
-In this report, we will show the process of my exploration of the concentration of measure phenomenon in the context of non-commutative probability theory. We assume the reader is an undergraduate student in mathematics and is familiar with the basic concepts of probability theory, measure theory, linear algebra, and some basic skills of mathematical analysis. To make the report more self-contained, we will add detailed annotated proofs that I understand and references for the original sources.
-
-\section*{How to use the dependency graph}
-
-Since our topic integrates almost everything I've learned during undergraduate study, I will try to make some dependency graph for reader and for me to keep track of what are the necessary knowledge to understand part of the report.
-
-One can imagine the project as a big tree, where the root is in undergrad math and branches out to the topics of the report, including many advanced topics and motivation to study them.
-
-\bigskip
-
-% --- Dependency tree graph (TikZ) ---
-\begin{figure}[ht]
-\centering
-\begin{tikzpicture}[
-  node distance=10mm and 18mm,
-  box/.style={draw, rectangle, fill=white, align=center, inner sep=4pt},
-  arrow/.style={-Latex}
-]
-
-% \node[box] (lin) {Linear Algebra\\(bases, maps, eigenvalues)};
-% \node[box, right=of lin] (real) {Real Analysis\\(limits, continuity, measure-lite)};
-% \node[box, below=of lin] (prob) {Probability\\(expectation, variance, concentration)};
-% \node[box, below=of real] (top) {Topology/Geometry\\(metrics, compactness)};
-
-% \node[box, below=12mm of prob] (func) {Functional Analysis\\($L^p$, Hilbert spaces, operators)};
-% \node[box, below=12mm of top] (quant) {Quantum Formalism\\(states, observables, partial trace)};
-
-% \node[box, below=14mm of func, xshift=25mm] (book) {This Book\\(Chapters 1--n)};
-% % draw arrows behind nodes
-% \begin{scope}[on background layer]
-%   \draw[arrow] (lin) -- (func);
-%   \draw[arrow] (real) -- (func);
-%   \draw[arrow] (prob) -- (func);
-%   \draw[arrow] (func) -- (quant);
-%   \draw[arrow] (lin) -- (quant);
-%   \draw[arrow] (top) -- (quant);
-
-%   \draw[arrow] (func) -- (book);
-%   \draw[arrow] (quant) -- (book);
-%   \draw[arrow] (prob) -- (book);
-% \end{scope}
-
-\end{tikzpicture}
-\caption{Dependency tree: prerequisites and how they feed into the main text.}
-\label{fig:dependency-tree}
-\end{figure}
-
-\ifSubfilesClassLoaded{
-  \printbibliography[title={References}]
-}
-
-\end{document}
+% preface.tex
+\documentclass[main.tex]{subfiles}
+
+\ifSubfilesClassLoaded{
+  \addbibresource{main.bib}
+}
+
+\begin{document}
+
+\chapter*{Preface}
+\addcontentsline{toc}{chapter}{Preface}
+
+Non-commutative probability theory is a branch of generalized probability theory that studies the probability of events in non-commutative algebras (e.g. the algebra of observables in quantum mechanics). In the 20th century, non-commutative probability theory has been applied to the study of quantum mechanics as the classical probability theory is not enough to describe quantum mechanics~\cite{kummer1998elements}.
+
+Recently, the concentration of measure phenomenon has been applied to the study of non-commutative probability theory. Basically, the non-trivial observation, citing from Gromov's work~\cite{MGomolovs}, states that an arbitrary 1-Lipschitz function $f:S^n\to \mathbb{R}$ concentrates near a single value $a_0\in \mathbb{R}$ as strongly as the distance function does. That is,
+
+$$
+\mu\{x\in S^n: |f(x)-a_0|\geq\epsilon\} < \kappa_n(\epsilon)\leq 2\exp\left(-\frac{(n-1)\epsilon^2}{2}\right)
+$$
+
+is applied to computing the probability that, given a bipartite system $A\otimes B$, assume $\dim(B)\geq \dim(A)\geq 3$, as the dimension of the smaller system $A$ increases, with very high probability, a random pure state $\sigma=|\psi\rangle\langle\psi|$ selected from $A\otimes B$ is almost as good as the maximally entangled state.
+
+Mathematically, that is:
+
+Let $\psi\in \mathcal{P}(A\otimes B)$ be a random pure state on $A\otimes B$.
+
+If we define $\beta=\frac{1}{\ln(2)}\frac{d_A}{d_B}$, then we have
+
+$$
+\operatorname{Pr}[H(\psi_A) < \log_2(d_A)-\alpha-\beta] \leq \exp\left(-\frac{1}{8\pi^2\ln(2)}\frac{(d_Ad_B-1)\alpha^2}{(\log_2(d_A))^2}\right)
+$$
+
+where $d_B\geq d_A\geq 3$~\cite{Hayden_2006}.
+
+In this report, we will show the process of my exploration of the concentration of measure phenomenon in the context of non-commutative probability theory. We assume the reader is an undergraduate student in mathematics and is familiar with the basic concepts of probability theory, measure theory, linear algebra, and some basic skills of mathematical analysis. To make the report more self-contained, we will add detailed annotated proofs that I understand and references for the original sources.
+
+\section*{How to use the dependency graph}
+
+Since our topic integrates almost everything I've learned during undergraduate study, I will try to make some dependency graph for reader and for me to keep track of what are the necessary knowledge to understand part of the report.
+
+One can imagine the project as a big tree, where the root is in undergrad math and branches out to the topics of the report, including many advanced topics and motivation to study them.
+
+\bigskip
+
+% --- Dependency tree graph (TikZ) ---
+\begin{figure}[ht]
+\centering
+\begin{tikzpicture}[
+  node distance=10mm and 18mm,
+  box/.style={draw, rectangle, fill=white, align=center, inner sep=4pt},
+  arrow/.style={-Latex}
+]
+
+% \node[box] (lin) {Linear Algebra\\(bases, maps, eigenvalues)};
+% \node[box, right=of lin] (real) {Real Analysis\\(limits, continuity, measure-lite)};
+% \node[box, below=of lin] (prob) {Probability\\(expectation, variance, concentration)};
+% \node[box, below=of real] (top) {Topology/Geometry\\(metrics, compactness)};
+
+% \node[box, below=12mm of prob] (func) {Functional Analysis\\($L^p$, Hilbert spaces, operators)};
+% \node[box, below=12mm of top] (quant) {Quantum Formalism\\(states, observables, partial trace)};
+
+% \node[box, below=14mm of func, xshift=25mm] (book) {This Book\\(Chapters 1--n)};
+% % draw arrows behind nodes
+% \begin{scope}[on background layer]
+%   \draw[arrow] (lin) -- (func);
+%   \draw[arrow] (real) -- (func);
+%   \draw[arrow] (prob) -- (func);
+%   \draw[arrow] (func) -- (quant);
+%   \draw[arrow] (lin) -- (quant);
+%   \draw[arrow] (top) -- (quant);
+
+%   \draw[arrow] (func) -- (book);
+%   \draw[arrow] (quant) -- (book);
+%   \draw[arrow] (prob) -- (book);
+% \end{scope}
+
+\end{tikzpicture}
+\caption{Dependency tree: prerequisites and how they feed into the main text.}
+\label{fig:dependency-tree}
+\end{figure}
+
+\ifSubfilesClassLoaded{
+  \printbibliography[title={References}]
+}
+
+\end{document}
diff --git a/snippets/compile.sh b/latex/snippets/compile.sh
similarity index 95%
rename from snippets/compile.sh
rename to latex/snippets/compile.sh
index 8c4be78..1471f4b 100644
--- a/snippets/compile.sh
+++ b/latex/snippets/compile.sh
@@ -8,7 +8,7 @@ echo "==============================================================="
 total_files=$(find chapters -name "*.tex" -type f | wc -l)
 processed_files=0
 
-if [[ $total_files -eq 0 ]]; then
+if [ $total_files -eq 0 ]; then
     echo "No .tex files found in chapters/ directory"
     exit 0
 fi
@@ -17,7 +17,7 @@ echo "Found $total_files .tex file(s) to process"
 echo ""
 
 for texfile in chapters/*.tex; do
-    if [[ -f "$texfile" ]]; then
+    if [ -f "$texfile" ]; then
         processed_files=$((processed_files + 1))
         base="${texfile%.*}"
         filename=$(basename "$texfile")