From ff84390bb700077bf7b9203ff732b196e2a80722 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Tue, 1 Mar 2022 07:28:02 -0800
Subject: feat: updated code

---
 Project.toml              |  1 +
 bin/drosophila.jl         | 51 ++++++++++++++++++++++++++++++++++++++---------
 docs/src/sci/inference.md | 12 +++++------
 docs/src/sci/normalize.md | 46 +++++++++++++++++++++---------------------
 src/SeqSpace.jl           | 25 +++++++++++++++++++++--
 src/infer.jl              |  2 +-
 src/manifold.jl           | 10 ++++++----
 7 files changed, 102 insertions(+), 45 deletions(-)

diff --git a/Project.toml b/Project.toml
index b9e06a6..f49854b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -13,6 +13,7 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
 GSL = "92c85e6c-cbff-5e0c-80f7-495c94daaecd"
 GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63"
diff --git a/bin/drosophila.jl b/bin/drosophila.jl
index 64eb81d..cdfaadc 100644
--- a/bin/drosophila.jl
+++ b/bin/drosophila.jl
@@ -103,7 +103,7 @@ function heteroskedastic(raw)
 		fig[1,1],
 		xscale=Makie.log10,
 		xlabel="cell sequencing depth",
-        xticks=([10^4.0, 10^4.5, 10^5.0], [L"10^{4.0}", L"10^{4.5}", L"10^{5.0}"]),
+        # xticks=[10^4.0, 10^4.5, 10^5.0], [L"10^4", L"10^{4.5}", L"10^5"]),
 		ylabel="cumulative",
         yticks=([0, 0.25, 0.5, 0.75, 1.0], [L"0.0", L"0.25", L"0.50", L"0.75", L"1.00"])
 	)
@@ -492,6 +492,34 @@ function scaling(D, N; δ=2)
     save("$FIG/pointcloud_scaling.png", fig, px_per_unit=2)
 end
 
+function locality(Dg, Dx)
+    x = PointCloud.upper_tri(Dx)
+	g = PointCloud.upper_tri(Dg)
+	
+	bins = collect(range(50,350,length=10))
+
+	fig = Figure(font="Latin Modern Math", fontsize=26)
+	ax1 = Axis(fig[1,1],
+		xlabel="expression distance (AU)",
+		ylabel="cumulative",
+		yticks=([0, 0.25, 0.5, 0.75, 1.0], [L"0.0", L"0.25", L"0.50", L"0.75", L"1.00"])
+	)
+
+	color = cgrad(:matter, length(bins)-1, categorical=true)
+	for i in 1:length(bins)-1
+		lb, ub = bins[i:i+1]
+		x0 = (lb + ub) / 2
+
+		data = g[lb .≤ x .≤ ub]
+		lines!(ax1, sort(data), range(0,1,length(data)),
+            color=color[i],
+            linewidth=2,
+			label=@sprintf "%.2f μm" x0)
+	end
+	axislegend("est. physical distance", position = :rb)
+    save("$FIG/pointcloud_locality.png", fig, px_per_unit=2)
+end
+
 function inverse(mapping, database, residual; len=100)
     cmean   = zeros(len)
     cstd    = zeros(len)
@@ -501,8 +529,8 @@ function inverse(mapping, database, residual; len=100)
     for (i,β) in enumerate(βs)
         ψ = minimum(size(residual))*mapping.invert(β)
         ψ = ψ ./ sum(ψ,dims=1)
-        G = residual*ψ'
-        c = [cor(G[g,:] |> vec, database.expression.real[:,i] |> vec) for (i,g) in enumerate(mapping.index) if g !== nothing]
+
+        c = [cor( (ψ*(residual[g,:] |> vec)) , database.expression.real[:,i] |> vec) for (i,g) in enumerate(mapping.index) if g !== nothing]
 
         cmean[i]   = mean(c)
         cstd[i]    = std(c)
@@ -530,7 +558,7 @@ function inverse(mapping, database, residual; len=100)
     δ = 10
     colors = cgrad(:matter,length(entropy[1:δ:end]), categorical=true)
     for (i,S) in enumerate(entropy[1:δ:end])
-        lines!(ax1, sort(S), range(0,1,length(S));
+        lines!(ax1, sort(S)./1.25, range(0,1,length(S));
                linewidth=2,
                color=colors[i],
                label=@sprintf "%.2f" βs[δ*(i-1)+1])
@@ -760,22 +788,22 @@ function main(root::String; subdir=nothing)
     inverse, database = if isfile("$DATA/inverse.jld2")
         load("$DATA/inverse.jld2", "inverse", "database")
     else
-        ν, ω = if isfile("$DATA/bdntp_params.jld2")
-            # load("$DATA/bdntp_params.jld2", "nu", "omega")
+        ν, ω = if isfile("$DATA/bdtnp_params.jld2")
+            # load("$DATA/bdtnp_params.jld2", "ν", "ω")
             nothing, nothing
         else
             nothing, nothing
         end
         db = Inference.virtualembryo(directory="/home/nolln/mnt/data/drosophila/dvex");
-        I  = Inference.inversion(count./sum(count,dims=1), gene; refdb=db, ν=ν, ω=ω);
+        I  = Inference.inversion(norm./sum(norm,dims=1), gene; refdb=db, ν=ν, ω=ω);
         save("$DATA/inverse.jld2", Dict("inverse"=>I, "transform"=>I.invert(150), "database"=>db))
         I, db
     end
 
     Figures.scaling(D, 100)
-    Figures.inverse(inverse, database, Z)
+    Figures.inverse(inverse, database, norm./sum(norm,dims=1))
+    Figures.gene_patterns(inverse, database, gene, norm./sum(norm,dims=1))
     Figures.isomap(inverse, database, μ, Z, D)
-    Figures.gene_patterns(inverse, database, gene, count./sum(count,dims=1))
 
     ξ = PointCloud.isomap(Z,10;k=10)
     if !isfile("$DATA/isomap.jld2")
@@ -784,6 +812,11 @@ function main(root::String; subdir=nothing)
             "distance"=>D,
         ))
     end
+
+    ψ = inverse.invert(150)
+    ψ = ψ ./ sum(ψ,dims=1)
+    D = ψ'*PointCloud.geodesics(database.position',6)*ψ
+    Figures.locality(PointCloud.Distances.euclidean(ξ[:,1:5]'), D)
 end
 
 if abspath(PROGRAM_FILE) == @__FILE__
diff --git a/docs/src/sci/inference.md b/docs/src/sci/inference.md
index 84f1b75..ce166d0 100644
--- a/docs/src/sci/inference.md
+++ b/docs/src/sci/inference.md
@@ -28,7 +28,7 @@ The virtual embryo was constructed by aligning florescent pointcloud data obtain
 ```@raw html
 <p align="center">
 <figure>
-  <img src="/assets/bdtnp.jpg" width="99%" />
+  <img src="/assets/drosophila/bdtnp.jpg" width="99%" />
   <figurecaption>
   Cartoon of the procedure used to generate the virtual embryo by the BDTNP.
   </figurecaption>
@@ -121,8 +121,8 @@ The mapping at the optimal temperature corresponds to a sampling entropy ``\rho_
 ```@raw html
 <p align="center">
 <figure>
-  <img src="/assets/bdtnp_fit.png" width="49%" />
-  <img src="/assets/bdtnp_entropy.png" width="49%" />
+  <img src="/assets/drosophila/bdtnp_fit.png" width="49%" />
+  <img src="/assets/drosophila/bdtnp_entropy.png" width="49%" />
   <figurecaption>
   (Left)Residuals of scRNAseq expression patterns matches to BDTNP database.
   (Right) Entropy of sampling distribution, averaged over cells.
@@ -142,9 +142,9 @@ Below we display a representative sample.
 ```@raw html
 <p align="center">
 <figure>
-  <img src="/assets/gene/eve.png" width="32%" />
-  <img src="/assets/gene/ftz.png" width="32%" />
-  <img src="/assets/gene/twi.png" width="32%" />
+  <img src="/assets/drosophila/gene/eve.png" width="32%" />
+  <img src="/assets/drosophila/gene/ftz.png" width="32%" />
+  <img src="/assets/drosophila/gene/twi.png" width="32%" />
   <figurecaption>
   Genes shown (left-to-right): eve, ftz, twi
   </figurecaption>
diff --git a/docs/src/sci/normalize.md b/docs/src/sci/normalize.md
index e6056d6..3cb5dbd 100644
--- a/docs/src/sci/normalize.md
+++ b/docs/src/sci/normalize.md
@@ -34,7 +34,7 @@ Our goal during the normalization procedure is two-fold:
 ```@raw html
 <p align="center">
 <figure>
-  <img src="/assets/heteroskedastic.png" width="79%" />
+  <img src="/assets/drosophila/heteroskedastic.png" width="79%" />
   <figurecaption>
   Count matrix is heteroskedastic: variation of scales across genes (rows) and cells (columns)
   </figurecaption>
@@ -54,8 +54,8 @@ Owing to dropout and other sources of overdispersion, as shown empircally in the
 ```@raw html
 <p align="center">
 <figure>
-  <img src="/assets/overdispersed_mean_vs_variance.png" width="49%" />
-  <img src="/assets/overdispersed_zeros.png" width="49%" />
+  <img src="/assets/drosophila/overdispersed_mean_vs_variance.png" width="49%" />
+  <img src="/assets/drosophila/overdispersed_zeros.png" width="49%" />
   <figurecaption>
   scRNAseq data for Drosophila is overdispersed.
   </figurecaption>
@@ -80,7 +80,7 @@ As such, singular values would be bounded by ``\bar{\lambda} \equiv 1+\sigma\sqr
 
 ```@raw html
 <p align="center">
-<img src="/assets/marchenko-pastur.png" width="49%" class="center"/>
+<img src="/assets/drosophila/marchenko-pastur.png" width="49%" class="center"/>
 </p>
 <p align="center">
 Marchenko pastur distribution (orange) vs random matrix eigenvalues (blue)
@@ -101,8 +101,8 @@ This is exhibited empirically below.
 
 ```@raw html
 <p align="center">
-<img src="/assets/gaussian_svd.png" width="49%" class="center"/>
-<img src="/assets/gaussian_overlap.png" width="49%" class="center"/>
+<img src="/assets/drosophila/gaussian_svd.png" width="49%" class="center"/>
+<img src="/assets/drosophila/gaussian_overlap.png" width="49%" class="center"/>
 </p>
 ```
 
@@ -127,8 +127,8 @@ An example is shown below:
 
 ```@raw html
 <p align="center">
-<img src="/assets/poisson_svd.png" width="49%" class="center"/>
-<img src="/assets/poisson_overlap.png" width="49%" class="center"/>
+<img src="/assets/drosophila/poisson_svd.png" width="49%" class="center"/>
+<img src="/assets/drosophila/poisson_overlap.png" width="49%" class="center"/>
 </p>
 ```
 
@@ -201,8 +201,8 @@ The result is shown below.
 
 ```@raw html
 <p align="center">
-<img src="/assets/negbinom.png" width="49%" class="center"/>
-<img src="/assets/negbinom_mean.png" width="49%" class="center"/>
+<img src="/assets/drosophila/negbinom.png" width="49%" class="center"/>
+<img src="/assets/drosophila/negbinom_mean.png" width="49%" class="center"/>
 </p>
 ```
 As shown, we underestimate the true rank as a few components have singular values below the Marchenko-Pastur noise floor.
@@ -218,9 +218,9 @@ This can be seen in the below figure, which shows the scatter plot of the parame
 ```@raw html
 <p align="center">
 <figure>
-  <img src="/assets/nb_1_uncertainty_vs_expression.png" width="32%" />
-  <img src="/assets/nb_2_uncertainty_vs_expression.png" width="32%" />
-  <img src="/assets/nb_3_uncertainty_vs_expression.png" width="32%" />
+  <img src="/assets/drosophila/nb_1_uncertainty_vs_expression.png" width="32%" />
+  <img src="/assets/drosophila/nb_2_uncertainty_vs_expression.png" width="32%" />
+  <img src="/assets/drosophila/nb_3_uncertainty_vs_expression.png" width="32%" />
   <figurecaption>
   Each point is a gene (row). Color of point determined by mean expression of gene.
   </figurecaption>
@@ -240,8 +240,8 @@ The right figure displays the cumulative density function for the filtered genes
 ```@raw html
 <p align="center">
 <figure>
-  <img src="/assets/nb_total_uncertainty_vs_expression.png" width="49%" />
-  <img src="/assets/nb_badfits.png" width="49%" />
+  <img src="/assets/drosophila/nb_total_uncertainty_vs_expression.png" width="49%" />
+  <img src="/assets/drosophila/nb_badfits.png" width="49%" />
   <figurecaption>
   Filter genes with bad fits. Genes with high uncertainty are determined to be lowly expressed.
   </figurecaption>
@@ -255,8 +255,8 @@ This phenomenon persists across all gene expression levels.
 ```@raw html
 <p align="center">
 <figure>
-  <img src="/assets/nb_param2.png" width="49%" />
-  <img src="/assets/nb_param3.png" width="49%" />
+  <img src="/assets/drosophila/nb_param2.png" width="49%" />
+  <img src="/assets/drosophila/nb_param3.png" width="49%" />
   <figurecaption>
   Parameter distributions
   </figurecaption>
@@ -274,9 +274,9 @@ As shown below, we find great quantitative agreement between the empirical estim
 ```@raw html
 <p align="center">
 <figure>
-  <img src="/assets/bootstrap_1.png" width="60%" />
-  <img src="/assets/bootstrap_2.png" width="60%" />
-  <img src="/assets/bootstrap_3.png" width="60%" />
+  <img src="/assets/drosophila/bootstrap_1.png" width="60%" />
+  <img src="/assets/drosophila/bootstrap_2.png" width="60%" />
+  <img src="/assets/drosophila/bootstrap_3.png" width="60%" />
 </figure>
 </p>
 ```
@@ -288,8 +288,8 @@ As shown below, we find there are ``\sim 30`` **statistically significant** line
 Interestingly, while not fully delocalized as seen by the participation ratio of the "noise" components, we see that roughly ``\sim 1000`` genes contribute significantly to each component suggesting these are coarse "pathways" discovered.
 ```@raw html
 <p align="center">
-<img src="/assets/rank_estimate.png" width="49%" class="center"/>
-<img src="/assets/participation_ratio.png" width="49%" class="center"/>
+<img src="/assets/drosophila/rank_estimate.png" width="49%" class="center"/>
+<img src="/assets/drosophila/participation_ratio.png" width="49%" class="center"/>
 </p>
 ```
 To ensure element-wise positivity, the estimated ``\tilde{\mu}_{i\alpha}`` is obtained by performing non-negative matrix factorization on the rescaled ``\tilde{n}_{i\alpha}`` with rank ``35``.
@@ -311,7 +311,7 @@ This was determined to be an excellent stochastic model for the computed ``\tild
 
 ```@raw html
 <p align="center">
-<img src="/assets/gamma_qq.png" width="49%" class="center"/>
+<img src="/assets/drosophila/gamma_qq.png" width="49%" class="center"/>
 </p>
 ```
 
diff --git a/src/SeqSpace.jl b/src/SeqSpace.jl
index 298f4c4..ccb92f4 100644
--- a/src/SeqSpace.jl
+++ b/src/SeqSpace.jl
@@ -158,7 +158,7 @@ end
 Deserialize a trained autoencoder from binary format to semantic format.
 Represents model as a collection of functors.
 """
-function unmarshal(r::Result)
+function unmarshal(r)
     autoencoder = model(r.model.size, r.param.dₒ;
           Ws         = r.param.Ws,
           normalizes = r.param.BN,
@@ -187,7 +187,22 @@ function unmarshal(r::Result)
         end
     end
 
-    return Result(r.param, r.loss, autoencoder)
+    param = HyperParams(;
+        dₒ = r.param.dₒ,
+        Ws = r.param.Ws,
+        BN = r.param.BN,
+        DO = r.param.DO,
+        N  = r.param.N,
+        δ  = r.param.δ,
+        η  = r.param.η,
+        B  = r.param.B,
+        V  = r.param.V,
+        k  = r.param.k,
+        γₓ = r.param.γₓ,
+        γᵤ = r.param.γᵤ,
+    )
+
+    return Result(param, r.loss, autoencoder)
 end
 
 # ------------------------------------------------------------------------
@@ -266,6 +281,10 @@ function buildloss(model, D², param)
         Dz² = param.g(z)
         Dx² = D²[i,i]
 
+        dx, dz = PointCloud.upper_tri(Dx²), PointCloud.upper_tri(Dz²)
+        rx, rz = softrank(dx ./ mean(dx)), softrank(dz ./ mean(dz))
+        ϵₓ = 1 - cor(1 .- rx, 1 .- rz)
+        #=
         ϵₓ = mean(
             let
                 dx, dz = Dx²[:,j], Dz²[:,j]
@@ -273,6 +292,7 @@ function buildloss(model, D², param)
                 1 - cor((1 .- rx).^2, (1 .- rz).^2)
             end for j ∈ 1:size(Dx²,2)
         )
+        =#
 
         ϵᵤ = mean(
             let
@@ -280,6 +300,7 @@ function buildloss(model, D², param)
                 mean( ( (2*i/length(zₛ)-1) - s)^2 for (i,s) in enumerate(zₛ) )
             end for d ∈ 1:size(z,1)
         )
+ 
         #=
         ϵᵤ = let
             a = Voronoi.volumes(z)
diff --git a/src/infer.jl b/src/infer.jl
index 551e55e..5aecf0f 100644
--- a/src/infer.jl
+++ b/src/infer.jl
@@ -389,7 +389,7 @@ function make_objective(ref, qry)
     function objective(Θ)
         # β, ν, ω = #0.5, Θ[1:84], Θ[85:end] #ones(84)
         # Σ, ϕ    = cost_scan(ref, qry, ν, ω)
-        β = 10
+        β = 250
         ν, ω = Θ[1:84], Θ[85:end]
         Σ, ϕ = cost_transform(ref, qry; ω=ω, ν=ν)
 
diff --git a/src/manifold.jl b/src/manifold.jl
index 3867539..650cfb9 100644
--- a/src/manifold.jl
+++ b/src/manifold.jl
@@ -200,8 +200,8 @@ Compute the tangent vectors ``\\hat{\\bm{e}}_\\phi, \\hat{\\bm{e}}_x`` associate
 Assumes all points within `r` are distributed over the surface.
 Assumes `r` is sized ``N \\times 3``.
 """
-function basis(s::Surface, r)
-    x  = pullback(s, r)
+function basis(s::Surface, r; surface=false, normalized=true)
+    x  = surface ? r : pullback(s, r)
 
     # function
     x₀ = s.Λ[1].(x[:,1])
@@ -225,8 +225,10 @@ function basis(s::Surface, r)
               +(∂a.*cos.(ϕ).-a.*sin.(ϕ).*∂ϕ).*cos.(x[:,2]) .- (∂b.*sin.(ϕ).+b.*cos.(ϕ).*∂ϕ).*sin.(x[:,2]) .+ ∂x₀,
               +(∂a.*sin.(ϕ).+a.*cos.(ϕ).*∂ϕ).*cos.(x[:,2]) .+ (∂b.*cos.(ϕ).-b.*sin.(ϕ).*∂ϕ).*sin.(x[:,2]) .+ ∂y₀)
 
-    ez = ez ./ sqrt.(ez⋅ez)
-    eθ = eθ ./ sqrt.(eθ⋅eθ)
+    if normalized
+        ez = ez ./ sqrt.(ez⋅ez)
+        eθ = eθ ./ sqrt.(eθ⋅eθ)
+    end
 
     return ez, eθ
 end
-- 
cgit v1.2.1