aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2022-03-01 07:28:02 -0800
committerNicholas Noll <nbnoll@eml.cc>2022-03-01 07:28:02 -0800
commitff84390bb700077bf7b9203ff732b196e2a80722 (patch)
treeb858a06e59173554cfae98196e3f5b3fe4a3f294
parent7f18d975a1b0ab091746021949520e3e294a1e77 (diff)
feat: updated code
-rw-r--r--Project.toml1
-rw-r--r--bin/drosophila.jl51
-rw-r--r--docs/src/sci/inference.md12
-rw-r--r--docs/src/sci/normalize.md46
-rw-r--r--src/SeqSpace.jl25
-rw-r--r--src/infer.jl2
-rw-r--r--src/manifold.jl10
7 files changed, 102 insertions, 45 deletions
diff --git a/Project.toml b/Project.toml
index b9e06a6..f49854b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -13,6 +13,7 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
GSL = "92c85e6c-cbff-5e0c-80f7-495c94daaecd"
GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63"
diff --git a/bin/drosophila.jl b/bin/drosophila.jl
index 64eb81d..cdfaadc 100644
--- a/bin/drosophila.jl
+++ b/bin/drosophila.jl
@@ -103,7 +103,7 @@ function heteroskedastic(raw)
fig[1,1],
xscale=Makie.log10,
xlabel="cell sequencing depth",
- xticks=([10^4.0, 10^4.5, 10^5.0], [L"10^{4.0}", L"10^{4.5}", L"10^{5.0}"]),
+ # xticks=[10^4.0, 10^4.5, 10^5.0], [L"10^4", L"10^{4.5}", L"10^5"]),
ylabel="cumulative",
yticks=([0, 0.25, 0.5, 0.75, 1.0], [L"0.0", L"0.25", L"0.50", L"0.75", L"1.00"])
)
@@ -492,6 +492,34 @@ function scaling(D, N; δ=2)
save("$FIG/pointcloud_scaling.png", fig, px_per_unit=2)
end
+function locality(Dg, Dx)
+ x = PointCloud.upper_tri(Dx)
+ g = PointCloud.upper_tri(Dg)
+
+ bins = collect(range(50,350,length=10))
+
+ fig = Figure(font="Latin Modern Math", fontsize=26)
+ ax1 = Axis(fig[1,1],
+ xlabel="expression distance (AU)",
+ ylabel="cumulative",
+ yticks=([0, 0.25, 0.5, 0.75, 1.0], [L"0.0", L"0.25", L"0.50", L"0.75", L"1.00"])
+ )
+
+ color = cgrad(:matter, length(bins)-1, categorical=true)
+ for i in 1:length(bins)-1
+ lb, ub = bins[i:i+1]
+ x0 = (lb + ub) / 2
+
+ data = g[lb .≤ x .≤ ub]
+ lines!(ax1, sort(data), range(0,1,length(data)),
+ color=color[i],
+ linewidth=2,
+ label=@sprintf "%.2f μm" x0)
+ end
+ axislegend("est. physical distance", position = :rb)
+ save("$FIG/pointcloud_locality.png", fig, px_per_unit=2)
+end
+
function inverse(mapping, database, residual; len=100)
cmean = zeros(len)
cstd = zeros(len)
@@ -501,8 +529,8 @@ function inverse(mapping, database, residual; len=100)
for (i,β) in enumerate(βs)
ψ = minimum(size(residual))*mapping.invert(β)
ψ = ψ ./ sum(ψ,dims=1)
- G = residual*ψ'
- c = [cor(G[g,:] |> vec, database.expression.real[:,i] |> vec) for (i,g) in enumerate(mapping.index) if g !== nothing]
+
+ c = [cor( (ψ*(residual[g,:] |> vec)) , database.expression.real[:,i] |> vec) for (i,g) in enumerate(mapping.index) if g !== nothing]
cmean[i] = mean(c)
cstd[i] = std(c)
@@ -530,7 +558,7 @@ function inverse(mapping, database, residual; len=100)
δ = 10
colors = cgrad(:matter,length(entropy[1:δ:end]), categorical=true)
for (i,S) in enumerate(entropy[1:δ:end])
- lines!(ax1, sort(S), range(0,1,length(S));
+ lines!(ax1, sort(S)./1.25, range(0,1,length(S));
linewidth=2,
color=colors[i],
label=@sprintf "%.2f" βs[δ*(i-1)+1])
@@ -760,22 +788,22 @@ function main(root::String; subdir=nothing)
inverse, database = if isfile("$DATA/inverse.jld2")
load("$DATA/inverse.jld2", "inverse", "database")
else
- ν, ω = if isfile("$DATA/bdntp_params.jld2")
- # load("$DATA/bdntp_params.jld2", "nu", "omega")
+ ν, ω = if isfile("$DATA/bdtnp_params.jld2")
+ # load("$DATA/bdtnp_params.jld2", "ν", "ω")
nothing, nothing
else
nothing, nothing
end
db = Inference.virtualembryo(directory="/home/nolln/mnt/data/drosophila/dvex");
- I = Inference.inversion(count./sum(count,dims=1), gene; refdb=db, ν=ν, ω=ω);
+ I = Inference.inversion(norm./sum(norm,dims=1), gene; refdb=db, ν=ν, ω=ω);
save("$DATA/inverse.jld2", Dict("inverse"=>I, "transform"=>I.invert(150), "database"=>db))
I, db
end
Figures.scaling(D, 100)
- Figures.inverse(inverse, database, Z)
+ Figures.inverse(inverse, database, norm./sum(norm,dims=1))
+ Figures.gene_patterns(inverse, database, gene, norm./sum(norm,dims=1))
Figures.isomap(inverse, database, μ, Z, D)
- Figures.gene_patterns(inverse, database, gene, count./sum(count,dims=1))
ξ = PointCloud.isomap(Z,10;k=10)
if !isfile("$DATA/isomap.jld2")
@@ -784,6 +812,11 @@ function main(root::String; subdir=nothing)
"distance"=>D,
))
end
+
+ ψ = inverse.invert(150)
+ ψ = ψ ./ sum(ψ,dims=1)
+ D = ψ'*PointCloud.geodesics(database.position',6)*ψ
+ Figures.locality(PointCloud.Distances.euclidean(ξ[:,1:5]'), D)
end
if abspath(PROGRAM_FILE) == @__FILE__
diff --git a/docs/src/sci/inference.md b/docs/src/sci/inference.md
index 84f1b75..ce166d0 100644
--- a/docs/src/sci/inference.md
+++ b/docs/src/sci/inference.md
@@ -28,7 +28,7 @@ The virtual embryo was constructed by aligning florescent pointcloud data obtain
```@raw html
<p align="center">
<figure>
- <img src="/assets/bdtnp.jpg" width="99%" />
+ <img src="/assets/drosophila/bdtnp.jpg" width="99%" />
<figurecaption>
Cartoon of the procedure used to generate the virtual embryo by the BDTNP.
</figurecaption>
@@ -121,8 +121,8 @@ The mapping at the optimal temperature corresponds to a sampling entropy ``\rho_
```@raw html
<p align="center">
<figure>
- <img src="/assets/bdtnp_fit.png" width="49%" />
- <img src="/assets/bdtnp_entropy.png" width="49%" />
+ <img src="/assets/drosophila/bdtnp_fit.png" width="49%" />
+ <img src="/assets/drosophila/bdtnp_entropy.png" width="49%" />
<figurecaption>
(Left)Residuals of scRNAseq expression patterns matches to BDTNP database.
(Right) Entropy of sampling distribution, averaged over cells.
@@ -142,9 +142,9 @@ Below we display a representative sample.
```@raw html
<p align="center">
<figure>
- <img src="/assets/gene/eve.png" width="32%" />
- <img src="/assets/gene/ftz.png" width="32%" />
- <img src="/assets/gene/twi.png" width="32%" />
+ <img src="/assets/drosophila/gene/eve.png" width="32%" />
+ <img src="/assets/drosophila/gene/ftz.png" width="32%" />
+ <img src="/assets/drosophila/gene/twi.png" width="32%" />
<figurecaption>
Genes shown (left-to-right): eve, ftz, twi
</figurecaption>
diff --git a/docs/src/sci/normalize.md b/docs/src/sci/normalize.md
index e6056d6..3cb5dbd 100644
--- a/docs/src/sci/normalize.md
+++ b/docs/src/sci/normalize.md
@@ -34,7 +34,7 @@ Our goal during the normalization procedure is two-fold:
```@raw html
<p align="center">
<figure>
- <img src="/assets/heteroskedastic.png" width="79%" />
+ <img src="/assets/drosophila/heteroskedastic.png" width="79%" />
<figurecaption>
Count matrix is heteroskedastic: variation of scales across genes (rows) and cells (columns)
</figurecaption>
@@ -54,8 +54,8 @@ Owing to dropout and other sources of overdispersion, as shown empircally in the
```@raw html
<p align="center">
<figure>
- <img src="/assets/overdispersed_mean_vs_variance.png" width="49%" />
- <img src="/assets/overdispersed_zeros.png" width="49%" />
+ <img src="/assets/drosophila/overdispersed_mean_vs_variance.png" width="49%" />
+ <img src="/assets/drosophila/overdispersed_zeros.png" width="49%" />
<figurecaption>
scRNAseq data for Drosophila is overdispersed.
</figurecaption>
@@ -80,7 +80,7 @@ As such, singular values would be bounded by ``\bar{\lambda} \equiv 1+\sigma\sqr
```@raw html
<p align="center">
-<img src="/assets/marchenko-pastur.png" width="49%" class="center"/>
+<img src="/assets/drosophila/marchenko-pastur.png" width="49%" class="center"/>
</p>
<p align="center">
Marchenko pastur distribution (orange) vs random matrix eigenvalues (blue)
@@ -101,8 +101,8 @@ This is exhibited empirically below.
```@raw html
<p align="center">
-<img src="/assets/gaussian_svd.png" width="49%" class="center"/>
-<img src="/assets/gaussian_overlap.png" width="49%" class="center"/>
+<img src="/assets/drosophila/gaussian_svd.png" width="49%" class="center"/>
+<img src="/assets/drosophila/gaussian_overlap.png" width="49%" class="center"/>
</p>
```
@@ -127,8 +127,8 @@ An example is shown below:
```@raw html
<p align="center">
-<img src="/assets/poisson_svd.png" width="49%" class="center"/>
-<img src="/assets/poisson_overlap.png" width="49%" class="center"/>
+<img src="/assets/drosophila/poisson_svd.png" width="49%" class="center"/>
+<img src="/assets/drosophila/poisson_overlap.png" width="49%" class="center"/>
</p>
```
@@ -201,8 +201,8 @@ The result is shown below.
```@raw html
<p align="center">
-<img src="/assets/negbinom.png" width="49%" class="center"/>
-<img src="/assets/negbinom_mean.png" width="49%" class="center"/>
+<img src="/assets/drosophila/negbinom.png" width="49%" class="center"/>
+<img src="/assets/drosophila/negbinom_mean.png" width="49%" class="center"/>
</p>
```
As shown, we underestimate the true rank as a few components have singular values below the Marchenko-Pastur noise floor.
@@ -218,9 +218,9 @@ This can be seen in the below figure, which shows the scatter plot of the parame
```@raw html
<p align="center">
<figure>
- <img src="/assets/nb_1_uncertainty_vs_expression.png" width="32%" />
- <img src="/assets/nb_2_uncertainty_vs_expression.png" width="32%" />
- <img src="/assets/nb_3_uncertainty_vs_expression.png" width="32%" />
+ <img src="/assets/drosophila/nb_1_uncertainty_vs_expression.png" width="32%" />
+ <img src="/assets/drosophila/nb_2_uncertainty_vs_expression.png" width="32%" />
+ <img src="/assets/drosophila/nb_3_uncertainty_vs_expression.png" width="32%" />
<figurecaption>
Each point is a gene (row). Color of point determined by mean expression of gene.
</figurecaption>
@@ -240,8 +240,8 @@ The right figure displays the cumulative density function for the filtered genes
```@raw html
<p align="center">
<figure>
- <img src="/assets/nb_total_uncertainty_vs_expression.png" width="49%" />
- <img src="/assets/nb_badfits.png" width="49%" />
+ <img src="/assets/drosophila/nb_total_uncertainty_vs_expression.png" width="49%" />
+ <img src="/assets/drosophila/nb_badfits.png" width="49%" />
<figurecaption>
Filter genes with bad fits. Genes with high uncertainty are determined to be lowly expressed.
</figurecaption>
@@ -255,8 +255,8 @@ This phenomenon persists across all gene expression levels.
```@raw html
<p align="center">
<figure>
- <img src="/assets/nb_param2.png" width="49%" />
- <img src="/assets/nb_param3.png" width="49%" />
+ <img src="/assets/drosophila/nb_param2.png" width="49%" />
+ <img src="/assets/drosophila/nb_param3.png" width="49%" />
<figurecaption>
Parameter distributions
</figurecaption>
@@ -274,9 +274,9 @@ As shown below, we find great quantitative agreement between the empirical estim
```@raw html
<p align="center">
<figure>
- <img src="/assets/bootstrap_1.png" width="60%" />
- <img src="/assets/bootstrap_2.png" width="60%" />
- <img src="/assets/bootstrap_3.png" width="60%" />
+ <img src="/assets/drosophila/bootstrap_1.png" width="60%" />
+ <img src="/assets/drosophila/bootstrap_2.png" width="60%" />
+ <img src="/assets/drosophila/bootstrap_3.png" width="60%" />
</figure>
</p>
```
@@ -288,8 +288,8 @@ As shown below, we find there are ``\sim 30`` **statistically significant** line
Interestingly, while not fully delocalized as seen by the participation ratio of the "noise" components, we see that roughly ``\sim 1000`` genes contribute significantly to each component suggesting these are coarse "pathways" discovered.
```@raw html
<p align="center">
-<img src="/assets/rank_estimate.png" width="49%" class="center"/>
-<img src="/assets/participation_ratio.png" width="49%" class="center"/>
+<img src="/assets/drosophila/rank_estimate.png" width="49%" class="center"/>
+<img src="/assets/drosophila/participation_ratio.png" width="49%" class="center"/>
</p>
```
To ensure element-wise positivity, the estimated ``\tilde{\mu}_{i\alpha}`` is obtained by performing non-negative matrix factorization on the rescaled ``\tilde{n}_{i\alpha}`` with rank ``35``.
@@ -311,7 +311,7 @@ This was determined to be an excellent stochastic model for the computed ``\tild
```@raw html
<p align="center">
-<img src="/assets/gamma_qq.png" width="49%" class="center"/>
+<img src="/assets/drosophila/gamma_qq.png" width="49%" class="center"/>
</p>
```
diff --git a/src/SeqSpace.jl b/src/SeqSpace.jl
index 298f4c4..ccb92f4 100644
--- a/src/SeqSpace.jl
+++ b/src/SeqSpace.jl
@@ -158,7 +158,7 @@ end
Deserialize a trained autoencoder from binary format to semantic format.
Represents model as a collection of functors.
"""
-function unmarshal(r::Result)
+function unmarshal(r)
autoencoder = model(r.model.size, r.param.dₒ;
Ws = r.param.Ws,
normalizes = r.param.BN,
@@ -187,7 +187,22 @@ function unmarshal(r::Result)
end
end
- return Result(r.param, r.loss, autoencoder)
+ param = HyperParams(;
+ dₒ = r.param.dₒ,
+ Ws = r.param.Ws,
+ BN = r.param.BN,
+ DO = r.param.DO,
+ N = r.param.N,
+ δ = r.param.δ,
+ η = r.param.η,
+ B = r.param.B,
+ V = r.param.V,
+ k = r.param.k,
+ γₓ = r.param.γₓ,
+ γᵤ = r.param.γᵤ,
+ )
+
+ return Result(param, r.loss, autoencoder)
end
# ------------------------------------------------------------------------
@@ -266,6 +281,10 @@ function buildloss(model, D², param)
Dz² = param.g(z)
Dx² = D²[i,i]
+ dx, dz = PointCloud.upper_tri(Dx²), PointCloud.upper_tri(Dz²)
+ rx, rz = softrank(dx ./ mean(dx)), softrank(dz ./ mean(dz))
+ ϵₓ = 1 - cor(1 .- rx, 1 .- rz)
+ #=
ϵₓ = mean(
let
dx, dz = Dx²[:,j], Dz²[:,j]
@@ -273,6 +292,7 @@ function buildloss(model, D², param)
1 - cor((1 .- rx).^2, (1 .- rz).^2)
end for j ∈ 1:size(Dx²,2)
)
+ =#
ϵᵤ = mean(
let
@@ -280,6 +300,7 @@ function buildloss(model, D², param)
mean( ( (2*i/length(zₛ)-1) - s)^2 for (i,s) in enumerate(zₛ) )
end for d ∈ 1:size(z,1)
)
+
#=
ϵᵤ = let
a = Voronoi.volumes(z)
diff --git a/src/infer.jl b/src/infer.jl
index 551e55e..5aecf0f 100644
--- a/src/infer.jl
+++ b/src/infer.jl
@@ -389,7 +389,7 @@ function make_objective(ref, qry)
function objective(Θ)
# β, ν, ω = #0.5, Θ[1:84], Θ[85:end] #ones(84)
# Σ, ϕ = cost_scan(ref, qry, ν, ω)
- β = 10
+ β = 250
ν, ω = Θ[1:84], Θ[85:end]
Σ, ϕ = cost_transform(ref, qry; ω=ω, ν=ν)
diff --git a/src/manifold.jl b/src/manifold.jl
index 3867539..650cfb9 100644
--- a/src/manifold.jl
+++ b/src/manifold.jl
@@ -200,8 +200,8 @@ Compute the tangent vectors ``\\hat{\\bm{e}}_\\phi, \\hat{\\bm{e}}_x`` associate
Assumes all points within `r` are distributed over the surface.
Assumes `r` is sized ``N \\times 3``.
"""
-function basis(s::Surface, r)
- x = pullback(s, r)
+function basis(s::Surface, r; surface=false, normalized=true)
+ x = surface ? r : pullback(s, r)
# function
x₀ = s.Λ[1].(x[:,1])
@@ -225,8 +225,10 @@ function basis(s::Surface, r)
+(∂a.*cos.(ϕ).-a.*sin.(ϕ).*∂ϕ).*cos.(x[:,2]) .- (∂b.*sin.(ϕ).+b.*cos.(ϕ).*∂ϕ).*sin.(x[:,2]) .+ ∂x₀,
+(∂a.*sin.(ϕ).+a.*cos.(ϕ).*∂ϕ).*cos.(x[:,2]) .+ (∂b.*cos.(ϕ).-b.*sin.(ϕ).*∂ϕ).*sin.(x[:,2]) .+ ∂y₀)
- ez = ez ./ sqrt.(ez⋅ez)
- eθ = eθ ./ sqrt.(eθ⋅eθ)
+ if normalized
+ ez = ez ./ sqrt.(ez⋅ez)
+ eθ = eθ ./ sqrt.(eθ⋅eθ)
+ end
return ez, eθ
end