From ff84390bb700077bf7b9203ff732b196e2a80722 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Tue, 1 Mar 2022 07:28:02 -0800 Subject: feat: updated code --- Project.toml | 1 + bin/drosophila.jl | 51 ++++++++++++++++++++++++++++++++++++++--------- docs/src/sci/inference.md | 12 +++++------ docs/src/sci/normalize.md | 46 +++++++++++++++++++++--------------------- src/SeqSpace.jl | 25 +++++++++++++++++++++-- src/infer.jl | 2 +- src/manifold.jl | 10 ++++++---- 7 files changed, 102 insertions(+), 45 deletions(-) diff --git a/Project.toml b/Project.toml index b9e06a6..f49854b 100644 --- a/Project.toml +++ b/Project.toml @@ -13,6 +13,7 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" +ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a" GSL = "92c85e6c-cbff-5e0c-80f7-495c94daaecd" GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63" diff --git a/bin/drosophila.jl b/bin/drosophila.jl index 64eb81d..cdfaadc 100644 --- a/bin/drosophila.jl +++ b/bin/drosophila.jl @@ -103,7 +103,7 @@ function heteroskedastic(raw) fig[1,1], xscale=Makie.log10, xlabel="cell sequencing depth", - xticks=([10^4.0, 10^4.5, 10^5.0], [L"10^{4.0}", L"10^{4.5}", L"10^{5.0}"]), + # xticks=[10^4.0, 10^4.5, 10^5.0], [L"10^4", L"10^{4.5}", L"10^5"]), ylabel="cumulative", yticks=([0, 0.25, 0.5, 0.75, 1.0], [L"0.0", L"0.25", L"0.50", L"0.75", L"1.00"]) ) @@ -492,6 +492,34 @@ function scaling(D, N; δ=2) save("$FIG/pointcloud_scaling.png", fig, px_per_unit=2) end +function locality(Dg, Dx) + x = PointCloud.upper_tri(Dx) + g = PointCloud.upper_tri(Dg) + + bins = collect(range(50,350,length=10)) + + fig = Figure(font="Latin Modern Math", fontsize=26) + ax1 = Axis(fig[1,1], + xlabel="expression distance (AU)", + ylabel="cumulative", + yticks=([0, 0.25, 0.5, 0.75, 1.0], [L"0.0", L"0.25", L"0.50", L"0.75", L"1.00"]) + ) + + color = cgrad(:matter, length(bins)-1, categorical=true) + for i in 1:length(bins)-1 + lb, ub = bins[i:i+1] + x0 = (lb + ub) / 2 + + data = g[lb .≤ x .≤ ub] + lines!(ax1, sort(data), range(0,1,length(data)), + color=color[i], + linewidth=2, + label=@sprintf "%.2f μm" x0) + end + axislegend("est. physical distance", position = :rb) + save("$FIG/pointcloud_locality.png", fig, px_per_unit=2) +end + function inverse(mapping, database, residual; len=100) cmean = zeros(len) cstd = zeros(len) @@ -501,8 +529,8 @@ function inverse(mapping, database, residual; len=100) for (i,β) in enumerate(βs) ψ = minimum(size(residual))*mapping.invert(β) ψ = ψ ./ sum(ψ,dims=1) - G = residual*ψ' - c = [cor(G[g,:] |> vec, database.expression.real[:,i] |> vec) for (i,g) in enumerate(mapping.index) if g !== nothing] + + c = [cor( (ψ*(residual[g,:] |> vec)) , database.expression.real[:,i] |> vec) for (i,g) in enumerate(mapping.index) if g !== nothing] cmean[i] = mean(c) cstd[i] = std(c) @@ -530,7 +558,7 @@ function inverse(mapping, database, residual; len=100) δ = 10 colors = cgrad(:matter,length(entropy[1:δ:end]), categorical=true) for (i,S) in enumerate(entropy[1:δ:end]) - lines!(ax1, sort(S), range(0,1,length(S)); + lines!(ax1, sort(S)./1.25, range(0,1,length(S)); linewidth=2, color=colors[i], label=@sprintf "%.2f" βs[δ*(i-1)+1]) @@ -760,22 +788,22 @@ function main(root::String; subdir=nothing) inverse, database = if isfile("$DATA/inverse.jld2") load("$DATA/inverse.jld2", "inverse", "database") else - ν, ω = if isfile("$DATA/bdntp_params.jld2") - # load("$DATA/bdntp_params.jld2", "nu", "omega") + ν, ω = if isfile("$DATA/bdtnp_params.jld2") + # load("$DATA/bdtnp_params.jld2", "ν", "ω") nothing, nothing else nothing, nothing end db = Inference.virtualembryo(directory="/home/nolln/mnt/data/drosophila/dvex"); - I = Inference.inversion(count./sum(count,dims=1), gene; refdb=db, ν=ν, ω=ω); + I = Inference.inversion(norm./sum(norm,dims=1), gene; refdb=db, ν=ν, ω=ω); save("$DATA/inverse.jld2", Dict("inverse"=>I, "transform"=>I.invert(150), "database"=>db)) I, db end Figures.scaling(D, 100) - Figures.inverse(inverse, database, Z) + Figures.inverse(inverse, database, norm./sum(norm,dims=1)) + Figures.gene_patterns(inverse, database, gene, norm./sum(norm,dims=1)) Figures.isomap(inverse, database, μ, Z, D) - Figures.gene_patterns(inverse, database, gene, count./sum(count,dims=1)) ξ = PointCloud.isomap(Z,10;k=10) if !isfile("$DATA/isomap.jld2") @@ -784,6 +812,11 @@ function main(root::String; subdir=nothing) "distance"=>D, )) end + + ψ = inverse.invert(150) + ψ = ψ ./ sum(ψ,dims=1) + D = ψ'*PointCloud.geodesics(database.position',6)*ψ + Figures.locality(PointCloud.Distances.euclidean(ξ[:,1:5]'), D) end if abspath(PROGRAM_FILE) == @__FILE__ diff --git a/docs/src/sci/inference.md b/docs/src/sci/inference.md index 84f1b75..ce166d0 100644 --- a/docs/src/sci/inference.md +++ b/docs/src/sci/inference.md @@ -28,7 +28,7 @@ The virtual embryo was constructed by aligning florescent pointcloud data obtain ```@raw html

- + Cartoon of the procedure used to generate the virtual embryo by the BDTNP. @@ -121,8 +121,8 @@ The mapping at the optimal temperature corresponds to a sampling entropy ``\rho_ ```@raw html

- - + + (Left)Residuals of scRNAseq expression patterns matches to BDTNP database. (Right) Entropy of sampling distribution, averaged over cells. @@ -142,9 +142,9 @@ Below we display a representative sample. ```@raw html

- - - + + + Genes shown (left-to-right): eve, ftz, twi diff --git a/docs/src/sci/normalize.md b/docs/src/sci/normalize.md index e6056d6..3cb5dbd 100644 --- a/docs/src/sci/normalize.md +++ b/docs/src/sci/normalize.md @@ -34,7 +34,7 @@ Our goal during the normalization procedure is two-fold: ```@raw html

- + Count matrix is heteroskedastic: variation of scales across genes (rows) and cells (columns) @@ -54,8 +54,8 @@ Owing to dropout and other sources of overdispersion, as shown empircally in the ```@raw html

- - + + scRNAseq data for Drosophila is overdispersed. @@ -80,7 +80,7 @@ As such, singular values would be bounded by ``\bar{\lambda} \equiv 1+\sigma\sqr ```@raw html

- +

Marchenko pastur distribution (orange) vs random matrix eigenvalues (blue) @@ -101,8 +101,8 @@ This is exhibited empirically below. ```@raw html

- - + +

``` @@ -127,8 +127,8 @@ An example is shown below: ```@raw html

- - + +

``` @@ -201,8 +201,8 @@ The result is shown below. ```@raw html

- - + +

``` As shown, we underestimate the true rank as a few components have singular values below the Marchenko-Pastur noise floor. @@ -218,9 +218,9 @@ This can be seen in the below figure, which shows the scatter plot of the parame ```@raw html

- - - + + + Each point is a gene (row). Color of point determined by mean expression of gene. @@ -240,8 +240,8 @@ The right figure displays the cumulative density function for the filtered genes ```@raw html

- - + + Filter genes with bad fits. Genes with high uncertainty are determined to be lowly expressed. @@ -255,8 +255,8 @@ This phenomenon persists across all gene expression levels. ```@raw html

- - + + Parameter distributions @@ -274,9 +274,9 @@ As shown below, we find great quantitative agreement between the empirical estim ```@raw html

- - - + + +

``` @@ -288,8 +288,8 @@ As shown below, we find there are ``\sim 30`` **statistically significant** line Interestingly, while not fully delocalized as seen by the participation ratio of the "noise" components, we see that roughly ``\sim 1000`` genes contribute significantly to each component suggesting these are coarse "pathways" discovered. ```@raw html

- - + +

``` To ensure element-wise positivity, the estimated ``\tilde{\mu}_{i\alpha}`` is obtained by performing non-negative matrix factorization on the rescaled ``\tilde{n}_{i\alpha}`` with rank ``35``. @@ -311,7 +311,7 @@ This was determined to be an excellent stochastic model for the computed ``\tild ```@raw html

- +

``` diff --git a/src/SeqSpace.jl b/src/SeqSpace.jl index 298f4c4..ccb92f4 100644 --- a/src/SeqSpace.jl +++ b/src/SeqSpace.jl @@ -158,7 +158,7 @@ end Deserialize a trained autoencoder from binary format to semantic format. Represents model as a collection of functors. """ -function unmarshal(r::Result) +function unmarshal(r) autoencoder = model(r.model.size, r.param.dₒ; Ws = r.param.Ws, normalizes = r.param.BN, @@ -187,7 +187,22 @@ function unmarshal(r::Result) end end - return Result(r.param, r.loss, autoencoder) + param = HyperParams(; + dₒ = r.param.dₒ, + Ws = r.param.Ws, + BN = r.param.BN, + DO = r.param.DO, + N = r.param.N, + δ = r.param.δ, + η = r.param.η, + B = r.param.B, + V = r.param.V, + k = r.param.k, + γₓ = r.param.γₓ, + γᵤ = r.param.γᵤ, + ) + + return Result(param, r.loss, autoencoder) end # ------------------------------------------------------------------------ @@ -266,6 +281,10 @@ function buildloss(model, D², param) Dz² = param.g(z) Dx² = D²[i,i] + dx, dz = PointCloud.upper_tri(Dx²), PointCloud.upper_tri(Dz²) + rx, rz = softrank(dx ./ mean(dx)), softrank(dz ./ mean(dz)) + ϵₓ = 1 - cor(1 .- rx, 1 .- rz) + #= ϵₓ = mean( let dx, dz = Dx²[:,j], Dz²[:,j] @@ -273,6 +292,7 @@ function buildloss(model, D², param) 1 - cor((1 .- rx).^2, (1 .- rz).^2) end for j ∈ 1:size(Dx²,2) ) + =# ϵᵤ = mean( let @@ -280,6 +300,7 @@ function buildloss(model, D², param) mean( ( (2*i/length(zₛ)-1) - s)^2 for (i,s) in enumerate(zₛ) ) end for d ∈ 1:size(z,1) ) + #= ϵᵤ = let a = Voronoi.volumes(z) diff --git a/src/infer.jl b/src/infer.jl index 551e55e..5aecf0f 100644 --- a/src/infer.jl +++ b/src/infer.jl @@ -389,7 +389,7 @@ function make_objective(ref, qry) function objective(Θ) # β, ν, ω = #0.5, Θ[1:84], Θ[85:end] #ones(84) # Σ, ϕ = cost_scan(ref, qry, ν, ω) - β = 10 + β = 250 ν, ω = Θ[1:84], Θ[85:end] Σ, ϕ = cost_transform(ref, qry; ω=ω, ν=ν) diff --git a/src/manifold.jl b/src/manifold.jl index 3867539..650cfb9 100644 --- a/src/manifold.jl +++ b/src/manifold.jl @@ -200,8 +200,8 @@ Compute the tangent vectors ``\\hat{\\bm{e}}_\\phi, \\hat{\\bm{e}}_x`` associate Assumes all points within `r` are distributed over the surface. Assumes `r` is sized ``N \\times 3``. """ -function basis(s::Surface, r) - x = pullback(s, r) +function basis(s::Surface, r; surface=false, normalized=true) + x = surface ? r : pullback(s, r) # function x₀ = s.Λ[1].(x[:,1]) @@ -225,8 +225,10 @@ function basis(s::Surface, r) +(∂a.*cos.(ϕ).-a.*sin.(ϕ).*∂ϕ).*cos.(x[:,2]) .- (∂b.*sin.(ϕ).+b.*cos.(ϕ).*∂ϕ).*sin.(x[:,2]) .+ ∂x₀, +(∂a.*sin.(ϕ).+a.*cos.(ϕ).*∂ϕ).*cos.(x[:,2]) .+ (∂b.*cos.(ϕ).-b.*sin.(ϕ).*∂ϕ).*sin.(x[:,2]) .+ ∂y₀) - ez = ez ./ sqrt.(ez⋅ez) - eθ = eθ ./ sqrt.(eθ⋅eθ) + if normalized + ez = ez ./ sqrt.(ez⋅ez) + eθ = eθ ./ sqrt.(eθ⋅eθ) + end return ez, eθ end -- cgit v1.2.1