import holoviews as hv
import numpy as np
import scistanpy as ssp

hv.extension('bokeh', inline=True)

def sample_data():
    """
    Generate sample data for deep mutational scanning analysis.
    Returns:
        INPUT_COUNTS: Array of input counts for each variant.
        LOG_INPUT_FREQS: Log frequencies of input variants.
        LOG_OUTPUT_FREQS: Log frequencies of output variants after selection.
    """
    # Sample input counts
    rng = np.random.default_rng(1025)
    input_freqs = rng.dirichlet(np.ones(10))
    log_input_freqs = np.log(input_freqs)
    input_counts = np.stack([rng.multinomial(10000, input_freqs)
                             for _ in range(3)])

    # Sample enrichment factors
    log_enrichment_factors = np.log(rng.exponential(0.1, size=(10,)))

    # Generate output counts after selection
    log_output_freqs = log_input_freqs + log_enrichment_factors
    log_output_freqs -= np.log(np.sum(np.exp(log_output_freqs))) # Normalize
    output_counts = np.stack([rng.multinomial(10000, np.exp(log_output_freqs))
                               for _ in range(3)])

    return {
        "INPUT_COUNTS": input_counts,
        "LOG_INPUT_FREQS": log_input_freqs,
        "OUTPUT_COUNTS": output_counts,
        "LOG_OUTPUT_FREQS": log_output_freqs,
        "LOG_ENRICHMENT_FACTORS": log_enrichment_factors
    }

SAMPLE_DATA = sample_data()

# All model's inherit from `ssp.Model`
class DMSModel(ssp.Model):

    # Define the structure of the model in the `__init__` method
    def __init__(self, input_counts, output_counts):

        # We're going to register default data for the input and output counts.
        # This isn't necessary, but means you won't need to pass the observables
        # into later methods.
        super().__init__(
            default_data={"input_counts": input_counts, "output_counts": output_counts}
        )

        # We now define our priors. Let's assume that we expect our enrichment
        # ratios to follow an exponential distribution. The log-enrichment factors
        # will then follow a exponential-exponential (Gumbel) distribution.
        # Note: We define 10 independent log-enrichment factors using the "shape"
        # argument.
        # Note: The "beta" parameter here is the inverse of the scale parameter
        # in Numpy/Scipy.
        self.log_enrichment = ssp.parameters.ExpExponential(beta=10.0, shape=(10,))

        # We reason that the input and output counts are multinomially distributed
        # with some unknown frequency, which are the values we want to infer. To
        # handle potentially small values, we will use an Exp-Dirichlet prior to
        # model the log-frequencies.
        self.log_input_freqs = ssp.parameters.ExpDirichlet(alpha=1.0, shape=(10,))

        # From the log-input frequencies and log-enrichment factors, we can define
        # a transformation that takes us to the output frequencies. We're in log
        # space, so this is just addition followed by normalization. Note that,
        # currently, all reductions and normalizations are performed over the last
        # axis (this cannot be changed yet).
        self.log_output_freqs = ssp.operations.normalize_log(
            self.log_input_freqs + self.log_enrichment
        )

        # Finally, we can model our observed counts at both the beginning and end
        # as multinomially distributed. Note that the name of the observable must
        # match the name we used when registering default data. If not registering
        # default data, you will need to provide the observables as keyword arguments
        # in the relevant functions (again, with matching names).
        # Note: We are using an alternate parametrization of the multinomial distribution
        # here to keep in log space.
        # Note: Numpy broadcasting rules apply, so the below will use the same
        # 10 log-frequencies for all 3 replicates. Note that this is why we need
        # `keepdims=True` when summing the counts to get `N`: (shapes (3, 1) and
        # (10,) broadcast to (3 x 10), while (3,) and (10,) do not broadcast.
        self.input_counts = ssp.parameters.MultinomialLogTheta(
            log_theta=self.log_input_freqs,
            N=input_counts.sum(axis=-1, keepdims=True),
            shape=(3, 10),
        )
        self.output_counts = ssp.parameters.MultinomialLogTheta(
            log_theta=self.log_output_freqs,
            N=output_counts.sum(axis=-1, keepdims=True),
            shape=(3, 10),
        )

# Build an instance
EXAMPLE_MODEL = DMSModel(
    input_counts=SAMPLE_DATA["INPUT_COUNTS"],
    output_counts=SAMPLE_DATA["OUTPUT_COUNTS"]
)

# Run a prior predictive check
EXAMPLE_MODEL.prior_predictive()

BokehModel(combine_events=True, render_bundle={'docs_json': {'143225d7-76f4-456e-b0bc-57edc7246af7': {'version…

MLE = EXAMPLE_MODEL.mle(lr=0.01)

Epochs:   7%|▋         | 6823/100000 [01:03<14:21, 108.17it/s, -log pdf/pmf=245.32]

MLE_ESTIMATES = {k: v.mle for k, v in MLE.model_varname_to_mle.items()}
MLE_ESTIMATES

{'log_enrichment': array([-0.88175307, -3.0444229 , -2.52572081, -2.72127213, -2.63158634,
        -3.36346962, -4.35208336, -2.55653866, -0.82543532, -3.89821476]),
 'log_input_freqs': array([-2.31319812, -1.04716232, -2.52949171, -2.33462835, -2.90948524,
        -2.95035977, -2.54973785, -6.09674183, -2.15798691, -2.64728865]),
 'input_counts': None,
 'output_counts': None}

hv.Scatter(
    data={
        "x": SAMPLE_DATA["LOG_ENRICHMENT_FACTORS"],
        "y": MLE_ESTIMATES["log_enrichment"],
    }
)

hv.Scatter(
    data={"x": SAMPLE_DATA["LOG_INPUT_FREQS"], "y": MLE_ESTIMATES["log_input_freqs"]}
)

INFERENCE_OBJ = MLE.get_inference_obj() # Bootstrapping
INFERENCE_OBJ.run_ppc() # Checking fit

BokehModel(combine_events=True, render_bundle={'docs_json': {'315e15f2-3cee-49ae-bc1b-8ab500f48833': {'version…

INFERENCE_OBJ.inference_obj  # ArviZ InferenceData instance

<xarray.Dataset> Size: 160kB
Dimensions:                (chain: 1, draw: 1000, a: 10)
Coordinates:
    log_input_freqs.alpha  (a) float64 80B 1.0 1.0 1.0 1.0 ... 1.0 1.0 1.0 1.0
Dimensions without coordinates: chain, draw, a
Data variables:
    log_enrichment         (chain, draw, a) float64 80kB -3.052 ... -0.9929
    log_input_freqs        (chain, draw, a) float64 80kB -3.075 ... -2.559

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

array([[[-3.05162068, -3.48388047, -2.45199896, ..., -2.11331152,
         -2.77574977, -1.50744984],
        [-3.54165014, -2.46428759, -2.49184172, ..., -2.16073524,
         -0.56073761, -3.71667996],
        [-3.08529589, -2.08587083, -3.46449429, ..., -3.97047433,
         -2.21290193, -2.53005757],
        ...,
        [-3.01478083, -2.26081935, -3.46390801, ..., -2.97368984,
         -4.6902851 , -1.10722984],
        [-3.99862541, -1.6489174 , -1.80430865, ..., -1.89673003,
         -3.35455209, -2.33587573],
        [-3.92141236, -2.46620363, -4.53229531, ..., -2.70640658,
         -4.58967283, -0.99287571]]], shape=(1, 1000, 10))

array([[[-3.07481256, -1.42619118, -1.75907631, ..., -3.67515058,
         -2.24584176, -1.95670032],
        [-3.1103593 , -5.45017524, -2.43984753, ..., -2.78305232,
         -1.32553558, -1.52997179],
        [-2.44140875, -3.04075668, -4.79461183, ..., -4.18685807,
         -1.74526816, -2.56184337],
        ...,
        [-2.37260952, -2.01023605, -1.21402788, ..., -2.81535244,
         -3.32033473, -1.66475567],
        [-5.16400392, -2.0192338 , -2.43027598, ..., -2.51321467,
         -4.54908608, -1.86393857],
        [-1.50167806, -4.53370862, -2.74664954, ..., -2.60190626,
         -1.77074476, -2.5592491 ]]], shape=(1, 1000, 10))

<xarray.Dataset> Size: 480kB
Dimensions:                (chain: 1, draw: 1000, b: 3, a: 10)
Coordinates:
    input_counts.N         (b) int64 24B 10000 10000 10000
    log_input_freqs.alpha  (a) float64 80B 1.0 1.0 1.0 1.0 ... 1.0 1.0 1.0 1.0
    output_counts.N        (b) int64 24B 10000 10000 10000
Dimensions without coordinates: chain, draw, b, a
Data variables:
    input_counts           (chain, draw, b, a) float64 240kB 979.0 ... 716.0
    output_counts          (chain, draw, b, a) float64 240kB 3.211e+03 ... 122.0

array([10000, 10000, 10000])

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

array([10000, 10000, 10000])

array([[[[ 979., 3523.,  846., ...,   22., 1111.,  646.],
         [1005., 3505.,  801., ...,   19., 1201.,  665.],
         [ 984., 3508.,  783., ...,   25., 1230.,  683.]],

        [[ 993., 3499.,  765., ...,   21., 1140.,  726.],
         [ 990., 3578.,  777., ...,   20., 1110.,  736.],
         [ 984., 3496.,  838., ...,   16., 1228.,  688.]],

        [[ 967., 3471.,  804., ...,   22., 1214.,  701.],
         [1006., 3531.,  746., ...,   24., 1150.,  692.],
         [ 988., 3472.,  796., ...,   12., 1128.,  686.]],

        ...,

        [[1032., 3509.,  774., ...,   26., 1148.,  702.],
         [ 985., 3586.,  815., ...,   29., 1158.,  694.],
         [ 970., 3603.,  766., ...,   26., 1176.,  679.]],

        [[ 985., 3514.,  788., ...,   22., 1190.,  711.],
         [1046., 3545.,  795., ...,   21., 1120.,  672.],
         [ 958., 3575.,  823., ...,   15., 1143.,  644.]],

        [[ 958., 3528.,  793., ...,   19., 1227.,  772.],
         [1014., 3499.,  801., ...,   19., 1148.,  713.],
         [ 969., 3522.,  770., ...,   26., 1149.,  716.]]]],
      shape=(1, 1000, 3, 10))

array([[[[3211., 1322.,  472., ...,   11., 3837.,  131.],
         [3129., 1326.,  491., ...,   17., 3868.,  103.],
         [3179., 1328.,  493., ...,    5., 3898.,  118.]],

        [[3294., 1353.,  503., ...,   17., 3727.,  101.],
         [3181., 1271.,  478., ...,    6., 3968.,  103.],
         [3154., 1310.,  508., ...,   20., 3884.,  114.]],

        [[3120., 1308.,  479., ...,   15., 3977.,  107.],
         [3256., 1241.,  465., ...,    8., 3894.,   94.],
         [3195., 1293.,  481., ...,    9., 3894.,  100.]],

        ...,

        [[3186., 1300.,  506., ...,   14., 3924.,   99.],
         [3150., 1292.,  505., ...,   13., 3941.,   94.],
         [3154., 1301.,  507., ...,   18., 3859.,  125.]],

        [[3156., 1283.,  505., ...,   22., 3939.,   90.],
         [3168., 1246.,  491., ...,   11., 3946.,  105.],
         [3194., 1273.,  434., ...,   14., 3924.,  121.]],

        [[3200., 1296.,  478., ...,   13., 3921.,  126.],
         [3211., 1223.,  505., ...,   10., 3939.,  100.],
         [3158., 1329.,  464., ...,   15., 3891.,  122.]]]],
      shape=(1, 1000, 3, 10))

HMC_RES = EXAMPLE_MODEL.mcmc(iter_warmup=2000, iter_sampling=1000)

13:16:34 - cmdstanpy - INFO - compiling stan file /tmp/tmpmmet_h7o/model.stan to exe file /tmp/tmpmmet_h7o/model

13:17:03 - cmdstanpy - INFO - compiled model executable: /tmp/tmpmmet_h7o/model
13:17:03 - cmdstanpy - WARNING - Stan compiler has produced 2 warnings:
13:17:03 - cmdstanpy - WARNING - 
--- Translating Stan model to C++ code ---
bin/stanc --filename-in-msg=model.stan --warn-pedantic --O1 --include-paths=/home/bwittmann/micromamba/envs/ssp_test/lib/python3.12/site-packages/scistanpy/model/stan --o=/tmp/tmpmmet_h7o/model.hpp /tmp/tmpmmet_h7o/model.stan
Warning: The parameter log_input_freqs_raw has no priors. This means either
    no prior is provided, or the prior(s) depend on data variables. In the
    later case, this may be a false positive.
Warning: The parameter log_enrichment has no priors. This means either no
    prior is provided, or the prior(s) depend on data variables. In the later
    case, this may be a false positive.

--- Compiling C++ code ---
g++ -std=c++17 -pthread -D_REENTRANT -Wno-sign-compare -Wno-ignored-attributes -Wno-class-memaccess     -DSTAN_THREADS -I stan/lib/stan_math/lib/tbb_2020.3/include    -O3 -I src -I stan/src -I stan/lib/rapidjson_1.1.0/ -I lib/CLI11-1.9.1/ -I stan/lib/stan_math/ -I stan/lib/stan_math/lib/eigen_3.4.0 -I stan/lib/stan_math/lib/boost_1.87.0 -I stan/lib/stan_math/lib/sundials_6.1.1/include -I stan/lib/stan_math/lib/sundials_6.1.1/src/sundials    -DBOOST_DISABLE_ASSERTS          -c -Wno-ignored-attributes   -x c++ -o /tmp/tmpmmet_h7o/model.o /tmp/tmpmmet_h7o/model.hpp

--- Linking model ---
g++ -std=c++17 -pthread -D_REENTRANT -Wno-sign-compare -Wno-ignored-attributes -Wno-class-memaccess     -DSTAN_THREADS -I stan/lib/stan_math/lib/tbb_2020.3/include    -O3 -I src -I stan/src -I stan/lib/rapidjson_1.1.0/ -I lib/CLI11-1.9.1/ -I stan/lib/stan_math/ -I stan/lib/stan_math/lib/eigen_3.4.0 -I stan/lib/stan_math/lib/boost_1.87.0 -I stan/lib/stan_math/lib/sundials_6.1.1/include -I stan/lib/stan_math/lib/sundials_6.1.1/src/sundials    -DBOOST_DISABLE_ASSERTS               -Wl,-L,"/home/bwittmann/.cmdstan/cmdstan-2.37.0/stan/lib/stan_math/lib/tbb"   -Wl,-rpath,"/home/bwittmann/.cmdstan/cmdstan-2.37.0/stan/lib/stan_math/lib/tbb"      /tmp/tmpmmet_h7o/model.o src/cmdstan/main_threads.o       -ltbb   stan/lib/stan_math/lib/sundials_6.1.1/lib/libsundials_nvecserial.a stan/lib/stan_math/lib/sundials_6.1.1/lib/libsundials_cvodes.a stan/lib/stan_math/lib/sundials_6.1.1/lib/libsundials_idas.a stan/lib/stan_math/lib/sundials_6.1.1/lib/libsundials_kinsol.a  stan/lib/stan_math/lib/tbb/libtbb.so.2 -o /tmp/tmpmmet_h7o/model
rm /tmp/tmpmmet_h7o/model.hpp /tmp/tmpmmet_h7o/model.o

13:17:03 - cmdstanpy - INFO - CmdStan start processing

chain 1 |          | 00:00 Status

chain 2 |          | 00:00 Status

chain 3 |          | 00:00 Status

chain 4 |          | 00:00 Status

13:17:05 - cmdstanpy - INFO - CmdStan done processing.

Converting CSV to NetCDF: 100%|██████████| 4/4 [00:27<00:00,  6.75s/it]

_ = HMC_RES.diagnose()

Sample diagnostic tests results' summaries:
-------------------------------------------
0 of 4000 (0.00%) samples had a low energy.
0 of 4000 (0.00%) samples reached the maximum tree depth.
0 of 4000 (0.00%) samples diverged.

R_hat diagnostic tests results' summaries:
------------------------------------------
0 of 10 (0.00%) r_hats tests failed for log_enrichment.
0 of 10 (0.00%) r_hats tests failed for log_input_freqs.
0 of 10 (0.00%) r_hats tests failed for log_output_freqs.

Ess_bulk diagnostic tests results' summaries:
---------------------------------------------
0 of 10 (0.00%) ess_bulks tests failed for log_enrichment.
0 of 10 (0.00%) ess_bulks tests failed for log_input_freqs.
0 of 10 (0.00%) ess_bulks tests failed for log_output_freqs.

Ess_tail diagnostic tests results' summaries:
---------------------------------------------
0 of 10 (0.00%) ess_tails tests failed for log_enrichment.
0 of 10 (0.00%) ess_tails tests failed for log_input_freqs.
0 of 10 (0.00%) ess_tails tests failed for log_output_freqs.

HMC_RES.run_ppc()

BokehModel(combine_events=True, render_bundle={'docs_json': {'1dffcae8-2c6d-4ab4-b57c-21e451fb8dc8': {'version…

HMC_RES.inference_obj

<xarray.Dataset> Size: 480kB
Dimensions:           (chain: 4, draw: 1000, a: 10)
Dimensions without coordinates: chain, draw, a
Data variables:
    log_enrichment    (chain, draw, a) float32 160kB -1.069 -3.236 ... -3.867
    log_input_freqs   (chain, draw, a) float32 160kB -2.328 -1.053 ... -2.638
    log_output_freqs  (chain, draw, a) float32 160kB -1.142 -2.034 ... -4.514

array([[[-1.068688, -3.236369, ..., -1.05469 , -4.08567 ],
        [-1.262433, -3.427649, ..., -1.232311, -4.283817],
        ...,
        [-0.932845, -3.147924, ..., -0.896691, -4.035187],
        [-0.77195 , -2.941099, ..., -0.727211, -3.724041]],

       [[-1.046526, -3.162458, ..., -0.94579 , -4.045295],
        [-0.956545, -3.104215, ..., -0.877397, -4.097172],
        ...,
        [-0.64909 , -2.812824, ..., -0.626214, -3.604606],
        [-0.882263, -3.056537, ..., -0.805593, -3.905529]],

       [[-1.333376, -3.492797, ..., -1.334005, -4.496884],
        [-1.137693, -3.253041, ..., -1.087074, -4.189535],
        ...,
        [-1.194294, -3.381477, ..., -1.101621, -4.325435],
        [-1.203106, -3.426734, ..., -1.159328, -4.311948]],

       [[-1.623296, -3.79377 , ..., -1.538482, -4.637006],
        [-1.534369, -3.703756, ..., -1.514616, -4.493636],
        ...,
        [-0.645705, -2.858608, ..., -0.678251, -3.628458],
        [-0.844686, -2.973438, ..., -0.735004, -3.867251]]],
      shape=(4, 1000, 10), dtype=float32)

array([[[-2.328192, -1.052654, ..., -2.146743, -2.650253],
        [-2.30359 , -1.053144, ..., -2.144945, -2.63379 ],
        ...,
        [-2.328521, -1.047004, ..., -2.156443, -2.659002],
        [-2.331172, -1.044762, ..., -2.165393, -2.639492]],

       [[-2.285582, -1.0619  , ..., -2.161284, -2.598546],
        [-2.297102, -1.060655, ..., -2.149428, -2.589555],
        ...,
        [-2.320424, -1.054399, ..., -2.120126, -2.644726],
        [-2.313679, -1.037961, ..., -2.181865, -2.686515]],

       [[-2.337826, -1.047284, ..., -2.141611, -2.589049],
        [-2.306165, -1.057289, ..., -2.148264, -2.649077],
        ...,
        [-2.322836, -1.0529  , ..., -2.184392, -2.63638 ],
        [-2.337645, -1.03459 , ..., -2.165001, -2.632983]],

       [[-2.320177, -1.038026, ..., -2.196003, -2.647448],
        [-2.314257, -1.045154, ..., -2.130461, -2.659276],
        ...,
        [-2.336945, -1.042501, ..., -2.12371 , -2.692032],
        [-2.302911, -1.04821 , ..., -2.189054, -2.637959]]],
      shape=(4, 1000, 10), dtype=float32)

array([[[-1.141578, -2.033722, ..., -0.946132, -4.480623],
        [-1.138317, -2.053086, ..., -0.94955 , -4.489902],
        ...,
        [-1.140021, -2.073582, ..., -0.931789, -4.572843],
        [-1.154388, -2.037127, ..., -0.94387 , -4.414799]],

       [[-1.152483, -2.044734, ..., -0.92745 , -4.464217],
        [-1.157013, -2.068236, ..., -0.93019 , -4.590092],
        ...,
        [-1.152963, -2.050672, ..., -0.92979 , -4.43278 ],
        [-1.150977, -2.049534, ..., -0.942494, -4.547079]],

       [[-1.144643, -2.013521, ..., -0.949056, -4.559373],
        [-1.156536, -2.023007, ..., -0.948016, -4.55129 ],
        ...,
        [-1.154477, -2.071724, ..., -0.923361, -4.599162],
        [-1.152518, -2.073091, ..., -0.936096, -4.556698]],

       [[-1.150744, -2.039067, ..., -0.941757, -4.491725],
        [-1.148419, -2.048703, ..., -0.944871, -4.452706],
        ...,
        [-1.132241, -2.0507  , ..., -0.951551, -4.47008 ],
        [-1.156215, -2.030266, ..., -0.932676, -4.513828]]],
      shape=(4, 1000, 10), dtype=float32)

<xarray.Dataset> Size: 960kB
Dimensions:        (chain: 4, draw: 1000, b: 3, a: 10)
Dimensions without coordinates: chain, draw, b, a
Data variables:
    input_counts   (chain, draw, b, a) int32 480kB 1020 3470 792 ... 24 1168 681
    output_counts  (chain, draw, b, a) int32 480kB 3149 1251 454 ... 8 3920 119

array([[[[1020, ...,  712],
         ...,
         [ 992, ...,  679]],

        ...,

        [[ 924, ...,  705],
         ...,
         [ 963, ...,  751]]],


       ...,


       [[[1024, ...,  743],
         ...,
         [1018, ...,  738]],

        ...,

        [[ 966, ...,  704],
         ...,
         [ 954, ...,  681]]]], shape=(4, 1000, 3, 10), dtype=int32)

array([[[[3149, ...,   97],
         ...,
         [3232, ...,  116]],

        ...,

        [[3141, ...,  102],
         ...,
         [3077, ...,  129]]],


       ...,


       [[[3123, ...,  117],
         ...,
         [3154, ...,  118]],

        ...,

        [[3184, ...,  109],
         ...,
         [3151, ...,  119]]]], shape=(4, 1000, 3, 10), dtype=int32)

<xarray.Dataset> Size: 112kB
Dimensions:        (chain: 4, draw: 1000)
Dimensions without coordinates: chain, draw
Data variables:
    lp__           (chain, draw) float32 16kB ...
    accept_stat__  (chain, draw) float32 16kB ...
    stepsize__     (chain, draw) float32 16kB ...
    treedepth__    (chain, draw) int32 16kB 7 7 8 7 7 7 7 7 ... 7 7 7 7 7 7 8 7
    n_leapfrog__   (chain, draw) int32 16kB ...
    divergent__    (chain, draw) int32 16kB 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0
    energy__       (chain, draw) float32 16kB 270.7 269.4 275.2 ... 284.3 285.1

[4000 values with dtype=float32]

[4000 values with dtype=float32]

Name Previous Section	Name in Model
$\ln{\mathbf{\theta_i}}$	log_input_freqs
$\ln{\mathbf{\theta_f}}$	log_output_freqs
$\ln{\mathbf{f}}$	log_enrichment
$\mathbf{c_i}$	input_counts
$\mathbf{c_i}$	output_counts

Overview¶

DMS Background¶

Bayesian Analysis of DMS¶

Fitting a DMS Model in SciStanPy¶