## _____ ____ _ _ _____ _____ ____ _____
## / ____|/ __ \| | | | __ \ / ____|/ __ \| __ \
## | (___ | | | | | | | |__) | (___ | | | | |__) |
## \___ \| | | | | | | _ / \___ \| | | | ___/
## ____) | |__| | |__| | | \ \ ____) | |__| | |
## |_____/ \____/ \____/|_| \_\_____/ \____/|_|
## Alex Holehouse (Pappu Lab and Holehouse Lab) and Jared Lalmansing (Pappu lab)
## Simulation analysis package
## Copyright 2014 - 2026
##
"""
sshdx - HDX protection factors via the Best-Vendruscolo model.
Predicts per-residue ln(protection factor) (ln P) for backbone amide
hydrogen-deuterium exchange (HDX) from a structural ensemble, using the
empirical Best-Vendruscolo relation
ln P_i = beta_c * N_c(i) + beta_h * N_h(i) + beta_0
where, for each residue ``i`` with a backbone amide N-H:
* ``N_c(i)`` is the number of *heavy atoms* in the protein within
``contact_cutoff`` (default 6.5 A) of the amide N of residue ``i``,
ignoring residues whose sequence separation from ``i`` is at most
``exclude_neighbours`` (default 2);
* ``N_h(i)`` is the number of *backbone* H-bonds the amide H of residue
``i`` forms as donor to a backbone carbonyl O at sequence separation
``> exclude_neighbours`` (per-frame H-bonds are detected with
``mdtraj.wernet_nilsson``).
Both counts are returned per frame per residue, so the resulting
``(n_frames, n_residues)`` arrays plug directly into the SOURSOP BME /
COPER reweighters. The optional ``weights`` argument collapses the frame
axis to a single per-residue ensemble mean (per the package-wide
:func:`soursop.ssutils.validate_weights` contract).
Proline (no backbone H) and any residue lacking a recognisable backbone
amide H are dropped from the residue list; the function returns the
residue indices it covered alongside the data array.
Public entry points
-------------------
* :func:`compute_Nc` - per-residue heavy-atom contacts (per frame).
* :func:`compute_Nh` - per-residue backbone H-bonds (per frame).
* :func:`compute_protection_factors` - per-residue ln(P) (per frame, or
optionally collapsed via ``weights``).
References
----------
* Best, R. B. & Vendruscolo, M. *Structural Interpretation of Hydrogen
Exchange Protection Factors in Proteins.* Structure **14**, 97-106
(2006). doi:`10.1016/j.str.2005.09.012
<https://doi.org/10.1016/j.str.2005.09.012>`_.
* Vendruscolo, M., Paci, E., Dobson, C. M. & Karplus, M. *Rare
Fluctuations of Native Proteins Sampled by Equilibrium Hydrogen
Exchange.* J. Am. Chem. Soc. **125**, 15686-15687 (2003).
**Author(s):** Alex Holehouse
"""
import mdtraj as md
import numpy as np
from .ssexceptions import SSException
from .ssutils import validate_weights, weighted_mean
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------
# Defaults (Best-Vendruscolo 2006)
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------
#: Heavy-atom contact weight (Best & Vendruscolo, Structure 14, 2006).
DEFAULT_BETA_C = 0.35
#: H-bond weight (Best & Vendruscolo, Structure 14, 2006).
DEFAULT_BETA_H = 2.0
#: Intercept; defaults to 0 in Best-Vendruscolo's published form.
DEFAULT_BETA_0 = 0.0
#: Heavy-atom contact cutoff, in **nanometres** (6.5 A).
DEFAULT_CONTACT_CUTOFF_NM = 0.65
#: Sequence-separation exclusion: residues with ``|i - j| <= this`` are
#: excluded from both N_c and N_h.
DEFAULT_EXCLUDE_NEIGHBOURS = 2
#: Atom-name fallbacks for the backbone amide hydrogen across common
#: force fields (CHARMM/AMBER use ``H`` or ``HN``; some N-terminal
#: parameterisations use ``H1``).
_BACKBONE_H_NAMES = ("H", "HN", "H1")
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------
def _backbone_nh_map(topology):
"""Resolve backbone amide N+H atom indices per residue.
Skips residues with no backbone N (rare hetero residues), no
recognisable backbone H name (e.g. proline), so the returned arrays
are aligned 1:1.
Returns
-------
residue_indices : numpy.ndarray (n_res,)
n_atom_indices : numpy.ndarray (n_res,)
h_atom_indices : numpy.ndarray (n_res,)
"""
res_idx, n_idx, h_idx = [], [], []
for residue in topology.residues:
if residue.name == "PRO":
continue
try:
n_atom = next(a for a in residue.atoms if a.name == "N")
except StopIteration:
continue
h_atom = None
for cand in _BACKBONE_H_NAMES:
for a in residue.atoms:
if a.name == cand:
h_atom = a
break
if h_atom is not None:
break
if h_atom is None:
continue
res_idx.append(residue.index)
n_idx.append(n_atom.index)
h_idx.append(h_atom.index)
return (
np.asarray(res_idx, dtype=int),
np.asarray(n_idx, dtype=int),
np.asarray(h_idx, dtype=int),
)
def _backbone_carbonyl_o_map(topology):
"""Atom indices of backbone carbonyl O per residue (acceptor)."""
res_idx, o_idx = [], []
for residue in topology.residues:
try:
o_atom = next(a for a in residue.atoms if a.name == "O")
except StopIteration:
continue
res_idx.append(residue.index)
o_idx.append(o_atom.index)
return np.asarray(res_idx, dtype=int), np.asarray(o_idx, dtype=int)
def _heavy_atom_residue_map(topology):
"""Atom indices and residue indices of all non-hydrogen atoms."""
atom_idx, res_idx = [], []
for a in topology.atoms:
if a.element is None or a.element.symbol == "H":
continue
atom_idx.append(a.index)
res_idx.append(a.residue.index)
return np.asarray(atom_idx, dtype=int), np.asarray(res_idx, dtype=int)
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------
[docs]
def compute_Nc(
protein,
contact_cutoff=DEFAULT_CONTACT_CUTOFF_NM,
exclude_neighbours=DEFAULT_EXCLUDE_NEIGHBOURS,
stride=1,
):
"""Per-residue per-frame heavy-atom contact count ``N_c(i)``.
``N_c(i)`` is the number of protein heavy atoms within
``contact_cutoff`` of the backbone amide N of residue ``i``, excluding
atoms in residues at sequence separation ``|i - j| <=
exclude_neighbours``. The cutoff is in **nanometres** (matching
mdtraj's convention).
Parameters
----------
protein : soursop.ssprotein.SSProtein
contact_cutoff : float, optional
Distance cutoff in nm. Default ``0.65`` (= 6.5 A,
Best-Vendruscolo).
exclude_neighbours : int, optional
Sequence-separation exclusion radius. Default ``2``.
stride : int, optional
Frame subsample. Default ``1``.
Returns
-------
residue_indices : numpy.ndarray, shape (n_res,)
Zero-based residue indices for which N_c is defined (the
backbone-amide residues; proline and any residue lacking a
recognisable backbone H are dropped).
Nc : numpy.ndarray, shape (n_frames, n_res), int
Per-frame, per-residue heavy-atom contact counts.
Raises
------
SSException
If the protein has no recognisable backbone amide N+H residues.
"""
top = protein.traj.topology
res_NH, n_atom_idx, _ = _backbone_nh_map(top)
heavy_atom_idx, heavy_res_idx = _heavy_atom_residue_map(top)
if len(res_NH) == 0:
raise SSException("compute_Nc: no residues with a backbone amide N+H found")
traj = protein.traj[::stride] if stride != 1 else protein.traj
n_frames = traj.n_frames
Nc = np.zeros((n_frames, len(res_NH)), dtype=int)
for k, (i_res, n_idx) in enumerate(zip(res_NH, n_atom_idx)):
mask = np.abs(heavy_res_idx - i_res) > exclude_neighbours
eligible = heavy_atom_idx[mask]
if len(eligible) == 0:
continue
pairs = np.column_stack([np.full(len(eligible), n_idx, dtype=int), eligible])
d = md.compute_distances(traj, pairs) # nm, shape (n_frames, n_pairs)
Nc[:, k] = np.sum(d < contact_cutoff, axis=1)
return res_NH, Nc
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------
[docs]
def compute_Nh(
protein,
exclude_neighbours=DEFAULT_EXCLUDE_NEIGHBOURS,
stride=1,
):
"""Per-residue per-frame backbone H-bond count ``N_h(i)``.
Counts backbone H-bonds in which the **amide H of residue i** is the
donor and a backbone carbonyl O is the acceptor, at sequence
separation ``|i - j| > exclude_neighbours``. Per-frame H-bonds are
detected by :func:`mdtraj.wernet_nilsson` (cone criterion on the
Donor-H...Acceptor geometry).
Parameters
----------
protein : soursop.ssprotein.SSProtein
exclude_neighbours : int, optional
Sequence-separation exclusion radius. Default ``2``.
stride : int, optional
Frame subsample. Default ``1``.
Returns
-------
residue_indices : numpy.ndarray, shape (n_res,)
Same residue list as :func:`compute_Nc`.
Nh : numpy.ndarray, shape (n_frames, n_res), int
"""
top = protein.traj.topology
res_NH, _, h_atom_idx = _backbone_nh_map(top)
res_O, o_atom_idx = _backbone_carbonyl_o_map(top)
h_to_res = dict(zip(h_atom_idx.tolist(), res_NH.tolist()))
o_to_res = dict(zip(o_atom_idx.tolist(), res_O.tolist()))
res_to_k = {int(r): k for k, r in enumerate(res_NH)}
traj = protein.traj[::stride] if stride != 1 else protein.traj
n_frames = traj.n_frames
Nh = np.zeros((n_frames, len(res_NH)), dtype=int)
hbonds_per_frame = md.wernet_nilsson(traj)
# mdtraj returns a list of length n_frames; each entry is an
# ``(n_hbonds, 3)`` int array: (donor_heavy_idx, h_idx, acceptor_heavy_idx).
for f, hb in enumerate(hbonds_per_frame):
if hb.shape[0] == 0:
continue
for h_idx, a_idx in zip(hb[:, 1], hb[:, 2]):
h_idx_i = int(h_idx)
a_idx_i = int(a_idx)
i_res = h_to_res.get(h_idx_i)
j_res = o_to_res.get(a_idx_i)
if i_res is None or j_res is None:
continue
if abs(i_res - j_res) > exclude_neighbours:
Nh[f, res_to_k[i_res]] += 1
return res_NH, Nh
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------
[docs]
def compute_protection_factors(
protein,
beta_c=DEFAULT_BETA_C,
beta_h=DEFAULT_BETA_H,
beta_0=DEFAULT_BETA_0,
contact_cutoff=DEFAULT_CONTACT_CUTOFF_NM,
exclude_neighbours=DEFAULT_EXCLUDE_NEIGHBOURS,
stride=1,
weights=False,
etol=1e-7,
):
"""Per-residue ln(protection factor) via the Best-Vendruscolo formula.
``ln P_i = beta_c * N_c(i) + beta_h * N_h(i) + beta_0`` evaluated per
frame, then optionally collapsed to a single per-residue ensemble
mean by a per-frame weight vector. The output shape
``(n_frames, n_res)`` is the natural input for
:class:`soursop.ssbme.BME` / :class:`soursop.sscoper.COPER`
reweighting against experimental HDX protection factors.
Parameters
----------
protein : soursop.ssprotein.SSProtein
beta_c, beta_h, beta_0 : float, optional
Best-Vendruscolo coefficients. Defaults
``(0.35, 2.0, 0.0)``.
contact_cutoff : float, optional
Heavy-atom contact cutoff (nm). Default ``0.65``.
exclude_neighbours : int, optional
Sequence-separation exclusion radius. Default ``2``.
stride : int, optional
Frame subsample. Default ``1``.
weights : numpy.ndarray or False, optional
Optional per-frame weights collapsing the frame axis to a
per-residue mean (validated by
:func:`soursop.ssutils.validate_weights`). Default ``False``
(per-frame array returned).
etol : float, optional
Tolerance on ``sum(weights) == 1``. Default ``1e-7``.
Returns
-------
residue_indices : numpy.ndarray, shape (n_res,)
Residue indices for which ln(P) is defined.
lnP : numpy.ndarray
``(n_frames, n_res)`` per-frame ln(P) by default;
``(n_res,)`` ensemble mean when ``weights`` is supplied.
Raises
------
SSException
If the protein has no backbone amide N+H residues, or if
``weights`` fails validation.
"""
res_NH, Nc = compute_Nc(
protein,
contact_cutoff=contact_cutoff,
exclude_neighbours=exclude_neighbours,
stride=stride,
)
res_NH_check, Nh = compute_Nh(
protein,
exclude_neighbours=exclude_neighbours,
stride=stride,
)
if not np.array_equal(res_NH, res_NH_check):
raise SSException(
"Internal inconsistency: compute_Nc and compute_Nh returned "
"different residue lists"
)
lnP = beta_c * Nc.astype(np.float64) + beta_h * Nh.astype(np.float64) + beta_0
n_frames_total = protein.traj.n_frames
validated_weights = validate_weights(
weights, n_frames_total, stride=stride, etol=etol
)
if validated_weights is not False:
lnP = weighted_mean(lnP, validated_weights, axis=0)
return res_NH, lnP