Source code for soursop.ssnmr

##     _____  ____  _    _ _____   _____  ____  _____  
##   / ____|/ __ \| |  | |  __ \ / ____|/ __ \|  __ \ 
##  | (___ | |  | | |  | | |__) | (___ | |  | | |__) |
##   \___ \| |  | | |  | |  _  / \___ \| |  | |  ___/ 
##   ____) | |__| | |__| | | \ \ ____) | |__| | |     
##  |_____/ \____/ \____/|_|  \_\_____/ \____/|_|     

## Alex Holehouse (Pappu Lab and Holehouse Lab) and Jared Lalmansing (Pappu lab)
## ssnmr was largely written by Alex Keeley
## Simulation analysis package
## Copyright 2014 - 2026
##


"""
ssnmr - sequence-based random-coil chemical shift prediction.

This module predicts random-coil backbone chemical shifts (CA, CB, CO,
N, HN, HA) for an arbitrary amino-acid sequence, corrected for
temperature, pH and (optionally) perdeuteration, including support for
phosphorylated serine/threonine/tyrosine. The implementation ports the
Kjaergaard & Poulsen / Schwarzinger reference-shift and neighbour-
correction tables and is the basis for comparing simulated ensembles to
experimental NMR data.

The public entry point is ``compute_random_coil_chemical_shifts``; the
remaining functions are private helpers.

**Author(s):** Alex Keeley (with Alex Holehouse)
"""
from .ssexceptions import SSException
import re

# ----------------------------------------------------------------------------------------------------------------------------------------------------------------



# ----------------------------------------------------------------------------------------------------------------------------------------------------------------
[docs] def compute_random_coil_chemical_shifts(protein_sequence, temperature=25, pH=7.4, use_ggxgg=True, use_perdeuteration=False, asFloat=True): """Predict sequence-corrected random-coil chemical shifts. For a user-provided amino-acid sequence, predicts the random-coil backbone chemical shifts (CA, CB, CO, N, HN, HA) and applies sequence-context (nearest-neighbour), temperature and pH corrections. Reference shifts and general sequence-correction factors are from Kjaergaard & Poulsen (J. Biomol. NMR 2011, 50:157-165); temperature and glycine corrections are from Kjaergaard, Brander & Poulsen (J. Biomol. NMR 2011, 49:139-149); the correction-factor methodology follows Schwarzinger et al. (JACS 2001, 123:2970-2978); and the perdeuteration corrections are from Cavanagh, Fairbrother, Palmer, Rance & Skelton, *Protein NMR Spectroscopy*, 2nd ed. (Academic Press, 2007). The implementation is a port of the JavaScript tool by Alex Maltsev (NIH); see https://www1.bio.ku.dk/english/research/bms/research/sbinlab/randomchemicalshifts/ The input may be a standard one-letter sequence; phospho-residues can additionally be supplied using parenthesised three-letter codes (e.g. ``"AS(SEP)GA"`` for a phospho-serine). Glycine and proline produce masked placeholder values for atoms they lack (CB for glycine; N/HN for proline). Parameters ---------- protein_sequence : str Amino-acid sequence to predict shifts for. One-letter codes, with optional parenthesised multi-letter codes for phospho-residues (``SEP``/``PS``, ``TPO``/``PT``, ``PTR``/``PY``). temperature : float or int, optional Sample temperature in degrees Celsius, used for the temperature correction. Must be between 0 and 100. Default ``25``. pH : float or int, optional Sample pH, used for the pH (titratable-residue) correction. Must be between 0 and 14. Default ``7.4``. use_ggxgg : bool, optional Whether to apply the GGXGG-based neighbour correction for glycines. Default ``True``. use_perdeuteration : bool, optional Whether to apply perdeuterated correction factors. Cannot be combined with phospho-residues. Default ``False``. asFloat : bool, optional If ``True`` the output chemical shifts are floats; if ``False`` they are formatted strings. Default ``True``. Returns ------- list of dict One dictionary per residue in the input sequence, each containing the residue abbreviation (``'Res'``), its index (``'Index'``) and the six predicted shifts (``'CA'``, ``'CB'``, ``'CO'``, ``'N'``, ``'HN'``, ``'HA'``). Atoms absent for a residue type (glycine CB, proline N/HN, HA under perdeuteration) carry a masked placeholder. Raises ------ soursop.ssexceptions.SSException If ``temperature`` is outside 0-100 C, if ``pH`` is outside 0-14, or if ``use_perdeuteration`` is requested for a sequence containing phosphorylated residues. Example ------- >>> shifts = compute_random_coil_chemical_shifts('ASGAS', temperature=25, pH=7.4) >>> sorted(shifts[0].keys()) ['CA', 'CB', 'CO', 'HA', 'HN', 'Index', 'N', 'Res'] """ # sanity check temperature if temperature > 100 or temperature < 0: raise SSException('Temperature provided (%i) was non-physiological. Remember temperature should be in *celcius*.' %(temperature)) # pH sanity check if pH < 0 or pH > 14: raise SSException('pH provided (%i) was non-physiological. Remember pH should be in between 0 and 14.' %(pH)) # SETUP # The array 'key' is used to translate amino acid letter code into # numerical index. Value is -1 when there is no such amino acid letter key_aa1 = [0, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, 11, -1, 12, 13, 14, 15, 16, -1, 17, 18, -1, 19, -1] key_aa3 = { "ALA": 0, "CYS": 1, "ASP": 2, "GLU": 3, "PHE": 4, "GLY": 5, "HIS": 6, "ILE": 7, "LYS": 8, "LEU": 9, "MET": 10, "ASN": 11, "PRO": 12, "GLN": 13, "ARG": 14, "SER": 15, "THR": 16, "VAL": 17, "TRP": 18, "TYR": 19, "PSER": 20, "SEP": 20, "PS": 20, "PTHR": 21, "TPO": 21, "PT": 21, "PTYR": 22, "PTR": 22, "PY": 22} # The array 'sequence' keeps the set of amino acid indices for the given # protein. The array size is not fixed and will accommodate a given # sequence. sequence = [] # The XX_av arrays contain uncorrected random coil values for atom type XX at 5C and pH 6.5 ca_av = [ 52.747, 58.639, 54.586, 56.805, 57.804, 45.317, 55.933, 61.279, 56.542, 55.359, 55.565, 53.422, 63.236, 56.041, 56.287, 58.604, 62.151, 62.574, 57.464, 57.968, 58.235, 62.985, 57.977] cb_av = [ 19.048, 29.693, 40.942, 30.181, 39.482, 0, 29.974, 38.640, 33.024, 42.262, 32.677, 38.643, 32.155, 29.322, 30.786, 63.706, 69.877, 32.784, 29.262, 38.654, 65.725, 72.449, 38.698] co_av = [ 177.967, 174.736, 176.527, 176.707, 175.721, 174.363, 175.044, 176.451, 176.738, 177.580, 176.423, 175.436, 177.023, 176.118, 176.442, 174.798, 174.647, 176.330, 176.276, 175.773, 174.769, 174.566, 175.740] n_av = [125.922, 121.101, 121.778, 122.87, 121.691, 110.613, 120.581, 123.509, 123.571, 124.07, 122.428, 119.966, 0, 122.304, 123.182, 117.629, 116.375, 122.879, 122.099, 121.787, 118.870, 119.095, 121.935] hn_av = [8.575, 8.627, 8.572, 8.692, 8.438, 8.662, 8.653, 8.441, 8.592, 8.485, 8.63, 8.672, 0, 8.67, 8.61, 8.585, 8.413, 8.44, 8.267, 8.396, 9.134, 9.091, 8.336] ha_av = [4.293, 4.498, 4.588, 4.266, 4.644, 3.977, 4.669, 4.134, 4.287, 4.338, 4.478, 4.696, 4.435, 4.321, 4.319, 4.437, 4.343, 4.076, 4.672, 4.588, 4.435, 4.253, 4.596] # The XX_t arrays contain temperature corrections for atom type XX ca_t = [-2.2, -0.9, 2.78, 0.94, -4.74, 3.28, 7.76, -1.98, -0.76, 1.73, 4.09, 2.78, 1.12, 2.26, -1.37, -1.7, -0.03, -2.79, -2.69, -4.99, 1.38, -5.27, -3.68] cb_t = [4.73, 1.27, 6.53, 4.6, 2.42, 0, 15.54, 4.6, 2.41, 4.92, 9.37, 5.08, -0.19, 3.62, 3.54, 4.4, 2.15, 2.49, 3.07, 2.92, 4.82, 3.66, 0.01] co_t = [-7.09, -2.55, -4.81, -4.9, -6.9, -3.21, -8.3, -8.73, -7.12, -8.18, -8.17, - 6.11, -4.01, -5.7, -6.9, -4.67, -5.24, -8.09, -7.88, -7.73, -1.71, -3.00, -3.00] n_t = [-5.25, -8.2, -3.91, -3.7, -11.15, -6.15, 3.3, -12.73, -7.6, -2.85, -6.2, - 3.25, 0, -6.45, -5.3, -3.8, -6.7, -14.16, -10.1, -12, -22.69, -31.26, -25.64] hn_t = [-8.95, -7, -6.2, -6.46, -7.5, -9.1, -8.3, -7.78, -7.5, -7.45, -7.05, - 6.95, 0, -7.2, -7.05, -7.6, -7.25, -7.64, -7.8, -7.73, -6.47, -8.34, -8.26] ha_t = [0.69, 0, -0.06, 0.31, 0.4, -0.02, -0.93, 0.37, 0.38, 0.05, -0.48, - 2.92, -0.02, 0.26, 0.4, 0.05, -0.04, 0.47, 0.38, 0.53, 0.48, 1.36, 0.42] # Neighbor correction factors for Ca. Notice extra zero for 'no Residue' (last element). ca_a = [-0.007, -0.043, -0.019, -0.025, 0.168, -0.059, -0.036, -0.076, -0.003, 0.084, 0.004, 0.01, -0.206, 0, 0, 0.003, -0.007, -0.095, -0.003, 0.14, -0.304, -0.303, 0.264, 0] ca_b = [-0.156, -0.011, 0.055, -0.013, -0.018, 0.132, -0.041, -0.27, -0.148, -0.205, -0.096, 0.047, -2.249, 0, -0.128, -0.024, -0.087, -0.242, 0.24, -0.004, 0.180, -0.041, 0.040, 0] ca_c = [-0.076, 0.095, 0.086, -0.036, -0.301, 0.007, -0.066, -0.217, -0.082, -0.139, -0.039, 0.157, -0.072, 0, -0.062, 0.025, -0.053, -0.212, -0.226, -0.344, -0.160, -0.100, -0.403, 0] ca_d = [0.007, 0.044, 0.131, 0.007, 0.105, 0.061, 0.09, -0.003, 0.048, -0.031, 0.003, 0.1, -0.024, 0, 0.074, 0.073, 0.05, -0.01, 0.112, 0.123, -0.009, 0.058, 0.131, 0] # Neighbor correction factors for Cb. Notice extra zero for 'no Residue' (last element). cb_a = [0.017, 0.07, -0.011, -0.002, -0.086, -0.011, -0.012, 0.097, 0.033, 0.058, 0.036, - 0.006, 0.149, 0, 0.11, -0.243, 0.039, 0.066, -0.159, -0.102, 0.013, 0.085, -0.154, 0] cb_b = [-0.049, -0.117, -0.03, -0.096, -0.151, -0.128, -0.08, -0.087, -0.077, -0.231, -0.153, - 0.138, -0.701, 0, -0.098, -0.043, -0.044, -0.039, -0.343, -0.147, 0.076, 0.210, -0.511, 0] cb_c = [0.078, -0.025, -0.104, 0.054, 0.113, 0.031, 0.052, 0, 0.021, -0.08, -0.031, - 0.069, 0.102, 0, -0.005, -0.028, 0.259, 0.034, 0.102, 0.14, -0.175, 0.002, 0.081, 0] cb_d = [0.058, 0.049, -0.111, -0.004, -0.022, -0.067, -0.011, 0.099, 0.083, 0.119, 0.08, - 0.074, 0.107, 0, 0.031, 0.006, 0.027, 0.078, -0.145, -0.055, -0.065, -0.031, -0.016, 0] # Neighbor correction factors for Co. Notice extra zero for 'no Residue' (last element). co_a = [-0.043, -0.046, 0.043, -0.086, -0.144, -0.008, -0.07, -0.249, -0.055, -0.163, -0.138, 0.021, -0.228, 0, -0.066, 0.033, -0.025, -0.21, 0.05, -0.135, -0.136, -0.367, -0.031, 0] co_b = [-0.181, -0.075, -0.251, 0.065, -0.357, 0.546, -0.195, -0.113, -0.042, -0.083, 0.02, - 0.23, -2.01, 0, -0.047, 0.148, 0.193, -0.06, -0.183, -0.338, 0.245, 0.074, -0.251, 0] co_c = [0.062, -0.082, 0.143, 0, -0.546, 0.212, -0.194, -0.201, -0.068, -0.109, -0.123, 0.022, 0.096, 0, -0.046, -0.017, -0.112, -0.146, -0.665, -0.569, 0.002, -0.129, -0.605, 0] co_d = [0.005, -0.044, 0.177, 0.068, -0.007, 0.069, 0.01, -0.037, -0.04, -0.018, -0.015, 0.075, -0.069, 0, -0.051, 0.024, 0.01, -0.017, -0.002, 0.012, 0.123, 0.096, 0.052, 0] # Neighbor correction factors for N. Notice extra zero for 'no Residue' (last element). n_a = [0.003, -0.05, 0.041, 0.018, 0.212, -0.024, -0.066, -0.069, -0.036, 0.039, 0.06, - 0.035, -0.091, 0, -0.042, 0.009, -0.02, -0.079, -0.073, 0.203, -0.018, -0.053, 0.247, 0] n_b = [0.141, 0.147, -0.033, -0.035, -0.445, -0.043, -0.316, 0.249, 0.068, -0.158, -0.044, - 0.256, 1.433, 0, 0.17, 0.167, 0.229, 0.318, -0.231, -0.401, 0.721, 0.485, -0.532, 0] n_c = [-2.25, 1.102, -1.407, -0.432, 0.187, -2.086, -0.033, 3.166, 0.152, -0.642, -0.053, - 1.25, -1.027, 0, 0.157, 0.212, 0.881, 2.83, 0.051, 0.395, -0.758, 1.619, 0.232, 0] n_d = [-0.209, -0.072, -1.003, -0.139, -0.089, -0.445, -0.049, 0.649, 0.17, -0.12, 0.015, - 0.757, -0.082, 0, 0.183, -0.62, 0.014, 0.634, -0.764, -0.127, -0.821, -0.193, 0.081, 0] # Neighbor correction factors for Hn. Notice extra zero for 'no Residue' (last element). hn_a = [-0.002, -0.007, 0.001, 0, -0.011, 0.007, -0.009, -0.031, -0.005, -0.004, -0.009, 0.004, -0.024, 0, -0.004, 0.006, -0.001, -0.029, -0.044, -0.013, -0.010, -0.019, 0.043, 0] hn_b = [-0.036, 0.032, 0.01, -0.012, -0.034, 0.032, -0.005, -0.027, -0.024, -0.004, -0.156, 0.031, -0.007, 0, -0.013, -0.046, 0.031, -0.028, -0.023, -0.02, -0.013, -0.044, 0.109, 0] hn_c = [-0.101, 0.052, -0.148, -0.021, -0.265, -0.206, -0.115, 0.028, -0.078, -0.093, -0.041, 0.002, 0.079, 0, 0.017, -0.023, -0.016, 0.042, -0.53, -0.305, -0.123, 0.091, -0.344, 0] hn_d = [-0.072, -0.05, -0.104, -0.034, -0.121, 0.001, 0.017, 0.008, -0.003, -0.069, -0.021, - 0.095, -0.03, 0, 0.009, -0.132, -0.035, 0.021, -0.358, -0.151, -0.173, -0.056, -0.137, 0] # Neighbor correction factors for Ha. Notice extra zero for 'no Residue' (last element). ha_a = [-0.002, 0.006, 0.015, 0.01, -0.068, 0.014, -0.025, -0.007, -0.012, -0.019, -0.015, 0.001, 0.014, 0, -0.013, 0.006, 0.006, -0.003, -0.075, -0.063, 0.030, 0.027, -0.080, 0] ha_b = [-0.002, 0.042, 0.005, 0.009, -0.055, 0.032, -0.029, 0.035, 0.002, 0.016, 0.008, 0.016, 0.305, 0, 0.008, 0.069, 0.106, 0.047, -0.067, -0.05, 0.033, 0.073, -0.059, 0] ha_c = [-0.013, 0.022, -0.007, 0, -0.027, 0.014, 0.006, 0.025, 0.002, 0.012, 0.013, 0.003, -0.027, 0, 0.008, 0.043, 0.036, 0.02, -0.149, -0.037, 0.033, 0.008, -0.055, 0] ha_d = [-0.002, 0.006, -0.022, -0.005, -0.057, 0.005, -0.012, -0.011, -0.008, 0.001, -0.002, - 0.008, 0.005, 0, -0.007, -0.007, -0.012, -0.012, -0.166, -0.063, -0.028, -0.039, -0.069, 0] # Neighbor correction factors for Ca for Gly. gly_ca_a = [-0.011, -0.001, -0.085, -0.001, -0.048, 0, -0.05, -0.054, 0.016, -0.057, 0.019, -0.05, -0.221, 0, 0, -0.006, -0.003, -0.042, -0.082, -0.034, 0, 0, 0, 0] gly_ca_b = [-0.149, -0.046, -0.045, -0.14, -0.244, 0, -0.09, -0.19, -0.01, -0.188, - 0.03, -0.015, -0.8, -0.002, -0.07, -0.051, -0.041, -0.174, -0.193, -0.245, 0, 0, 0, 0] gly_ca_c = [0.076, 0.117, 0.329, 0.141, 0.072, 0, 0.02, 0.029, -0.061, 0.052, 0.114, 0.238, 0.033, 0.028, -0.01, 0.129, 0.134, 0.024, 0.11, 0.068, 0, 0, 0, 0] gly_ca_d = [0.017, 0.007, 0.066, 0.053, 0.006, 0, 0.01, 0.024, 0.013, -0.053, 0.011, 0.018, 0.048, 0.017, 0.02, -0.003, 0.001, 0.016, -0.08, -0.004, 0, 0, 0, 0] # Neighbor correction factors for Co for Gly. gly_co_a = [-0.113, -0.076, -0.112, -0.113, -0.297, 0, -0.137, -0.226, -0.075, -0.153, - 0.092, -0.085, -0.558, -0.039, -0.038, -0.073, -0.084, -0.232, -0.296, -0.303, 0, 0, 0, 0] gly_co_b = [-0.754, -0.46, -0.753, -0.44, -0.832, 0, -0.706, -0.558, -0.494, -0.491, - 0.424, -0.714, -2.799, -0.477, -0.444, -0.4, -0.203, -0.526, -0.565, -0.863, 0, 0, 0, 0] gly_co_c = [-0.042, -0.24, 0.165, -0.024, -0.23, 0, -0.174, -0.152, -0.187, -0.104, - 0.165, -0.079, -0.058, -0.148, -0.186, -0.13, -0.127, -0.154, -0.31, -0.23, 0, 0, 0, 0] gly_co_d = [-0.018, -0.056, 0.049, 0.013, -0.097, 0, -0.039, -0.029, -0.034, -0.006, - 0.024, -0.025, -0.024, -0.027, -0.021, -0.054, -0.047, -0.024, -0.185, -0.156, 0, 0, 0, 0] # Neighbor correction factors for N for Gly. gly_n_a = [-0.044, -0.081, -0.104, 0.094, -0.142, 0, -0.087, -0.176, -0.056, -0.094, - 0.084, -0.104, -0.115, -0.074, -0.039, -0.045, -0.055, -0.15, -0.234, -0.163, 0, 0, 0, 0] gly_n_b = [-0.03, -0.078, 0.045, -0.023, -0.415, 0, -0.427, -0.111, -0.143, -0.145, - 0.161, -0.149, -0.134, -0.097, -0.117, 0.032, 0, -0.054, -0.317, -0.381, 0, 0, 0, 0] gly_n_c = [-0.535, 2.6, 0.764, 1.235, 2.41, 0, 0.913, 4.157, 1.248, 0.809, 1.288, 0.692, 0.756, 1.304, 1.296, 2.114, 2.444, 3.754, 2.703, 2.709, 0, 0, 0, 0] gly_n_d = [-0.158, -0.003, -0.027, -0.094, -0.408, 0, 0.729, -0.03, -0.061, -0.143, - 0.03, -0.16, -0.157, -0.056, -0.031, -0.154, -0.024, -0.057, -0.821, -0.495, 0, 0, 0, 0] # Set pSER, pTHR, and pTYR Gly corrections to their SER, THR and TYR equivalents gly_ca_a[key_aa3["PSER"]] = gly_ca_a[key_aa3["SER"]] gly_ca_a[key_aa3["PTHR"]] = gly_ca_a[key_aa3["THR"]] gly_ca_a[key_aa3["PTYR"]] = gly_ca_a[key_aa3["TYR"]] gly_ca_b[key_aa3["PSER"]] = gly_ca_b[key_aa3["SER"]] gly_ca_b[key_aa3["PTHR"]] = gly_ca_b[key_aa3["THR"]] gly_ca_b[key_aa3["PTYR"]] = gly_ca_b[key_aa3["TYR"]] gly_ca_c[key_aa3["PSER"]] = gly_ca_c[key_aa3["SER"]] gly_ca_c[key_aa3["PTHR"]] = gly_ca_c[key_aa3["THR"]] gly_ca_c[key_aa3["PTYR"]] = gly_ca_c[key_aa3["TYR"]] gly_ca_d[key_aa3["PSER"]] = gly_ca_d[key_aa3["SER"]] gly_ca_d[key_aa3["PTHR"]] = gly_ca_d[key_aa3["THR"]] gly_ca_d[key_aa3["PTYR"]] = gly_ca_d[key_aa3["TYR"]] gly_co_a[key_aa3["PSER"]] = gly_co_a[key_aa3["SER"]] gly_co_a[key_aa3["PTHR"]] = gly_co_a[key_aa3["THR"]] gly_co_a[key_aa3["PTYR"]] = gly_co_a[key_aa3["TYR"]] gly_co_b[key_aa3["PSER"]] = gly_co_b[key_aa3["SER"]] gly_co_b[key_aa3["PTHR"]] = gly_co_b[key_aa3["THR"]] gly_co_b[key_aa3["PTYR"]] = gly_co_b[key_aa3["TYR"]] gly_co_c[key_aa3["PSER"]] = gly_co_c[key_aa3["SER"]] gly_co_c[key_aa3["PTHR"]] = gly_co_c[key_aa3["THR"]] gly_co_c[key_aa3["PTYR"]] = gly_co_c[key_aa3["TYR"]] gly_co_d[key_aa3["PSER"]] = gly_co_d[key_aa3["SER"]] gly_co_d[key_aa3["PTHR"]] = gly_co_d[key_aa3["THR"]] gly_co_d[key_aa3["PTYR"]] = gly_co_d[key_aa3["TYR"]] gly_n_a[key_aa3["PSER"]] = gly_n_a[key_aa3["SER"]] gly_n_a[key_aa3["PTHR"]] = gly_n_a[key_aa3["THR"]] gly_n_a[key_aa3["PTYR"]] = gly_n_a[key_aa3["TYR"]] gly_n_b[key_aa3["PSER"]] = gly_n_b[key_aa3["SER"]] gly_n_b[key_aa3["PTHR"]] = gly_n_b[key_aa3["THR"]] gly_n_b[key_aa3["PTYR"]] = gly_n_b[key_aa3["TYR"]] gly_n_c[key_aa3["PSER"]] = gly_n_c[key_aa3["SER"]] gly_n_c[key_aa3["PTHR"]] = gly_n_c[key_aa3["THR"]] gly_n_c[key_aa3["PTYR"]] = gly_n_c[key_aa3["TYR"]] gly_n_d[key_aa3["PSER"]] = gly_n_d[key_aa3["SER"]] gly_n_d[key_aa3["PTHR"]] = gly_n_d[key_aa3["THR"]] gly_n_d[key_aa3["PTYR"]] = gly_n_d[key_aa3["TYR"]] # Arrays for calculation of pH corrected shifts asp_ph_0 = [[53.05, 37.81, 175.25, 120.10, 8.68, 4.70], [54.59, 40.95, 176.53, 121.78, 8.57, 4.58]] glu_ph_0 = [[55.90, 28.65, 176.19, 122.04, 8.58, 4.36], [56.81, 30.19, 176.71, 122.86, 8.69, 4.26]] his_ph_0 = [[55.30, 28.89, 174.37, 120.00, 8.78, 4.69], [56.69, 31.27, 175.85, 121.46, 8.44, 4.63]] sep_ph_0 = [[57.613, 66.527, 174.062, 116.940, 8.805, 4.525], [58.433, 65.463, 174.977, 119.488, 9.242, 4.409]] tpo_ph_0 = [[61.827, 73.986, 174.063, 115.936, 8.615, 4.407], [63.524, 71.718, 174.793, 121.101, 9.397, 4.167]] ptr_ph_0 = [[57.905, 38.760, 175.605, 121.797, 8.403, 4.604], [58.002, 38.685, 175.788, 121.953, 8.320, 4.595]] # Arrays for keeping the chemical shifts calculated for the entered pH value asp_ph_corr = [0, 0, 0, 0, 0, 0] glu_ph_corr = [0, 0, 0, 0, 0, 0] his_ph_corr = [0, 0, 0, 0, 0, 0] sep_ph_corr = [0, 0, 0, 0, 0, 0] tpo_ph_corr = [0, 0, 0, 0, 0, 0] ptr_ph_corr = [0, 0, 0, 0, 0, 0] # Arrays for CS corrections for deuterated proteins ca_deut = [-0.68, -0.55, -0.55, -0.69, -0.55, -0.39, -0.55, -0.77, -0.69, - 0.62, -0.69, -0.55, -0.69, -0.69, -0.69, -0.55, -0.55, -0.84, -0.55, -0.55] cb_deut = [-1.00, -0.71, -0.71, -0.97, -0.71, 0.00, -0.71, -1.28, -1.11, - 1.26, -0.97, -0.71, -1.11, -0.97, -1.11, -0.71, -0.71, -1.20, -0.71, -0.71] # RUN cur = 0 delta_T = temperature - 5 # difference between the given temperature and 5 degrees C ca_pred = 0 cb_pred = 0 co_pred = 0 n_pred = 0 hn_pred = 0 ha_pred = 0 output = [] # Calculate deprotonated fractions of Asp, Glu and His at the given pH asp_deprot_frac = 7.78 * (10**-5) / (7.78 * (10**-5) + (10**(-pH))) glu_deprot_frac = 3.43 * (10**-5) / (3.43 * (10**-5) + (10**(-pH))) his_deprot_frac = 1.67 * (10**-7) / (1.67 * (10**-7) + (10**(-pH))) sep_deprot_frac = 9.76 * (10**-7) / (9.76 * (10**-7) + (10**(-pH))) tpo_deprot_frac = 5.00 * (10**-7) / (5.00 * (10**-7) + (10**(-pH))) ptr_deprot_frac = 1.47 * (10**-6) / (1.47 * (10**-6) + (10**(-pH))) # Calculate pH corrected chemical shifts for i in range(6): asp_ph_corr[i] = asp_deprot_frac * asp_ph_0[1][i] + (1 - asp_deprot_frac) * asp_ph_0[0][i] glu_ph_corr[i] = glu_deprot_frac * glu_ph_0[1][i] + (1 - glu_deprot_frac) * glu_ph_0[0][i] his_ph_corr[i] = his_deprot_frac * his_ph_0[1][i] + (1 - his_deprot_frac) * his_ph_0[0][i] sep_ph_corr[i] = sep_deprot_frac * sep_ph_0[1][i] + (1 - sep_deprot_frac) * sep_ph_0[0][i] tpo_ph_corr[i] = tpo_deprot_frac * tpo_ph_0[1][i] + (1 - tpo_deprot_frac) * tpo_ph_0[0][i] ptr_ph_corr[i] = ptr_deprot_frac * ptr_ph_0[1][i] + (1 - ptr_deprot_frac) * ptr_ph_0[0][i] sequences = __set_sequence(protein_sequence, key_aa1, key_aa3) sequence = sequences[0] aminos = sequences[1] for j in range(len(sequence) - 4): output.append({"Res": aminos[j], "Index": j}) # deuterated parameters not available for phosphorylated Residues if ((22 in sequence) or (25 in sequence) or (28 in sequence)) and use_perdeuteration: raise SSException('Phosphorylated amino acids not supported in deuterated proteins') while cur < (len(sequence) - 4): if sequence[cur + 2] == 2: # Aspartate ca_pred = asp_ph_corr[0] cb_pred = asp_ph_corr[1] co_pred = asp_ph_corr[2] n_pred = asp_ph_corr[3] hn_pred = asp_ph_corr[4] ha_pred = asp_ph_corr[5] elif sequence[cur + 2] == 3: # Glutamate ca_pred = glu_ph_corr[0] cb_pred = glu_ph_corr[1] co_pred = glu_ph_corr[2] n_pred = glu_ph_corr[3] hn_pred = glu_ph_corr[4] ha_pred = glu_ph_corr[5] elif sequence[cur + 2] == 6: # Histidine ca_pred = his_ph_corr[0] cb_pred = his_ph_corr[1] co_pred = his_ph_corr[2] n_pred = his_ph_corr[3] hn_pred = his_ph_corr[4] ha_pred = his_ph_corr[5] elif sequence[cur + 2] == key_aa3["SEP"]: # phospho-SER ca_pred = sep_ph_corr[0] cb_pred = sep_ph_corr[1] co_pred = sep_ph_corr[2] n_pred = sep_ph_corr[3] hn_pred = sep_ph_corr[4] ha_pred = sep_ph_corr[5] elif sequence[cur + 2] == key_aa3["TPO"]: # phospho-THR ca_pred = tpo_ph_corr[0] cb_pred = tpo_ph_corr[1] co_pred = tpo_ph_corr[2] n_pred = tpo_ph_corr[3] hn_pred = tpo_ph_corr[4] ha_pred = tpo_ph_corr[5] elif sequence[cur + 2] == key_aa3["PTR"]: # phospho-TYR ca_pred = ptr_ph_corr[0] cb_pred = ptr_ph_corr[1] co_pred = ptr_ph_corr[2] n_pred = ptr_ph_corr[3] hn_pred = ptr_ph_corr[4] ha_pred = ptr_ph_corr[5] else: # any other amino acid ca_pred = ca_av[sequence[cur + 2]] cb_pred = cb_av[sequence[cur + 2]] co_pred = co_av[sequence[cur + 2]] n_pred = n_av[sequence[cur + 2]] hn_pred = hn_av[sequence[cur + 2]] ha_pred = ha_av[sequence[cur + 2]] # Apply the neighbor and temperature corrections if sequence[cur + 2] == 5 and use_ggxgg: # special case of glycine ca_pred += gly_ca_a[sequence[cur + 4]] + gly_ca_b[sequence[cur + 3]] + \ gly_ca_c[sequence[cur + 1]] + gly_ca_d[sequence[cur]] + (delta_T * ca_t[sequence[cur + 2]] / 1000) co_pred += gly_co_a[sequence[cur + 4]] + gly_co_b[sequence[cur + 3]] + \ gly_co_c[sequence[cur + 1]] + gly_co_d[sequence[cur]] + (delta_T * co_t[sequence[cur + 2]] / 1000) n_pred += gly_n_a[sequence[cur + 4]] + gly_n_b[sequence[cur + 3]] + \ gly_n_c[sequence[cur + 1]] + gly_n_d[sequence[cur]] + (delta_T * n_t[sequence[cur + 2]] / 1000) else: # all other Residues ca_pred += ca_a[sequence[cur + 4]] + ca_b[sequence[cur + 3]] + ca_c[sequence[cur + 1]] + \ ca_d[sequence[cur]] + (delta_T * ca_t[sequence[cur + 2]] / 1000) co_pred += co_a[sequence[cur + 4]] + co_b[sequence[cur + 3]] + co_c[sequence[cur + 1]] + \ co_d[sequence[cur]] + (delta_T * co_t[sequence[cur + 2]] / 1000) n_pred += n_a[sequence[cur + 4]] + n_b[sequence[cur + 3]] + n_c[sequence[cur + 1]] + \ n_d[sequence[cur]] + (delta_T * n_t[sequence[cur + 2]] / 1000) cb_pred += cb_a[sequence[cur + 4]] + cb_b[sequence[cur + 3]] + cb_c[sequence[cur + 1]] + \ cb_d[sequence[cur]] + (delta_T * cb_t[sequence[cur + 2]] / 1000) hn_pred += hn_a[sequence[cur + 4]] + hn_b[sequence[cur + 3]] + hn_c[sequence[cur + 1]] + \ hn_d[sequence[cur]] + (delta_T * hn_t[sequence[cur + 2]] / 1000) ha_pred += ha_a[sequence[cur + 4]] + ha_b[sequence[cur + 3]] + ha_c[sequence[cur + 1]] + \ ha_d[sequence[cur]] + (delta_T * ha_t[sequence[cur + 2]] / 1000) if use_perdeuteration: ca_pred += ca_deut[sequence[cur + 2]] cb_pred += cb_deut[sequence[cur + 2]] # write to output if sequence[cur + 2] == 5: # special output for gly output[cur].update({"CA": __round3(ca_pred, asFloat)}) output[cur].update({"CB": "**.***"}) output[cur].update({"CO": __round3(co_pred, asFloat)}) else: output[cur].update({"CA": __round3(ca_pred, asFloat)}) output[cur].update({"CB": __round3(cb_pred, asFloat)}) output[cur].update({"CO": __round3(co_pred, asFloat)}) if sequence[cur + 2] == 12: # special output for pro output[cur].update({"N": "***.***"}) output[cur].update({"HN": "*.***"}) else: output[cur].update({"N": __round3(n_pred, asFloat)}) output[cur].update({"HN": __round3(hn_pred, asFloat)}) if use_perdeuteration: output[cur].update({"HA": "*.***"}) else: output[cur].update({"HA": __round3(ha_pred, asFloat)}) cur += 1 return output
def __set_sequence(sequence, key1, key3): """Translate an amino-acid sequence string into numeric indices. Parses the input sequence (single-letter codes plus optional parenthesised multi-letter phospho-residue codes) into the integer encoding used by the chemical-shift tables. The numeric list is padded with two sentinel residues (code ``23``) at each end so that the nearest-neighbour correction can be applied uniformly at the chain termini. Unrecognised characters are skipped. Parameters ---------- sequence : str The amino-acid abbreviation string supplied by the user. key1 : list Lookup list mapping single-letter codes (by ``ord``-offset) to numeric residue indices; ``-1`` marks an invalid letter. key3 : dict Lookup dict mapping two/three-letter codes (including phospho aliases) to numeric residue indices. Returns ------- tuple of (list of int, list of str) A 2-tuple ``(sequence, aminos)`` where ``sequence`` is the sentinel-padded list of numeric residue codes and ``aminos`` is the list of the parsed residue abbreviations (unpadded). Example ------- >>> seq, aminos = __set_sequence('AG', key_aa1, key_aa3) >>> seq[:2] [23, 23] """ key_aa1 = key1 key_aa3 = key3 i = 0 code = 0 inp = sequence sequence = [] aminos = [] sequence.append(23) sequence.append(23) # Strip white space at beginning and end inp = inp.strip() regex = re.findall(r"\(([^)]+)\)|(.)", inp) for i in range(len(regex)): set = regex[i] if set[0] == '': aa1 = set[1] aminos.append(aa1) code = ord(aa1[0]) - 65 if (code < 0 or code > 35) or (key_aa1[code] == -1): continue sequence.append(key_aa1[code]) else: aa3 = set[0].upper() aminos.append(aa3) if aa3 in key_aa3: sequence.append(key_aa3[aa3]) else: continue sequence.append(23) sequence.append(23) return (sequence, aminos) # ------------------------------------------------------------------------------------------------------------------------------------------------------------------ def __round3(num, asFloat=False): """Round a number to exactly three decimal places. Performs statistics-consistent rounding of a float to precisely three decimal places, zero-padding the fractional part so the result always has three digits after the decimal point. The value can be returned as either a fixed-width string or a float. Parameters ---------- num : float The number to be rounded. asFloat : bool, optional If ``True`` the rounded value is returned as a float; if ``False`` it is returned as a zero-padded string. Default ``False``. Returns ------- str or float The value rounded to three decimal places - a string when ``asFloat`` is ``False``, otherwise a float. Example ------- >>> __round3(1.5, asFloat=True) 1.5 >>> __round3(1.5) '1.500' """ strng = "" + str(round(num * 1000 + 10**(-len(str(num * 1000)) - 1)) / 1000) strng2 = "" + str(round(num + 10**(-len(str(num)) - 1))) delta = len(strng) - len(strng2) if delta == 0: strng += ".000" if delta == 2: strng += "00" if delta == 3: strng += "0" if asFloat: return float(strng) else: return strng