Source code for ssbio.protein.structure.properties.residues

"""
Structure Residues
==================
"""

import logging
from collections import defaultdict
from copy import deepcopy

from Bio.Alphabet import IUPAC
from Bio.PDB import Polypeptide
from Bio.PDB.HSExposure import ExposureCN, HSExposureCA, HSExposureCB
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

import ssbio.protein.sequence.utils
from ssbio.protein.structure.utils.structureio import StructureIO

log = logging.getLogger(__name__)


[docs]def search_ss_bonds(model, threshold=3.0):
    """ Searches S-S bonds based on distances
        between atoms in the structure (first model only).
        Average distance is 2.05A. Threshold is 3A default.
        Returns iterator with tuples of residues.

        ADAPTED FROM JOAO RODRIGUES' BIOPYTHON GSOC PROJECT (http://biopython.org/wiki/GSOC2010_Joao)
    """

    # Taken from http://docs.python.org/library/itertools.html
    # Python 2.4 does not include itertools.combinations

    def combinations(iterable, r):
        # combinations('ABCD', 2) --> AB AC AD BC BD CD
        # combinations(range(4), 3) --> 012 013 023 123
        pool = tuple(iterable)
        n = len(pool)
        if r > n:
            return
        indices = list(range(r))
        yield tuple(pool[i] for i in indices)
        while True:
            for i in reversed(range(r)):
                if indices[i] != i + n - r:
                    break
            else:
                return
            indices[i] += 1
            for j in range(i + 1, r):
                indices[j] = indices[j - 1] + 1
            yield tuple(pool[i] for i in indices)

    cysteines = [r for r in model.get_residues() if r.get_resname() == 'CYS']

    pairs = combinations(cysteines, 2)  # Iterator with pairs

    bridges = []
    for cys_pair in pairs:
        try:
            if cys_pair[0]['SG'] - cys_pair[1]['SG'] < threshold:
                bridges.append(cys_pair)
        except KeyError:  # This will occur when a CYS residue is missing a SG atom for some reason
            log.error('{}: no SG atom found for one or both of the cysteine residues {}'.format(model, cys_pair))
            continue

    infodict = {}
    if bridges:
        infodict = defaultdict(list)

        for disulfide_bridge in bridges:
            residue1 = disulfide_bridge[0]
            residue2 = disulfide_bridge[1]
            chain = residue1.get_parent().id
            infodict[chain].append((residue1.get_full_id()[3], residue2.get_full_id()[3]))

    return infodict


[docs]def resname_in_proximity(resname, model, chains, resnums, threshold=5):
    """Search within the proximity of a defined list of residue numbers and their chains for any specifed residue name.

    Args:
        resname (str): Residue name to search for in proximity of specified chains + resnums
        model: Biopython Model object
        chains (str, list): Chain ID or IDs to check
        resnums (int, list): Residue numbers within the chain to check
        threshold (float): Cutoff in Angstroms for returning True if a RESNAME is near

    Returns:
        bool: True if a RESNAME is within the threshold cutoff

    """
    residues = [r for r in model.get_residues() if r.get_resname() == resname]

    chains = ssbio.utils.force_list(chains)
    resnums = ssbio.utils.force_list(resnums)

    for chain in chains:
        for resnum in resnums:
            my_residue_last_atom = model[chain][resnum].child_list[-1]
            for rz in residues:
                distance = rz.child_list[-1] - my_residue_last_atom
                if distance < threshold:
                    # print(resnum, rz, distance)
                    return True

    return False


[docs]def get_structure_seqrecords(model):
    """Get a dictionary of a PDB file's sequences.

    Special cases include:
        - Insertion codes. In the case of residue numbers like "15A", "15B", both residues are written out. Example: 9LPR
        - HETATMs. Currently written as an "X", or unknown amino acid.

    Args:
        model: Biopython Model object of a Structure

    Returns:
        list: List of SeqRecords

    """

    structure_seq_records = []

    # Loop over each chain of the PDB
    for chain in model:
        tracker = 0
        chain_seq = ''
        chain_resnums = []

        # Loop over the residues
        for res in chain.get_residues():
            # NOTE: you can get the residue number too
            res_id = res.id
            res_num = res_id[1]
            res_icode = res_id[2]

            # Double check if the residue name is a standard residue
            # If it is not a standard residue (ie. selenomethionine),
            # it will be filled in with an X on the next iteration)
            if Polypeptide.is_aa(res, standard=True):
                end_tracker = res_num
                res_aa_one = Polypeptide.three_to_one(res.get_resname())

                # Tracker to fill in X's
                if end_tracker != (tracker + 1):
                    if res_icode != ' ':
                        chain_seq += res_aa_one
                        chain_resnums.append(res_num)
                        tracker = end_tracker + 1
                        continue
                    else:
                        multiplier = (end_tracker - tracker - 1)
                        chain_seq += 'X' * multiplier
                        # Residue numbers for unresolved or nonstandard residues are Infinite
                        chain_resnums.extend([float("Inf")] * multiplier)

                chain_seq += res_aa_one
                chain_resnums.append(res_num)
                tracker = end_tracker

            else:
                continue

        chain_seq_record = SeqRecord(Seq(chain_seq, IUPAC.protein), id=chain.get_id())
        chain_seq_record.letter_annotations['structure_resnums'] = chain_resnums
        structure_seq_records.append(chain_seq_record)

    return structure_seq_records


[docs]def get_structure_seqs(pdb_file, file_type):
    """Get a dictionary of a PDB file's sequences.

    Special cases include:
        - Insertion codes. In the case of residue numbers like "15A", "15B", both residues are written out. Example: 9LPR
        - HETATMs. Currently written as an "X", or unknown amino acid.

    Args:
        pdb_file: Path to PDB file

    Returns:
        dict: Dictionary of:
        {chain_id: sequence}

    """

    # TODO: Please check out capitalization of chain IDs in mmcif files. example: 5afi - chain "l" is present but
    # it seems like biopython capitalizes it to chain L

    # Get the first model
    my_structure = StructureIO(pdb_file)
    model = my_structure.first_model

    structure_seqs = {}

    # Loop over each chain of the PDB
    for chain in model:
        chain_seq = ''
        tracker = 0

        # Loop over the residues
        for res in chain.get_residues():
            # NOTE: you can get the residue number too
            # res_num = res.id[1]

            # Double check if the residue name is a standard residue
            # If it is not a standard residue (ie. selenomethionine),
            # it will be filled in with an X on the next iteration)
            if Polypeptide.is_aa(res, standard=True):
                full_id = res.get_full_id()
                end_tracker = full_id[3][1]
                i_code = full_id[3][2]
                aa = Polypeptide.three_to_one(res.get_resname())

                # Tracker to fill in X's
                if end_tracker != (tracker + 1):
                    if i_code != ' ':
                        chain_seq += aa
                        tracker = end_tracker + 1
                        continue
                    else:
                        chain_seq += 'X' * (end_tracker - tracker - 1)

                chain_seq += aa
                tracker = end_tracker

            else:
                continue

        structure_seqs[chain.get_id()] = chain_seq

    return structure_seqs


[docs]def match_structure_sequence(orig_seq, new_seq, match='X', fill_with='X', ignore_excess=False):
    """Correct a sequence to match inserted X's in a structure sequence

    This is useful for mapping a sequence obtained from structural tools like MSMS or DSSP
        to the sequence obtained by the get_structure_seqs method.

    Examples:
        >>> structure_seq = 'XXXABCDEF'
        >>> prop_list = [4, 5, 6, 7, 8, 9]
        >>> match_structure_sequence(structure_seq, prop_list)
        ['X', 'X', 'X', 4, 5, 6, 7, 8, 9]

        >>> match_structure_sequence(structure_seq, prop_list, fill_with=float('Inf'))
        [inf, inf, inf, 4, 5, 6, 7, 8, 9]

        >>> structure_seq = '---ABCDEF---'
        >>> prop_list = ('H','H','H','C','C','C')
        >>> match_structure_sequence(structure_seq, prop_list, match='-', fill_with='-')
        ('-', '-', '-', 'H', 'H', 'H', 'C', 'C', 'C', '-', '-', '-')

        >>> structure_seq = 'ABCDEF---'
        >>> prop_list = 'HHHCCC'
        >>> match_structure_sequence(structure_seq, prop_list, match='-', fill_with='-')
        'HHHCCC---'

        >>> structure_seq = 'AXBXCXDXEXF'
        >>> prop_list = ['H', 'H', 'H', 'C', 'C', 'C']
        >>> match_structure_sequence(structure_seq, prop_list, match='X', fill_with='X')
        ['H', 'X', 'H', 'X', 'H', 'X', 'C', 'X', 'C', 'X', 'C']

    Args:
        orig_seq (str, Seq, SeqRecord): Sequence to match to
        new_seq (str, tuple, list): Sequence to fill in
        match (str): What to match
        fill_with: What to fill in when matches are found
        ignore_excess (bool): If excess sequence on the tail end of new_seq should be ignored

    Returns:
        str, tuple, list: new_seq which will match the length of orig_seq

    """
    if len(orig_seq) == len(new_seq):
        log.debug('Lengths already equal, nothing to fill in')
        return new_seq

    if not ignore_excess:
        if len(orig_seq) < len(new_seq):
            raise ValueError('Original sequence has a length less than the sequence provided to match to')
    else:
        log.debug('New sequence will be truncated to length of original sequence - information may be lost!')

    if not isinstance(new_seq, str) and not isinstance(new_seq, tuple) and not isinstance(new_seq, list):
        raise ValueError('Invalid sequence provided, must be string, tuple, or list')

    orig_seq = ssbio.protein.sequence.utils.cast_to_str(orig_seq)
    new_thing = deepcopy(new_seq)
    if isinstance(new_seq, tuple):
        new_thing = list(new_thing)

    for i, s in enumerate(orig_seq):
        if s == match:
            if isinstance(new_thing, str):
                new_thing = new_thing[:i] + fill_with + new_thing[i:]
            if isinstance(new_thing, list):
                new_thing.insert(i, fill_with)

    new_thing = new_thing[:len(orig_seq)]

    if isinstance(new_seq, tuple):
        new_thing = tuple(new_thing)

    return new_thing


[docs]def site_centroid(residues, model):
    """Get the XYZ coordinate of the center of a list of residues.

    Args:
        residues: List of residue numbers
        pdb_file: Path to PDB file

    Returns:
        tuple: (X, Y, Z) coordinate of centroid

    """
    pass


[docs]def distance_to_site(residue_of_interest, residues, model):
    """Calculate the distance between an amino acid and a group of amino acids.

    Args:
        residue_of_interest: Residue number you are interested in (ie. a mutation)
        residues: List of residue numbers

    Returns:
        float: Distance (in Angstroms) to the group of residues

    """
    centroid = site_centroid(residues, residue_of_interest)
    pass


# TODO: half sphere exposure
[docs]def hse_output(pdb_file, file_type):
    """
    The solvent exposure of an amino acid residue is important for analyzing,
    understanding and predicting aspects of protein structure and function [73].
    A residue's solvent exposure can be classified as four categories: exposed, partly exposed,
    buried and deeply buried residues. Hamelryck  et al. [73] established a new 2D measure that provides a
    different view of solvent exposure, i.e. half-sphere exposure (HSE). By conceptually dividing the sphere
    of a residue into two halves- HSE-up and HSE-down, HSE provides a more detailed description of an amino
    acid residue's spatial neighborhood. HSE is calculated by the hsexpo module implemented in the BioPython
    package [74] from a PDB file.

    http://onlinelibrary.wiley.com/doi/10.1002/prot.20379/abstract

    Args:
        pdb_file:

    Returns:

    """
    # Get the first model
    my_structure = StructureIO(pdb_file)
    model = my_structure.first_model

    # Calculate HSEalpha
    exp_ca = HSExposureCA(model)
    # Calculate HSEbeta
    exp_cb = HSExposureCB(model)
    # Calculate classical coordination number
    exp_fs = ExposureCN(model)

    return
# def magni(a, b, c):
#     """Calculate the magnitude of distance vector
#     """
#     return pow((pow(a, 2) + pow(b, 2) + pow(c, 2)), 1.0 / 2.0)


# @cachetools.func.ttl_cache(maxsize=256)
# def calculate_res_distance(res_1, res_2, pdb_file):
#     """Calculate distance of one residue number to another in a PDB file
#
#     Args:
#         res_1: Residue number 1
#         res_2: Residue number 2
#         pdb_file: Path to PDB file
#
#     Returns:
#
#     """
#
#     my_structure = StructureIO(pdb_file)
#     model = my_structure.first_model
#
#     res_list = PDB.Selection.unfold_entities(model, 'R')
#
#     ires_list = []
#     res_chk_1 = ''
#     res_chk_2 = ''
#     for j in res_list:
#         if j.id[1] in [res_1, res_2] and j.resname != 'HOH':
#             ires_list.append(j)
#             if res_chk_1 == '' and res_chk_2 == '':
#                 res_chk_1 = j.id[1]
#             else:
#                 res_chk_2 = j.id[1]
#
#     paired = ssbio.utils.combinations(ires_list, 2)
#     try:
#         for k in paired:
#             chainA = PDB.Selection.unfold_entities(k[0], 'C')[0]
#             chainB = PDB.Selection.unfold_entities(k[1], 'C')[0]
#             vec = list(
#                 np.array([x.get_coord() for x in k[0]]).mean(axis=0) - np.array([x.get_coord() for x in k[1]]).mean(
#                     axis=0))
#             distance = magni(vec[0], vec[1], vec[2])
#
#         return distance
#     except UnboundLocalError:
#         log.error("Unknown interaction")
#         return None