"""
Structure Residues
==================
"""
import logging
from collections import defaultdict
from copy import deepcopy
from Bio.Alphabet import IUPAC
from Bio.PDB import Polypeptide
from Bio.PDB.HSExposure import ExposureCN, HSExposureCA, HSExposureCB
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import ssbio.protein.sequence.utils
from ssbio.protein.structure.utils.structureio import StructureIO
log = logging.getLogger(__name__)
[docs]def search_ss_bonds(model, threshold=3.0):
""" Searches S-S bonds based on distances
between atoms in the structure (first model only).
Average distance is 2.05A. Threshold is 3A default.
Returns iterator with tuples of residues.
ADAPTED FROM JOAO RODRIGUES' BIOPYTHON GSOC PROJECT (http://biopython.org/wiki/GSOC2010_Joao)
"""
# Taken from http://docs.python.org/library/itertools.html
# Python 2.4 does not include itertools.combinations
def combinations(iterable, r):
# combinations('ABCD', 2) --> AB AC AD BC BD CD
# combinations(range(4), 3) --> 012 013 023 123
pool = tuple(iterable)
n = len(pool)
if r > n:
return
indices = list(range(r))
yield tuple(pool[i] for i in indices)
while True:
for i in reversed(range(r)):
if indices[i] != i + n - r:
break
else:
return
indices[i] += 1
for j in range(i + 1, r):
indices[j] = indices[j - 1] + 1
yield tuple(pool[i] for i in indices)
cysteines = [r for r in model.get_residues() if r.get_resname() == 'CYS']
pairs = combinations(cysteines, 2) # Iterator with pairs
bridges = []
for cys_pair in pairs:
try:
if cys_pair[0]['SG'] - cys_pair[1]['SG'] < threshold:
bridges.append(cys_pair)
except KeyError: # This will occur when a CYS residue is missing a SG atom for some reason
log.error('{}: no SG atom found for one or both of the cysteine residues {}'.format(model, cys_pair))
continue
infodict = {}
if bridges:
infodict = defaultdict(list)
for disulfide_bridge in bridges:
residue1 = disulfide_bridge[0]
residue2 = disulfide_bridge[1]
chain = residue1.get_parent().id
infodict[chain].append((residue1.get_full_id()[3], residue2.get_full_id()[3]))
return infodict
[docs]def resname_in_proximity(resname, model, chains, resnums, threshold=5):
"""Search within the proximity of a defined list of residue numbers and their chains for any specifed residue name.
Args:
resname (str): Residue name to search for in proximity of specified chains + resnums
model: Biopython Model object
chains (str, list): Chain ID or IDs to check
resnums (int, list): Residue numbers within the chain to check
threshold (float): Cutoff in Angstroms for returning True if a RESNAME is near
Returns:
bool: True if a RESNAME is within the threshold cutoff
"""
residues = [r for r in model.get_residues() if r.get_resname() == resname]
chains = ssbio.utils.force_list(chains)
resnums = ssbio.utils.force_list(resnums)
for chain in chains:
for resnum in resnums:
my_residue_last_atom = model[chain][resnum].child_list[-1]
for rz in residues:
distance = rz.child_list[-1] - my_residue_last_atom
if distance < threshold:
# print(resnum, rz, distance)
return True
return False
[docs]def get_structure_seqrecords(model):
"""Get a dictionary of a PDB file's sequences.
Special cases include:
- Insertion codes. In the case of residue numbers like "15A", "15B", both residues are written out. Example: 9LPR
- HETATMs. Currently written as an "X", or unknown amino acid.
Args:
model: Biopython Model object of a Structure
Returns:
list: List of SeqRecords
"""
structure_seq_records = []
# Loop over each chain of the PDB
for chain in model:
tracker = 0
chain_seq = ''
chain_resnums = []
# Loop over the residues
for res in chain.get_residues():
# NOTE: you can get the residue number too
res_id = res.id
res_num = res_id[1]
res_icode = res_id[2]
# Double check if the residue name is a standard residue
# If it is not a standard residue (ie. selenomethionine),
# it will be filled in with an X on the next iteration)
if Polypeptide.is_aa(res, standard=True):
end_tracker = res_num
res_aa_one = Polypeptide.three_to_one(res.get_resname())
# Tracker to fill in X's
if end_tracker != (tracker + 1):
if res_icode != ' ':
chain_seq += res_aa_one
chain_resnums.append(res_num)
tracker = end_tracker + 1
continue
else:
multiplier = (end_tracker - tracker - 1)
chain_seq += 'X' * multiplier
# Residue numbers for unresolved or nonstandard residues are Infinite
chain_resnums.extend([float("Inf")] * multiplier)
chain_seq += res_aa_one
chain_resnums.append(res_num)
tracker = end_tracker
else:
continue
chain_seq_record = SeqRecord(Seq(chain_seq, IUPAC.protein), id=chain.get_id())
chain_seq_record.letter_annotations['structure_resnums'] = chain_resnums
structure_seq_records.append(chain_seq_record)
return structure_seq_records
[docs]def get_structure_seqs(pdb_file, file_type):
"""Get a dictionary of a PDB file's sequences.
Special cases include:
- Insertion codes. In the case of residue numbers like "15A", "15B", both residues are written out. Example: 9LPR
- HETATMs. Currently written as an "X", or unknown amino acid.
Args:
pdb_file: Path to PDB file
Returns:
dict: Dictionary of:
{chain_id: sequence}
"""
# TODO: Please check out capitalization of chain IDs in mmcif files. example: 5afi - chain "l" is present but
# it seems like biopython capitalizes it to chain L
# Get the first model
my_structure = StructureIO(pdb_file)
model = my_structure.first_model
structure_seqs = {}
# Loop over each chain of the PDB
for chain in model:
chain_seq = ''
tracker = 0
# Loop over the residues
for res in chain.get_residues():
# NOTE: you can get the residue number too
# res_num = res.id[1]
# Double check if the residue name is a standard residue
# If it is not a standard residue (ie. selenomethionine),
# it will be filled in with an X on the next iteration)
if Polypeptide.is_aa(res, standard=True):
full_id = res.get_full_id()
end_tracker = full_id[3][1]
i_code = full_id[3][2]
aa = Polypeptide.three_to_one(res.get_resname())
# Tracker to fill in X's
if end_tracker != (tracker + 1):
if i_code != ' ':
chain_seq += aa
tracker = end_tracker + 1
continue
else:
chain_seq += 'X' * (end_tracker - tracker - 1)
chain_seq += aa
tracker = end_tracker
else:
continue
structure_seqs[chain.get_id()] = chain_seq
return structure_seqs
[docs]def match_structure_sequence(orig_seq, new_seq, match='X', fill_with='X', ignore_excess=False):
"""Correct a sequence to match inserted X's in a structure sequence
This is useful for mapping a sequence obtained from structural tools like MSMS or DSSP
to the sequence obtained by the get_structure_seqs method.
Examples:
>>> structure_seq = 'XXXABCDEF'
>>> prop_list = [4, 5, 6, 7, 8, 9]
>>> match_structure_sequence(structure_seq, prop_list)
['X', 'X', 'X', 4, 5, 6, 7, 8, 9]
>>> match_structure_sequence(structure_seq, prop_list, fill_with=float('Inf'))
[inf, inf, inf, 4, 5, 6, 7, 8, 9]
>>> structure_seq = '---ABCDEF---'
>>> prop_list = ('H','H','H','C','C','C')
>>> match_structure_sequence(structure_seq, prop_list, match='-', fill_with='-')
('-', '-', '-', 'H', 'H', 'H', 'C', 'C', 'C', '-', '-', '-')
>>> structure_seq = 'ABCDEF---'
>>> prop_list = 'HHHCCC'
>>> match_structure_sequence(structure_seq, prop_list, match='-', fill_with='-')
'HHHCCC---'
>>> structure_seq = 'AXBXCXDXEXF'
>>> prop_list = ['H', 'H', 'H', 'C', 'C', 'C']
>>> match_structure_sequence(structure_seq, prop_list, match='X', fill_with='X')
['H', 'X', 'H', 'X', 'H', 'X', 'C', 'X', 'C', 'X', 'C']
Args:
orig_seq (str, Seq, SeqRecord): Sequence to match to
new_seq (str, tuple, list): Sequence to fill in
match (str): What to match
fill_with: What to fill in when matches are found
ignore_excess (bool): If excess sequence on the tail end of new_seq should be ignored
Returns:
str, tuple, list: new_seq which will match the length of orig_seq
"""
if len(orig_seq) == len(new_seq):
log.debug('Lengths already equal, nothing to fill in')
return new_seq
if not ignore_excess:
if len(orig_seq) < len(new_seq):
raise ValueError('Original sequence has a length less than the sequence provided to match to')
else:
log.debug('New sequence will be truncated to length of original sequence - information may be lost!')
if not isinstance(new_seq, str) and not isinstance(new_seq, tuple) and not isinstance(new_seq, list):
raise ValueError('Invalid sequence provided, must be string, tuple, or list')
orig_seq = ssbio.protein.sequence.utils.cast_to_str(orig_seq)
new_thing = deepcopy(new_seq)
if isinstance(new_seq, tuple):
new_thing = list(new_thing)
for i, s in enumerate(orig_seq):
if s == match:
if isinstance(new_thing, str):
new_thing = new_thing[:i] + fill_with + new_thing[i:]
if isinstance(new_thing, list):
new_thing.insert(i, fill_with)
new_thing = new_thing[:len(orig_seq)]
if isinstance(new_seq, tuple):
new_thing = tuple(new_thing)
return new_thing
[docs]def site_centroid(residues, model):
"""Get the XYZ coordinate of the center of a list of residues.
Args:
residues: List of residue numbers
pdb_file: Path to PDB file
Returns:
tuple: (X, Y, Z) coordinate of centroid
"""
pass
[docs]def distance_to_site(residue_of_interest, residues, model):
"""Calculate the distance between an amino acid and a group of amino acids.
Args:
residue_of_interest: Residue number you are interested in (ie. a mutation)
residues: List of residue numbers
Returns:
float: Distance (in Angstroms) to the group of residues
"""
centroid = site_centroid(residues, residue_of_interest)
pass
# TODO: half sphere exposure
[docs]def hse_output(pdb_file, file_type):
"""
The solvent exposure of an amino acid residue is important for analyzing,
understanding and predicting aspects of protein structure and function [73].
A residue's solvent exposure can be classified as four categories: exposed, partly exposed,
buried and deeply buried residues. Hamelryck et al. [73] established a new 2D measure that provides a
different view of solvent exposure, i.e. half-sphere exposure (HSE). By conceptually dividing the sphere
of a residue into two halves- HSE-up and HSE-down, HSE provides a more detailed description of an amino
acid residue's spatial neighborhood. HSE is calculated by the hsexpo module implemented in the BioPython
package [74] from a PDB file.
http://onlinelibrary.wiley.com/doi/10.1002/prot.20379/abstract
Args:
pdb_file:
Returns:
"""
# Get the first model
my_structure = StructureIO(pdb_file)
model = my_structure.first_model
# Calculate HSEalpha
exp_ca = HSExposureCA(model)
# Calculate HSEbeta
exp_cb = HSExposureCB(model)
# Calculate classical coordination number
exp_fs = ExposureCN(model)
return
# def magni(a, b, c):
# """Calculate the magnitude of distance vector
# """
# return pow((pow(a, 2) + pow(b, 2) + pow(c, 2)), 1.0 / 2.0)
# @cachetools.func.ttl_cache(maxsize=256)
# def calculate_res_distance(res_1, res_2, pdb_file):
# """Calculate distance of one residue number to another in a PDB file
#
# Args:
# res_1: Residue number 1
# res_2: Residue number 2
# pdb_file: Path to PDB file
#
# Returns:
#
# """
#
# my_structure = StructureIO(pdb_file)
# model = my_structure.first_model
#
# res_list = PDB.Selection.unfold_entities(model, 'R')
#
# ires_list = []
# res_chk_1 = ''
# res_chk_2 = ''
# for j in res_list:
# if j.id[1] in [res_1, res_2] and j.resname != 'HOH':
# ires_list.append(j)
# if res_chk_1 == '' and res_chk_2 == '':
# res_chk_1 = j.id[1]
# else:
# res_chk_2 = j.id[1]
#
# paired = ssbio.utils.combinations(ires_list, 2)
# try:
# for k in paired:
# chainA = PDB.Selection.unfold_entities(k[0], 'C')[0]
# chainB = PDB.Selection.unfold_entities(k[1], 'C')[0]
# vec = list(
# np.array([x.get_coord() for x in k[0]]).mean(axis=0) - np.array([x.get_coord() for x in k[1]]).mean(
# axis=0))
# distance = magni(vec[0], vec[1], vec[2])
#
# return distance
# except UnboundLocalError:
# log.error("Unknown interaction")
# return None