"""
SeqProp
=======
"""
import os.path as op
import pandas as pd
import requests
import logging
from copy import copy, deepcopy
from slugify import Slugify
from Bio import SeqIO
from BCBio import GFF
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation, ExactPosition
from ssbio.core.object import Object
import ssbio.utils
import ssbio.databases.pdb
import ssbio.protein.sequence.utils
import ssbio.protein.sequence.utils.fasta
import ssbio.protein.sequence.properties.residues
custom_slugify = Slugify(safe_chars='-_.')
log = logging.getLogger(__name__)
[docs]class SeqProp(SeqRecord):
"""Generic class to represent information for a protein sequence.
Extends the Biopython SeqRecord class. The main functionality added is the ability to set and load directly from
sequence, metadata, and feature files. Additionally, methods are provided to calculate and store sequence
properties in the ``annotations`` and ``letter_annotations`` field of a SeqProp. These can then be accessed
for a range of residue numbers.
Attributes:
id (str): Unique identifier for this protein sequence
seq (Seq): Protein sequence as a Biopython Seq object
name (str): Optional name for this sequence
description (str): Optional description for this sequence
bigg (str, list): BiGG IDs mapped to this sequence
kegg (str, list): KEGG IDs mapped to this sequence
refseq (str, list): RefSeq IDs mapped to this sequence
uniprot (str, list): UniProt IDs mapped to this sequence
gene_name (str, list): Gene names mapped to this sequence
pdbs (list): PDB IDs mapped to this sequence
go (str, list): GO terms mapped to this sequence
pfam (str, list): PFAMs mapped to this sequence
ec_number (str, list): EC numbers mapped to this sequence
sequence_file (str): FASTA file for this sequence
metadata_file (str): Metadata file (any format) for this sequence
feature_file (str): GFF file for this sequence
features (list): List of protein sequence features, which define regions of the protein
annotations (dict): Annotations of this protein sequence, which summarize global properties
letter_annotations (RestrictedDict): Residue-level annotations, which describe single residue properties
Todo:
- Properly inherit methods from the Object class...
"""
def __init__(self, seq, id, name='<unknown name>', description='<unknown description>',
sequence_path=None, metadata_path=None, feature_path=None):
"""Initialize a SeqProp object with the standard SeqRecord inputs, along with optional paths to files.
Args:
seq (str, Seq, SeqRecord): Protein sequence to load
id (str): Unique identifier for this protein sequence
name (str): Optional name for this sequence
description (str): Optional description for this sequence
sequence_path (str): Absolute or relative path to sequence (FASTA) file
metadata_path (str): Absolute or relative path to metadata file
feature_path (str): Absolute or relative path to feature (GFF) file
"""
# Top level database identifiers
self.id = id
self.description = description
self.notes = {}
self.bigg = None
self.kegg = None
self.refseq = None
self.uniprot = None
self.gene_name = None
self.pdbs = None
self.go = None
self.pfam = None
self.ec_number = None
# Files, sequences, features
self._sequence_dir = None
self.sequence_file = None
self._metadata_dir = None
self.metadata_file = None
self._feature_dir = None
self.feature_file = None
self._features = None
# Stuff from SeqRecord
self.name = None
self.description = None
self.dbxrefs = None
self.features = None
self.annotations = None
self._per_letter_annotations = None
self._seq = None
self.seq = seq
super(SeqProp, self).__init__(seq=self.seq, id=id, name=name, description=description)
if sequence_path:
self.sequence_path = sequence_path
if metadata_path:
self.metadata_path = metadata_path
if feature_path:
self.feature_path = feature_path
@property
def seq(self):
"""Seq: Dynamically loaded Seq object from the sequence file"""
if self.sequence_file:
file_to_load = copy(self.sequence_path)
log.debug('{}: reading sequence from sequence file {}'.format(self.id, file_to_load))
tmp_sr = SeqIO.read(file_to_load, 'fasta')
return tmp_sr.seq
else:
if not self._seq:
log.debug('{}: no sequence stored in memory'.format(self.id))
else:
log.debug('{}: reading sequence from memory'.format(self.id))
return self._seq
@seq.setter
def seq(self, s):
if not s:
self._seq = None
elif self.sequence_file:
raise ValueError('{}: unable to set sequence, sequence file is associated with this object'.format(self.id))
elif type(s) == str or type(s) == Seq:
self._seq = ssbio.protein.sequence.utils.cast_to_seq(obj=s)
# If a SeqRecord, copy all attributes
elif type(s) == SeqRecord:
self._seq = s.seq
if self.name == '<unknown name>':
self.name = s.name
if self.description == '<unknown description>':
self.description = s.description
if not self.dbxrefs:
self.dbxrefs = s.dbxrefs
if not self.features:
self.features = s.features
if not self.annotations:
self.annotations = s.annotations
if not self.letter_annotations:
self.letter_annotations = s.letter_annotations
@property
def features(self):
"""list: Get the features stored in memory or in the GFF file"""
if self.feature_file:
log.debug('{}: reading features from feature file {}'.format(self.id, self.feature_path))
with open(self.feature_path) as handle:
feats = list(GFF.parse(handle))
if len(feats) > 1:
log.warning('Too many sequences in GFF')
else:
return feats[0].features
else:
return self._features
@features.setter
def features(self, feats):
if self.feature_file:
raise ValueError('{}: unable to set features, feature file is associated with this object'.format(self.id))
elif feats:
self._features = feats
else:
self._features = []
@property
def seq_str(self):
"""str: Get the sequence formatted as a string"""
if not self.seq:
return None
return ssbio.protein.sequence.utils.cast_to_str(self.seq)
@property
def seq_len(self):
"""int: Get the sequence length"""
if not self.seq:
return 0
return len(self.seq)
@property
def num_pdbs(self):
"""int: Report the number of PDB IDs stored in the ``pdbs`` attribute"""
if not self.pdbs:
return 0
else:
return len(self.pdbs)
@property
def sequence_dir(self):
if not self._sequence_dir:
raise OSError('No sequence folder set')
return self._sequence_dir
@sequence_dir.setter
def sequence_dir(self, path):
if path and not op.exists(path):
raise OSError('{}: folder does not exist'.format(path))
self._sequence_dir = path
@property
def sequence_path(self):
if not self.sequence_file:
raise OSError('Sequence file not loaded')
path = op.join(self.sequence_dir, self.sequence_file)
if not op.exists(path):
raise OSError('{}: file does not exist'.format(path))
return path
@sequence_path.setter
def sequence_path(self, fasta_path):
"""Provide pointers to the paths of the FASTA file
Args:
fasta_path: Path to FASTA file
"""
if not fasta_path:
self.sequence_dir = None
self.sequence_file = None
else:
if not op.exists(fasta_path):
raise OSError('{}: file does not exist'.format(fasta_path))
if not op.dirname(fasta_path):
self.sequence_dir = '.'
else:
self.sequence_dir = op.dirname(fasta_path)
self.sequence_file = op.basename(fasta_path)
tmp_sr = SeqIO.read(fasta_path, 'fasta')
if self.name == '<unknown name>':
self.name = tmp_sr.name
if self.description == '<unknown description>':
self.description = tmp_sr.description
if not self.dbxrefs:
self.dbxrefs = tmp_sr.dbxrefs
if not self.features:
self.features = tmp_sr.features
if not self.annotations:
self.annotations = tmp_sr.annotations
if not self.letter_annotations:
self.letter_annotations = tmp_sr.letter_annotations
@property
def metadata_dir(self):
if not self._metadata_dir:
raise OSError('No metadata folder set')
return self._metadata_dir
@metadata_dir.setter
def metadata_dir(self, path):
if path and not op.exists(path):
raise OSError('{}: folder does not exist'.format(path))
self._metadata_dir = path
@property
def metadata_path(self):
if not self.metadata_file:
raise OSError('Metadata file not loaded')
path = op.join(self.metadata_dir, self.metadata_file)
if not op.exists(path):
raise OSError('{}: file does not exist'.format(path))
return path
@metadata_path.setter
def metadata_path(self, m_path):
"""Provide pointers to the paths of the metadata file
Args:
m_path: Path to metadata file
"""
if not m_path:
self.metadata_dir = None
self.metadata_file = None
else:
if not op.exists(m_path):
raise OSError('{}: file does not exist!'.format(m_path))
if not op.dirname(m_path):
self.metadata_dir = '.'
else:
self.metadata_dir = op.dirname(m_path)
self.metadata_file = op.basename(m_path)
@property
def feature_dir(self):
if not self._feature_dir:
raise OSError('No feature folder set')
return self._feature_dir
@feature_dir.setter
def feature_dir(self, path):
if path and not op.exists(path):
raise OSError('{}: folder does not exist'.format(path))
self._feature_dir = path
@property
def feature_path(self):
path = op.join(self.feature_dir, self.feature_file)
if not op.exists(path):
raise OSError('{}: file does not exist'.format(path))
return path
@feature_path.setter
def feature_path(self, gff_path):
"""Load a GFF file with information on a single sequence and store features in the ``features`` attribute
Args:
gff_path: Path to GFF file.
"""
if not gff_path:
self.feature_dir = None
self.feature_file = None
else:
if not op.exists(gff_path):
raise OSError('{}: file does not exist!'.format(gff_path))
if not op.dirname(gff_path):
self.feature_dir = '.'
else:
self.feature_dir = op.dirname(gff_path)
self.feature_file = op.basename(gff_path)
[docs] def feature_path_unset(self):
"""Copy features to memory and remove the association of the feature file."""
if not self.feature_file:
raise IOError('No feature file to unset')
with open(self.feature_path) as handle:
feats = list(GFF.parse(handle))
if len(feats) > 1:
log.warning('Too many sequences in GFF')
else:
tmp = feats[0].features
self.feature_dir = None
self.feature_file = None
self.features = tmp
# def __str__(self):
# return Object.__str__(self)
#
# def __repr__(self):
# return Object.__repr__(self)
def update(self, newdata, overwrite=False, only_keys=None):
# Filter for list of keys in only_keys
if only_keys:
only_keys = ssbio.utils.force_list(only_keys)
newdata = {k: v for k, v in newdata.items() if k in only_keys}
# Update attributes
for key, value in newdata.items():
# Overwrite flag overwrites all attributes
if overwrite:
setattr(self, key, value)
else:
# Otherwise check if attribute is None and set it if so
if hasattr(self, key):
if not getattr(self, key):
setattr(self, key, value)
else:
continue
# Or just add a new attribute
else:
setattr(self, key, value)
[docs] def get_dict(self, only_attributes=None, exclude_attributes=None, df_format=False):
"""Get a dictionary of this object's attributes. Optional format for storage in a Pandas DataFrame.
Args:
only_attributes (str, list): Attributes that should be returned. If not provided, all are returned.
exclude_attributes (str, list): Attributes that should be excluded.
df_format (bool): If dictionary values should be formatted for a dataframe
(everything possible is transformed into strings, int, or float -
if something can't be transformed it is excluded)
Returns:
dict: Dictionary of attributes
"""
# Choose attributes to return, return everything in the object if a list is not specified
if not only_attributes:
keys = list(self.__dict__.keys())
else:
keys = ssbio.utils.force_list(only_attributes)
# Remove keys you don't want returned
if exclude_attributes:
exclude_attributes = ssbio.utils.force_list(exclude_attributes)
for x in exclude_attributes:
if x in keys:
keys.remove(x)
# Copy attributes into a new dictionary
df_dict = {}
for k, orig_v in self.__dict__.items():
if k in keys:
v = deepcopy(orig_v)
if df_format:
if v and not isinstance(v, str) and not isinstance(v, int) and not isinstance(v,
float) and not isinstance(
v, bool):
try:
df_dict[k] = ssbio.utils.force_string(deepcopy(v))
except TypeError:
log.warning('{}: excluding attribute from dict, cannot transform into string'.format(k))
elif not v and not isinstance(v, int) and not isinstance(v, float):
df_dict[k] = None
else:
df_dict[k] = deepcopy(v)
else:
df_dict[k] = deepcopy(v)
return df_dict
def save_pickle(self, outfile, protocol=2):
import ssbio.io
ssbio.io.save_pickle(self, outfile, protocol)
def __json_encode__(self):
to_return = {}
# Don't save properties, methods in the JSON
for x in [a for a in dir(self) if
not a.startswith('__') and not a.startswith('_{}__'.format(type(self).__name__)) and not isinstance(
getattr(type(self), a, None), property) and not callable(getattr(self, a))]:
to_return.update({x: getattr(self, x)})
return to_return
def save_json(self, outfile, compression=False):
import ssbio.io
ssbio.io.save_json(self, outfile, compression=compression)
[docs] def equal_to(self, seq_prop):
"""Test if the sequence is equal to another SeqProp's sequence
Args:
seq_prop: SeqProp object
Returns:
bool: If the sequences are the same
"""
if not self.seq or not seq_prop or not seq_prop.seq:
return False
return self.seq == seq_prop.seq
[docs] def write_fasta_file(self, outfile, force_rerun=False):
"""Write a FASTA file for the protein sequence, ``seq`` will now load directly from this file.
Args:
outfile (str): Path to new FASTA file to be written to
force_rerun (bool): If an existing file should be overwritten
"""
if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
SeqIO.write(self, outfile, "fasta")
# The Seq as it will now be dynamically loaded from the file
self.sequence_path = outfile
[docs] def write_gff_file(self, outfile, force_rerun=False):
"""Write a GFF file for the protein features, ``features`` will now load directly from this file.
Args:
outfile (str): Path to new FASTA file to be written to
force_rerun (bool): If an existing file should be overwritten
"""
if ssbio.utils.force_rerun(outfile=outfile, flag=force_rerun):
with open(outfile, "w") as out_handle:
GFF.write([self], out_handle)
self.feature_path = outfile
[docs] def add_point_feature(self, resnum, feat_type=None, feat_id=None):
"""Add a feature to the features list describing a single residue.
Args:
resnum (int): Protein sequence residue number
feat_type (str, optional): Optional description of the feature type (ie. 'catalytic residue')
feat_id (str, optional): Optional ID of the feature type (ie. 'TM1')
"""
if self.feature_file:
raise ValueError('Feature file associated with sequence, please remove file association to append '
'additional features.')
if not feat_type:
feat_type = 'Manually added protein sequence single residue feature'
newfeat = SeqFeature(location=FeatureLocation(ExactPosition(resnum-1), ExactPosition(resnum)),
type=feat_type,
id=feat_id)
self.features.append(newfeat)
[docs] def add_region_feature(self, start_resnum, end_resnum, feat_type=None, feat_id=None):
"""Add a feature to the features list describing a region of the protein sequence.
Args:
start_resnum (int): Start residue number of the protein sequence feature
end_resnum (int): End residue number of the protein sequence feature
feat_type (str, optional): Optional description of the feature type (ie. 'binding domain')
feat_id (str, optional): Optional ID of the feature type (ie. 'TM1')
"""
if self.feature_file:
raise ValueError('Feature file associated with sequence, please remove file association to append '
'additional features.')
if not feat_type:
feat_type = 'Manually added protein sequence region feature'
newfeat = SeqFeature(location=FeatureLocation(start_resnum-1, end_resnum),
type=feat_type,
id=feat_id)
self.features.append(newfeat)
[docs] def get_biopython_pepstats(self):
"""Run Biopython's built in ProteinAnalysis module and store statistics in the ``annotations`` attribute."""
if self.seq:
try:
pepstats = ssbio.protein.sequence.properties.residues.biopython_protein_analysis(self.seq)
except KeyError as e:
log.error('{}: unable to run ProteinAnalysis module, unknown amino acid {}'.format(self.id, e))
return
except ValueError as e:
log.error('{}: unable to run ProteinAnalysis module, {}'.format(self.id, e))
return
self.annotations.update(pepstats)
else:
raise ValueError('{}: no sequence available, unable to run ProteinAnalysis'.format(self.id))
[docs] def get_emboss_pepstats(self):
"""Run the EMBOSS pepstats program on the protein sequence.
Stores statistics in the ``annotations`` attribute.
Saves a ``.pepstats`` file of the results where the sequence file is located.
"""
if not self.sequence_file:
raise IOError('FASTA file needs to be written for EMBOSS pepstats to be run')
outfile = ssbio.protein.sequence.properties.residues.emboss_pepstats_on_fasta(infile=self.sequence_path)
pepstats = ssbio.protein.sequence.properties.residues.emboss_pepstats_parser(outfile)
self.annotations.update(pepstats)
[docs] def blast_pdb(self, seq_ident_cutoff=0, evalue=0.0001, display_link=False,
outdir=None, force_rerun=False):
"""BLAST this sequence to the PDB"""
if not outdir:
outdir = self.sequence_dir
if not outdir:
raise ValueError('Output directory must be specified')
if not self.seq_str:
log.error('{}: no sequence loaded'.format(self.id))
return None
try:
blast_results = ssbio.databases.pdb.blast_pdb(self.seq_str,
outfile='{}_blast_pdb.xml'.format(custom_slugify(self.id)),
outdir=outdir,
force_rerun=force_rerun,
evalue=evalue,
seq_ident_cutoff=seq_ident_cutoff,
link=display_link)
except requests.ConnectionError as e:
log.error('{}: BLAST request timed out'.format(self.id))
print(e)
return None
return blast_results
[docs] def get_residue_annotations(self, start_resnum, end_resnum=None):
"""Retrieve letter annotations for a residue or a range of residues
Args:
start_resnum (int): Residue number
end_resnum (int): Optional residue number, specify if a range is desired
Returns:
dict: Letter annotations for this residue or residues
"""
if not end_resnum:
end_resnum = start_resnum
# Create a new SeqFeature
f = SeqFeature(FeatureLocation(start_resnum - 1, end_resnum))
# Get sequence properties
return f.extract(self).letter_annotations
[docs] def get_aggregation_propensity(self, email, password, cutoff_v=5, cutoff_n=5, run_amylmuts=False, outdir=None):
"""Run the AMYLPRED2 web server to calculate the aggregation propensity of this protein sequence, which is
the number of aggregation-prone segments on the unfolded protein sequence.
Stores statistics in the ``annotations`` attribute, under the key `aggprop-amylpred`.
See :mod:`ssbio.protein.sequence.properties.aggregation_propensity` for instructions and details.
"""
if not outdir:
outdir = self.sequence_dir
if not outdir:
raise ValueError('Output directory must be specified')
import ssbio.protein.sequence.properties.aggregation_propensity as agg
agg_predictions = agg.AMYLPRED(email=email, password=password)
result = agg_predictions.get_aggregation_propensity(seq=self, outdir=outdir,
cutoff_v=cutoff_v, cutoff_n=cutoff_n,
run_amylmuts=run_amylmuts)
self.annotations['aggprop-amylpred'] = result
[docs] def get_kinetic_folding_rate(self, secstruct, at_temp=None):
"""Run the FOLD-RATE web server to calculate the kinetic folding rate given an amino acid sequence and its
structural classficiation (alpha/beta/mixed)
Stores statistics in the ``annotations`` attribute, under the key `kinetic_folding_rate_<TEMP>-foldrate`.
See :func:`ssbio.protein.sequence.properties.kinetic_folding_rate.get_foldrate` for instructions and details.
"""
import ssbio.protein.sequence.properties.kinetic_folding_rate as kfr
rate = kfr.get_foldrate(seq=self, secstruct=secstruct)
self.annotations['kinetic_folding_rate_37.0_C-foldrate'] = rate
if at_temp:
new_rate = kfr.get_foldrate_at_temp(ref_rate=rate, new_temp=at_temp)
self.annotations['kinetic_folding_rate_{}_C-foldrate'.format(at_temp)] = new_rate
[docs] def get_thermostability(self, at_temp):
"""Run the thermostability calculator using either the Dill or Oobatake methods.
Stores calculated (dG, Keq) tuple in the ``annotations`` attribute, under the key
`thermostability_<TEMP>-<METHOD_USED>`.
See :func:`ssbio.protein.sequence.properties.thermostability.get_dG_at_T` for instructions and details.
"""
import ssbio.protein.sequence.properties.thermostability as ts
dG = ts.get_dG_at_T(seq=self, temp=at_temp)
self.annotations['thermostability_{}_C-{}'.format(at_temp, dG[2].lower())] = (dG[0], dG[1])