Source code for ssbio.protein.sequence.seqprop

"""
SeqProp
=======
"""

import os.path as op
import pandas as pd
import requests
import logging
from copy import copy, deepcopy
from slugify import Slugify

from Bio import SeqIO
from BCBio import GFF
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation, ExactPosition

from ssbio.core.object import Object
import ssbio.utils
import ssbio.databases.pdb
import ssbio.protein.sequence.utils
import ssbio.protein.sequence.utils.fasta
import ssbio.protein.sequence.properties.residues

custom_slugify = Slugify(safe_chars='-_.')
log = logging.getLogger(__name__)


[docs]class SeqProp(SeqRecord):

    """Generic class to represent information for a protein sequence.

    Extends the Biopython SeqRecord class. The main functionality added is the ability to set and load directly from
    sequence, metadata, and feature files. Additionally, methods are provided to calculate and store sequence
    properties in the ``annotations`` and ``letter_annotations`` field of a SeqProp. These can then be accessed
    for a range of residue numbers.

    Attributes:
        id (str): Unique identifier for this protein sequence
        seq (Seq): Protein sequence as a Biopython Seq object
        name (str): Optional name for this sequence
        description (str): Optional description for this sequence
        bigg (str, list): BiGG IDs mapped to this sequence
        kegg (str, list): KEGG IDs mapped to this sequence
        refseq (str, list): RefSeq IDs mapped to this sequence
        uniprot (str, list): UniProt IDs mapped to this sequence
        gene_name (str, list): Gene names mapped to this sequence
        pdbs (list): PDB IDs mapped to this sequence
        go (str, list): GO terms mapped to this sequence
        pfam (str, list): PFAMs mapped to this sequence
        ec_number (str, list): EC numbers mapped to this sequence
        sequence_file (str): FASTA file for this sequence
        metadata_file (str): Metadata file (any format) for this sequence
        feature_file (str): GFF file for this sequence
        features (list): List of protein sequence features, which define regions of the protein
        annotations (dict): Annotations of this protein sequence, which summarize global properties
        letter_annotations (RestrictedDict): Residue-level annotations, which describe single residue properties

    Todo:
        - Properly inherit methods from the Object class...

    """

    def __init__(self, seq, id, name='<unknown name>', description='<unknown description>',
                 sequence_path=None, metadata_path=None, feature_path=None):
        """Initialize a SeqProp object with the standard SeqRecord inputs, along with optional paths to files.

        Args:
            seq (str, Seq, SeqRecord): Protein sequence to load
            id (str): Unique identifier for this protein sequence
            name (str): Optional name for this sequence
            description (str): Optional description for this sequence
            sequence_path (str): Absolute or relative path to sequence (FASTA) file
            metadata_path (str): Absolute or relative path to metadata file
            feature_path (str): Absolute or relative path to feature (GFF) file

        """

        # Top level database identifiers
        self.id = id
        self.description = description
        self.notes = {}

        self.bigg = None
        self.kegg = None
        self.refseq = None
        self.uniprot = None
        self.gene_name = None
        self.pdbs = None
        self.go = None
        self.pfam = None
        self.ec_number = None

        # Files, sequences, features
        self._sequence_dir = None
        self.sequence_file = None
        self._metadata_dir = None
        self.metadata_file = None
        self._feature_dir = None
        self.feature_file = None
        self._features = None

        # Stuff from SeqRecord
        self.name = None
        self.description = None
        self.dbxrefs = None
        self.features = None
        self.annotations = None
        self._per_letter_annotations = None

        self._seq = None
        self.seq = seq

        super(SeqProp, self).__init__(seq=self.seq, id=id, name=name, description=description)

        if sequence_path:
            self.sequence_path = sequence_path
        if metadata_path:
            self.metadata_path = metadata_path
        if feature_path:
            self.feature_path = feature_path

    @property
    def seq(self):
        """Seq: Dynamically loaded Seq object from the sequence file"""

        if self.sequence_file:
            file_to_load = copy(self.sequence_path)
            log.debug('{}: reading sequence from sequence file {}'.format(self.id, file_to_load))
            tmp_sr = SeqIO.read(file_to_load, 'fasta')
            return tmp_sr.seq

        else:
            if not self._seq:
                log.debug('{}: no sequence stored in memory'.format(self.id))
            else:
                log.debug('{}: reading sequence from memory'.format(self.id))

            return self._seq

    @seq.setter
    def seq(self, s):
        if not s:
            self._seq = None

        elif self.sequence_file:
            raise ValueError('{}: unable to set sequence, sequence file is associated with this object'.format(self.id))

        elif type(s) == str or type(s) == Seq:
            self._seq = ssbio.protein.sequence.utils.cast_to_seq(obj=s)

        # If a SeqRecord, copy all attributes
        elif type(s) == SeqRecord:
            self._seq = s.seq
            if self.name == '<unknown name>':
                self.name = s.name
            if self.description == '<unknown description>':
                self.description = s.description
            if not self.dbxrefs:
                self.dbxrefs = s.dbxrefs
            if not self.features:
                self.features = s.features
            if not self.annotations:
                self.annotations = s.annotations
            if not self.letter_annotations:
                self.letter_annotations = s.letter_annotations

    @property
    def features(self):
        """list: Get the features stored in memory or in the GFF file"""

        if self.feature_file:
            log.debug('{}: reading features from feature file {}'.format(self.id, self.feature_path))
            with open(self.feature_path) as handle:
                feats = list(GFF.parse(handle))
                if len(feats) > 1:
                    log.warning('Too many sequences in GFF')
                else:
                    return feats[0].features

        else:
            return self._features

    @features.setter
    def features(self, feats):
        if self.feature_file:
            raise ValueError('{}: unable to set features, feature file is associated with this object'.format(self.id))
        elif feats:
            self._features = feats
        else:
            self._features = []

    @property
    def seq_str(self):
        """str: Get the sequence formatted as a string"""

        if not self.seq:
            return None

        return ssbio.protein.sequence.utils.cast_to_str(self.seq)

    @property
    def seq_len(self):
        """int: Get the sequence length"""

        if not self.seq:
            return 0

        return len(self.seq)

    @property
    def num_pdbs(self):
        """int: Report the number of PDB IDs stored in the ``pdbs`` attribute"""

        if not self.pdbs:
            return 0

        else:
            return len(self.pdbs)

    @property
    def sequence_dir(self):
        if not self._sequence_dir:
            raise OSError('No sequence folder set')
        return self._sequence_dir

    @sequence_dir.setter
    def sequence_dir(self, path):
        if path and not op.exists(path):
            raise OSError('{}: folder does not exist'.format(path))

        self._sequence_dir = path

    @property
    def sequence_path(self):
        if not self.sequence_file:
            raise OSError('Sequence file not loaded')

        path = op.join(self.sequence_dir, self.sequence_file)
        if not op.exists(path):
            raise OSError('{}: file does not exist'.format(path))
        return path

    @sequence_path.setter
    def sequence_path(self, fasta_path):
        """Provide pointers to the paths of the FASTA file

        Args:
            fasta_path: Path to FASTA file

        """
        if not fasta_path:
            self.sequence_dir = None
            self.sequence_file = None

        else:
            if not op.exists(fasta_path):
                raise OSError('{}: file does not exist'.format(fasta_path))

            if not op.dirname(fasta_path):
                self.sequence_dir = '.'
            else:
                self.sequence_dir = op.dirname(fasta_path)
            self.sequence_file = op.basename(fasta_path)

            tmp_sr = SeqIO.read(fasta_path, 'fasta')
            if self.name == '<unknown name>':
                self.name = tmp_sr.name
            if self.description == '<unknown description>':
                self.description = tmp_sr.description
            if not self.dbxrefs:
                self.dbxrefs = tmp_sr.dbxrefs
            if not self.features:
                self.features = tmp_sr.features
            if not self.annotations:
                self.annotations = tmp_sr.annotations
            if not self.letter_annotations:
                self.letter_annotations = tmp_sr.letter_annotations

    @property
    def metadata_dir(self):
        if not self._metadata_dir:
            raise OSError('No metadata folder set')
        return self._metadata_dir

    @metadata_dir.setter
    def metadata_dir(self, path):
        if path and not op.exists(path):
            raise OSError('{}: folder does not exist'.format(path))

        self._metadata_dir = path

    @property
    def metadata_path(self):
        if not self.metadata_file:
            raise OSError('Metadata file not loaded')

        path = op.join(self.metadata_dir, self.metadata_file)
        if not op.exists(path):
            raise OSError('{}: file does not exist'.format(path))
        return path

    @metadata_path.setter
    def metadata_path(self, m_path):
        """Provide pointers to the paths of the metadata file

        Args:
            m_path: Path to metadata file

        """
        if not m_path:
            self.metadata_dir = None
            self.metadata_file = None

        else:
            if not op.exists(m_path):
                raise OSError('{}: file does not exist!'.format(m_path))

            if not op.dirname(m_path):
                self.metadata_dir = '.'
            else:
                self.metadata_dir = op.dirname(m_path)
            self.metadata_file = op.basename(m_path)

    @property
    def feature_dir(self):
        if not self._feature_dir:
            raise OSError('No feature folder set')
        return self._feature_dir

    @feature_dir.setter
    def feature_dir(self, path):
        if path and not op.exists(path):
            raise OSError('{}: folder does not exist'.format(path))

        self._feature_dir = path

    @property
    def feature_path(self):
        path = op.join(self.feature_dir, self.feature_file)
        if not op.exists(path):
            raise OSError('{}: file does not exist'.format(path))
        return path

    @feature_path.setter
    def feature_path(self, gff_path):
        """Load a GFF file with information on a single sequence and store features in the ``features`` attribute

        Args:
            gff_path: Path to GFF file.

        """
        if not gff_path:
            self.feature_dir = None
            self.feature_file = None

        else:
            if not op.exists(gff_path):
                raise OSError('{}: file does not exist!'.format(gff_path))

            if not op.dirname(gff_path):
                self.feature_dir = '.'
            else:
                self.feature_dir = op.dirname(gff_path)
            self.feature_file = op.basename(gff_path)

[docs]    def feature_path_unset(self):
        """Copy features to memory and remove the association of the feature file."""
        if not self.feature_file:
            raise IOError('No feature file to unset')

        with open(self.feature_path) as handle:
            feats = list(GFF.parse(handle))
            if len(feats) > 1:
                log.warning('Too many sequences in GFF')
            else:
                tmp = feats[0].features

        self.feature_dir = None
        self.feature_file = None
        self.features = tmp

    # def __str__(self):
    #     return Object.__str__(self)
    #
    # def __repr__(self):
    #     return Object.__repr__(self)

    def update(self, newdata, overwrite=False, only_keys=None):
        # Filter for list of keys in only_keys
        if only_keys:
            only_keys = ssbio.utils.force_list(only_keys)
            newdata = {k: v for k, v in newdata.items() if k in only_keys}

        # Update attributes
        for key, value in newdata.items():
            # Overwrite flag overwrites all attributes
            if overwrite:
                setattr(self, key, value)
            else:
                # Otherwise check if attribute is None and set it if so
                if hasattr(self, key):
                    if not getattr(self, key):
                        setattr(self, key, value)
                    else:
                        continue
                # Or just add a new attribute
                else:
                    setattr(self, key, value)

[docs]    def get_dict(self, only_attributes=None, exclude_attributes=None, df_format=False):
        """Get a dictionary of this object's attributes. Optional format for storage in a Pandas DataFrame.

        Args:
            only_attributes (str, list): Attributes that should be returned. If not provided, all are returned.
            exclude_attributes (str, list): Attributes that should be excluded.
            df_format (bool): If dictionary values should be formatted for a dataframe
                (everything possible is transformed into strings, int, or float -
                if something can't be transformed it is excluded)

        Returns:
            dict: Dictionary of attributes

        """

        # Choose attributes to return, return everything in the object if a list is not specified
        if not only_attributes:
            keys = list(self.__dict__.keys())
        else:
            keys = ssbio.utils.force_list(only_attributes)

        # Remove keys you don't want returned
        if exclude_attributes:
            exclude_attributes = ssbio.utils.force_list(exclude_attributes)
            for x in exclude_attributes:
                if x in keys:
                    keys.remove(x)

        # Copy attributes into a new dictionary
        df_dict = {}
        for k, orig_v in self.__dict__.items():
            if k in keys:
                v = deepcopy(orig_v)
                if df_format:
                    if v and not isinstance(v, str) and not isinstance(v, int) and not isinstance(v,
                                                                                                  float) and not isinstance(
                            v, bool):
                        try:
                            df_dict[k] = ssbio.utils.force_string(deepcopy(v))
                        except TypeError:
                            log.warning('{}: excluding attribute from dict, cannot transform into string'.format(k))
                    elif not v and not isinstance(v, int) and not isinstance(v, float):
                        df_dict[k] = None
                    else:
                        df_dict[k] = deepcopy(v)
                else:
                    df_dict[k] = deepcopy(v)
        return df_dict

    def save_pickle(self, outfile, protocol=2):
        import ssbio.io
        ssbio.io.save_pickle(self, outfile, protocol)

    def __json_encode__(self):
        to_return = {}
        # Don't save properties, methods in the JSON
        for x in [a for a in dir(self) if
                  not a.startswith('__') and not a.startswith('_{}__'.format(type(self).__name__)) and not isinstance(
                          getattr(type(self), a, None), property) and not callable(getattr(self, a))]:
            to_return.update({x: getattr(self, x)})
        return to_return

    def save_json(self, outfile, compression=False):
        import ssbio.io
        ssbio.io.save_json(self, outfile, compression=compression)

[docs]    def equal_to(self, seq_prop):
        """Test if the sequence is equal to another SeqProp's sequence

        Args:
            seq_prop: SeqProp object

        Returns:
            bool: If the sequences are the same

        """
        if not self.seq or not seq_prop or not seq_prop.seq:
            return False

        return self.seq == seq_prop.seq

[docs]    def write_fasta_file(self, outfile, force_rerun=False):
        """Write a FASTA file for the protein sequence, ``seq`` will now load directly from this file.

        Args:
            outfile (str): Path to new FASTA file to be written to
            force_rerun (bool): If an existing file should be overwritten

        """
        if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
            SeqIO.write(self, outfile, "fasta")

        # The Seq as it will now be dynamically loaded from the file
        self.sequence_path = outfile

[docs]    def write_gff_file(self, outfile, force_rerun=False):
        """Write a GFF file for the protein features, ``features`` will now load directly from this file.

        Args:
            outfile (str): Path to new FASTA file to be written to
            force_rerun (bool): If an existing file should be overwritten

        """
        if ssbio.utils.force_rerun(outfile=outfile, flag=force_rerun):
            with open(outfile, "w") as out_handle:
                GFF.write([self], out_handle)

        self.feature_path = outfile

[docs]    def add_point_feature(self, resnum, feat_type=None, feat_id=None):
        """Add a feature to the features list describing a single residue.

        Args:
            resnum (int): Protein sequence residue number
            feat_type (str, optional): Optional description of the feature type (ie. 'catalytic residue')
            feat_id (str, optional): Optional ID of the feature type (ie. 'TM1')

        """
        if self.feature_file:
            raise ValueError('Feature file associated with sequence, please remove file association to append '
                             'additional features.')

        if not feat_type:
            feat_type = 'Manually added protein sequence single residue feature'
        newfeat = SeqFeature(location=FeatureLocation(ExactPosition(resnum-1), ExactPosition(resnum)),
                             type=feat_type,
                             id=feat_id)

        self.features.append(newfeat)

[docs]    def add_region_feature(self, start_resnum, end_resnum, feat_type=None, feat_id=None):
        """Add a feature to the features list describing a region of the protein sequence.

        Args:
            start_resnum (int): Start residue number of the protein sequence feature
            end_resnum (int): End residue number of the protein sequence feature
            feat_type (str, optional): Optional description of the feature type (ie. 'binding domain')
            feat_id (str, optional): Optional ID of the feature type (ie. 'TM1')

        """
        if self.feature_file:
            raise ValueError('Feature file associated with sequence, please remove file association to append '
                             'additional features.')

        if not feat_type:
            feat_type = 'Manually added protein sequence region feature'
        newfeat = SeqFeature(location=FeatureLocation(start_resnum-1, end_resnum),
                             type=feat_type,
                             id=feat_id)

        self.features.append(newfeat)

[docs]    def get_biopython_pepstats(self):
        """Run Biopython's built in ProteinAnalysis module and store statistics in the ``annotations`` attribute."""

        if self.seq:
            try:
                pepstats = ssbio.protein.sequence.properties.residues.biopython_protein_analysis(self.seq)
            except KeyError as e:
                log.error('{}: unable to run ProteinAnalysis module, unknown amino acid {}'.format(self.id, e))
                return
            except ValueError as e:
                log.error('{}: unable to run ProteinAnalysis module, {}'.format(self.id, e))
                return
            self.annotations.update(pepstats)
        else:
            raise ValueError('{}: no sequence available, unable to run ProteinAnalysis'.format(self.id))

[docs]    def get_emboss_pepstats(self):
        """Run the EMBOSS pepstats program on the protein sequence.

        Stores statistics in the ``annotations`` attribute.
        Saves a ``.pepstats`` file of the results where the sequence file is located.
        """
        if not self.sequence_file:
            raise IOError('FASTA file needs to be written for EMBOSS pepstats to be run')
        outfile = ssbio.protein.sequence.properties.residues.emboss_pepstats_on_fasta(infile=self.sequence_path)
        pepstats = ssbio.protein.sequence.properties.residues.emboss_pepstats_parser(outfile)
        self.annotations.update(pepstats)

[docs]    def blast_pdb(self, seq_ident_cutoff=0, evalue=0.0001, display_link=False,
                  outdir=None, force_rerun=False):
        """BLAST this sequence to the PDB"""
        if not outdir:
            outdir = self.sequence_dir
            if not outdir:
                raise ValueError('Output directory must be specified')

        if not self.seq_str:
            log.error('{}: no sequence loaded'.format(self.id))
            return None

        try:
            blast_results = ssbio.databases.pdb.blast_pdb(self.seq_str,
                                                          outfile='{}_blast_pdb.xml'.format(custom_slugify(self.id)),
                                                          outdir=outdir,
                                                          force_rerun=force_rerun,
                                                          evalue=evalue,
                                                          seq_ident_cutoff=seq_ident_cutoff,
                                                          link=display_link)
        except requests.ConnectionError as e:
            log.error('{}: BLAST request timed out'.format(self.id))
            print(e)
            return None

        return blast_results

[docs]    def get_residue_annotations(self, start_resnum, end_resnum=None):
        """Retrieve letter annotations for a residue or a range of residues

        Args:
            start_resnum (int): Residue number
            end_resnum (int): Optional residue number, specify if a range is desired

        Returns:
            dict: Letter annotations for this residue or residues

        """
        if not end_resnum:
            end_resnum = start_resnum

        # Create a new SeqFeature
        f = SeqFeature(FeatureLocation(start_resnum - 1, end_resnum))

        # Get sequence properties
        return f.extract(self).letter_annotations

[docs]    def get_aggregation_propensity(self, email, password, cutoff_v=5, cutoff_n=5, run_amylmuts=False, outdir=None):
        """Run the AMYLPRED2 web server to calculate the aggregation propensity of this protein sequence, which is
        the number of aggregation-prone segments on the unfolded protein sequence.

        Stores statistics in the ``annotations`` attribute, under the key `aggprop-amylpred`.

        See :mod:`ssbio.protein.sequence.properties.aggregation_propensity` for instructions and details.

        """
        if not outdir:
            outdir = self.sequence_dir
            if not outdir:
                raise ValueError('Output directory must be specified')

        import ssbio.protein.sequence.properties.aggregation_propensity as agg

        agg_predictions = agg.AMYLPRED(email=email, password=password)
        result = agg_predictions.get_aggregation_propensity(seq=self, outdir=outdir,
                                                            cutoff_v=cutoff_v, cutoff_n=cutoff_n,
                                                            run_amylmuts=run_amylmuts)
        self.annotations['aggprop-amylpred'] = result

[docs]    def get_kinetic_folding_rate(self, secstruct, at_temp=None):
        """Run the FOLD-RATE web server to calculate the kinetic folding rate given an amino acid sequence and its
        structural classficiation (alpha/beta/mixed)

        Stores statistics in the ``annotations`` attribute, under the key `kinetic_folding_rate_<TEMP>-foldrate`.

        See :func:`ssbio.protein.sequence.properties.kinetic_folding_rate.get_foldrate` for instructions and details.

        """

        import ssbio.protein.sequence.properties.kinetic_folding_rate as kfr

        rate = kfr.get_foldrate(seq=self, secstruct=secstruct)
        self.annotations['kinetic_folding_rate_37.0_C-foldrate'] = rate

        if at_temp:
            new_rate = kfr.get_foldrate_at_temp(ref_rate=rate, new_temp=at_temp)
            self.annotations['kinetic_folding_rate_{}_C-foldrate'.format(at_temp)] = new_rate

[docs]    def get_thermostability(self, at_temp):
        """Run the thermostability calculator using either the Dill or Oobatake methods.

        Stores calculated (dG, Keq) tuple in the ``annotations`` attribute, under the key
        `thermostability_<TEMP>-<METHOD_USED>`.

        See :func:`ssbio.protein.sequence.properties.thermostability.get_dG_at_T` for instructions and details.

        """

        import ssbio.protein.sequence.properties.thermostability as ts

        dG = ts.get_dG_at_T(seq=self, temp=at_temp)
        self.annotations['thermostability_{}_C-{}'.format(at_temp, dG[2].lower())] = (dG[0], dG[1])