Source code for ssbio.protein.sequence.properties.aggregation_propensity

from __future__ import print_function

__author__ = 'Ke Chen'
__email__ = "kec003@ucsd.edu"

import os
import os.path as op
import glob
import time
import requests
import ssbio.protein.sequence.utils

# TODO: replace urllib usage with six library
try:
    from urllib.request import urlopen
    from urllib.request import build_opener
    from urllib.request import HTTPCookieProcessor
    from urllib.parse import urlparse
    from urllib.parse import urlencode
except ImportError:
    from urlparse import urlparse
    from urllib2 import urlopen
    from urllib import urlencode
    from urllib2 import build_opener
    from urllib2 import HTTPCookieProcessor
try:
    from http.cookiejar import CookieJar
except ImportError:
    from cookielib import CookieJar


[docs]class AMYLPRED:

    """Class to submit sequences to AMYLPRED2_.

    Instructions:

        #. Create an account on the webserver at the `AMYLPRED2 registration link`_.
        #. Create a new AMYLPRED object with your email and password initialized along with it.
        #. Run ``get_aggregation_propensity`` on a protein sequence.

    Attributes:
        email (str): Account email
        password (str): Account password

    Todo:
        - Properly implement force_rerun and caching functions

    .. _AMYLPRED2: http://aias.biol.uoa.gr/AMYLPRED2/
    .. _AMYLPRED2 registration link: http://aias.biol.uoa.gr/AMYLPRED2/register.php

    """

    def __init__(self, email, password):
        """Initialize the AMYLPRED object with your email and password used to login here.
        
        Args:
            email (str): Account email 
            password (str): Account password
             
        """

        self.email = email
        self.password = password

[docs]    def get_aggregation_propensity(self, seq, outdir, cutoff_v=5, cutoff_n=5, run_amylmuts=False):
        """Run the AMYLPRED2 web server for a protein sequence and get the consensus result for aggregation propensity.

        Args:
            seq (str, Seq, SeqRecord): Amino acid sequence
            outdir (str): Directory to where output files should be saved
            cutoff_v (int): The minimal number of methods that agree on a residue being a aggregation-prone residue
            cutoff_n (int): The minimal number of consecutive residues to be considered as a 'stretch' of 
                aggregation-prone region
            run_amylmuts (bool): If AMYLMUTS method should be run, default False. AMYLMUTS is optional as it is the most
                time consuming and generates a slightly different result every submission.

        Returns:
            int: Aggregation propensity - the number of aggregation-prone segments on an unfolded protein sequence

        """

        seq = ssbio.protein.sequence.utils.cast_to_str(seq)

        results = self.run_amylpred2(seq=seq, outdir=outdir, run_amylmuts=run_amylmuts)
        agg_index, agg_conf = self.parse_for_consensus_aggregation(N=len(seq), results=results, cutoff_v=cutoff_v, cutoff_n=cutoff_n)

        return agg_index

[docs]    def run_amylpred2(self, seq, outdir, run_amylmuts=False):
        """Run all methods on the AMYLPRED2 web server for an amino acid sequence and gather results.

        Result files are cached in ``/path/to/outdir/AMYLPRED2_results``.

        Args:
            seq (str): Amino acid sequence as a string
            outdir (str): Directory to where output files should be saved
            run_amylmuts (bool): If AMYLMUTS method should be run, default False

        Returns:
            dict: Result for each method run

        """
        outdir_amylpred = op.join(outdir, 'AMYLPRED2_results')
        if not op.exists(outdir_amylpred):
            os.mkdir(outdir_amylpred)

        url = "http://aias.biol.uoa.gr/AMYLPRED2/login.php"
        cj = CookieJar()
        opener = build_opener(HTTPCookieProcessor(cj))
        formdata = {"email": self.email, "password": self.password}
        data_encoded = urlencode(formdata)
        data_encoded = data_encoded.encode('ASCII')
        response = opener.open(url, data_encoded)

        # AMYLMUTS is most time consuming and generates a slightly different result every submission
        methods = ['AGGRESCAN', 'NETCSSP', 'PAFIG', 'APD', 'AMYLPATTERN',
                   'SECSTR', 'BSC', 'WALTZ', 'CONFENERGY', 'TANGO']

        if run_amylmuts:
            methods.append('AMYLMUTS')

        output = {}
        timeCounts = 0

        for met in methods:
            # First check if there is an existing results file
            existing_results = glob.glob(op.join(outdir_amylpred, '*_{}.txt'.format(met)))
            if existing_results:
                results_file = existing_results[0]
            else:
                values = {'seq_data': seq, 'method': met}
                data = urlencode(values)
                data = data.encode('ASCII')
                url_input = "http://aias.biol.uoa.gr/cgi-bin/AMYLPRED2/amylpred2.pl"
                response = opener.open(url_input, data)
                result = str(response.read())
                ind = str.find(result, 'Job ID')
                result2 = result[ind:ind + 50]
                ind1 = str.find(result2, ':')
                ind2 = str.find(result2, '<BR>')
                job_id = result2[ind1 + 2:ind2]

                # Waiting for the calculation to complete
                url_result = 'http://aias.biol.uoa.gr/AMYLPRED2/tmp/' + job_id + '.txt'
                print(url_result)
                print("Waiting for %s results" % met, end='.')
                while True:
                    result = urlopen(url_result).read()
                    if not result:
                        time.sleep(1)
                        timeCounts += 1
                        print('.', end='')
                    else:
                        response = requests.get(url_result)
                        break
                results_file = op.join(outdir_amylpred, "{}_{}.txt".format(url_result.split('/')[-1].strip('.txt'), met))
                with open(results_file, "wb") as handle:
                    for data in response.iter_content():
                        handle.write(data)
                print("")

            method, hits = self.parse_method_results(results_file, met)
            # if method.lower() == met.lower():
            output[met] = hits
            # elif method == 'Beta-strand contiguity' and met == 'BSC':
            # output[met]=hits
            # elif method == 'Hexapeptide Conf. Energy' and met == 'CONFENERGY':
        if timeCounts != 0:
            print("Time spent: %d seconds" % timeCounts)
        return output

[docs]    def parse_method_results(self, results_file, met):
        """Parse the output of a AMYLPRED2 result file."""

        result = str(open(results_file).read())
        ind_s = str.find(result, 'HITS')
        ind_e = str.find(result, '**NOTE')
        tmp = result[ind_s + 10:ind_e].strip(" ")
        hits_resid = []
        method = None
        if ":" in tmp:
            method = tmp.split(":")[0]
            hits = tmp.split(":")[1]
            if "-" in hits:
                for ele in hits.split(","):
                    ele = ele.replace('\\r\\n\\r\\n', '')
                    res_s = ele.split("-")[0]
                    res_e = ele.split("-")[1]
                    for i in range(int(res_s), int(res_e) + 1):
                        hits_resid.append(i)
        if method:
            return method, hits_resid
        else:
            return met, hits_resid

    def aggregation_index(self, output, cutoff_v, cutoff_n):
        # all index for all aggregation potential > cutoff_v
        idx = [i for i in range(0, len(output)) if output[i] >= cutoff_v]
        # hold number of continueous aa with aggregation potential > cutoff_v
        # for each continueous stretch
        agg_index = []
        idx_for_summation = []
        if idx:
            # start index for each stretch
            start_idx = [idx[0]]
            i = 0
            counter = 0
            while i < len(idx) - 1:
                if idx[i] + 1 == idx[i + 1]:
                    counter += 1
                else:
                    agg_index.append(counter + 1)
                    start_idx.append(idx[i + 1])
                    counter = 0
                i += 1
                if i == len(idx) - 1: agg_index.append(counter + 1)
            idx2 = [i for i in range(0, len(agg_index)) if agg_index[i] >= cutoff_n]
            for i in idx2:
                idx_for_summation.extend(range(start_idx[i], agg_index[i] + start_idx[i]))
            sum = 0
            for i in idx_for_summation:
                sum += output[i]
            sum_all = 0
        else:
            idx2 = []
            sum = 0
        if len(idx_for_summation) == 0:
            return len(idx2), 0
        else:
            return len(idx2), sum / float(len(idx_for_summation))

    def parse_for_consensus_aggregation(self, N, results, cutoff_v=5, cutoff_n=5):
        if len(results) == 0:
            raise ValueError('No results were found!')

        hits_res_count = {}
        for met in results.keys():
            for res in results[met]:
                if res in hits_res_count.keys():
                    hits_res_count[res] += 1
                else:
                    hits_res_count[res] = 1

        AGG_Propensity = []
        output2 = []
        for i in range(1, int(N) + 1):
            if i in hits_res_count.keys():
                output2.append(hits_res_count[i])
            else:
                output2.append(0.0)
        agg_index, agg_conf = self.aggregation_index(output2, cutoff_v, cutoff_n)
        # These are parameters that you can explore, "5,5" means
        # 5 consecutive residues agreed among at least 5 methods
        # is considered contributing 1 to the aggregation propensity
        return agg_index, agg_conf


# TODO: clean up execution script to run on FASTA file(s)
# if __name__ == '__main__':
#     import os.path as op
#     from ssbio import utils
#     date = utils.Date()
#     # load inputs from command line
#
#     p = argparse.ArgumentParser(description='Run AMYLPRED2 on a FASTA file or a folder of FASTA files.')
#     p.add_argument('email', help='http://aias.biol.uoa.gr/AMYLPRED2/login.php Email')
#     p.add_argument('password', help='Password')
#     p.add_argument('infile', help='FASTA file or directory of FASTA files.')
#     args = p.parse_args()
#
#     curr_dir = os.getcwd()
#     # initialize the class with your email and password for the site
#     agg_predictions = agg.AMYLPRED(args.email, args.password)
#
#     prop_dir = 'properties'
#     if not op.exists(prop_dir):
#         os.mkdir(prop_dir)
#
#     agg_prop_dir = op.join(prop_dir,'aggregation_propensity')
#     if not op.exists(agg_prop_dir):
#         os.mkdir(agg_prop_dir)
#
#     # TODO: improve arg parsing for files/dirs
#     # TODO: this was all done in a rush - current dir and infile should be improved
#     if len(args.infile) == 1 and op.isdir(args.infile[0]):
#         os.chdir(args.infile[0])
#         files = glob.glob('*')
#     else:
#         files = args.infile
#
#     results = []
#
#     for file in tqdm(files):
#         if op.isdir(file):
#             continue
#
#         # load the sequence file, also the ID
#         seq_records = fasta.load_fasta_file(file)
#         seq_id = op.splitext(op.basename(file))[0]
#
#         seq_folder = op.join(agg_prop_dir, seq_id)
#         if not op.exists(seq_folder):
#             os.mkdir(seq_folder)
#
#         os.chdir(seq_folder)
#         # TODO: seems useless to return seqrecords to just convert them to strings
#         for seq_record in seq_records:
#             agg_index = agg_predictions.get_aggregation_propensity(str(seq_record.seq))
#             result = {'id':seq_id, 'agg_index':agg_index}
#             results.append(result)
#         os.chdir(curr_dir)
#
#     agg_df = pd.DataFrame(results)
#     agg_df.to_csv(op.join(prop_dir, '{}_aggprop_results.csv'.format(date.short_date)))
#     print('Saved results in properties/aggregation_propensity and summarized in aggprop_results.csv')