Source code for ssbio.protein.structure.homology.itasser.itasserprop

import logging
import os
import os.path as op
import shutil
import time

import pandas as pd
from ssbio.protein.structure.utils.cleanpdb import CleanPDB
from ssbio.protein.structure.utils.structureio import StructureIO

import ssbio.utils
from ssbio.protein.structure.structprop import StructProp

log = logging.getLogger(__name__)


[docs]class ITASSERProp(StructProp):

    """Parse all available information for a local I-TASSER modeling run.

    Initializes a class to collect I-TASSER modeling information and optionally copy results to a new directory.
    SEE: https://zhanglab.ccmb.med.umich.edu/papers/2015_1.pdf for detailed information.

    Args:
        ident (str): ID of I-TASSER modeling run
        original_results_path (str): Path to I-TASSER modeling folder
        coach_results_folder (str): Path to original COACH results
        model_to_use (str): Which I-TASSER model to use. Default is "model1"

    """

    # TODO: parse 1) predicted SS (seq.dat) 2) predicted solvent acc (exp.dat) 3) B-factor (BFP.dat?)
    # Other? Top 10 proteins with similar structure in PDB: $datadir/model1/cofactor/similarpdb_model1.lst

    def __init__(self, ident, original_results_path, coach_results_folder='model1/coach', model_to_use='model1'):

        if not op.exists(original_results_path):
            raise OSError('{}: folder does not exist'.format(original_results_path))

        StructProp.__init__(self, ident, is_experimental=False)

        self._attrs_to_copy = []

        self.original_results_path = original_results_path
        self.model_to_use = model_to_use

        ### MODEL
        self.model_date = None
        original_model_path = op.join(original_results_path, '{}.pdb'.format(model_to_use))
        if not op.exists(original_model_path):
            raise IOError('{}: no homology model generated'.format(original_model_path))
        else:
            self.load_structure_path(structure_path=original_model_path, file_type='pdb')

        ### MODELING RESULTS
        self.difficulty = None
        self.top_template_pdb = None
        self.top_template_chain = None

        # Parse init.dat
        self._init_dat_path = op.join(original_results_path, 'init.dat')
        if op.exists(self._init_dat_path):
            self.update(parse_init_dat(self._init_dat_path))

        # Parse seq.dat
        self._seq_dat_path = op.join(original_results_path, 'seq.dat')
        if op.exists(self._seq_dat_path):
            self._attrs_to_copy.append('_seq_dat_path')
            # TODO: parse seq.dat and store in modeling_results
            self.update(parse_seq_dat(self._seq_dat_path))

        # Parse exp.dat
        self._exp_dat_path = op.join(original_results_path, 'exp.dat')
        if op.exists(self._exp_dat_path):
            self._attrs_to_copy.append('_exp_dat_path')
            # TODO: parse exp.dat and store in modeling_results
            self.update(parse_exp_dat(self._exp_dat_path))

        # Parse BFP.dat
        self._bfp_dat_path = op.join(original_results_path, 'BFP.dat')
        if op.exists(self._bfp_dat_path):
            self._attrs_to_copy.append('_bfp_dat_path')
            # TODO: parse BFP.dat and store in modeling_results
            self.update(parse_bfp_dat(self._bfp_dat_path))

        # Parse cscore
        self.c_score = None
        self.tm_score = None
        self.tm_score_err = None
        self.rmsd = None
        self.rmsd_err = None
        self._cscore_path = op.join(original_results_path, 'cscore')
        if op.exists(self._cscore_path):
            self._attrs_to_copy.append('_cscore_path')
            self.update(parse_cscore(self._cscore_path))


        ### COACH RESULTS
        self.coach_bsites = []
        self.coach_ec = []
        self.coach_go_mf = []
        self.coach_go_bp = []
        self.coach_go_cc = []

        coach_folder = op.join(original_results_path, coach_results_folder)
        self._coach_bsites_inf_path = op.join(coach_folder, 'Bsites.inf')
        self._coach_ec_dat_path = op.join(coach_folder, 'EC.dat')
        self._coach_go_mf_dat_path = op.join(coach_folder, 'GO_MF.dat')
        self._coach_go_bp_dat_path = op.join(coach_folder, 'GO_BP.dat')
        self._coach_go_cc_dat_path = op.join(coach_folder, 'GO_CC.dat')

        if op.isdir(coach_folder):
            # Parse Bsites.inf
            if op.exists(self._coach_bsites_inf_path):
                parsed_bsites = parse_coach_bsites_inf(self._coach_bsites_inf_path)
                if parsed_bsites:
                    self._attrs_to_copy.append('_coach_bsites_inf_path')
                    self.coach_bsites = parsed_bsites

            # Parse EC.dat
            if op.exists(self._coach_ec_dat_path):
                parsed_ec = parse_coach_ec(self._coach_ec_dat_path)
                if parsed_ec:
                    self._attrs_to_copy.append('_coach_ec_dat_path')
                    self.coach_ec = parsed_ec

            # Parse GO_MF.dat
            if op.exists(self._coach_go_mf_dat_path):
                parsed_go_mf = parse_coach_go(self._coach_go_mf_dat_path)
                if parsed_go_mf:
                    self._attrs_to_copy.append('_coach_go_mf_dat_path')
                    self.coach_go_mf = parsed_go_mf

            # Parse GO_BP.dat
            if op.exists(self._coach_go_bp_dat_path):
                parsed_go_bp = parse_coach_go(self._coach_go_bp_dat_path)
                if parsed_go_bp:
                    self._attrs_to_copy.append('_coach_go_bp_dat_path')
                    self.coach_go_bp = parsed_go_bp

            # Parse GO_CC.dat
            if op.exists(self._coach_go_cc_dat_path):
                parsed_go_cc = parse_coach_go(self._coach_go_cc_dat_path)
                if parsed_go_cc:
                    self._attrs_to_copy.append('_coach_go_cc_dat_path')
                    self.coach_go_cc = parsed_go_cc

[docs]    def load_structure_path(self, structure_path, file_type='pdb'):
        StructProp.load_structure_path(self, structure_path=structure_path, file_type=file_type)
        # Additionally save model creation date
        self.model_date = time.strftime('%Y-%m-%d', time.gmtime(os.path.getmtime(self.structure_path)))

[docs]    def copy_results(self, copy_to_dir, rename_model_to=None, force_rerun=False):
        """Copy the raw information from I-TASSER modeling to a new folder.

        Copies all files in the list _attrs_to_copy.

        Args:
            copy_to_dir (str): Directory to copy the minimal set of results per sequence.
            rename_model_to (str): New file name (without extension)
            force_rerun (bool): If existing models and results should be overwritten.

        """
        # Save path to the structure and copy it if specified
        if not rename_model_to:
            rename_model_to = self.model_to_use

        new_model_path = op.join(copy_to_dir, '{}.pdb'.format(rename_model_to))

        if self.structure_path:
            if ssbio.utils.force_rerun(flag=force_rerun, outfile=new_model_path):
                # Clean and save it
                custom_clean = CleanPDB()
                my_pdb = StructureIO(self.structure_path)
                new_model_path = my_pdb.write_pdb(custom_selection=custom_clean,
                                                  custom_name=rename_model_to,
                                                  out_dir=copy_to_dir,
                                                  force_rerun=force_rerun)

            # Update the structure_path to be the new clean file
            self.load_structure_path(structure_path=new_model_path, file_type='pdb')

            # Other modeling results - store in a new folder
            dest_itasser_dir = op.join(copy_to_dir, '{}_itasser'.format(rename_model_to))
            if not op.exists(dest_itasser_dir):
                os.mkdir(dest_itasser_dir)

            for attr in self._attrs_to_copy:
                old_file_path = getattr(self, attr)
                new_file_path = op.join(dest_itasser_dir, op.basename(old_file_path))
                if ssbio.utils.force_rerun(flag=force_rerun, outfile=new_file_path):
                    shutil.copy2(old_file_path, new_file_path)
                    log.debug('{}: copied from {}'.format(new_file_path, old_file_path))
                else:
                    log.debug('{}: file already exists'.format(new_file_path))
                setattr(self, attr, new_file_path)

    @property
    def df_coach_bsites(self):
        df_cols = ['site_num', 'c_score', 'cluster_size', 'algorithm',
                   'pdb_template_id', 'pdb_template_chain', 'pdb_ligand',
                   'binding_location_coords', 'c_score_method', 'binding_residues',
                   'ligand_cluster_counts']

        bsites_inf_df = pd.DataFrame.from_records(self.coach_bsites, columns=df_cols).drop_duplicates().reset_index(drop=True)

        if bsites_inf_df.empty:
            log.warning('Empty dataframe')
            return bsites_inf_df
        else:
            bsites_inf_df['c_score'] = pd.to_numeric(bsites_inf_df.c_score, errors='coerce')
            bsites_inf_df['cluster_size'] = pd.to_numeric(bsites_inf_df.cluster_size, errors='coerce')
            return ssbio.utils.clean_df(bsites_inf_df)

    @property
    def df_coach_ec(self):
        if self._coach_ec_dat_path:
            return parse_coach_ec_df(self._coach_ec_dat_path)
        else:
            log.warning('Empty dataframe')
            return pd.DataFrame()

    @property
    def df_coach_go(self):
        cols = ['go_id', 'go_term', 'c_score']

        go_all_df = pd.DataFrame()

        for go_list in [self.coach_go_mf, self.coach_go_cc, self.coach_go_bp]:
            go_df = pd.DataFrame.from_records(go_list, columns=cols).drop_duplicates().reset_index(drop=True)
            go_df['c_score'] = pd.to_numeric(go_df.c_score, errors='coerce')

            if go_all_df.empty:
                go_all_df = go_df
            else:
                go_all_df.append(go_df)

        return go_all_df

[docs]    def get_dict(self, only_attributes=None, exclude_attributes=None, df_format=False):
        """Summarize the I-TASSER run in a dictionary containing modeling results and top predictions from COACH

        Args:
            only_attributes (str, list): Attributes that should be returned. If not provided, all are returned.
            exclude_attributes (str, list): Attributes that should be excluded.
            df_format (bool): If dictionary values should be formatted for a dataframe
                (everything possible is transformed into strings, int, or float -
                if something can't be transformed it is excluded)

        Returns:
            dict: Dictionary of attributes

        """

        to_exclude = ['coach_bsites', 'coach_ec', 'coach_go_mf', 'coach_go_bp', 'coach_go_cc']
        if not exclude_attributes:
            excluder = to_exclude
        else:
            excluder = ssbio.utils.force_list(exclude_attributes)
            excluder.extend(to_exclude)

        summary_dict = StructProp.get_dict(self, only_attributes=only_attributes,
                                           exclude_attributes=excluder,
                                           df_format=df_format)

        if self.coach_bsites:
            tmp = {'top_bsite_' + k:v for k, v in self.coach_bsites[0].items()}
            summary_dict.update(tmp)

        if self.coach_ec:
            tmp = {'top_ec_' + k: v for k, v in self.coach_ec[0].items()}
            summary_dict.update(tmp)

        if self.coach_go_mf:
            tmp = {'top_go_mf_' + k: v for k, v in self.coach_go_mf[0].items()}
            summary_dict.update(tmp)

        if self.coach_go_bp:
            tmp = {'top_go_bp_' + k: v for k, v in self.coach_go_bp[0].items()}
            summary_dict.update(tmp)

        if self.coach_go_cc:
            tmp = {'top_go_cc_' + k: v for k, v in self.coach_go_cc[0].items()}
            summary_dict.update(tmp)

        return summary_dict


[docs]def parse_init_dat(infile):
    """Parse the main init.dat file which contains the modeling results

    The first line of the file init.dat contains stuff like::
    
        "120 easy  40   8"
    
    The other lines look like this::
    
        "     161   11.051   1  1guqA MUSTER"
    
    and getting the first 10 gives you the top 10 templates used in modeling

    Args:
        infile (stt): Path to init.dat

    Returns:
        dict: Dictionary of parsed information

    """

    # TODO: would be nice to get top 10 templates instead of just the top
    init_dict = {}

    log.debug('{}: reading file...'.format(infile))
    with open(infile, 'r') as f:
        # Get first 2 lines of file
        head = [next(f).strip() for x in range(2)]

    summary = head[0].split()
    difficulty = summary[1]

    top_template_info = head[1].split()
    top_template_pdbchain = top_template_info[3]
    top_template_pdb = top_template_pdbchain[:4]
    top_template_chain = top_template_pdbchain[4:]

    init_dict['difficulty'] = difficulty
    init_dict['top_template_pdb'] = top_template_pdb
    init_dict['top_template_chain'] = top_template_chain

    return init_dict


[docs]def parse_seq_dat(infile):
    """Parse the secondary structure predictions in seq.dat

    Args:
        infile (str): Path to seq.dat

    Returns:
        list: List of secondary structure predictions for all residues

    """

    # TODO: parser for seq.dat
    seq_dict = {}
    return seq_dict


[docs]def parse_exp_dat(infile):
    """Parse the solvent accessibility predictions in exp.dat

    Args:
        infile (str): Path to exp.dat

    Returns:
        list: List of solvent accessibility predictions for all residues

    """

    # TODO: parser for exp.dat
    exp_dict = {}
    return exp_dict


[docs]def parse_bfp_dat(infile):
    """Parse the B-factor predictions in BFP.dat

    Args:
        infile (str): Path to BFP.dat

    Returns:
        list: List of B-factor predictions for all residues

    """

    # TODO: parser for BFP.dat
    bfp_dict = {}
    return bfp_dict


[docs]def parse_cscore(infile):
    """Parse the cscore file to return a dictionary of scores.

    Args:
        infile (str): Path to cscore

    Returns:
        dict: Dictionary of scores

    """

    cscore_dict = {}

    with open(infile, 'r') as f:
        for ll in f.readlines():
            # Look for the first line that starts with model1
            if ll.lower().startswith('model1'):
                l = ll.split()

                cscore = l[1]
                tmscore_full = l[2].split('+-')
                tmscore = tmscore_full[0]
                tmscore_err = tmscore_full[1]
                rmsd_full = l[3].split('+-')
                rmsd = rmsd_full[0]
                rmsd_err = rmsd_full[1]

                cscore_dict['c_score'] = float(cscore)
                cscore_dict['tm_score'] = float(tmscore)
                cscore_dict['tm_score_err'] = float(tmscore_err)
                cscore_dict['rmsd'] = float(rmsd)
                cscore_dict['rmsd_err'] = float(rmsd_err)

    return cscore_dict


[docs]def parse_coach_bsites_inf(infile):
    """Parse the Bsites.inf output file of COACH and return a list of rank-ordered binding site predictions

    Bsites.inf contains the summary of COACH clustering results after all other prediction algorithms have finished
    For each site (cluster), there are three lines:
    
        - Line 1: site number, c-score of coach prediction, cluster size
        - Line 2: algorithm, PDB ID, ligand ID, center of binding site (cartesian coordinates), 
          c-score of the algorithm's prediction, binding residues from single template
        - Line 3: Statistics of ligands in the cluster

    C-score information:

        - "In our training data, a prediction with C-score>0.35 has average false positive and false negative rates below
          0.16 and 0.13, respectively." (https://zhanglab.ccmb.med.umich.edu/COACH/COACH.pdf)

    Args:
        infile (str): Path to Bsites.inf

    Returns:
        list: Ranked list of dictionaries, keys defined below
        
            - ``site_num``: cluster which is the consensus binding site
            - ``c_score``: confidence score of the cluster prediction
            - ``cluster_size``: number of predictions within this cluster
            - ``algorithm``: main? algorithm used to make the prediction
            - ``pdb_template_id``: PDB ID of the template used to make the prediction
            - ``pdb_template_chain``: chain of the PDB which has the ligand
            - ``pdb_ligand``: predicted ligand to bind
            - ``binding_location_coords``: centroid of the predicted ligand position in the homology model
            - ``c_score_method``: confidence score for the main algorithm
            - ``binding_residues``: predicted residues to bind the ligand
            - ``ligand_cluster_counts``: number of predictions per ligand

    """

    bsites_results = []

    with open(infile) as pp:
        lines = list(filter(None, (line.rstrip() for line in pp)))

    for i in range(len(lines) // 3):
        bsites_site_dict = {}

        line1 = lines[i * 3].split('\t')
        line2 = lines[i * 3 + 1].split('\t')
        line3 = lines[i * 3 + 2]

        bsites_site_dict['site_num'] = line1[0]
        bsites_site_dict['c_score'] = float(line1[1])
        bsites_site_dict['cluster_size'] = line1[2]

        bsites_site_dict['algorithm'] = line2[0]
        bsites_site_dict['pdb_template_id'] = line2[1][:4]
        bsites_site_dict['pdb_template_chain'] = line2[1][4]
        bsites_site_dict['pdb_ligand'] = line2[2]
        bsites_site_dict['binding_location_coords'] = tuple(float(x) for x in line2[3].split())

        # TODO: what's the difference between this c-score and the cluster's c-score?
        # how is the cluster's c-score computed? it's not the average c-score of all methods
        # also why are some COFACTOR c-scores >1?
        # 160411 - seems like the COFACTOR "BS-score" is being reported here, not its c-score...
        tmp_split = line2[4].split(' :')
        bsites_site_dict['c_score_method'] = tmp_split[0]
        bsites_site_dict['binding_residues'] = tmp_split[1]

        bsites_site_dict['ligand_cluster_counts'] = line3

        bsites_results.append(bsites_site_dict)

    return bsites_results


[docs]def parse_coach_ec_df(infile):
    """Parse the EC.dat output file of COACH and return a dataframe of results

    EC.dat contains the predicted EC number and active residues.
    The columns are: PDB_ID, TM-score, RMSD, Sequence identity,
    Coverage, Confidence score, EC number, and Active site residues

    Args:
        infile (str): Path to EC.dat

    Returns:
        DataFrame: Pandas DataFrame summarizing EC number predictions

    """

    ec_df = pd.read_table(infile, delim_whitespace=True,
                          names=['pdb_template', 'tm_score', 'rmsd', 'seq_ident', 'seq_coverage',
                                 'c_score', 'ec_number', 'binding_residues'])

    ec_df['pdb_template_id'] = ec_df['pdb_template'].apply(lambda x: x[:4])
    ec_df['pdb_template_chain'] = ec_df['pdb_template'].apply(lambda x: x[4])

    ec_df = ec_df[['pdb_template_id', 'pdb_template_chain', 'tm_score', 'rmsd',
                   'seq_ident', 'seq_coverage', 'c_score', 'ec_number', 'binding_residues']]
    ec_df['c_score'] = pd.to_numeric(ec_df.c_score, errors='coerce')

    return ec_df


[docs]def parse_coach_ec(infile):
    """Parse the EC.dat output file of COACH and return a list of rank-ordered EC number predictions

    EC.dat contains the predicted EC number and active residues.
    The columns are: PDB_ID, TM-score, RMSD, Sequence identity,
    Coverage, Confidence score, EC number, and Active site residues

    Args:
        infile (str): Path to EC.dat

    Returns:
         list: Ranked list of dictionaries, keys defined below

            - ``pdb_template_id``: PDB ID of the template used to make the prediction
            - ``pdb_template_chain``: chain of the PDB which has the ligand
            - ``tm_score``: TM-score of the template to the model (similarity score)
            - ``rmsd``: RMSD of the template to the model (also a measure of similarity)
            - ``seq_ident``: percent sequence identity
            - ``seq_coverage``: percent sequence coverage
            - ``c_score``: confidence score of the EC prediction
            - ``ec_number``: predicted EC number
            - ``binding_residues``: predicted residues to bind the ligand

    """
    return list(parse_coach_ec_df(infile).to_dict(orient='index').values())


[docs]def parse_coach_go(infile):
    """Parse a GO output file from COACH and return a rank-ordered list of GO term predictions

    The columns in all files are: GO terms, Confidence score, Name of GO terms. The files are:
        
        - GO_MF.dat - GO terms in 'molecular function'
        - GO_BP.dat - GO terms in 'biological process'
        - GO_CC.dat - GO terms in 'cellular component'

    Args:
        infile (str): Path to any COACH GO prediction file

    Returns:
        Pandas DataFrame: Organized dataframe of results, columns defined below
            
            - ``go_id``: GO term ID
            - ``go_term``: GO term text
            - ``c_score``: confidence score of the GO prediction

    """
    go_list = []

    with open(infile) as go_file:
        for line in go_file.readlines():
            go_dict = {}

            go_split = line.split()
            go_dict['go_id'] = go_split[0]
            go_dict['c_score'] = go_split[1]
            go_dict['go_term'] = ' '.join(go_split[2:])

            go_list.append(go_dict)

    return go_list