Source code for ssbio.protein.structure.properties.dssp

import pandas as pd
import logging
import ssbio.utils
from collections import defaultdict
from six import iteritems
from Bio import PDB
from Bio.PDB.DSSP import dssp_dict_from_pdb_file
from Bio.PDB.DSSP import residue_max_acc
from Bio.PDB.Polypeptide import aa1
from Bio.PDB.Polypeptide import one_to_three
log = logging.getLogger(__name__)


def get_dssp_df(model, pdb_file, dssp_exec='dssp', outfile=None, outdir=None, outext='_dssp.df', force_rerun=False):
    # Create the output file name
    outfile = ssbio.utils.outfile_maker(inname=pdb_file, outname=outfile, outdir=outdir, outext=outext)

    if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
        try:
            # TODO: errors with non-standard residues, ie. MSE in 4Q6U or 1nfr
            # TODO: write command line executor for DSSP and parser for raw DSSP results
            dssp = PDB.DSSP(model=model, in_file=pdb_file, dssp=dssp_exec)
        except KeyError:
            return pd.DataFrame()

        if len(dssp.property_list) == 0:
            return pd.DataFrame()

        # Reorganize the results into a csv file
        appender = []
        for k in dssp.property_keys:
            to_append = []
            x = dssp.property_dict[k]
            chain = k[0]
            residue = k[1]
            het = residue[0]
            resnum = residue[1]
            icode = residue[2]
            to_append.extend([chain, resnum, icode])
            to_append.extend(x)
            appender.append(to_append)

        cols = ['chain', 'resnum', 'icode',
                'dssp_index', 'aa', 'ss', 'exposure_rsa', 'phi', 'psi',
                'NH_O_1_relidx', 'NH_O_1_energy', 'O_NH_1_relidx',
                'O_NH_1_energy', 'NH_O_2_relidx', 'NH_O_2_energy',
                'O_NH_2_relidx', 'O_NH_2_energy']

        df = pd.DataFrame.from_records(appender, columns=cols)

        # Adding additional columns
        df = df[df['aa'].isin(list(aa1))]
        df['aa_three'] = df['aa'].apply(one_to_three)
        df['max_acc'] = df['aa_three'].map(residue_max_acc['Sander'].get)
        df[['exposure_rsa', 'max_acc']] = df[['exposure_rsa', 'max_acc']].astype(float)
        df['exposure_asa'] = df['exposure_rsa'] * df['max_acc']

        df.to_csv(outfile)
    else:
        log.debug('{}: already ran DSSP and force_rerun={}, loading results'.format(outfile, force_rerun))
        df = pd.read_csv(outfile, index_col=0)

    return df


[docs]def get_dssp_df_on_file(pdb_file, outfile=None, outdir=None, outext='_dssp.df', force_rerun=False): """Run DSSP directly on a structure file with the Biopython method Bio.PDB.DSSP.dssp_dict_from_pdb_file Avoids errors like: PDBException: Structure/DSSP mismatch at <Residue MSE het= resseq=19 icode= > by not matching information to the structure file (DSSP fills in the ID "X" for unknown residues) Args: pdb_file: Path to PDB file outfile: Name of output file outdir: Path to output directory outext: Extension of output file force_rerun: If DSSP should be rerun if the outfile exists Returns: DataFrame: DSSP results summarized """ # TODO: function unfinished # Create the output file name outfile = ssbio.utils.outfile_maker(inname=pdb_file, outname=outfile, outdir=outdir, outext=outext) if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile): try: d = dssp_dict_from_pdb_file(pdb_file) except Exception('DSSP failed to produce an output'): log.error('{}: unable to run DSSP'.format(pdb_file)) return pd.DataFrame() appender = [] # TODO: WARNING: d is slightly different than when using function get_dssp_df for k in d[1]: to_append = [] y = d[0][k] chain = k[0] residue = k[1] het = residue[0] resnum = residue[1] icode = residue[2] to_append.extend([chain, resnum, icode]) to_append.extend(y) appender.append(to_append) cols = ['chain', 'resnum', 'icode', 'dssp_index', 'aa', 'ss', 'exposure_rsa', 'phi', 'psi', 'NH_O_1_relidx', 'NH_O_1_energy', 'O_NH_1_relidx', 'O_NH_1_energy', 'NH_O_2_relidx', 'NH_O_2_energy', 'O_NH_2_relidx', 'O_NH_2_energy'] df = pd.DataFrame.from_records(appender, columns=cols) # Adding additional columns df = df[df['aa'].isin(list(aa1))] df['aa_three'] = df['aa'].apply(one_to_three) df['max_acc'] = df['aa_three'].map(residue_max_acc['Sander'].get) df[['exposure_rsa', 'max_acc']] = df[['exposure_rsa', 'max_acc']].astype(float) df['exposure_asa'] = df['exposure_rsa'] * df['max_acc'] df.to_csv(outfile) else: log.debug('{}: already ran DSSP and force_rerun={}, loading results'.format(outfile, force_rerun)) df = pd.read_csv(outfile, index_col=0)
return df
[docs]def secondary_structure_summary(dssp_df): """Summarize the secondary structure content of the DSSP dataframe for each chain. Args: dssp_df: Pandas DataFrame of parsed DSSP results Returns: dict: Chain to secondary structure summary dictionary """ chains = dssp_df.chain.unique() infodict = {} for chain in chains: expoinfo = defaultdict(int) chain_df = dssp_df[dssp_df.chain == chain] counts = chain_df.ss.value_counts() total = float(len(chain_df)) for ss, count in iteritems(counts): if ss == '-': expoinfo['percent_C-dssp'] = count/total if ss == 'H': expoinfo['percent_H-dssp'] = count/total if ss == 'B': expoinfo['percent_B-dssp'] = count/total if ss == 'E': expoinfo['percent_E-dssp'] = count/total if ss == 'G': expoinfo['percent_G-dssp'] = count/total if ss == 'I': expoinfo['percent_I-dssp'] = count/total if ss == 'T': expoinfo['percent_T-dssp'] = count/total if ss == 'S': expoinfo['percent_S-dssp'] = count/total # Filling in 0 percenters for per in ['percent_C-dssp','percent_H-dssp','percent_B-dssp','percent_E-dssp', 'percent_G-dssp','percent_I-dssp','percent_T-dssp','percent_S-dssp']: if per not in expoinfo: expoinfo[per] = 0.0 infodict[chain] = dict(expoinfo)
return infodict # TODO: below methods have not been fixed
[docs]def calc_surface_buried(dssp_df): '''Calculates the percent of residues that are in the surface or buried, as well as if they are polar or nonpolar. Returns a dictionary of this. ''' SN = 0 BN = 0 SP = 0 SNP = 0 SPo = 0 SNe = 0 BNP = 0 BP = 0 BPo = 0 BNe = 0 Total = 0 sbinfo = {} df_min = dssp_df[['aa_three', 'exposure_asa']] if len(df_min) == 0: return sbinfo else: for i, r in df_min.iterrows(): res = r.aa_three area = r.exposure_asa if res in AAdict: if AAdict[res] == 'nonpolar' and area > 3: SNP = SNP + 1 SN = SN + 1 elif AAdict[res] == 'polar' and area > 3: SP = SP + 1 SN = SN + 1 elif AAdict[res] == 'positive' and area > 3: SPo = SPo + 1 SN = SN + 1 elif AAdict[res] == 'negative' and area > 3: SNe = SNe + 1 SN = SN + 1 elif AAdict[res] == 'positive' and area <= 3: BPo = BPo + 1 BN = BN + 1 elif AAdict[res] == 'negative' and area <= 3: BNe = BNe + 1 BN = BN + 1 elif AAdict[res] == 'polar' and area <= 3: BP = BP + 1 BN = BN + 1 elif AAdict[res] == 'nonpolar' and area <= 3: BNP = BNP + 1 BN = BN + 1 Total = float(BN + SN) pSNP = float(SNP) / Total pSP = float(SP) / Total pSPo = float(SPo) / Total pSNe = float(SNe) / Total pBNP = float(BNP) / Total pBP = float(BP) / Total pBPo = float(BPo) / Total pBNe = float(BNe) / Total pBN = float(BN) / Total pSN = float(SN) / Total sbinfo['ssb_per_S_NP'] = pSNP sbinfo['ssb_per_S_P'] = pSP sbinfo['ssb_per_S_pos'] = pSPo sbinfo['ssb_per_S_neg'] = pSNe sbinfo['ssb_per_B_NP'] = pBNP sbinfo['ssb_per_B_P'] = pBP sbinfo['ssb_per_B_pos'] = pBPo sbinfo['ssb_per_B_neg'] = pBNe sbinfo['ssb_per_S'] = pSN sbinfo['ssb_per_B'] = pBN
return sbinfo
[docs]def calc_sasa(dssp_df): """ Calculation of SASA utilizing the DSSP program. DSSP must be installed for biopython to properly call it. Install using apt-get on Ubuntu or from: http://swift.cmbi.ru.nl/gv/dssp/ Input: PDB or CIF structure file Output: SASA (integer) of structure """ infodict = {'ssb_sasa': dssp_df.exposure_asa.sum(), 'ssb_mean_rel_exposed': dssp_df.exposure_rsa.mean(), 'ssb_size': len(dssp_df)}
return infodict
[docs]def all_dssp_props(filename, file_type): '''Returns a large dictionary of SASA, secondary structure composition, and surface/buried composition. Values are computed using DSSP. Input: PDB or MMCIF filename Output: Dictionary of values obtained from dssp ''' t = dssp_dataframe(filename, file_type) # print(t) sasa = calc_sasa(t) sstr = secondary_structure_summary(t) subu = calc_surface_buried(t) sasa.update(sstr) sasa.update(subu)
return sasa
[docs]def get_ss_class(pdb_file, dssp_file, chain): """Define the secondary structure class of a PDB file at the specific chain Args: pdb_file: dssp_file: chain: Returns: """ prag = pr.parsePDB(pdb_file) pr.parseDSSP(dssp_file, prag) alpha, threeTen, beta = get_dssp_ss_content_multiplechains(prag, chain) if alpha == 0 and beta > 0: classification = 'all-beta' elif beta == 0 and alpha > 0: classification = 'all-alpha' elif beta == 0 and alpha == 0: classification = 'mixed' elif float(alpha) / beta >= 20: classification = 'all-alpha' else: classification = 'mixed'
return classification