import logging
import os
import os.path as op
import shutil
import time
import pandas as pd
from ssbio.protein.structure.utils.cleanpdb import CleanPDB
from ssbio.protein.structure.utils.structureio import StructureIO
import ssbio.utils
from ssbio.protein.structure.structprop import StructProp
log = logging.getLogger(__name__)
[docs]class ITASSERProp(StructProp):
"""Parse all available information for a local I-TASSER modeling run.
Initializes a class to collect I-TASSER modeling information and optionally copy results to a new directory.
SEE: https://zhanglab.ccmb.med.umich.edu/papers/2015_1.pdf for detailed information.
Args:
ident (str): ID of I-TASSER modeling run
original_results_path (str): Path to I-TASSER modeling folder
coach_results_folder (str): Path to original COACH results
model_to_use (str): Which I-TASSER model to use. Default is "model1"
"""
# TODO: parse 1) predicted SS (seq.dat) 2) predicted solvent acc (exp.dat) 3) B-factor (BFP.dat?)
# Other? Top 10 proteins with similar structure in PDB: $datadir/model1/cofactor/similarpdb_model1.lst
def __init__(self, ident, original_results_path, coach_results_folder='model1/coach', model_to_use='model1'):
if not op.exists(original_results_path):
raise OSError('{}: folder does not exist'.format(original_results_path))
StructProp.__init__(self, ident, is_experimental=False)
self._attrs_to_copy = []
self.original_results_path = original_results_path
self.model_to_use = model_to_use
### MODEL
self.model_date = None
original_model_path = op.join(original_results_path, '{}.pdb'.format(model_to_use))
if not op.exists(original_model_path):
raise IOError('{}: no homology model generated'.format(original_model_path))
else:
self.load_structure_path(structure_path=original_model_path, file_type='pdb')
### MODELING RESULTS
self.difficulty = None
self.top_template_pdb = None
self.top_template_chain = None
# Parse init.dat
self._init_dat_path = op.join(original_results_path, 'init.dat')
if op.exists(self._init_dat_path):
self.update(parse_init_dat(self._init_dat_path))
# Parse seq.dat
self._seq_dat_path = op.join(original_results_path, 'seq.dat')
if op.exists(self._seq_dat_path):
self._attrs_to_copy.append('_seq_dat_path')
# TODO: parse seq.dat and store in modeling_results
self.update(parse_seq_dat(self._seq_dat_path))
# Parse exp.dat
self._exp_dat_path = op.join(original_results_path, 'exp.dat')
if op.exists(self._exp_dat_path):
self._attrs_to_copy.append('_exp_dat_path')
# TODO: parse exp.dat and store in modeling_results
self.update(parse_exp_dat(self._exp_dat_path))
# Parse BFP.dat
self._bfp_dat_path = op.join(original_results_path, 'BFP.dat')
if op.exists(self._bfp_dat_path):
self._attrs_to_copy.append('_bfp_dat_path')
# TODO: parse BFP.dat and store in modeling_results
self.update(parse_bfp_dat(self._bfp_dat_path))
# Parse cscore
self.c_score = None
self.tm_score = None
self.tm_score_err = None
self.rmsd = None
self.rmsd_err = None
self._cscore_path = op.join(original_results_path, 'cscore')
if op.exists(self._cscore_path):
self._attrs_to_copy.append('_cscore_path')
self.update(parse_cscore(self._cscore_path))
### COACH RESULTS
self.coach_bsites = []
self.coach_ec = []
self.coach_go_mf = []
self.coach_go_bp = []
self.coach_go_cc = []
coach_folder = op.join(original_results_path, coach_results_folder)
self._coach_bsites_inf_path = op.join(coach_folder, 'Bsites.inf')
self._coach_ec_dat_path = op.join(coach_folder, 'EC.dat')
self._coach_go_mf_dat_path = op.join(coach_folder, 'GO_MF.dat')
self._coach_go_bp_dat_path = op.join(coach_folder, 'GO_BP.dat')
self._coach_go_cc_dat_path = op.join(coach_folder, 'GO_CC.dat')
if op.isdir(coach_folder):
# Parse Bsites.inf
if op.exists(self._coach_bsites_inf_path):
parsed_bsites = parse_coach_bsites_inf(self._coach_bsites_inf_path)
if parsed_bsites:
self._attrs_to_copy.append('_coach_bsites_inf_path')
self.coach_bsites = parsed_bsites
# Parse EC.dat
if op.exists(self._coach_ec_dat_path):
parsed_ec = parse_coach_ec(self._coach_ec_dat_path)
if parsed_ec:
self._attrs_to_copy.append('_coach_ec_dat_path')
self.coach_ec = parsed_ec
# Parse GO_MF.dat
if op.exists(self._coach_go_mf_dat_path):
parsed_go_mf = parse_coach_go(self._coach_go_mf_dat_path)
if parsed_go_mf:
self._attrs_to_copy.append('_coach_go_mf_dat_path')
self.coach_go_mf = parsed_go_mf
# Parse GO_BP.dat
if op.exists(self._coach_go_bp_dat_path):
parsed_go_bp = parse_coach_go(self._coach_go_bp_dat_path)
if parsed_go_bp:
self._attrs_to_copy.append('_coach_go_bp_dat_path')
self.coach_go_bp = parsed_go_bp
# Parse GO_CC.dat
if op.exists(self._coach_go_cc_dat_path):
parsed_go_cc = parse_coach_go(self._coach_go_cc_dat_path)
if parsed_go_cc:
self._attrs_to_copy.append('_coach_go_cc_dat_path')
self.coach_go_cc = parsed_go_cc
[docs] def load_structure_path(self, structure_path, file_type='pdb'):
StructProp.load_structure_path(self, structure_path=structure_path, file_type=file_type)
# Additionally save model creation date
self.model_date = time.strftime('%Y-%m-%d', time.gmtime(os.path.getmtime(self.structure_path)))
[docs] def copy_results(self, copy_to_dir, rename_model_to=None, force_rerun=False):
"""Copy the raw information from I-TASSER modeling to a new folder.
Copies all files in the list _attrs_to_copy.
Args:
copy_to_dir (str): Directory to copy the minimal set of results per sequence.
rename_model_to (str): New file name (without extension)
force_rerun (bool): If existing models and results should be overwritten.
"""
# Save path to the structure and copy it if specified
if not rename_model_to:
rename_model_to = self.model_to_use
new_model_path = op.join(copy_to_dir, '{}.pdb'.format(rename_model_to))
if self.structure_path:
if ssbio.utils.force_rerun(flag=force_rerun, outfile=new_model_path):
# Clean and save it
custom_clean = CleanPDB()
my_pdb = StructureIO(self.structure_path)
new_model_path = my_pdb.write_pdb(custom_selection=custom_clean,
custom_name=rename_model_to,
out_dir=copy_to_dir,
force_rerun=force_rerun)
# Update the structure_path to be the new clean file
self.load_structure_path(structure_path=new_model_path, file_type='pdb')
# Other modeling results - store in a new folder
dest_itasser_dir = op.join(copy_to_dir, '{}_itasser'.format(rename_model_to))
if not op.exists(dest_itasser_dir):
os.mkdir(dest_itasser_dir)
for attr in self._attrs_to_copy:
old_file_path = getattr(self, attr)
new_file_path = op.join(dest_itasser_dir, op.basename(old_file_path))
if ssbio.utils.force_rerun(flag=force_rerun, outfile=new_file_path):
shutil.copy2(old_file_path, new_file_path)
log.debug('{}: copied from {}'.format(new_file_path, old_file_path))
else:
log.debug('{}: file already exists'.format(new_file_path))
setattr(self, attr, new_file_path)
@property
def df_coach_bsites(self):
df_cols = ['site_num', 'c_score', 'cluster_size', 'algorithm',
'pdb_template_id', 'pdb_template_chain', 'pdb_ligand',
'binding_location_coords', 'c_score_method', 'binding_residues',
'ligand_cluster_counts']
bsites_inf_df = pd.DataFrame.from_records(self.coach_bsites, columns=df_cols).drop_duplicates().reset_index(drop=True)
if bsites_inf_df.empty:
log.warning('Empty dataframe')
return bsites_inf_df
else:
bsites_inf_df['c_score'] = pd.to_numeric(bsites_inf_df.c_score, errors='coerce')
bsites_inf_df['cluster_size'] = pd.to_numeric(bsites_inf_df.cluster_size, errors='coerce')
return ssbio.utils.clean_df(bsites_inf_df)
@property
def df_coach_ec(self):
if self._coach_ec_dat_path:
return parse_coach_ec_df(self._coach_ec_dat_path)
else:
log.warning('Empty dataframe')
return pd.DataFrame()
@property
def df_coach_go(self):
cols = ['go_id', 'go_term', 'c_score']
go_all_df = pd.DataFrame()
for go_list in [self.coach_go_mf, self.coach_go_cc, self.coach_go_bp]:
go_df = pd.DataFrame.from_records(go_list, columns=cols).drop_duplicates().reset_index(drop=True)
go_df['c_score'] = pd.to_numeric(go_df.c_score, errors='coerce')
if go_all_df.empty:
go_all_df = go_df
else:
go_all_df.append(go_df)
return go_all_df
[docs] def get_dict(self, only_attributes=None, exclude_attributes=None, df_format=False):
"""Summarize the I-TASSER run in a dictionary containing modeling results and top predictions from COACH
Args:
only_attributes (str, list): Attributes that should be returned. If not provided, all are returned.
exclude_attributes (str, list): Attributes that should be excluded.
df_format (bool): If dictionary values should be formatted for a dataframe
(everything possible is transformed into strings, int, or float -
if something can't be transformed it is excluded)
Returns:
dict: Dictionary of attributes
"""
to_exclude = ['coach_bsites', 'coach_ec', 'coach_go_mf', 'coach_go_bp', 'coach_go_cc']
if not exclude_attributes:
excluder = to_exclude
else:
excluder = ssbio.utils.force_list(exclude_attributes)
excluder.extend(to_exclude)
summary_dict = StructProp.get_dict(self, only_attributes=only_attributes,
exclude_attributes=excluder,
df_format=df_format)
if self.coach_bsites:
tmp = {'top_bsite_' + k:v for k, v in self.coach_bsites[0].items()}
summary_dict.update(tmp)
if self.coach_ec:
tmp = {'top_ec_' + k: v for k, v in self.coach_ec[0].items()}
summary_dict.update(tmp)
if self.coach_go_mf:
tmp = {'top_go_mf_' + k: v for k, v in self.coach_go_mf[0].items()}
summary_dict.update(tmp)
if self.coach_go_bp:
tmp = {'top_go_bp_' + k: v for k, v in self.coach_go_bp[0].items()}
summary_dict.update(tmp)
if self.coach_go_cc:
tmp = {'top_go_cc_' + k: v for k, v in self.coach_go_cc[0].items()}
summary_dict.update(tmp)
return summary_dict
[docs]def parse_init_dat(infile):
"""Parse the main init.dat file which contains the modeling results
The first line of the file init.dat contains stuff like::
"120 easy 40 8"
The other lines look like this::
" 161 11.051 1 1guqA MUSTER"
and getting the first 10 gives you the top 10 templates used in modeling
Args:
infile (stt): Path to init.dat
Returns:
dict: Dictionary of parsed information
"""
# TODO: would be nice to get top 10 templates instead of just the top
init_dict = {}
log.debug('{}: reading file...'.format(infile))
with open(infile, 'r') as f:
# Get first 2 lines of file
head = [next(f).strip() for x in range(2)]
summary = head[0].split()
difficulty = summary[1]
top_template_info = head[1].split()
top_template_pdbchain = top_template_info[3]
top_template_pdb = top_template_pdbchain[:4]
top_template_chain = top_template_pdbchain[4:]
init_dict['difficulty'] = difficulty
init_dict['top_template_pdb'] = top_template_pdb
init_dict['top_template_chain'] = top_template_chain
return init_dict
[docs]def parse_seq_dat(infile):
"""Parse the secondary structure predictions in seq.dat
Args:
infile (str): Path to seq.dat
Returns:
list: List of secondary structure predictions for all residues
"""
# TODO: parser for seq.dat
seq_dict = {}
return seq_dict
[docs]def parse_exp_dat(infile):
"""Parse the solvent accessibility predictions in exp.dat
Args:
infile (str): Path to exp.dat
Returns:
list: List of solvent accessibility predictions for all residues
"""
# TODO: parser for exp.dat
exp_dict = {}
return exp_dict
[docs]def parse_bfp_dat(infile):
"""Parse the B-factor predictions in BFP.dat
Args:
infile (str): Path to BFP.dat
Returns:
list: List of B-factor predictions for all residues
"""
# TODO: parser for BFP.dat
bfp_dict = {}
return bfp_dict
[docs]def parse_cscore(infile):
"""Parse the cscore file to return a dictionary of scores.
Args:
infile (str): Path to cscore
Returns:
dict: Dictionary of scores
"""
cscore_dict = {}
with open(infile, 'r') as f:
for ll in f.readlines():
# Look for the first line that starts with model1
if ll.lower().startswith('model1'):
l = ll.split()
cscore = l[1]
tmscore_full = l[2].split('+-')
tmscore = tmscore_full[0]
tmscore_err = tmscore_full[1]
rmsd_full = l[3].split('+-')
rmsd = rmsd_full[0]
rmsd_err = rmsd_full[1]
cscore_dict['c_score'] = float(cscore)
cscore_dict['tm_score'] = float(tmscore)
cscore_dict['tm_score_err'] = float(tmscore_err)
cscore_dict['rmsd'] = float(rmsd)
cscore_dict['rmsd_err'] = float(rmsd_err)
return cscore_dict
[docs]def parse_coach_bsites_inf(infile):
"""Parse the Bsites.inf output file of COACH and return a list of rank-ordered binding site predictions
Bsites.inf contains the summary of COACH clustering results after all other prediction algorithms have finished
For each site (cluster), there are three lines:
- Line 1: site number, c-score of coach prediction, cluster size
- Line 2: algorithm, PDB ID, ligand ID, center of binding site (cartesian coordinates),
c-score of the algorithm's prediction, binding residues from single template
- Line 3: Statistics of ligands in the cluster
C-score information:
- "In our training data, a prediction with C-score>0.35 has average false positive and false negative rates below
0.16 and 0.13, respectively." (https://zhanglab.ccmb.med.umich.edu/COACH/COACH.pdf)
Args:
infile (str): Path to Bsites.inf
Returns:
list: Ranked list of dictionaries, keys defined below
- ``site_num``: cluster which is the consensus binding site
- ``c_score``: confidence score of the cluster prediction
- ``cluster_size``: number of predictions within this cluster
- ``algorithm``: main? algorithm used to make the prediction
- ``pdb_template_id``: PDB ID of the template used to make the prediction
- ``pdb_template_chain``: chain of the PDB which has the ligand
- ``pdb_ligand``: predicted ligand to bind
- ``binding_location_coords``: centroid of the predicted ligand position in the homology model
- ``c_score_method``: confidence score for the main algorithm
- ``binding_residues``: predicted residues to bind the ligand
- ``ligand_cluster_counts``: number of predictions per ligand
"""
bsites_results = []
with open(infile) as pp:
lines = list(filter(None, (line.rstrip() for line in pp)))
for i in range(len(lines) // 3):
bsites_site_dict = {}
line1 = lines[i * 3].split('\t')
line2 = lines[i * 3 + 1].split('\t')
line3 = lines[i * 3 + 2]
bsites_site_dict['site_num'] = line1[0]
bsites_site_dict['c_score'] = float(line1[1])
bsites_site_dict['cluster_size'] = line1[2]
bsites_site_dict['algorithm'] = line2[0]
bsites_site_dict['pdb_template_id'] = line2[1][:4]
bsites_site_dict['pdb_template_chain'] = line2[1][4]
bsites_site_dict['pdb_ligand'] = line2[2]
bsites_site_dict['binding_location_coords'] = tuple(float(x) for x in line2[3].split())
# TODO: what's the difference between this c-score and the cluster's c-score?
# how is the cluster's c-score computed? it's not the average c-score of all methods
# also why are some COFACTOR c-scores >1?
# 160411 - seems like the COFACTOR "BS-score" is being reported here, not its c-score...
tmp_split = line2[4].split(' :')
bsites_site_dict['c_score_method'] = tmp_split[0]
bsites_site_dict['binding_residues'] = tmp_split[1]
bsites_site_dict['ligand_cluster_counts'] = line3
bsites_results.append(bsites_site_dict)
return bsites_results
[docs]def parse_coach_ec_df(infile):
"""Parse the EC.dat output file of COACH and return a dataframe of results
EC.dat contains the predicted EC number and active residues.
The columns are: PDB_ID, TM-score, RMSD, Sequence identity,
Coverage, Confidence score, EC number, and Active site residues
Args:
infile (str): Path to EC.dat
Returns:
DataFrame: Pandas DataFrame summarizing EC number predictions
"""
ec_df = pd.read_table(infile, delim_whitespace=True,
names=['pdb_template', 'tm_score', 'rmsd', 'seq_ident', 'seq_coverage',
'c_score', 'ec_number', 'binding_residues'])
ec_df['pdb_template_id'] = ec_df['pdb_template'].apply(lambda x: x[:4])
ec_df['pdb_template_chain'] = ec_df['pdb_template'].apply(lambda x: x[4])
ec_df = ec_df[['pdb_template_id', 'pdb_template_chain', 'tm_score', 'rmsd',
'seq_ident', 'seq_coverage', 'c_score', 'ec_number', 'binding_residues']]
ec_df['c_score'] = pd.to_numeric(ec_df.c_score, errors='coerce')
return ec_df
[docs]def parse_coach_ec(infile):
"""Parse the EC.dat output file of COACH and return a list of rank-ordered EC number predictions
EC.dat contains the predicted EC number and active residues.
The columns are: PDB_ID, TM-score, RMSD, Sequence identity,
Coverage, Confidence score, EC number, and Active site residues
Args:
infile (str): Path to EC.dat
Returns:
list: Ranked list of dictionaries, keys defined below
- ``pdb_template_id``: PDB ID of the template used to make the prediction
- ``pdb_template_chain``: chain of the PDB which has the ligand
- ``tm_score``: TM-score of the template to the model (similarity score)
- ``rmsd``: RMSD of the template to the model (also a measure of similarity)
- ``seq_ident``: percent sequence identity
- ``seq_coverage``: percent sequence coverage
- ``c_score``: confidence score of the EC prediction
- ``ec_number``: predicted EC number
- ``binding_residues``: predicted residues to bind the ligand
"""
return list(parse_coach_ec_df(infile).to_dict(orient='index').values())
[docs]def parse_coach_go(infile):
"""Parse a GO output file from COACH and return a rank-ordered list of GO term predictions
The columns in all files are: GO terms, Confidence score, Name of GO terms. The files are:
- GO_MF.dat - GO terms in 'molecular function'
- GO_BP.dat - GO terms in 'biological process'
- GO_CC.dat - GO terms in 'cellular component'
Args:
infile (str): Path to any COACH GO prediction file
Returns:
Pandas DataFrame: Organized dataframe of results, columns defined below
- ``go_id``: GO term ID
- ``go_term``: GO term text
- ``c_score``: confidence score of the GO prediction
"""
go_list = []
with open(infile) as go_file:
for line in go_file.readlines():
go_dict = {}
go_split = line.split()
go_dict['go_id'] = go_split[0]
go_dict['c_score'] = go_split[1]
go_dict['go_term'] = ' '.join(go_split[2:])
go_list.append(go_dict)
return go_list