Source code for ssbio.protein.structure.properties.fatcat

import ssbio.utils
import os.path as op
from bs4 import BeautifulSoup
import itertools
from tqdm import tqdm
import numpy as np
import os
import pandas as pd

__author__ = "Anand Sastry"
__email__ = "avsastry@eng.ucsd.edu"


[docs]def run_fatcat(structure_path_1, structure_path_2, fatcat_sh, outdir='', silent=False, print_cmd=False, force_rerun=False):
    """Run FATCAT on two PDB files, and return the path of the XML result file.
    
    Args:
        structure_path_1 (str): Path to PDB file 
        structure_path_2 (str): Path to PDB file 
        fatcat_sh (str): Path to "runFATCAT.sh" executable script
        outdir (str): Path to where FATCAT XML output files will be saved
        silent (bool): If stdout should be silenced from showing up in Python console output
        print_cmd (bool): If command to run FATCAT should be printed to stdout
        force_rerun (bool): If FATCAT should be run even if XML output files already exist 

    Returns:
        str: Path to XML output file

    """
    filename1 = op.splitext(op.basename(structure_path_1))[0]
    filename2 = op.splitext(op.basename(structure_path_2))[0]

    if not op.exists(outdir):
        os.mkdir(outdir)
    outfile = op.join(outdir, filename1 + '__' + filename2 + '.xml')

    # Run FATCAT on the structures, print the XML of the result to stdout
    fatcat_cmd = '{} -file1 {} -file2 {} -outFile {}'.format(fatcat_sh, structure_path_1, structure_path_2, outfile)
    if print_cmd:
        print(fatcat_cmd)
    ssbio.utils.command_runner(fatcat_cmd, force_rerun_flag=force_rerun, outfile_checker=outfile, silent=silent)

    return outfile


[docs]def run_fatcat_all_by_all(list_of_structure_paths, fatcat_sh, outdir='', silent=True, force_rerun=False):
    """Run FATCAT on all pairs of structures given a list of structures.
    
    Args:
        list_of_structure_paths (list): List of PDB file paths 
        fatcat_sh (str): Path to "runFATCAT.sh" executable script
        outdir (str): Path to where FATCAT XML output files will be saved 
        silent (bool): If command to run FATCAT should be printed to stdout
        force_rerun (bool): If FATCAT should be run even if XML output files already exist 

    Returns:
        Pandas DataFrame: TM-scores (similarity) between all structures

    """
    structure_ids = {x: i for i, x in enumerate(list_of_structure_paths)}

    comps = itertools.combinations(list_of_structure_paths, 2)
    tm_score_matrix = np.eye(len(list_of_structure_paths))

    for pdb1, pdb2 in tqdm(comps):
        fatcat_file = run_fatcat(pdb1, pdb2, fatcat_sh, outdir=outdir, silent=silent, force_rerun=force_rerun)
        tm_score_matrix[structure_ids[pdb1], structure_ids[pdb2]] = parse_fatcat(fatcat_file)['tm_score']
        tm_score_matrix[structure_ids[pdb2], structure_ids[pdb1]] = parse_fatcat(fatcat_file)['tm_score']

    # Convert to dataframe with filenames
    filenames = [op.splitext(op.basename(x))[0] for x in list_of_structure_paths]
    tm_score_matrix_annotated = pd.DataFrame(data=tm_score_matrix, columns=filenames, index=filenames)

    return tm_score_matrix_annotated


[docs]def parse_fatcat(fatcat_xml):
    """Parse a FATCAT XML result file.
    
    Args:
        fatcat_xml (str): Path to FATCAT XML result file 

    Returns:
        dict: Parsed information from the output
        
    Todo:
        - Only returning TM-score at the moment

    """
    fatcat_results = {}

    # Parse output xml file
    with open(fatcat_xml, 'r') as f:
        soup = BeautifulSoup(f, 'lxml')

    # Find the tmScore of the alignment
    if soup.find('block'):
        fatcat_results['tm_score'] = float(soup.find('afpchain')['tmscore'])

    return fatcat_results