Source code for ssbio.databases.swissmodel

"""
SWISSMODEL
==========
"""

import json
import logging
import requests
import ssbio.utils
import os.path as op
from collections import defaultdict

log = logging.getLogger(__name__)


[docs]class SWISSMODEL():
    """Methods to parse through a SWISS-MODEL metadata set.

    Download a particular organism's metadata from SWISS-MODEL here: https://swissmodel.expasy.org/repository

    Args:
        metadata_dir (str): Path to the extracted SWISS-MODEL_Repository folder

    """

    def __init__(self, metadata_dir):
        self.metadata_dir = metadata_dir
        """str: Path to the extracted SWISS-MODEL_Repository folder"""

        self.all_models = None
        """dict: Dictionary of lists, UniProt ID as the keys"""

        # Parse the INDEX_JSON file and then store all the metadata in all_models
        self.parse_metadata()

    @property
    def metadata_index_json(self):
        """str: Path to the INDEX_JSON file."""
        try:
            return op.join(self.metadata_dir, 'INDEX.json')
        except FileNotFoundError:
            return op.join(self.metadata_dir, 'INDEX_JSON')

    @property
    def uniprots_modeled(self):
        """list: Return all UniProt accession numbers with at least one model"""
        return list(self.all_models.keys())

[docs]    def parse_metadata(self):
        """Parse the INDEX_JSON file and reorganize it as a dictionary of lists."""

        all_models = defaultdict(list)

        with open(self.metadata_index_json) as f:
            loaded = json.load(f)

        for m in loaded['index']:
            all_models[m['uniprot_ac']].append(m)

        self.all_models = dict(all_models)

[docs]    def get_models(self, uniprot_acc):
        """Return all available models for a UniProt accession number.

        Args:
            uniprot_acc (str): UniProt ACC/ID

        Returns:
            dict: All available models in SWISS-MODEL for this UniProt entry

        """
        if uniprot_acc in self.all_models:
            return self.all_models[uniprot_acc]
        else:
            log.error('{}: no SWISS-MODELs available'.format(uniprot_acc))
            return None

[docs]    def get_model_filepath(self, infodict):
        """Get the path to the homology model using information from the index dictionary for a single model.

        Example: use self.get_models(UNIPROT_ID) to get all the models, which returns a list of dictionaries.
            Use one of those dictionaries as input to this function to get the filepath to the model itself.

        Args:
            infodict (dict): Information about a model from get_models

        Returns:
            str: Path to homology model

        """
        u = infodict['uniprot_ac']

        original_filename = '{}_{}_{}_{}'.format(infodict['from'], infodict['to'],
                                                 infodict['template'], infodict['coordinate_id'])
        file_path = op.join(self.metadata_dir, u[:2], u[2:4], u[4:6],
                            'swissmodel', '{}.pdb'.format(original_filename))

        if op.exists(file_path):
            return file_path
        else:
            log.warning('{}: no file {} found for model'.format(u, file_path))
            return None

[docs]    def download_models(self, uniprot_acc, outdir='', force_rerun=False):
        """Download all models available for a UniProt accession number.

        Args:
            uniprot_acc (str): UniProt ACC/ID
            outdir (str): Path to output directory, uses working directory if not set
            force_rerun (bool): Force a redownload the models if they already exist

        Returns:
            list: Paths to the downloaded models

        """
        downloaded = []
        subset = self.get_models(uniprot_acc)

        for entry in subset:
            ident = '{}_{}_{}_{}'.format(uniprot_acc, entry['template'], entry['from'], entry['to'])
            outfile = op.join(outdir, ident + '.pdb')

            if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
                response = requests.get(entry['url'])

                if response.status_code == 404:
                    log.error('{}: 404 returned, no model available.'.format(ident))

                else:
                    with open(outfile, 'w') as f:
                        f.write(response.text)

                    log.debug('{}: downloaded homology model'.format(ident))
                    downloaded.append(outfile)
            else:
                downloaded.append(outfile)

        return downloaded

[docs]    def organize_models(self, outdir, force_rerun=False):
        """Organize and rename SWISS-MODEL models to a single folder with a name containing template information.

        Args:
            outdir (str): New directory to copy renamed models to
            force_rerun (bool): If models should be copied again even if they already exist

        Returns:
            dict: Dictionary of lists, UniProt IDs as the keys and new file paths as the values

        """
        for u, models in self.all_models:
            for m in models:
                original_filename = '{}_{}_{}_{}'.format(m['from'], m['to'], m['template'], m['coordinate_id'])
                file_path = op.join(self.metadata_dir,
                                    u[:2], u[2:4], u[4:], 'swissmodel',
                                    '{}.pdb'.format(original_filename))
                if op.exists(file_path):
                    uniprot_to_swissmodel[uni].append(file_path)
                else:
                    log.warning('{}: no file {} found for model'.format(u, ))

[docs]def get_oligomeric_state(swiss_model_path):
    """Parse the oligomeric prediction in a SWISS-MODEL repository file

    As of 2018-02-26, works on all E. coli models. Untested on other pre-made organism models.

    Args:
        swiss_model_path (str): Path to SWISS-MODEL PDB file

    Returns:
        dict: Information parsed about the oligomeric state

    """
    oligo_info = {}
    with open(swiss_model_path, 'r') as f:
        for line in f:
            if line.startswith('REMARK   3 MODEL INFORMATION'):
                break
        for i in range(10):
            line = f.readline()
            if 'ENGIN' in line:
                oligo_info['ENGIN'] = line.rstrip().split(' ')[-1]
            elif 'OSTAT' in line:
                oligo_info['OSTAT'] = line.rstrip().split(' ')[-1]
            elif 'OSRSN' in line:
                oligo_info['OSRSN'] = line.rstrip().split(' ')[-1]
            elif 'QSPRD' in line:
                oligo_info['QSPRD'] = line.rstrip().split(' ')[-1]
            elif 'GMQE' in line:
                oligo_info['GMQE'] = line.rstrip().split(' ')[-1]
            elif 'QMN4' in line:
                oligo_info['QMN4'] = line.rstrip().split(' ')[-1]
            elif 'MODT' in line:
                oligo_info['MODT'] = line.rstrip().split(' ')[-1]
    return oligo_info


[docs]def translate_ostat(ostat):
    """Translate the OSTAT field to an integer.

    As of 2018-02-26, works on all E. coli models. Untested on other pre-made organism models.

    Args:
        ostat (str): Predicted oligomeric state of the PDB file

    Returns:
        int: Translated string to integer

    """
    ostat_lower = ostat.strip().lower()
    if ostat_lower == 'monomer':
        return 1
    elif ostat_lower == 'homo-dimer':
        return 2
    elif ostat_lower == 'homo-trimer':
        return 3
    elif ostat_lower == 'homo-tetramer':
        return 4
    elif ostat_lower == 'homo-pentamer':
        return 5
    elif ostat_lower == 'homo-hexamer':
        return 6
    elif ostat_lower == 'homo-heptamer':
        return 7
    elif ostat_lower == 'homo-octamer':
        return 8
    else:
        num = int(ostat_lower.split('-')[1])
        return num