Source code for ssbio.databases.pisa

"""
PISA
===========
"""

import requests
from collections import defaultdict
from copy import deepcopy
import ssbio.utils
import os
import os.path as op
from lxml import etree
import logging
import glob

log = logging.getLogger(__name__)


[docs]def download_pisa_multimers_xml(pdb_ids, save_single_xml_files=True, outdir=None, force_rerun=False):
    """Download the PISA XML file for multimers.

    See: http://www.ebi.ac.uk/pdbe/pisa/pi_download.html for more info

    XML description of macromolecular assemblies:
        http://www.ebi.ac.uk/pdbe/pisa/cgi-bin/multimers.pisa?pdbcodelist
        where "pdbcodelist" is a comma-separated (strictly no spaces) list of PDB codes. The resulting file contain XML
        output of assembly data, equivalent to that displayed in PISA assembly pages, for each of the specified PDB
        entries.   NOTE: If a mass-download is intended, please minimize the number of retrievals by specifying as many
        PDB codes in the URL as feasible (20-50 is a good range), and never send another URL request until the previous
        one has been completed (meaning that the multimers.pisa file has been downloaded). Excessive requests will
        silently die in the server queue.

    Args:
        pdb_ids (str, list): PDB ID or list of IDs
        save_single_xml_files (bool): If single XML files should be saved per PDB ID. If False, if multiple PDB IDs are
            provided, then a single, combined XML output file is downloaded
        outdir (str): Directory to output PISA XML files
        force_rerun (bool): Redownload files if they already exist

    Returns:
        list: of files downloaded

    """
    if not outdir:
        outdir = os.getcwd()

    files = {}
    pdb_ids = ssbio.utils.force_lower_list(sorted(pdb_ids))

    # If we want to save single PISA XML files per PDB ID...
    if save_single_xml_files:
        # Check for existing PISA XML files
        if not force_rerun:
            existing_files = [op.basename(x) for x in glob.glob(op.join(outdir, '*_multimers.pisa.xml'))]
            # Store the paths to these files to return
            files = {v.split('_')[0]: op.join(outdir, v) for v in existing_files}
            log.debug('Already downloaded PISA files for {}'.format(list(files.keys())))
        else:
            existing_files = []

        # Filter PDB IDs based on existing file
        pdb_ids = [x for x in pdb_ids if '{}_multimers.pisa.xml'.format(x) not in existing_files]

        # Split the list into 50 to limit requests
        split_list = ssbio.utils.split_list_by_n(pdb_ids, 40)

        # Download PISA files
        for l in split_list:
            pdbs = ','.join(l)
            all_pisa_link = 'http://www.ebi.ac.uk/pdbe/pisa/cgi-bin/multimers.pisa?{}'.format(pdbs)
            r = requests.get(all_pisa_link)

            # Parse PISA file and save individual XML files
            parser = etree.XMLParser(ns_clean=True)
            tree = etree.fromstring(r.text, parser)
            for pdb in tree.findall('pdb_entry'):
                filename = op.join(outdir, '{}_multimers.pisa.xml'.format(pdb.find('pdb_code').text))
                add_root = etree.Element('pisa_multimers')
                add_root.append(pdb)
                with open(filename, 'wb') as f:
                    f.write(etree.tostring(add_root))
                files[pdb.find('pdb_code').text] = filename
                log.debug('{}: downloaded PISA results'.format(pdb))

    else:
        split_list = ssbio.utils.split_list_by_n(pdb_ids, 40)

        for l in split_list:
            pdbs = ','.join(l)
            all_pisa_link = 'http://www.ebi.ac.uk/pdbe/pisa/cgi-bin/multimers.pisa?{}'.format(pdbs)

            filename = op.join(outdir, '{}_multimers.pisa.xml'.format(pdbs))

            if ssbio.utils.force_rerun(flag=force_rerun, outfile=filename):
                r = requests.get(all_pisa_link)
                with open(filename, 'w') as f:
                    f.write(r.text)
                log.debug('Downloaded PISA results')
            else:
                log.debug('PISA results already downloaded')

            for x in l:
                files[x] = filename

    return files


[docs]def parse_pisa_multimers_xml(pisa_multimers_xml, download_structures=False, outdir=None, force_rerun=False):
    """Retrieve PISA information from an XML results file

    See: http://www.ebi.ac.uk/pdbe/pisa/pi_download.html for more info

    XML description of macromolecular assemblies:
        http://www.ebi.ac.uk/pdbe/pisa/cgi-bin/multimers.pisa?pdbcodelist
        where "pdbcodelist" is a comma-separated (strictly no spaces) list of PDB codes. The resulting file contain XML
        output of assembly data, equivalent to that displayed in PISA assembly pages, for each of the specified PDB
        entries.   NOTE: If a mass-download is intended, please minimize the number of retrievals by specifying as many
        PDB codes in the URL as feasible (20-50 is a good range), and never send another URL request until the previous
        one has been completed (meaning that the multimers.pisa file has been downloaded). Excessive requests will
        silently die in the server queue.

    Args:
        pisa_multimers_xml (str): Path to PISA XML output file
        download_structures (bool): If assembly files should be downloaded
        outdir (str): Directory to output assembly files
        force_rerun (bool): Redownload files if they already exist

    Returns:
        dict: of parsed PISA information

    """
    if not outdir:
        outdir = os.getcwd()

    parser = etree.XMLParser(ns_clean=True)
    tree = etree.parse(pisa_multimers_xml, parser)
    root = tree.getroot()

    pisa = defaultdict(dict)

    for pdb in root.findall('pdb_entry'):
        # Get the PDB ID
        pdb_id = pdb.find('pdb_code').text

        # Check the assembly status
        status = pdb.find('status').text
        errors = ['Entry not found', 'Overlapping structures', 'No symmetry operations']
        if status in errors:
            pisa[pdb_id]['status'] = status
            continue

        # Check monomer status
        num_complexes = int(pdb.find('total_asm').text)
        if num_complexes == 0:
            pisa[pdb_id]['status'] = 'MONOMER'
            continue

        elif num_complexes > 0:
            # All "assembly sets" (see PISA sets for more info)
            sets = pdb.findall('asm_set')

            for s in sets:
                set_id = int(s.find('ser_no').text)

                # All assemblies
                assemblies = s.findall('assembly')

                for cplx in assemblies:

                    ############################################################################################
                    # This part tells you the actual composition of the predicted complex (chains and ligands)
                    parts = cplx.findall('molecule')

                    chains = defaultdict(int)

                    for part in parts:
                        part_id = part.find('chain_id').text
                        if part_id.startswith('['):
                            part_id = 'LIG_' + part_id.split(']')[0].strip('[')
                        chains[str(part_id)] += 1

                    ligands = {}

                    for key in deepcopy(chains).keys():
                        if key.startswith('LIG_'):
                            ligands[str(key.split('_')[1])] = chains.pop(key)
                    ############################################################################################

                    adder = {}

                    cplx_id = int(cplx.find('id').text)
                    cplx_composition = str(cplx.find('composition').text)
                    d_g_diss = float(cplx.find('diss_energy').text)
                    d_g_int = float(cplx.find('int_energy').text)
                    pdb_biomol = int(cplx.find('R350').text)

                    if d_g_diss >= 0:
                        stable = True
                    else:
                        stable = False

                    adder['cplx_composition'] = cplx_composition.strip()
                    adder['cplx_chains'] = chains
                    adder['cplx_ligands'] = ligands
                    adder['stable'] = stable
                    adder['d_g_diss'] = d_g_diss
                    adder['d_g_int'] = d_g_int
                    adder['pdb_biomol'] = pdb_biomol

                    pisa[pdb_id][(set_id, cplx_id)] = adder

                    if download_structures:
                        ident = '{}:{},{}'.format(pdb_id, set_id, cplx_id)
                        filename = op.join(outdir, ident + '.pdb')

                        if ssbio.utils.force_rerun(flag=force_rerun, outfile=filename):
                            download_structure_link = 'http://www.ebi.ac.uk/pdbe/pisa/cgi-bin/multimer.pdb?{}'.format(
                                ident)
                            r = requests.get(download_structure_link)

                            with open(filename, 'w') as f:
                                f.write(r.text)

                            log.debug('{}: downloaded structure file'.format(ident))
                        else:
                            log.debug('{}: structure file already downloaded'.format(ident))

                        pisa[pdb_id][(set_id, cplx_id)]['structure_file'] = filename

    return pisa


[docs]def pdb_chain_stoichiometry_biomolone(pdbid):
    """Get the stoichiometry of the chains in biological assembly 1 as a dictionary.

    Steps taken are:
    1) Download PDB and parse header, make biomolecule if provided
    2) Count how many times each chain appears in biomolecule #1
    3) Convert chain id to uniprot id
    4) Return final dictionary

    Args:
        pdbid (str): 4 character PDB ID

    Returns:
        dict: {(ChainID,UniProtID): # occurences}
    """
    pass