Source code for ssbio.databases.pdb

"""
PDBProp
=======
"""

import gzip
import json
import logging
import os.path as op
import mmtf
import os
from cobra.core import DictList
import pandas as pd
import requests
import deprecation
from Bio.PDB import PDBList
from lxml import etree
from six.moves.urllib_error import URLError
from six.moves.urllib.request import urlopen, urlretrieve

import ssbio.databases.pisa as pisa
import ssbio.utils
from ssbio.protein.structure.structprop import StructProp

try:
    from StringIO import StringIO
except ImportError:
    from io import StringIO

log = logging.getLogger(__name__)


[docs]class PDBProp(StructProp):

    """Store information about a protein structure from the Protein Data Bank.

    Extends the :class:`~ssbio.protein.structure.structprop.StructProp` class to allow initialization of the structure
    by its PDB ID, and then enabling downloads of the structure file as well as parsing its metadata.

    Args:
        ident (str):
        description (str):
        chains (str):
        mapped_chains (str):
        structure_path (str):
        file_type (str): ``pdb``, ``mmCif``, ``xml``, ``mmtf`` - file type for files downloaded from the PDB

    """

    def __init__(self, ident, description=None, chains=None, mapped_chains=None, structure_path=None, file_type=None):
        StructProp.__init__(self, ident, description=description, chains=chains, mapped_chains=mapped_chains,
                            is_experimental=True, structure_path=structure_path, file_type=file_type)
        self.experimental_method = None
        self.resolution = None
        self.date = None
        self.taxonomy_name = None
        self.biological_assemblies = DictList()
        """DictList: A list for storing Bioassembly objects related to this PDB ID"""

[docs]    def download_structure_file(self, outdir, file_type=None, load_header_metadata=True, force_rerun=False):
        """Download a structure file from the PDB, specifying an output directory and a file type. Optionally download
        the mmCIF header file and parse data from it to store within this object.

        Args:
            outdir (str): Path to output directory
            file_type (str): ``pdb``, ``mmCif``, ``xml``, ``mmtf`` - file type for files downloaded from the PDB
            load_header_metadata (bool): If header metadata should be loaded into this object, fastest with mmtf files
            force_rerun (bool): If structure file should be downloaded even if it already exists

        """
        ssbio.utils.double_check_attribute(object=self, setter=file_type, backup_attribute='file_type',
                                           custom_error_text='Please set file type to be downloaded from the PDB: '
                                                             'pdb, mmCif, xml, or mmtf')

        # XTODO: check if outfile exists using ssbio.utils.force_rerun, pdblist seems to take long if it exists
        p = PDBList()
        with ssbio.utils.suppress_stdout():
            structure_file = p.retrieve_pdb_file(pdb_code=self.id, pdir=outdir, file_format=file_type, overwrite=force_rerun)
        if not op.exists(structure_file):
            log.debug('{}: {} file not available'.format(self.id, file_type))
            raise URLError('{}.{}: file not available to download'.format(self.id, file_type))
        else:
            log.debug('{}: {} file saved'.format(self.id, file_type))

            # Rename .ent files to .pdb
            if file_type == 'pdb':
                new_name = structure_file.replace('pdb', '').replace('ent', 'pdb')
                os.rename(structure_file, new_name)
                structure_file = new_name

            self.load_structure_path(structure_file, file_type)
            if load_header_metadata and file_type == 'mmtf':
                self.update(parse_mmtf_header(structure_file))
            if load_header_metadata and file_type != 'mmtf':
                self.update(parse_mmcif_header(download_mmcif_header(pdb_id=self.id, outdir=outdir, force_rerun=force_rerun)))


    def get_pisa_complex_predictions(self, outdir, existing_pisa_multimer_xml=None):
        if not existing_pisa_multimer_xml:
            pisa_xmls = pisa.download_pisa_multimers_xml(pdb_ids=self.id, outdir=outdir,
                                                         save_single_xml_files=True)
        else:
            pisa_xmls = {}
            pisa_xmls[self.id] = existing_pisa_multimer_xml
        pisa_dict = pisa.parse_pisa_multimers_xml(pisa_xmls[self.id], download_structures=True,
                                                  outdir=outdir)

    def __json_encode__(self):
        # TODO: investigate why saving with # does not work!
        to_return = {}
        for x in self.__dict__.keys():
            if x == 'pdb_title' or x == 'description':
                sanitized = ssbio.utils.force_string(getattr(self, x)).replace('#', '-')
            else:
                to_return.update({x: getattr(self, x)})
        return to_return


[docs]def parse_mmtf_header(infile):
    """Parse an MMTF file and return basic header-like information.

    Args:
        infile (str): Path to MMTF file

    Returns:
        dict: Dictionary of parsed header

    Todo:
        - Can this be sped up by not parsing the 3D coordinate info somehow?
        - OR just store the sequences when this happens since it is already being parsed.

    """
    infodict = {}

    mmtf_decoder = mmtf.parse(infile)
    infodict['date'] = mmtf_decoder.deposition_date
    infodict['release_date'] = mmtf_decoder.release_date
    infodict['experimental_method'] = [x.decode() for x in mmtf_decoder.experimental_methods]
    infodict['resolution'] = mmtf_decoder.resolution
    infodict['description'] = mmtf_decoder.title

    group_name_exclude = ['HOH']
    chem_comp_type_exclude = ['l-peptide linking', 'peptide linking']
    chemicals = list(set([mmtf_decoder.group_list[idx]['groupName'] for idx in mmtf_decoder.group_type_list if mmtf_decoder.group_list[idx]['chemCompType'].lower() not in chem_comp_type_exclude and mmtf_decoder.group_list[idx]['groupName'] not in group_name_exclude]))
    infodict['chemicals'] = chemicals
    return infodict

[docs]def download_mmcif_header(pdb_id, outdir='', force_rerun=False):
    """Download a mmCIF header file from the RCSB PDB by ID.

    Args:
        pdb_id: PDB ID
        outdir: Optional output directory, default is current working directory
        force_rerun: If the file should be downloaded again even if it exists

    Returns:
        str: Path to outfile

    """
    # method in biopython. extra file types have not been added to biopython download yet

    pdb_id = pdb_id.lower()
    file_type = 'cif'
    folder = 'header'
    outfile = op.join(outdir, '{}.header.{}'.format(pdb_id, file_type))

    if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
        download_link = 'http://files.rcsb.org/{}/{}.{}'.format(folder, pdb_id, file_type)
        urlretrieve(download_link, outfile)
        log.debug('{}: saved header file'.format(outfile))
    else:
        log.debug('{}: header file already saved'.format(outfile))

    return outfile


[docs]def parse_mmcif_header(infile):
    """Parse a couple important fields from the mmCIF file format with some manual curation of ligands.

    If you want full access to the mmCIF file just use the MMCIF2Dict class in Biopython.

    Args:
        infile: Path to mmCIF file

    Returns:
        dict: Dictionary of parsed header

    """
    from Bio.PDB.MMCIF2Dict import MMCIF2Dict

    newdict = {}
    try:
        mmdict = MMCIF2Dict(infile)
    except ValueError as e:
        log.exception(e)
        return newdict

    chemical_ids_exclude = ['HOH']
    chemical_types_exclude = ['l-peptide linking','peptide linking']

    if '_struct.title' in mmdict:
        newdict['pdb_title'] = mmdict['_struct.title']
    else:
        log.debug('{}: No title field'.format(infile))

    if '_struct.pdbx_descriptor' in mmdict:
        newdict['description'] = mmdict['_struct.pdbx_descriptor']
    else:
        log.debug('{}: no description field'.format(infile))

    if '_pdbx_database_status.recvd_initial_deposition_date' in mmdict:
        newdict['date'] = mmdict['_pdbx_database_status.recvd_initial_deposition_date']
    elif '_database_PDB_rev.date' in mmdict:
        newdict['date'] = mmdict['_database_PDB_rev.date']
    else:
        log.debug('{}: no date field'.format(infile))

    if '_exptl.method' in mmdict:
        newdict['experimental_method'] = mmdict['_exptl.method']
    else:
        log.debug('{}: no experimental method field'.format(infile))

    # TODO: refactor how to get resolutions based on experimental method
    if '_refine.ls_d_res_high' in mmdict:
        try:
            if isinstance(mmdict['_refine.ls_d_res_high'], list):
                newdict['resolution'] = [float(x) for x in mmdict['_refine.ls_d_res_high']]
            else:
                newdict['resolution'] = float(mmdict['_refine.ls_d_res_high'])
        except:
            try:
                newdict['resolution'] = float(mmdict['_em_3d_reconstruction.resolution'])
            except:
                log.debug('{}: no resolution field'.format(infile))
    else:
        log.debug('{}: no resolution field'.format(infile))

    if '_chem_comp.id' in mmdict:
        chemicals_filtered = ssbio.utils.filter_list_by_indices(mmdict['_chem_comp.id'],
                                                            ssbio.utils.not_find(mmdict['_chem_comp.type'],
                                                                           chemical_types_exclude,
                                                                           case_sensitive=False))
        chemicals_fitered = ssbio.utils.filter_list(chemicals_filtered, chemical_ids_exclude, case_sensitive=True)
        newdict['chemicals'] = chemicals_fitered
    else:
        log.debug('{}: no chemical composition field'.format(infile))

    if '_entity_src_gen.pdbx_gene_src_scientific_name' in mmdict:
        newdict['taxonomy_name'] = mmdict['_entity_src_gen.pdbx_gene_src_scientific_name']
    else:
        log.debug('{}: no organism field'.format(infile))

    return newdict


[docs]def download_sifts_xml(pdb_id, outdir='', force_rerun=False):
    """Download the SIFTS file for a PDB ID.

    Args:
        pdb_id (str): PDB ID
        outdir (str): Output directory, current working directory if not specified.
        force_rerun (bool): If the file should be downloaded again even if it exists

    Returns:
        str: Path to downloaded file

    """
    baseURL = 'ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/xml/'
    filename = '{}.xml.gz'.format(pdb_id.lower())

    outfile = op.join(outdir, filename.split('.')[0] + '.sifts.xml')

    if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
        response = urlopen(baseURL + filename)
        with open(outfile, 'wb') as f:
            f.write(gzip.decompress(response.read()))

    return outfile


[docs]def map_uniprot_resnum_to_pdb(uniprot_resnum, chain_id, sifts_file):
    """Map a UniProt residue number to its corresponding PDB residue number.

    This function requires that the SIFTS file be downloaded,
    and also a chain ID (as different chains may have different mappings).

    Args:
        uniprot_resnum (int): integer of the residue number you'd like to map
        chain_id (str): string of the PDB chain to map to
        sifts_file (str): Path to the SIFTS XML file

    Returns:
        (tuple): tuple containing:

            mapped_resnum (int): Mapped residue number
            is_observed (bool): Indicates if the 3D structure actually shows the residue

    """
    # Load the xml with lxml
    parser = etree.XMLParser(ns_clean=True)
    tree = etree.parse(sifts_file, parser)
    root = tree.getroot()

    my_pdb_resnum = None

    # TODO: "Engineered_Mutation is also a possible annotation, need to figure out what to do with that
    my_pdb_annotation = False

    # Find the right chain (entities in the xml doc)
    ent = './/{http://www.ebi.ac.uk/pdbe/docs/sifts/eFamily.xsd}entity'
    for chain in root.findall(ent):
        # TODO: IMPORTANT - entityId is not the chain ID!!! it is just in alphabetical order!
        if chain.attrib['entityId'] == chain_id:
            # Find the "crossRefDb" tag that has the attributes dbSource="UniProt" and  dbResNum="your_resnum_here"
            # Then match it to the crossRefDb dbResNum that has the attribute dbSource="PDBresnum"

            # Check if uniprot + resnum even exists in the sifts file (it won't if the pdb doesn't contain the residue)
            ures = './/{http://www.ebi.ac.uk/pdbe/docs/sifts/eFamily.xsd}crossRefDb[@dbSource="UniProt"][@dbResNum="%s"]' % uniprot_resnum
            my_uniprot_residue = chain.findall(ures)
            if len(my_uniprot_residue) == 1:
                # Get crossRefDb dbSource="PDB"
                parent = my_uniprot_residue[0].getparent()
                pres = './/{http://www.ebi.ac.uk/pdbe/docs/sifts/eFamily.xsd}crossRefDb[@dbSource="PDB"]'
                my_pdb_residue = parent.findall(pres)
                my_pdb_resnum = int(my_pdb_residue[0].attrib['dbResNum'])

                # Get <residueDetail dbSource="PDBe" property="Annotation">
                # Will be Not_Observed if it is not seen in the PDB
                anno = './/{http://www.ebi.ac.uk/pdbe/docs/sifts/eFamily.xsd}residueDetail[@dbSource="PDBe"][@property="Annotation"]'
                my_pdb_annotation = parent.findall(anno)
                if len(my_pdb_annotation) == 1:
                    my_pdb_annotation = my_pdb_annotation[0].text
                    if my_pdb_annotation == 'Not_Observed':
                        my_pdb_annotation = False
                else:
                    my_pdb_annotation = True
            else:
                return None, False

    return my_pdb_resnum, my_pdb_annotation


[docs]def best_structures(uniprot_id, outname=None, outdir=None, seq_ident_cutoff=0.0, force_rerun=False):
    """Use the PDBe REST service to query for the best PDB structures for a UniProt ID.

    More information found here: https://www.ebi.ac.uk/pdbe/api/doc/sifts.html
    Link used to retrieve results: https://www.ebi.ac.uk/pdbe/api/mappings/best_structures/:accession
    The list of PDB structures mapping to a UniProt accession sorted by coverage of the protein and, if the same, resolution.

    Here is the ranking algorithm described by the PDB paper:
    https://nar.oxfordjournals.org/content/44/D1/D385.full

    "Finally, a single quality indicator is also calculated for each entry by taking the harmonic average
    of all the percentile scores representing model and model-data-fit quality measures and then subtracting
    10 times the numerical value of the resolution (in Angstrom) of the entry to ensure that resolution plays
    a role in characterising the quality of a structure. This single empirical 'quality measure' value is used
    by the PDBe query system to sort results and identify the 'best' structure in a given context. At present,
    entries determined by methods other than X-ray crystallography do not have similar data quality information
    available and are not considered as 'best structures'."

    Args:
        uniprot_id (str): UniProt Accession ID
        outname (str): Basename of the output file of JSON results
        outdir (str): Path to output directory of JSON results
        seq_ident_cutoff (float): Cutoff results based on percent coverage (in decimal form)
        force_rerun (bool): Obtain best structures mapping ignoring previously downloaded results

    Returns:
        list: Rank-ordered list of dictionaries representing chain-specific PDB entries. Keys are:
            * pdb_id: the PDB ID which maps to the UniProt ID
            * chain_id: the specific chain of the PDB which maps to the UniProt ID
            * coverage: the percent coverage of the entire UniProt sequence
            * resolution: the resolution of the structure
            * start: the structure residue number which maps to the start of the mapped sequence
            * end: the structure residue number which maps to the end of the mapped sequence
            * unp_start: the sequence residue number which maps to the structure start
            * unp_end: the sequence residue number which maps to the structure end
            * experimental_method: type of experiment used to determine structure
            * tax_id: taxonomic ID of the protein's original organism

    """
    outfile = ''

    if not outdir:
        outdir = ''

    # if output dir is specified but not outname, use the uniprot
    if not outname and outdir:
        outname = uniprot_id

    if outname:
        outname = op.join(outdir, outname)
        outfile = '{}.json'.format(outname)

    # Load a possibly existing json file
    if not ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
        with open(outfile, 'r') as f:
            raw_data = json.load(f)
        log.debug('{}: loaded existing json file'.format(uniprot_id))

    # Otherwise run the web request
    else:
        # TODO: add a checker for a cached file of uniprot -> PDBs - can be generated within gempro pipeline and stored
        response = requests.get('https://www.ebi.ac.uk/pdbe/api/mappings/best_structures/{}'.format(uniprot_id),
                                data={'key': 'value'})
        if response.status_code == 404:
            log.debug('{}: 404 returned, probably no structures available.'.format(uniprot_id))
            raw_data = {uniprot_id: {}}
        else:
            log.debug('{}: Obtained best structures'.format(uniprot_id))
            raw_data = response.json()

        # Write the json file if specified
        if outfile:
            with open(outfile, 'w') as f:
                json.dump(raw_data, f)
            log.debug('{}: Saved json file of best structures'.format(uniprot_id))

    data = dict(raw_data)[uniprot_id]

    # Filter for sequence identity percentage
    if seq_ident_cutoff != 0:
        for result in data:
            if result['coverage'] < seq_ident_cutoff:
                data.remove(result)

    return data


[docs]def blast_pdb(seq, outfile='', outdir='', evalue=0.0001, seq_ident_cutoff=0.0, link=False, force_rerun=False):
    """Returns a list of BLAST hits of a sequence to available structures in the PDB.

    Args:
        seq (str): Your sequence, in string format
        outfile (str): Name of output file
        outdir (str, optional): Path to output directory. Default is the current directory.
        evalue (float, optional): Cutoff for the E-value - filters for significant hits. 0.001 is liberal, 0.0001 is stringent (default).
        seq_ident_cutoff (float, optional): Cutoff results based on percent coverage (in decimal form)
        link (bool, optional): Set to True if a link to the HTML results should be displayed
        force_rerun (bool, optional): If existing BLAST results should not be used, set to True. Default is False

    Returns:
        list: Rank ordered list of BLAST hits in dictionaries.

    """

    if len(seq) < 12:
        raise ValueError('Sequence must be at least 12 residues long.')
    if link:
        page = 'PDB results page: http://www.rcsb.org/pdb/rest/getBlastPDB1?sequence={}&eCutOff={}&maskLowComplexity=yes&matrix=BLOSUM62&outputFormat=HTML'.format(seq, evalue)
        print(page)

    parser = etree.XMLParser(ns_clean=True)

    outfile = op.join(outdir, outfile)
    if ssbio.utils.force_rerun(force_rerun, outfile):
        # Load the BLAST XML results if force_rerun=True
        page = 'http://www.rcsb.org/pdb/rest/getBlastPDB1?sequence={}&eCutOff={}&maskLowComplexity=yes&matrix=BLOSUM62&outputFormat=XML'.format(
                        seq, evalue)
        req = requests.get(page)
        if req.status_code == 200:
            response = req.text

            # Save the XML file
            if outfile:
                with open(outfile, 'w') as f:
                    f.write(response)

            # Parse the XML string
            tree = etree.ElementTree(etree.fromstring(response, parser))
            log.debug('Loaded BLAST results from REST server')
        else:
            log.error('BLAST request timed out')
            return []
    else:
        tree = etree.parse(outfile, parser)
        log.debug('{}: Loaded existing BLAST XML results'.format(outfile))

    # Get length of original sequence to calculate percentages
    len_orig = float(len(seq))

    root = tree.getroot()
    hit_list = []

    for hit in root.findall('BlastOutput_iterations/Iteration/Iteration_hits/Hit'):
        info = {}

        hitdef = hit.find('Hit_def')
        if hitdef is not None:
            info['hit_pdb'] = hitdef.text.split('|')[0].split(':')[0].lower()
            info['hit_pdb_chains'] = hitdef.text.split('|')[0].split(':')[2].split(',')

        # One PDB can align to different parts of the sequence
        # Will just choose the top hit for this single PDB
        hsp = hit.findall('Hit_hsps/Hsp')[0]

        # Number of identical residues
        hspi = hsp.find('Hsp_identity')
        if hspi is not None:
            info['hit_num_ident'] = int(hspi.text)
            info['hit_percent_ident'] = int(hspi.text)/len_orig

            if int(hspi.text)/len_orig < seq_ident_cutoff:
                log.debug('{}: does not meet sequence identity cutoff'.format(hitdef.text.split('|')[0].split(':')[0]))
                continue

        # Number of similar residues (positive hits)
        hspp = hsp.find('Hsp_positive')
        if hspp is not None:
            info['hit_num_similar'] = int(hspp.text)
            info['hit_percent_similar'] = int(hspp.text) / len_orig

        # Total number of gaps (unable to align in either query or subject)
        hspg = hsp.find('Hsp_gaps')
        if hspg is not None:
            info['hit_num_gaps'] = int(hspg.text)
            info['hit_percent_gaps'] = int(hspg.text) / len_orig

        # E-value of BLAST
        hspe = hsp.find('Hsp_evalue')
        if hspe is not None:
            info['hit_evalue'] = float(hspe.text)

        # Score of BLAST
        hsps = hsp.find('Hsp_score')
        if hsps is not None:
            info['hit_score'] = float(hsps.text)

        hit_list.append(info)

    log.debug("{}: Number of BLAST hits".format(len(hit_list)))
    return hit_list


[docs]def blast_pdb_df(blast_results):
    """Make a dataframe of BLAST results"""
    cols = ['hit_pdb', 'hit_pdb_chains', 'hit_evalue', 'hit_score', 'hit_num_ident', 'hit_percent_ident',
            'hit_num_similar', 'hit_percent_similar', 'hit_num_gaps', 'hit_percent_gaps']
    return pd.DataFrame.from_records(blast_results, columns=cols)


def _property_table():
    """Download the PDB -> resolution table directly from the RCSB PDB REST service.

    See the other fields that you can get here: http://www.rcsb.org/pdb/results/reportField.do

    Returns:
        Pandas DataFrame: table of structureId as the index, resolution and experimentalTechnique as the columns

    """
    url = 'http://www.rcsb.org/pdb/rest/customReport.csv?pdbids=*&customReportColumns=structureId,resolution,experimentalTechnique,releaseDate&service=wsfile&format=csv'
    r = requests.get(url)
    p = pd.read_csv(StringIO(r.text)).set_index('structureId')
    return p


[docs]def get_resolution(pdb_id):
    """Quick way to get the resolution of a PDB ID using the table of results from the REST service

    Returns infinity if the resolution is not available.

    Returns:
        float: resolution of a PDB ID in Angstroms

    TODO:
        - Unit test

    """

    pdb_id = pdb_id.upper()
    if pdb_id not in _property_table().index:
        raise ValueError('PDB ID not in property table')
    else:
        resolution = _property_table().ix[pdb_id, 'resolution']
        if pd.isnull(resolution):
            log.debug('{}: no resolution available, probably not an X-ray crystal structure')
            resolution = float('inf')

    return resolution


[docs]def get_release_date(pdb_id):
    """Quick way to get the release date of a PDB ID using the table of results from the REST service

    Returns None if the release date is not available.

    Returns:
        str: Organism of a PDB ID

    """

    pdb_id = pdb_id.upper()
    if pdb_id not in _property_table().index:
        raise ValueError('PDB ID not in property table')
    else:
        release_date = _property_table().ix[pdb_id, 'releaseDate']
        if pd.isnull(release_date):
            log.debug('{}: no release date available')
            release_date = None

    return release_date


[docs]def get_num_bioassemblies(pdb_id, cache=False, outdir=None, force_rerun=False):
    """Check if there are bioassemblies using the PDB REST API, and if there are, get the number of bioassemblies
    available.

    See: https://www.rcsb.org/pages/webservices/rest, section 'List biological assemblies'

    Not all PDB entries have biological assemblies available and some have multiple. Details that are necessary to
    recreate a biological assembly from the asymmetric unit can be accessed from the following requests.

    - Number of biological assemblies associated with a PDB entry
    - Access the transformation information needed to generate a biological assembly (nr=0 will return information
      for the asymmetric unit, nr=1 will return information for the first assembly, etc.)

    A query of https://www.rcsb.org/pdb/rest/bioassembly/nrbioassemblies?structureId=1hv4 returns this::

        <nrBioAssemblies structureId="1HV4" hasAssemblies="true" count="2"/>

    Args:
        pdb_id (str): PDB ID
        cache (bool): If the XML file should be downloaded
        outdir (str): If cache, then specify the output directory
        force_rerun (bool): If cache, and if file exists, specify if API should be queried again

    """
    parser = etree.XMLParser(ns_clean=True)

    if not outdir:
        outdir = os.getcwd()
    outfile = op.join(outdir, '{}_nrbiomols.xml'.format(pdb_id))

    if ssbio.utils.force_rerun(force_rerun, outfile):
        page = 'https://www.rcsb.org/pdb/rest/bioassembly/nrbioassemblies?structureId={}'.format(pdb_id)
        req = requests.get(page)

        if req.status_code == 200:
            response = req.text

            # Save the XML file
            if cache:
                with open(outfile, 'w') as f:
                    f.write(response)

            # Parse the XML string
            tree = etree.ElementTree(etree.fromstring(response, parser))
            log.debug('Loaded bioassembly information from REST server')
        else:
            log.error('Request timed out')
            req.raise_for_status()
    else:
        tree = etree.parse(outfile, parser)
        log.debug('{}: Loaded existing XML results'.format(outfile))

    r = tree.getroot()
    has_biomols = r.get('hasAssemblies')
    if has_biomols == 'true':
        has_biomols = True
    else:
        has_biomols = False

    if has_biomols:
        num_biomols = r.get('count')
    else:
        num_biomols = 0

    num_biomols = int(num_biomols)
    return num_biomols


[docs]def get_bioassembly_info(pdb_id, biomol_num, cache=False, outdir=None, force_rerun=False):
    """Get metadata about a bioassembly from the RCSB PDB's REST API.

    See: https://www.rcsb.org/pdb/rest/bioassembly/bioassembly?structureId=1hv4&nr=1
    The API returns an XML file containing the information on a biological assembly that looks like this::

        <bioassembly structureId="1HV4" assemblyNr="1" method="PISA" desc="author_and_software_defined_assembly">
            <transformations operator="1" chainIds="A,B,C,D">
                <transformation index="1">
                    <matrix m11="1.00000000" m12="0.00000000" m13="0.00000000" m21="0.00000000" m22="1.00000000" m23="0.00000000" m31="0.00000000" m32="0.00000000" m33="1.00000000"/>
                    <shift v1="0.00000000" v2="0.00000000" v3="0.00000000"/>
                </transformation>
            </transformations>
        </bioassembly>

    Args:
        pdb_id (str): PDB ID
        biomol_num (int): Biological assembly number you are interested in
        cache (bool): If the XML file should be downloaded
        outdir (str): If cache, then specify the output directory
        force_rerun (bool): If cache, and if file exists, specify if API should be queried again

    """
    parser = etree.XMLParser(ns_clean=True)
    #
    # if not outdir:
    #     outdir = os.getcwd()
    # outfile = op.join(outdir, '{}.xml'.format(self.id))
    #
    # if ssbio.utils.force_rerun(force_rerun, outfile):
    #     page = 'https://www.rcsb.org/pdb/rest/bioassembly/bioassembly?structureId={}&nr={}'.format(
    #         self.original_pdb_id, biomol_num)
    #     req = requests.get(page)
    #
    #     if req.status_code == 200:
    #         response = req.text
    #
    #         # Save the XML file
    #         if cache:
    #             with open(outfile, 'w') as f:
    #                 f.write(response)
    #
    #         # Parse the XML string
    #         r = xmltodict.parse(response)
    #         log.debug('Loaded bioassembly information from REST server')
    #     else:
    #         log.error('Request timed out')
    #         req.raise_for_status()
    # else:
    #     with open(outfile, 'r') as f:
    #         r = xmltodict.parse(f.read())
    #     log.debug('{}: Loaded existing XML results'.format(outfile))
    #
    # self.biomol_to_chain_dict[biomol_num] = {'chains': r['bioassembly']['transformations']['@chainIds'],
    #                                          'multiplier': len(r['bioassembly']['transformations']['transformation'])}
    # # TODO: figure out how to store matrices etc.
    #
    # log.info('{}_{}: ')


def download_biomol(pdb_id, biomol_num, outdir, file_type='pdb', force_rerun=False):
    import zlib
    from six.moves.urllib_error import URLError
    from six.moves.urllib.request import urlopen, urlretrieve
    import contextlib

    ssbio.utils.make_dir(outdir)
    server_folder = pdb_id[1:3]

    if file_type == 'pdb':
        # server = 'ftp://ftp.wwpdb.org/pub/pdb/data/biounit/coordinates/divided/{}/'.format(server_folder)
        server = 'https://files.rcsb.org/download/'
        server_filename = pdb_id + '.pdb%i.gz' % biomol_num
        local_filename = pdb_id + '_bio%i.pdb' % biomol_num
        outfile = op.join(outdir, local_filename)

    elif file_type.lower() == 'mmcif' or file_type.lower() == 'cif':
        server = 'ftp://ftp.wwpdb.org/pub/pdb/data/biounit/mmCIF/divided/{}/'.format(server_folder)
        server_filename = pdb_id + '-assembly%i.cif.gz' % biomol_num
        local_filename = pdb_id + '_bio%i.cif' % biomol_num
        outfile = op.join(outdir, local_filename)

    else:
        raise ValueError('Biological assembly only available in PDB or mmCIF file types.')

    if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
        download_link = op.join(server, server_filename)
        try:
            with contextlib.closing(urlopen(download_link)) as f:
                decompressed_data = zlib.decompress(f.read(), 16 + zlib.MAX_WBITS)
                with open(op.join(outdir, local_filename), 'wb') as f:
                    f.write(decompressed_data)
        except URLError as e:
            print(e)
            return None

    return outfile


########################################################################################################################
########################################################################################################################
# DEPRECATED FUNCTIONS
########################################################################################################################
########################################################################################################################


[docs]@deprecation.deprecated(deprecated_in="1.0", removed_in="2.0",
                        details="Use Biopython's PDBList.retrieve_pdb_file function instead")
def download_structure(pdb_id, file_type, outdir='', only_header=False, force_rerun=False):
    """Download a structure from the RCSB PDB by ID. Specify the file type desired.

    Args:
        pdb_id: PDB ID
        file_type: pdb, pdb.gz, mmcif, cif, cif.gz, xml.gz, mmtf, mmtf.gz
        outdir: Optional output directory
        only_header: If only the header file should be downloaded
        force_rerun: If the file should be downloaded again even if it exists

    Returns:
        str: Path to outfile

    """
    # method in biopython. extra file types have not been added to biopython download yet

    pdb_id = pdb_id.lower()
    file_type = file_type.lower()
    file_types = ['pdb', 'pdb.gz', 'mmcif', 'cif', 'cif.gz', 'xml.gz', 'mmtf', 'mmtf.gz']
    if file_type not in file_types:
        raise ValueError('Invalid file type, must be either: pdb, pdb.gz, cif, cif.gz, xml.gz, mmtf, mmtf.gz')

    if file_type == 'mmtf':
        file_type = 'mmtf.gz'

    if file_type.endswith('.gz'):
        gzipped = True
    else:
        gzipped = False

    if file_type == 'mmcif':
        file_type = 'cif'

    if only_header:
        folder = 'header'
        outfile = op.join(outdir, '{}.header.{}'.format(pdb_id, file_type))
    else:
        folder = 'download'
        outfile = op.join(outdir, '{}.{}'.format(pdb_id, file_type))

    if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
        if file_type == 'mmtf.gz' or file_type == 'mmtf':
            mmtf_api = '1.0'
            download_link = 'http://mmtf.rcsb.org/v{}/full/{}.mmtf.gz'.format(mmtf_api, pdb_id)
        else:
            download_link = 'http://files.rcsb.org/{}/{}.{}'.format(folder, pdb_id, file_type)

        urlretrieve(download_link, outfile)

        if gzipped:
            outfile = ssbio.utils.gunzip_file(infile=outfile,
                                              outfile=outfile.strip('.gz'),
                                              outdir=outdir,
                                              delete_original=False,
                                              force_rerun_flag=force_rerun)

        log.debug('{}: saved structure file'.format(outfile))
    else:
        if file_type == 'mmtf.gz':
            outfile = op.join(outdir, '{}.{}'.format(pdb_id, 'mmtf'))
        log.debug('{}: structure file already saved'.format(outfile))

    return outfile