"""
PISA
===========
"""
import requests
from collections import defaultdict
from copy import deepcopy
import ssbio.utils
import os
import os.path as op
from lxml import etree
import logging
import glob
log = logging.getLogger(__name__)
[docs]def download_pisa_multimers_xml(pdb_ids, save_single_xml_files=True, outdir=None, force_rerun=False):
"""Download the PISA XML file for multimers.
See: http://www.ebi.ac.uk/pdbe/pisa/pi_download.html for more info
XML description of macromolecular assemblies:
http://www.ebi.ac.uk/pdbe/pisa/cgi-bin/multimers.pisa?pdbcodelist
where "pdbcodelist" is a comma-separated (strictly no spaces) list of PDB codes. The resulting file contain XML
output of assembly data, equivalent to that displayed in PISA assembly pages, for each of the specified PDB
entries. NOTE: If a mass-download is intended, please minimize the number of retrievals by specifying as many
PDB codes in the URL as feasible (20-50 is a good range), and never send another URL request until the previous
one has been completed (meaning that the multimers.pisa file has been downloaded). Excessive requests will
silently die in the server queue.
Args:
pdb_ids (str, list): PDB ID or list of IDs
save_single_xml_files (bool): If single XML files should be saved per PDB ID. If False, if multiple PDB IDs are
provided, then a single, combined XML output file is downloaded
outdir (str): Directory to output PISA XML files
force_rerun (bool): Redownload files if they already exist
Returns:
list: of files downloaded
"""
if not outdir:
outdir = os.getcwd()
files = {}
pdb_ids = ssbio.utils.force_lower_list(sorted(pdb_ids))
# If we want to save single PISA XML files per PDB ID...
if save_single_xml_files:
# Check for existing PISA XML files
if not force_rerun:
existing_files = [op.basename(x) for x in glob.glob(op.join(outdir, '*_multimers.pisa.xml'))]
# Store the paths to these files to return
files = {v.split('_')[0]: op.join(outdir, v) for v in existing_files}
log.debug('Already downloaded PISA files for {}'.format(list(files.keys())))
else:
existing_files = []
# Filter PDB IDs based on existing file
pdb_ids = [x for x in pdb_ids if '{}_multimers.pisa.xml'.format(x) not in existing_files]
# Split the list into 50 to limit requests
split_list = ssbio.utils.split_list_by_n(pdb_ids, 40)
# Download PISA files
for l in split_list:
pdbs = ','.join(l)
all_pisa_link = 'http://www.ebi.ac.uk/pdbe/pisa/cgi-bin/multimers.pisa?{}'.format(pdbs)
r = requests.get(all_pisa_link)
# Parse PISA file and save individual XML files
parser = etree.XMLParser(ns_clean=True)
tree = etree.fromstring(r.text, parser)
for pdb in tree.findall('pdb_entry'):
filename = op.join(outdir, '{}_multimers.pisa.xml'.format(pdb.find('pdb_code').text))
add_root = etree.Element('pisa_multimers')
add_root.append(pdb)
with open(filename, 'wb') as f:
f.write(etree.tostring(add_root))
files[pdb.find('pdb_code').text] = filename
log.debug('{}: downloaded PISA results'.format(pdb))
else:
split_list = ssbio.utils.split_list_by_n(pdb_ids, 40)
for l in split_list:
pdbs = ','.join(l)
all_pisa_link = 'http://www.ebi.ac.uk/pdbe/pisa/cgi-bin/multimers.pisa?{}'.format(pdbs)
filename = op.join(outdir, '{}_multimers.pisa.xml'.format(pdbs))
if ssbio.utils.force_rerun(flag=force_rerun, outfile=filename):
r = requests.get(all_pisa_link)
with open(filename, 'w') as f:
f.write(r.text)
log.debug('Downloaded PISA results')
else:
log.debug('PISA results already downloaded')
for x in l:
files[x] = filename
return files
[docs]def parse_pisa_multimers_xml(pisa_multimers_xml, download_structures=False, outdir=None, force_rerun=False):
"""Retrieve PISA information from an XML results file
See: http://www.ebi.ac.uk/pdbe/pisa/pi_download.html for more info
XML description of macromolecular assemblies:
http://www.ebi.ac.uk/pdbe/pisa/cgi-bin/multimers.pisa?pdbcodelist
where "pdbcodelist" is a comma-separated (strictly no spaces) list of PDB codes. The resulting file contain XML
output of assembly data, equivalent to that displayed in PISA assembly pages, for each of the specified PDB
entries. NOTE: If a mass-download is intended, please minimize the number of retrievals by specifying as many
PDB codes in the URL as feasible (20-50 is a good range), and never send another URL request until the previous
one has been completed (meaning that the multimers.pisa file has been downloaded). Excessive requests will
silently die in the server queue.
Args:
pisa_multimers_xml (str): Path to PISA XML output file
download_structures (bool): If assembly files should be downloaded
outdir (str): Directory to output assembly files
force_rerun (bool): Redownload files if they already exist
Returns:
dict: of parsed PISA information
"""
if not outdir:
outdir = os.getcwd()
parser = etree.XMLParser(ns_clean=True)
tree = etree.parse(pisa_multimers_xml, parser)
root = tree.getroot()
pisa = defaultdict(dict)
for pdb in root.findall('pdb_entry'):
# Get the PDB ID
pdb_id = pdb.find('pdb_code').text
# Check the assembly status
status = pdb.find('status').text
errors = ['Entry not found', 'Overlapping structures', 'No symmetry operations']
if status in errors:
pisa[pdb_id]['status'] = status
continue
# Check monomer status
num_complexes = int(pdb.find('total_asm').text)
if num_complexes == 0:
pisa[pdb_id]['status'] = 'MONOMER'
continue
elif num_complexes > 0:
# All "assembly sets" (see PISA sets for more info)
sets = pdb.findall('asm_set')
for s in sets:
set_id = int(s.find('ser_no').text)
# All assemblies
assemblies = s.findall('assembly')
for cplx in assemblies:
############################################################################################
# This part tells you the actual composition of the predicted complex (chains and ligands)
parts = cplx.findall('molecule')
chains = defaultdict(int)
for part in parts:
part_id = part.find('chain_id').text
if part_id.startswith('['):
part_id = 'LIG_' + part_id.split(']')[0].strip('[')
chains[str(part_id)] += 1
ligands = {}
for key in deepcopy(chains).keys():
if key.startswith('LIG_'):
ligands[str(key.split('_')[1])] = chains.pop(key)
############################################################################################
adder = {}
cplx_id = int(cplx.find('id').text)
cplx_composition = str(cplx.find('composition').text)
d_g_diss = float(cplx.find('diss_energy').text)
d_g_int = float(cplx.find('int_energy').text)
pdb_biomol = int(cplx.find('R350').text)
if d_g_diss >= 0:
stable = True
else:
stable = False
adder['cplx_composition'] = cplx_composition.strip()
adder['cplx_chains'] = chains
adder['cplx_ligands'] = ligands
adder['stable'] = stable
adder['d_g_diss'] = d_g_diss
adder['d_g_int'] = d_g_int
adder['pdb_biomol'] = pdb_biomol
pisa[pdb_id][(set_id, cplx_id)] = adder
if download_structures:
ident = '{}:{},{}'.format(pdb_id, set_id, cplx_id)
filename = op.join(outdir, ident + '.pdb')
if ssbio.utils.force_rerun(flag=force_rerun, outfile=filename):
download_structure_link = 'http://www.ebi.ac.uk/pdbe/pisa/cgi-bin/multimer.pdb?{}'.format(
ident)
r = requests.get(download_structure_link)
with open(filename, 'w') as f:
f.write(r.text)
log.debug('{}: downloaded structure file'.format(ident))
else:
log.debug('{}: structure file already downloaded'.format(ident))
pisa[pdb_id][(set_id, cplx_id)]['structure_file'] = filename
return pisa
[docs]def pdb_chain_stoichiometry_biomolone(pdbid):
"""Get the stoichiometry of the chains in biological assembly 1 as a dictionary.
Steps taken are:
1) Download PDB and parse header, make biomolecule if provided
2) Count how many times each chain appears in biomolecule #1
3) Convert chain id to uniprot id
4) Return final dictionary
Args:
pdbid (str): 4 character PDB ID
Returns:
dict: {(ChainID,UniProtID): # occurences}
"""
pass