"""
PDBProp
=======
"""
import gzip
import json
import logging
import os.path as op
import mmtf
import os
from cobra.core import DictList
import pandas as pd
import requests
import deprecation
from Bio.PDB import PDBList
from lxml import etree
from six.moves.urllib_error import URLError
from six.moves.urllib.request import urlopen, urlretrieve
import ssbio.databases.pisa as pisa
import ssbio.utils
from ssbio.protein.structure.structprop import StructProp
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
log = logging.getLogger(__name__)
[docs]class PDBProp(StructProp):
"""Store information about a protein structure from the Protein Data Bank.
Extends the :class:`~ssbio.protein.structure.structprop.StructProp` class to allow initialization of the structure
by its PDB ID, and then enabling downloads of the structure file as well as parsing its metadata.
Args:
ident (str):
description (str):
chains (str):
mapped_chains (str):
structure_path (str):
file_type (str): ``pdb``, ``mmCif``, ``xml``, ``mmtf`` - file type for files downloaded from the PDB
"""
def __init__(self, ident, description=None, chains=None, mapped_chains=None, structure_path=None, file_type=None):
StructProp.__init__(self, ident, description=description, chains=chains, mapped_chains=mapped_chains,
is_experimental=True, structure_path=structure_path, file_type=file_type)
self.experimental_method = None
self.resolution = None
self.date = None
self.taxonomy_name = None
self.biological_assemblies = DictList()
"""DictList: A list for storing Bioassembly objects related to this PDB ID"""
[docs] def download_structure_file(self, outdir, file_type=None, load_header_metadata=True, force_rerun=False):
"""Download a structure file from the PDB, specifying an output directory and a file type. Optionally download
the mmCIF header file and parse data from it to store within this object.
Args:
outdir (str): Path to output directory
file_type (str): ``pdb``, ``mmCif``, ``xml``, ``mmtf`` - file type for files downloaded from the PDB
load_header_metadata (bool): If header metadata should be loaded into this object, fastest with mmtf files
force_rerun (bool): If structure file should be downloaded even if it already exists
"""
ssbio.utils.double_check_attribute(object=self, setter=file_type, backup_attribute='file_type',
custom_error_text='Please set file type to be downloaded from the PDB: '
'pdb, mmCif, xml, or mmtf')
# XTODO: check if outfile exists using ssbio.utils.force_rerun, pdblist seems to take long if it exists
p = PDBList()
with ssbio.utils.suppress_stdout():
structure_file = p.retrieve_pdb_file(pdb_code=self.id, pdir=outdir, file_format=file_type, overwrite=force_rerun)
if not op.exists(structure_file):
log.debug('{}: {} file not available'.format(self.id, file_type))
raise URLError('{}.{}: file not available to download'.format(self.id, file_type))
else:
log.debug('{}: {} file saved'.format(self.id, file_type))
# Rename .ent files to .pdb
if file_type == 'pdb':
new_name = structure_file.replace('pdb', '').replace('ent', 'pdb')
os.rename(structure_file, new_name)
structure_file = new_name
self.load_structure_path(structure_file, file_type)
if load_header_metadata and file_type == 'mmtf':
self.update(parse_mmtf_header(structure_file))
if load_header_metadata and file_type != 'mmtf':
self.update(parse_mmcif_header(download_mmcif_header(pdb_id=self.id, outdir=outdir, force_rerun=force_rerun)))
def get_pisa_complex_predictions(self, outdir, existing_pisa_multimer_xml=None):
if not existing_pisa_multimer_xml:
pisa_xmls = pisa.download_pisa_multimers_xml(pdb_ids=self.id, outdir=outdir,
save_single_xml_files=True)
else:
pisa_xmls = {}
pisa_xmls[self.id] = existing_pisa_multimer_xml
pisa_dict = pisa.parse_pisa_multimers_xml(pisa_xmls[self.id], download_structures=True,
outdir=outdir)
def __json_encode__(self):
# TODO: investigate why saving with # does not work!
to_return = {}
for x in self.__dict__.keys():
if x == 'pdb_title' or x == 'description':
sanitized = ssbio.utils.force_string(getattr(self, x)).replace('#', '-')
else:
to_return.update({x: getattr(self, x)})
return to_return
return infodict
return outfile
return newdict
[docs]def download_sifts_xml(pdb_id, outdir='', force_rerun=False):
"""Download the SIFTS file for a PDB ID.
Args:
pdb_id (str): PDB ID
outdir (str): Output directory, current working directory if not specified.
force_rerun (bool): If the file should be downloaded again even if it exists
Returns:
str: Path to downloaded file
"""
baseURL = 'ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/xml/'
filename = '{}.xml.gz'.format(pdb_id.lower())
outfile = op.join(outdir, filename.split('.')[0] + '.sifts.xml')
if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
response = urlopen(baseURL + filename)
with open(outfile, 'wb') as f:
f.write(gzip.decompress(response.read()))
return outfile
[docs]def map_uniprot_resnum_to_pdb(uniprot_resnum, chain_id, sifts_file):
"""Map a UniProt residue number to its corresponding PDB residue number.
This function requires that the SIFTS file be downloaded,
and also a chain ID (as different chains may have different mappings).
Args:
uniprot_resnum (int): integer of the residue number you'd like to map
chain_id (str): string of the PDB chain to map to
sifts_file (str): Path to the SIFTS XML file
Returns:
(tuple): tuple containing:
mapped_resnum (int): Mapped residue number
is_observed (bool): Indicates if the 3D structure actually shows the residue
"""
# Load the xml with lxml
parser = etree.XMLParser(ns_clean=True)
tree = etree.parse(sifts_file, parser)
root = tree.getroot()
my_pdb_resnum = None
# TODO: "Engineered_Mutation is also a possible annotation, need to figure out what to do with that
my_pdb_annotation = False
# Find the right chain (entities in the xml doc)
ent = './/{http://www.ebi.ac.uk/pdbe/docs/sifts/eFamily.xsd}entity'
for chain in root.findall(ent):
# TODO: IMPORTANT - entityId is not the chain ID!!! it is just in alphabetical order!
if chain.attrib['entityId'] == chain_id:
# Find the "crossRefDb" tag that has the attributes dbSource="UniProt" and dbResNum="your_resnum_here"
# Then match it to the crossRefDb dbResNum that has the attribute dbSource="PDBresnum"
# Check if uniprot + resnum even exists in the sifts file (it won't if the pdb doesn't contain the residue)
ures = './/{http://www.ebi.ac.uk/pdbe/docs/sifts/eFamily.xsd}crossRefDb[@dbSource="UniProt"][@dbResNum="%s"]' % uniprot_resnum
my_uniprot_residue = chain.findall(ures)
if len(my_uniprot_residue) == 1:
# Get crossRefDb dbSource="PDB"
parent = my_uniprot_residue[0].getparent()
pres = './/{http://www.ebi.ac.uk/pdbe/docs/sifts/eFamily.xsd}crossRefDb[@dbSource="PDB"]'
my_pdb_residue = parent.findall(pres)
my_pdb_resnum = int(my_pdb_residue[0].attrib['dbResNum'])
# Get <residueDetail dbSource="PDBe" property="Annotation">
# Will be Not_Observed if it is not seen in the PDB
anno = './/{http://www.ebi.ac.uk/pdbe/docs/sifts/eFamily.xsd}residueDetail[@dbSource="PDBe"][@property="Annotation"]'
my_pdb_annotation = parent.findall(anno)
if len(my_pdb_annotation) == 1:
my_pdb_annotation = my_pdb_annotation[0].text
if my_pdb_annotation == 'Not_Observed':
my_pdb_annotation = False
else:
my_pdb_annotation = True
else:
return None, False
return my_pdb_resnum, my_pdb_annotation
[docs]def best_structures(uniprot_id, outname=None, outdir=None, seq_ident_cutoff=0.0, force_rerun=False):
"""Use the PDBe REST service to query for the best PDB structures for a UniProt ID.
More information found here: https://www.ebi.ac.uk/pdbe/api/doc/sifts.html
Link used to retrieve results: https://www.ebi.ac.uk/pdbe/api/mappings/best_structures/:accession
The list of PDB structures mapping to a UniProt accession sorted by coverage of the protein and, if the same, resolution.
Here is the ranking algorithm described by the PDB paper:
https://nar.oxfordjournals.org/content/44/D1/D385.full
"Finally, a single quality indicator is also calculated for each entry by taking the harmonic average
of all the percentile scores representing model and model-data-fit quality measures and then subtracting
10 times the numerical value of the resolution (in Angstrom) of the entry to ensure that resolution plays
a role in characterising the quality of a structure. This single empirical 'quality measure' value is used
by the PDBe query system to sort results and identify the 'best' structure in a given context. At present,
entries determined by methods other than X-ray crystallography do not have similar data quality information
available and are not considered as 'best structures'."
Args:
uniprot_id (str): UniProt Accession ID
outname (str): Basename of the output file of JSON results
outdir (str): Path to output directory of JSON results
seq_ident_cutoff (float): Cutoff results based on percent coverage (in decimal form)
force_rerun (bool): Obtain best structures mapping ignoring previously downloaded results
Returns:
list: Rank-ordered list of dictionaries representing chain-specific PDB entries. Keys are:
* pdb_id: the PDB ID which maps to the UniProt ID
* chain_id: the specific chain of the PDB which maps to the UniProt ID
* coverage: the percent coverage of the entire UniProt sequence
* resolution: the resolution of the structure
* start: the structure residue number which maps to the start of the mapped sequence
* end: the structure residue number which maps to the end of the mapped sequence
* unp_start: the sequence residue number which maps to the structure start
* unp_end: the sequence residue number which maps to the structure end
* experimental_method: type of experiment used to determine structure
* tax_id: taxonomic ID of the protein's original organism
"""
outfile = ''
if not outdir:
outdir = ''
# if output dir is specified but not outname, use the uniprot
if not outname and outdir:
outname = uniprot_id
if outname:
outname = op.join(outdir, outname)
outfile = '{}.json'.format(outname)
# Load a possibly existing json file
if not ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
with open(outfile, 'r') as f:
raw_data = json.load(f)
log.debug('{}: loaded existing json file'.format(uniprot_id))
# Otherwise run the web request
else:
# TODO: add a checker for a cached file of uniprot -> PDBs - can be generated within gempro pipeline and stored
response = requests.get('https://www.ebi.ac.uk/pdbe/api/mappings/best_structures/{}'.format(uniprot_id),
data={'key': 'value'})
if response.status_code == 404:
log.debug('{}: 404 returned, probably no structures available.'.format(uniprot_id))
raw_data = {uniprot_id: {}}
else:
log.debug('{}: Obtained best structures'.format(uniprot_id))
raw_data = response.json()
# Write the json file if specified
if outfile:
with open(outfile, 'w') as f:
json.dump(raw_data, f)
log.debug('{}: Saved json file of best structures'.format(uniprot_id))
data = dict(raw_data)[uniprot_id]
# Filter for sequence identity percentage
if seq_ident_cutoff != 0:
for result in data:
if result['coverage'] < seq_ident_cutoff:
data.remove(result)
return data
[docs]def blast_pdb(seq, outfile='', outdir='', evalue=0.0001, seq_ident_cutoff=0.0, link=False, force_rerun=False):
"""Returns a list of BLAST hits of a sequence to available structures in the PDB.
Args:
seq (str): Your sequence, in string format
outfile (str): Name of output file
outdir (str, optional): Path to output directory. Default is the current directory.
evalue (float, optional): Cutoff for the E-value - filters for significant hits. 0.001 is liberal, 0.0001 is stringent (default).
seq_ident_cutoff (float, optional): Cutoff results based on percent coverage (in decimal form)
link (bool, optional): Set to True if a link to the HTML results should be displayed
force_rerun (bool, optional): If existing BLAST results should not be used, set to True. Default is False
Returns:
list: Rank ordered list of BLAST hits in dictionaries.
"""
if len(seq) < 12:
raise ValueError('Sequence must be at least 12 residues long.')
if link:
page = 'PDB results page: http://www.rcsb.org/pdb/rest/getBlastPDB1?sequence={}&eCutOff={}&maskLowComplexity=yes&matrix=BLOSUM62&outputFormat=HTML'.format(seq, evalue)
print(page)
parser = etree.XMLParser(ns_clean=True)
outfile = op.join(outdir, outfile)
if ssbio.utils.force_rerun(force_rerun, outfile):
# Load the BLAST XML results if force_rerun=True
page = 'http://www.rcsb.org/pdb/rest/getBlastPDB1?sequence={}&eCutOff={}&maskLowComplexity=yes&matrix=BLOSUM62&outputFormat=XML'.format(
seq, evalue)
req = requests.get(page)
if req.status_code == 200:
response = req.text
# Save the XML file
if outfile:
with open(outfile, 'w') as f:
f.write(response)
# Parse the XML string
tree = etree.ElementTree(etree.fromstring(response, parser))
log.debug('Loaded BLAST results from REST server')
else:
log.error('BLAST request timed out')
return []
else:
tree = etree.parse(outfile, parser)
log.debug('{}: Loaded existing BLAST XML results'.format(outfile))
# Get length of original sequence to calculate percentages
len_orig = float(len(seq))
root = tree.getroot()
hit_list = []
for hit in root.findall('BlastOutput_iterations/Iteration/Iteration_hits/Hit'):
info = {}
hitdef = hit.find('Hit_def')
if hitdef is not None:
info['hit_pdb'] = hitdef.text.split('|')[0].split(':')[0].lower()
info['hit_pdb_chains'] = hitdef.text.split('|')[0].split(':')[2].split(',')
# One PDB can align to different parts of the sequence
# Will just choose the top hit for this single PDB
hsp = hit.findall('Hit_hsps/Hsp')[0]
# Number of identical residues
hspi = hsp.find('Hsp_identity')
if hspi is not None:
info['hit_num_ident'] = int(hspi.text)
info['hit_percent_ident'] = int(hspi.text)/len_orig
if int(hspi.text)/len_orig < seq_ident_cutoff:
log.debug('{}: does not meet sequence identity cutoff'.format(hitdef.text.split('|')[0].split(':')[0]))
continue
# Number of similar residues (positive hits)
hspp = hsp.find('Hsp_positive')
if hspp is not None:
info['hit_num_similar'] = int(hspp.text)
info['hit_percent_similar'] = int(hspp.text) / len_orig
# Total number of gaps (unable to align in either query or subject)
hspg = hsp.find('Hsp_gaps')
if hspg is not None:
info['hit_num_gaps'] = int(hspg.text)
info['hit_percent_gaps'] = int(hspg.text) / len_orig
# E-value of BLAST
hspe = hsp.find('Hsp_evalue')
if hspe is not None:
info['hit_evalue'] = float(hspe.text)
# Score of BLAST
hsps = hsp.find('Hsp_score')
if hsps is not None:
info['hit_score'] = float(hsps.text)
hit_list.append(info)
log.debug("{}: Number of BLAST hits".format(len(hit_list)))
return hit_list
[docs]def blast_pdb_df(blast_results):
"""Make a dataframe of BLAST results"""
cols = ['hit_pdb', 'hit_pdb_chains', 'hit_evalue', 'hit_score', 'hit_num_ident', 'hit_percent_ident',
'hit_num_similar', 'hit_percent_similar', 'hit_num_gaps', 'hit_percent_gaps']
return pd.DataFrame.from_records(blast_results, columns=cols)
def _property_table():
"""Download the PDB -> resolution table directly from the RCSB PDB REST service.
See the other fields that you can get here: http://www.rcsb.org/pdb/results/reportField.do
Returns:
Pandas DataFrame: table of structureId as the index, resolution and experimentalTechnique as the columns
"""
url = 'http://www.rcsb.org/pdb/rest/customReport.csv?pdbids=*&customReportColumns=structureId,resolution,experimentalTechnique,releaseDate&service=wsfile&format=csv'
r = requests.get(url)
p = pd.read_csv(StringIO(r.text)).set_index('structureId')
return p
[docs]def get_resolution(pdb_id):
"""Quick way to get the resolution of a PDB ID using the table of results from the REST service
Returns infinity if the resolution is not available.
Returns:
float: resolution of a PDB ID in Angstroms
TODO:
- Unit test
"""
pdb_id = pdb_id.upper()
if pdb_id not in _property_table().index:
raise ValueError('PDB ID not in property table')
else:
resolution = _property_table().ix[pdb_id, 'resolution']
if pd.isnull(resolution):
log.debug('{}: no resolution available, probably not an X-ray crystal structure')
resolution = float('inf')
return resolution
[docs]def get_release_date(pdb_id):
"""Quick way to get the release date of a PDB ID using the table of results from the REST service
Returns None if the release date is not available.
Returns:
str: Organism of a PDB ID
"""
pdb_id = pdb_id.upper()
if pdb_id not in _property_table().index:
raise ValueError('PDB ID not in property table')
else:
release_date = _property_table().ix[pdb_id, 'releaseDate']
if pd.isnull(release_date):
log.debug('{}: no release date available')
release_date = None
return release_date
[docs]def get_num_bioassemblies(pdb_id, cache=False, outdir=None, force_rerun=False):
"""Check if there are bioassemblies using the PDB REST API, and if there are, get the number of bioassemblies
available.
See: https://www.rcsb.org/pages/webservices/rest, section 'List biological assemblies'
Not all PDB entries have biological assemblies available and some have multiple. Details that are necessary to
recreate a biological assembly from the asymmetric unit can be accessed from the following requests.
- Number of biological assemblies associated with a PDB entry
- Access the transformation information needed to generate a biological assembly (nr=0 will return information
for the asymmetric unit, nr=1 will return information for the first assembly, etc.)
A query of https://www.rcsb.org/pdb/rest/bioassembly/nrbioassemblies?structureId=1hv4 returns this::
<nrBioAssemblies structureId="1HV4" hasAssemblies="true" count="2"/>
Args:
pdb_id (str): PDB ID
cache (bool): If the XML file should be downloaded
outdir (str): If cache, then specify the output directory
force_rerun (bool): If cache, and if file exists, specify if API should be queried again
"""
parser = etree.XMLParser(ns_clean=True)
if not outdir:
outdir = os.getcwd()
outfile = op.join(outdir, '{}_nrbiomols.xml'.format(pdb_id))
if ssbio.utils.force_rerun(force_rerun, outfile):
page = 'https://www.rcsb.org/pdb/rest/bioassembly/nrbioassemblies?structureId={}'.format(pdb_id)
req = requests.get(page)
if req.status_code == 200:
response = req.text
# Save the XML file
if cache:
with open(outfile, 'w') as f:
f.write(response)
# Parse the XML string
tree = etree.ElementTree(etree.fromstring(response, parser))
log.debug('Loaded bioassembly information from REST server')
else:
log.error('Request timed out')
req.raise_for_status()
else:
tree = etree.parse(outfile, parser)
log.debug('{}: Loaded existing XML results'.format(outfile))
r = tree.getroot()
has_biomols = r.get('hasAssemblies')
if has_biomols == 'true':
has_biomols = True
else:
has_biomols = False
if has_biomols:
num_biomols = r.get('count')
else:
num_biomols = 0
num_biomols = int(num_biomols)
return num_biomols
[docs]def get_bioassembly_info(pdb_id, biomol_num, cache=False, outdir=None, force_rerun=False):
"""Get metadata about a bioassembly from the RCSB PDB's REST API.
See: https://www.rcsb.org/pdb/rest/bioassembly/bioassembly?structureId=1hv4&nr=1
The API returns an XML file containing the information on a biological assembly that looks like this::
<bioassembly structureId="1HV4" assemblyNr="1" method="PISA" desc="author_and_software_defined_assembly">
<transformations operator="1" chainIds="A,B,C,D">
<transformation index="1">
<matrix m11="1.00000000" m12="0.00000000" m13="0.00000000" m21="0.00000000" m22="1.00000000" m23="0.00000000" m31="0.00000000" m32="0.00000000" m33="1.00000000"/>
<shift v1="0.00000000" v2="0.00000000" v3="0.00000000"/>
</transformation>
</transformations>
</bioassembly>
Args:
pdb_id (str): PDB ID
biomol_num (int): Biological assembly number you are interested in
cache (bool): If the XML file should be downloaded
outdir (str): If cache, then specify the output directory
force_rerun (bool): If cache, and if file exists, specify if API should be queried again
"""
parser = etree.XMLParser(ns_clean=True)
#
# if not outdir:
# outdir = os.getcwd()
# outfile = op.join(outdir, '{}.xml'.format(self.id))
#
# if ssbio.utils.force_rerun(force_rerun, outfile):
# page = 'https://www.rcsb.org/pdb/rest/bioassembly/bioassembly?structureId={}&nr={}'.format(
# self.original_pdb_id, biomol_num)
# req = requests.get(page)
#
# if req.status_code == 200:
# response = req.text
#
# # Save the XML file
# if cache:
# with open(outfile, 'w') as f:
# f.write(response)
#
# # Parse the XML string
# r = xmltodict.parse(response)
# log.debug('Loaded bioassembly information from REST server')
# else:
# log.error('Request timed out')
# req.raise_for_status()
# else:
# with open(outfile, 'r') as f:
# r = xmltodict.parse(f.read())
# log.debug('{}: Loaded existing XML results'.format(outfile))
#
# self.biomol_to_chain_dict[biomol_num] = {'chains': r['bioassembly']['transformations']['@chainIds'],
# 'multiplier': len(r['bioassembly']['transformations']['transformation'])}
# # TODO: figure out how to store matrices etc.
#
# log.info('{}_{}: ')
def download_biomol(pdb_id, biomol_num, outdir, file_type='pdb', force_rerun=False):
import zlib
from six.moves.urllib_error import URLError
from six.moves.urllib.request import urlopen, urlretrieve
import contextlib
ssbio.utils.make_dir(outdir)
server_folder = pdb_id[1:3]
if file_type == 'pdb':
# server = 'ftp://ftp.wwpdb.org/pub/pdb/data/biounit/coordinates/divided/{}/'.format(server_folder)
server = 'https://files.rcsb.org/download/'
server_filename = pdb_id + '.pdb%i.gz' % biomol_num
local_filename = pdb_id + '_bio%i.pdb' % biomol_num
outfile = op.join(outdir, local_filename)
elif file_type.lower() == 'mmcif' or file_type.lower() == 'cif':
server = 'ftp://ftp.wwpdb.org/pub/pdb/data/biounit/mmCIF/divided/{}/'.format(server_folder)
server_filename = pdb_id + '-assembly%i.cif.gz' % biomol_num
local_filename = pdb_id + '_bio%i.cif' % biomol_num
outfile = op.join(outdir, local_filename)
else:
raise ValueError('Biological assembly only available in PDB or mmCIF file types.')
if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
download_link = op.join(server, server_filename)
try:
with contextlib.closing(urlopen(download_link)) as f:
decompressed_data = zlib.decompress(f.read(), 16 + zlib.MAX_WBITS)
with open(op.join(outdir, local_filename), 'wb') as f:
f.write(decompressed_data)
except URLError as e:
print(e)
return None
return outfile
########################################################################################################################
########################################################################################################################
# DEPRECATED FUNCTIONS
########################################################################################################################
########################################################################################################################
[docs]@deprecation.deprecated(deprecated_in="1.0", removed_in="2.0",
details="Use Biopython's PDBList.retrieve_pdb_file function instead")
def download_structure(pdb_id, file_type, outdir='', only_header=False, force_rerun=False):
"""Download a structure from the RCSB PDB by ID. Specify the file type desired.
Args:
pdb_id: PDB ID
file_type: pdb, pdb.gz, mmcif, cif, cif.gz, xml.gz, mmtf, mmtf.gz
outdir: Optional output directory
only_header: If only the header file should be downloaded
force_rerun: If the file should be downloaded again even if it exists
Returns:
str: Path to outfile
"""
# method in biopython. extra file types have not been added to biopython download yet
pdb_id = pdb_id.lower()
file_type = file_type.lower()
file_types = ['pdb', 'pdb.gz', 'mmcif', 'cif', 'cif.gz', 'xml.gz', 'mmtf', 'mmtf.gz']
if file_type not in file_types:
raise ValueError('Invalid file type, must be either: pdb, pdb.gz, cif, cif.gz, xml.gz, mmtf, mmtf.gz')
if file_type == 'mmtf':
file_type = 'mmtf.gz'
if file_type.endswith('.gz'):
gzipped = True
else:
gzipped = False
if file_type == 'mmcif':
file_type = 'cif'
if only_header:
folder = 'header'
outfile = op.join(outdir, '{}.header.{}'.format(pdb_id, file_type))
else:
folder = 'download'
outfile = op.join(outdir, '{}.{}'.format(pdb_id, file_type))
if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
if file_type == 'mmtf.gz' or file_type == 'mmtf':
mmtf_api = '1.0'
download_link = 'http://mmtf.rcsb.org/v{}/full/{}.mmtf.gz'.format(mmtf_api, pdb_id)
else:
download_link = 'http://files.rcsb.org/{}/{}.{}'.format(folder, pdb_id, file_type)
urlretrieve(download_link, outfile)
if gzipped:
outfile = ssbio.utils.gunzip_file(infile=outfile,
outfile=outfile.strip('.gz'),
outdir=outdir,
delete_original=False,
force_rerun_flag=force_rerun)
log.debug('{}: saved structure file'.format(outfile))
else:
if file_type == 'mmtf.gz':
outfile = op.join(outdir, '{}.{}'.format(pdb_id, 'mmtf'))
log.debug('{}: structure file already saved'.format(outfile))
return outfile