"""
KEGGProp
===========
"""
import io
import logging
import os.path as op
from collections import defaultdict
from bioservices import KEGG
from slugify import Slugify
import ssbio.utils
from ssbio.protein.sequence.seqprop import SeqProp
log = logging.getLogger(__name__)
custom_slugify = Slugify(safe_chars='-_')
bs_kegg = KEGG()
[docs]class KEGGProp(SeqProp):
def __init__(self, seq, id, name='<unknown name>', description='<unknown description>',
fasta_path=None, txt_path=None, gff_path=None):
SeqProp.__init__(self, id=id, seq=seq, name=name, description=description,
sequence_path=fasta_path, metadata_path=txt_path, feature_path=gff_path)
self.kegg = id
@SeqProp.metadata_path.setter
def metadata_path(self, m_path):
"""Provide pointers to the paths of the metadata file
Args:
m_path: Path to metadata file
"""
if not m_path:
self.metadata_dir = None
self.metadata_file = None
else:
if not op.exists(m_path):
raise OSError('{}: file does not exist!'.format(m_path))
if not op.dirname(m_path):
self.metadata_dir = '.'
else:
self.metadata_dir = op.dirname(m_path)
self.metadata_file = op.basename(m_path)
# TODO: update using Biopython's built in SeqRecord parser
# Just updating IDs and stuff
self.update(parse_kegg_gene_metadata(self.metadata_path), overwrite=True)
def download_seq_file(self, outdir, force_rerun=False):
kegg_seq_file = download_kegg_aa_seq(gene_id=self.id,
outdir=outdir,
force_rerun=force_rerun)
if kegg_seq_file:
self.sequence_path = kegg_seq_file
else:
log.warning('{}: no sequence file available'.format(self.id))
def download_metadata_file(self, outdir, force_rerun=False):
kegg_metadata_file = download_kegg_gene_metadata(gene_id=self.id,
outdir=outdir,
force_rerun=force_rerun)
if kegg_metadata_file:
self.metadata_path = kegg_metadata_file
else:
log.warning('{}: no metadata file available'.format(self.id))
return outfile
return metadata
[docs]def download_kegg_aa_seq(gene_id, outdir=None, force_rerun=False):
"""Download a FASTA sequence of a protein from the KEGG database and return the path.
Args:
gene_id: the gene identifier
outdir: optional path to output directory
Returns:
Path to FASTA file
"""
if not outdir:
outdir = ''
outfile = op.join(outdir, '{}.faa'.format(custom_slugify(gene_id)))
if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
raw_text = bs_kegg.get("{}".format(gene_id), option='aaseq')
if raw_text == 404:
return
with io.open(outfile, mode='wt', encoding='utf-8') as f:
f.write(raw_text)
log.debug('{}: downloaded KEGG FASTA file'.format(outfile))
else:
log.debug('{}: KEGG FASTA file already exists'.format(outfile))
return outfile
[docs]def map_kegg_all_genes(organism_code, target_db):
"""Map all of an organism's gene IDs to the target database.
This is faster than supplying a specific list of genes to map,
plus there seems to be a limit on the number you can map with a manual REST query anyway.
Args:
organism_code: the three letter KEGG code of your organism
target_db: ncbi-proteinid | ncbi-geneid | uniprot
Returns:
Dictionary of ID mapping
"""
mapping = bs_kegg.conv(target_db, organism_code)
# strip the organism code from the keys and the identifier in the values
new_mapping = {}
for k,v in mapping.items():
new_mapping[k.replace(organism_code + ':', '')] = str(v.split(':')[1])
return new_mapping