Source code for ssbio.databases.kegg

"""
KEGGProp
===========
"""

import io
import logging
import os.path as op
from collections import defaultdict

from bioservices import KEGG
from slugify import Slugify

import ssbio.utils
from ssbio.protein.sequence.seqprop import SeqProp

log = logging.getLogger(__name__)
custom_slugify = Slugify(safe_chars='-_')
bs_kegg = KEGG()


[docs]class KEGGProp(SeqProp): def __init__(self, seq, id, name='<unknown name>', description='<unknown description>', fasta_path=None, txt_path=None, gff_path=None): SeqProp.__init__(self, id=id, seq=seq, name=name, description=description, sequence_path=fasta_path, metadata_path=txt_path, feature_path=gff_path) self.kegg = id @SeqProp.metadata_path.setter def metadata_path(self, m_path): """Provide pointers to the paths of the metadata file Args: m_path: Path to metadata file """ if not m_path: self.metadata_dir = None self.metadata_file = None else: if not op.exists(m_path): raise OSError('{}: file does not exist!'.format(m_path)) if not op.dirname(m_path): self.metadata_dir = '.' else: self.metadata_dir = op.dirname(m_path) self.metadata_file = op.basename(m_path) # TODO: update using Biopython's built in SeqRecord parser # Just updating IDs and stuff self.update(parse_kegg_gene_metadata(self.metadata_path), overwrite=True) def download_seq_file(self, outdir, force_rerun=False): kegg_seq_file = download_kegg_aa_seq(gene_id=self.id, outdir=outdir, force_rerun=force_rerun) if kegg_seq_file: self.sequence_path = kegg_seq_file else: log.warning('{}: no sequence file available'.format(self.id)) def download_metadata_file(self, outdir, force_rerun=False): kegg_metadata_file = download_kegg_gene_metadata(gene_id=self.id, outdir=outdir, force_rerun=force_rerun) if kegg_metadata_file: self.metadata_path = kegg_metadata_file else:
log.warning('{}: no metadata file available'.format(self.id))
[docs]def download_kegg_gene_metadata(gene_id, outdir=None, force_rerun=False): """Download the KEGG flatfile for a KEGG ID and return the path. Args: gene_id: KEGG gene ID (with organism code), i.e. "eco:1244" outdir: optional output directory of metadata Returns: Path to metadata file """ if not outdir: outdir = '' # Replace colon with dash in the KEGG gene ID outfile = op.join(outdir, '{}.kegg'.format(custom_slugify(gene_id))) if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile): raw_text = bs_kegg.get("{}".format(gene_id)) if raw_text == 404: return with io.open(outfile, mode='wt', encoding='utf-8') as f: f.write(raw_text) log.debug('{}: downloaded KEGG metadata file'.format(outfile)) else: log.debug('{}: KEGG metadata file already exists'.format(outfile))
return outfile
[docs]def parse_kegg_gene_metadata(infile): """Parse the KEGG flatfile and return a dictionary of metadata. Dictionary keys are: refseq uniprot pdbs taxonomy Args: infile: Path to KEGG flatfile Returns: dict: Dictionary of metadata """ metadata = defaultdict(str) with open(infile) as mf: kegg_parsed = bs_kegg.parse(mf.read()) # TODO: additional fields can be parsed if 'DBLINKS' in kegg_parsed.keys(): if 'UniProt' in kegg_parsed['DBLINKS']: unis = str(kegg_parsed['DBLINKS']['UniProt']).split(' ') # TODO: losing other uniprot ids by doing this if isinstance(unis, list): metadata['uniprot'] = unis[0] else: metadata['uniprot'] = unis if 'NCBI-ProteinID' in kegg_parsed['DBLINKS']: metadata['refseq'] = str(kegg_parsed['DBLINKS']['NCBI-ProteinID']) if 'STRUCTURE' in kegg_parsed.keys(): metadata['pdbs'] = str(kegg_parsed['STRUCTURE']['PDB']).split(' ') else: metadata['pdbs'] = None if 'ORGANISM' in kegg_parsed.keys(): metadata['taxonomy'] = str(kegg_parsed['ORGANISM'])
return metadata
[docs]def download_kegg_aa_seq(gene_id, outdir=None, force_rerun=False): """Download a FASTA sequence of a protein from the KEGG database and return the path. Args: gene_id: the gene identifier outdir: optional path to output directory Returns: Path to FASTA file """ if not outdir: outdir = '' outfile = op.join(outdir, '{}.faa'.format(custom_slugify(gene_id))) if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile): raw_text = bs_kegg.get("{}".format(gene_id), option='aaseq') if raw_text == 404: return with io.open(outfile, mode='wt', encoding='utf-8') as f: f.write(raw_text) log.debug('{}: downloaded KEGG FASTA file'.format(outfile)) else: log.debug('{}: KEGG FASTA file already exists'.format(outfile))
return outfile
[docs]def map_kegg_all_genes(organism_code, target_db): """Map all of an organism's gene IDs to the target database. This is faster than supplying a specific list of genes to map, plus there seems to be a limit on the number you can map with a manual REST query anyway. Args: organism_code: the three letter KEGG code of your organism target_db: ncbi-proteinid | ncbi-geneid | uniprot Returns: Dictionary of ID mapping """ mapping = bs_kegg.conv(target_db, organism_code) # strip the organism code from the keys and the identifier in the values new_mapping = {} for k,v in mapping.items(): new_mapping[k.replace(organism_code + ':', '')] = str(v.split(':')[1])
return new_mapping