Source code for ssbio.protein.sequence.utils.blast

"""
Sequence BLAST
==============
"""

import os
import subprocess
import pandas as pd
import os.path as op
from ssbio import utils
import logging
try:
    from IPython.display import clear_output
    have_ipython = True
    from tqdm import tqdm_notebook as tqdm
except ImportError:
    have_ipython = False
    from tqdm import tqdm

date = utils.Date()
log = logging.getLogger(__name__)


[docs]def run_makeblastdb(infile, dbtype, outdir=''):
    """Make the BLAST database for a genome file.

    Args:
        infile (str): path to genome FASTA file
        dbtype (str): "nucl" or "prot" - what format your genome files are in
        outdir (str): path to directory to output database files (default is original folder)

    Returns:
        Paths to BLAST databases.

    """
    # TODO: add force_rerun option
    # TODO: rewrite using utils function command

    # Output location
    og_dir, name, ext = utils.split_folder_and_path(infile)
    if not outdir:
        outdir = og_dir
    outfile_basename = op.join(outdir, name)

    # Check if BLAST DB was already made
    if dbtype == 'nucl':
        outext = ['.nhr', '.nin', '.nsq']
    elif dbtype == 'prot':
        outext = ['.phr', '.pin', '.psq']
    else:
        raise ValueError('dbtype must be "nucl" or "prot"')
    outfile_all = [outfile_basename + x for x in outext]
    db_made = True
    for f in outfile_all:
        if not op.exists(f):
            db_made = False

    # Run makeblastdb if DB does not exist
    if db_made:
        log.debug('BLAST database already exists at {}'.format(outfile_basename))
        return outfile_all
    else:
        retval = subprocess.call('makeblastdb -in {} -dbtype {} -out {}'.format(infile, dbtype, outfile_basename), shell=True)
        if retval == 0:
            log.debug('Made BLAST database at {}'.format(outfile_basename))
            return outfile_all
        else:
            log.error('Error running makeblastdb, exit code {}'.format(retval))


[docs]def run_bidirectional_blast(reference, other_genome, dbtype, outdir=''):
    """BLAST a genome against another, and vice versa.

    This function requires BLAST to be installed, do so by running:
    sudo apt install ncbi-blast+

    Args:
        reference (str): path to "reference" genome, aka your "base strain"
        other_genome (str): path to other genome which will be BLASTed to the reference
        dbtype (str): "nucl" or "prot" - what format your genome files are in
        outdir (str): path to folder where BLAST outputs should be placed

    Returns:
        Paths to BLAST output files.
        (reference_vs_othergenome.out, othergenome_vs_reference.out)

    """
    # TODO: add force_rerun option

    if dbtype == 'nucl':
        command = 'blastn'
    elif dbtype == 'prot':
        command = 'blastp'
    else:
        raise ValueError('dbtype must be "nucl" or "prot"')

    r_folder, r_name, r_ext = utils.split_folder_and_path(reference)
    g_folder, g_name, g_ext = utils.split_folder_and_path(other_genome)

    # make sure BLAST DBs have been made
    run_makeblastdb(infile=reference, dbtype=dbtype, outdir=r_folder)
    run_makeblastdb(infile=other_genome, dbtype=dbtype, outdir=g_folder)

    # Reference vs genome
    r_vs_g = r_name + '_vs_' + g_name + '_blast.out'
    r_vs_g = op.join(outdir, r_vs_g)
    if op.exists(r_vs_g) and os.stat(r_vs_g).st_size != 0:
        log.debug('{} vs {} BLAST already run'.format(r_name, g_name))
    else:
        cmd = '{} -query {} -db {} -outfmt 6 -out {}'.format(command, reference, op.join(g_folder, g_name), r_vs_g)
        log.debug('Running: {}'.format(cmd))
        retval = subprocess.call(cmd, shell=True)
        if retval == 0:
            log.debug('BLASTed {} vs {}'.format(g_name, r_name))
        else:
            log.error('Error running {}, exit code {}'.format(command, retval))

    # Genome vs reference
    g_vs_r = g_name + '_vs_' + r_name + '_blast.out'
    g_vs_r = op.join(outdir, g_vs_r)
    if op.exists(g_vs_r) and os.stat(g_vs_r).st_size != 0:
        log.debug('{} vs {} BLAST already run'.format(g_name, r_name))
    else:
        cmd = '{} -query {} -db {} -outfmt 6 -out {}'.format(command, other_genome, op.join(r_folder, r_name), g_vs_r)
        log.debug('Running: {}'.format(cmd))
        retval = subprocess.call(cmd, shell=True)
        if retval == 0:
            log.debug('BLASTed {} vs {}'.format(g_name, r_name))
        else:
            log.error('Error running {}, exit code {}'.format(command, retval))

    return r_vs_g, g_vs_r


[docs]def print_run_bidirectional_blast(reference, other_genome, dbtype, outdir):
    """Write torque submission files for running bidirectional blast on a server and print execution command.

    Args:
        reference (str): Path to "reference" genome, aka your "base strain"
        other_genome (str): Path to other genome which will be BLASTed to the reference
        dbtype (str): "nucl" or "prot" - what format your genome files are in
        outdir (str): Path to folder where Torque scripts should be placed

    """
    # TODO: add force_rerun option

    if dbtype == 'nucl':
        command = 'blastn'
    elif dbtype == 'prot':
        command = 'blastp'
    else:
        raise ValueError('dbtype must be "nucl" or "prot"')

    r_folder, r_name, r_ext = utils.split_folder_and_path(reference)
    g_folder, g_name, g_ext = utils.split_folder_and_path(other_genome)

    # Reference vs genome
    r_vs_g_name = r_name + '_vs_' + g_name
    r_vs_g = r_vs_g_name + '_blast.out'
    if op.exists(op.join(outdir, r_vs_g)) and os.stat(op.join(outdir, r_vs_g)).st_size != 0:
        log.debug('{} vs {} BLAST already run'.format(r_name, g_name))
    else:
        cmd = '{} -query {} -db {} -outfmt 6 -out {}'.format(command, reference, g_name, r_vs_g)
        utils.write_torque_script(command=cmd, err=r_vs_g_name, out=r_vs_g_name, name=r_vs_g_name,
                                  outfile=op.join(outdir, r_vs_g_name) + '.sh',
                                  walltime='00:15:00', queue='regular')

    # Genome vs reference
    g_vs_r_name = g_name + '_vs_' + r_name
    g_vs_r = g_vs_r_name + '_blast.out'
    if op.exists(op.join(outdir, g_vs_r)) and os.stat(op.join(outdir, g_vs_r)).st_size != 0:
        log.debug('{} vs {} BLAST already run'.format(g_name, r_name))
    else:
        cmd = '{} -query {} -db {} -outfmt 6 -out {}'.format(command, other_genome, r_name, g_vs_r)
        utils.write_torque_script(command=cmd, err=g_vs_r_name, out=g_vs_r_name, name=g_vs_r_name,
                                  outfile=op.join(outdir, g_vs_r_name) + '.sh',
                                  walltime='00:15:00', queue='regular')


[docs]def calculate_bbh(blast_results_1, blast_results_2, r_name=None, g_name=None, outdir=''):
    """Calculate the best bidirectional BLAST hits (BBH) and save a dataframe of results.

    Args:
        blast_results_1 (str): BLAST results for reference vs. other genome
        blast_results_2 (str): BLAST results for other vs. reference genome
        r_name: Name of reference genome
        g_name: Name of other genome
        outdir: Directory where BLAST results are stored.

    Returns:
        Path to Pandas DataFrame of the BBH results.

    """
    # TODO: add force_rerun option

    cols = ['gene', 'subject', 'PID', 'alnLength', 'mismatchCount', 'gapOpenCount', 'queryStart', 'queryEnd',
            'subjectStart', 'subjectEnd', 'eVal', 'bitScore']

    if not r_name and not g_name:
        r_name = op.basename(blast_results_1).split('_vs_')[0]
        g_name = op.basename(blast_results_1).split('_vs_')[1].replace('_blast.out', '')

        r_name2 = op.basename(blast_results_2).split('_vs_')[1].replace('_blast.out', '')
        if r_name != r_name2:
            log.warning('{} != {}'.format(r_name, r_name2))

    outfile = op.join(outdir, '{}_vs_{}_bbh.csv'.format(r_name, g_name))
    if op.exists(outfile) and os.stat(outfile).st_size != 0:
        log.debug('{} vs {} BLAST BBHs already found at {}'.format(r_name, g_name, outfile))
        return outfile

    bbh1 = pd.read_csv(blast_results_1, sep='\t', names=cols)
    bbh2 = pd.read_csv(blast_results_2, sep='\t', names=cols)

    out = pd.DataFrame()
    log.debug('Finding BBHs for {} vs. {}'.format(r_name, g_name))

    for g in bbh1[pd.notnull(bbh1.gene)].gene.unique():
        res = bbh1[bbh1.gene == g]
        if len(res) == 0:
            continue
        best_hit = res.ix[res.PID.idxmax()].copy()
        best_gene = best_hit.subject
        res2 = bbh2[bbh2.gene == best_gene]
        if len(res2) == 0:
            continue
        best_hit2 = res2.ix[res2.PID.idxmax()]
        best_gene2 = best_hit2.subject
        if g == best_gene2:
            best_hit['BBH'] = '<=>'
        else:
            best_hit['BBH'] = '->'
        out = pd.concat([out, pd.DataFrame(best_hit).transpose()])

    out.to_csv(outfile)
    log.debug('{} vs {} BLAST BBHs saved at {}'.format(r_name, g_name, outfile))
    return outfile


[docs]def create_orthology_matrix(r_name, genome_to_bbh_files, pid_cutoff=None, bitscore_cutoff=None, evalue_cutoff=None,
                            filter_condition='OR', outname='', outdir='', force_rerun=False):
    """Create an orthology matrix using best bidirectional BLAST hits (BBH) outputs.

    Args:
        r_name (str): Name of the reference genome
        genome_to_bbh_files (dict): Mapping of genome names to the BBH csv output from the
            :func:`~ssbio.protein.sequence.utils.blast.calculate_bbh` method
        pid_cutoff (float): Minimum percent identity between BLAST hits to filter for in the range [0, 100]
        bitscore_cutoff (float): Minimum bitscore allowed between BLAST hits
        evalue_cutoff (float): Maximum E-value allowed between BLAST hits
        filter_condition (str): 'OR' or 'AND', how to combine cutoff filters. 'OR' gives more results since it
            is less stringent, as you will be filtering for hits with (>80% PID or >30 bitscore or <0.0001 evalue).
        outname: Name of output file of orthology matrix
        outdir: Path to output directory
        force_rerun (bool): Force recreation of the orthology matrix even if the outfile exists

    Returns:
        str: Path to orthologous genes matrix.

    """
    if outname:
        outfile = op.join(outdir, outname)
    else:
        outfile = op.join(outdir, '{}_orthology.csv'.format(r_name))

    if op.exists(outfile) and os.stat(outfile).st_size != 0 and not force_rerun:
        return outfile

    if not pid_cutoff and not bitscore_cutoff and not evalue_cutoff:
        log.warning('No cutoffs supplied, insignificant hits may be reported')

    if filter_condition not in ['OR', 'AND']:
        raise ValueError('Invalid filter condition supplied, must be either "OR" or "AND"')

    if not pid_cutoff:
        pid_cutoff = 0
    if not bitscore_cutoff:
        bitscore_cutoff = 0
    if not evalue_cutoff:
        evalue_cutoff = float('Inf')

    out = pd.DataFrame()
    for g_name, bbh_path in tqdm(genome_to_bbh_files.items()):

        df_bbh = pd.read_csv(bbh_path, index_col=0)
        bidirectional = df_bbh[df_bbh.BBH == '<=>']

        if filter_condition == 'OR':
            data = bidirectional[(bidirectional.PID > pid_cutoff) | (bidirectional.eVal < evalue_cutoff) | (bidirectional.bitScore > bitscore_cutoff)]
        elif filter_condition == 'AND':
            data = bidirectional[(bidirectional.PID > pid_cutoff) & (bidirectional.eVal < evalue_cutoff) & (bidirectional.bitScore > bitscore_cutoff)]

        data.index = data.gene
        data2 = data[['subject']]
        if out.empty:
            out = data2
            out = out.rename(columns={'subject': g_name})
        else:
            out = pd.merge(out, data2, left_index=True, right_index=True, how='outer')
            out = out.rename(columns={'subject': g_name})

    out.to_csv(outfile)
    log.debug('{} orthologous genes saved at {}'.format(r_name, outfile))
    return outfile