Source code for ssbio.protein.structure.homology.itasser.itasserprep

from __future__ import print_function

import getpass
import logging
import os
import os.path as op

from ssbio.protein.sequence.utils import fasta as fasta

log = logging.getLogger(__name__)

[docs]class ITASSERPrep(): """Prepare a protein sequence for an I-TASSER homology modeling run. The main utilities of this class are to: * Allow for the input of a protein sequence string and paths to I-TASSER to create execution scripts * Automate large-scale homology modeling efforts by creating Slurm or TORQUE job scheduling scripts Args: ident: Identifier for your sequence. Will be used as the global ID (folder name, sequence name) seq_str: Sequence in string format root_dir: Local directory where I-TASSER folder will be created itasser_path: Path to I-TASSER folder, i.e. '~/software/I-TASSER4.4' itlib_path: Path to ITLIB folder, i.e. '~/software/ITLIB' execute_dir: Optional path to execution directory - use this if you are copying the homology models to another location such as a supercomputer for running light: If simulations should be limited to 5 runs runtype: How you will be running I-TASSER - local, slurm, or torque print_exec: If the execution script should be printed out java_home: Path to Java executable binding_site_pred: If binding site predictions should be run ec_pred: If EC number predictions should be run go_pred: If GO term predictions should be run additional_options: Any other additional I-TASSER options, appended to the command job_scheduler_header: Any job scheduling options, prepended as a header to the file """ def __init__(self, ident, seq_str, root_dir, itasser_path, itlib_path, execute_dir=None, light=True, runtype='local', print_exec=False, java_home=None, binding_site_pred=False, ec_pred=False, go_pred=False, additional_options=None, job_scheduler_header=None): if runtype.lower() not in ['local', 'torque', 'slurm']: raise ValueError('Invalid runtype, must be "local", "torque", "slurm"') self.id = ident self.seq_str = seq_str if not self.seq_str: raise ValueError('{}: no sequence input'.format(self.id)) if len(self.seq_str) < 10 or len(self.seq_str) > 1500: log.warning('{}: I-TASSER modeling will not run as sequence length ({}) is not in the range [10, 1500]'.format(self.id, len(self.seq_str))) self.root_dir = root_dir if not op.exists(root_dir): os.makedirs(root_dir) if not execute_dir: # If no execute_dir is given, use the same dir as the created folder self.execute_dir = self.prep_folder(seq_str) elif execute_dir: orig_data_dir = self.prep_folder(seq_str) self.execute_dir = op.join(execute_dir, op.basename(orig_data_dir)) self.print_exec = print_exec self.runtype = runtype if light: light = 'true' else: light = 'false' self.light = light self.model_exists = op.exists(op.join(self.execute_dir, 'model1.pdb')) if not additional_options: additional_options = '' else: additional_options += ' ' if binding_site_pred: additional_options += '-LBS true ' if ec_pred: additional_options += '-EC true ' if go_pred: additional_options += '-GO true ' self.additional_options = additional_options if not java_home: self.java_home = '${JAVA_HOME}' else: self.java_home = java_home if not job_scheduler_header: self.job_scheduler_header = '' else: self.job_scheduler_header = job_scheduler_header if runtype == 'local' or runtype == 'torque': self.prep_script_local(itasser_loc=itasser_path, itlib_loc=itlib_path) if runtype == 'slurm': self.prep_script_slurm(itasser_loc=itasser_path, itlib_loc=itlib_path)
[docs] def prep_folder(self, seq): """Take in a sequence string and prepares the folder for the I-TASSER run.""" itasser_dir = op.join(self.root_dir, self.id) if not op.exists(itasser_dir): os.makedirs(itasser_dir) tmp = {self.id: seq} fasta.write_fasta_file_from_dict(indict=tmp, outname='seq', outext='.fasta', outdir=itasser_dir)
return itasser_dir def prep_script_local(self, itasser_loc, itlib_loc): script_file = '{}.sh'.format(self.id) outfile = os.path.join(self.root_dir, script_file) itasser = {'executable': op.join(itasser_loc, 'I-TASSERmod/runI-TASSER.pl'), 'pkgdir': itasser_loc, 'libdir': itlib_loc, 'seqname': self.id, 'datadir': self.execute_dir, 'java_home': self.java_home, 'additional_options': self.additional_options, 'light': self.light} script = open(outfile, 'w') script.write('#!/bin/bash -l\n') if self.runtype == 'torque': script.write('{}'.format(self.job_scheduler_header)) script.write(("{i[executable]} " "-pkgdir {i[pkgdir]} " "-libdir {i[libdir]} " "-seqname {i[seqname]} " "-datadir {i[datadir]} " "-java_home {i[java_home]} " "{i[additional_options]}" "-light {i[light]}\n\n").format(i=itasser)) script.close() os.chmod(outfile, 0o755) if self.print_exec and self.runtype=='local': print('nohup ./{} > {}.out &'.format(op.basename(outfile), os.path.join(self.root_dir, self.id)), end='\n\n') if self.print_exec and self.runtype == 'torque': print('qsub {}'.format(op.basename(outfile), os.path.join(self.root_dir, self.id)), end='; ') return outfile def prep_script_slurm(self, itasser_loc, itlib_loc): script_file = '{}.slm'.format(self.id) outfile = os.path.join(self.root_dir, script_file) itasser = {'executable': op.join(itasser_loc, 'I-TASSERmod/runI-TASSER.pl'), 'pkgdir': itasser_loc, 'libdir': itlib_loc, 'seqname': self.id, 'datadir': self.execute_dir, 'java_home': self.java_home, 'light': self.light, 'additional_options': self.additional_options} slurm = open(outfile, 'w') slurm.write('#!/bin/bash -l\n') slurm.write('{}'.format(self.job_scheduler_header)) slurm.write(('{i[executable]} ' '-pkgdir {i[pkgdir]} ' '-libdir {i[libdir]} ' '-seqname {i[seqname]} ' '-datadir {i[datadir]} ' '-java_home {i[java_home]} ' '{i[additional_options]}' '-light {i[light]}\n\n').format(i=itasser)) slurm.close() os.chmod(outfile, 0o755) if self.print_exec: print('sbatch {}'.format(op.basename(outfile)), end='; ')
return outfile if __name__ == '__main__': pass # TODO: make this an executable script to # 1) ask for global I-TASSER locations # 2) ask for working directory # 3) take in multiple inputs and prepare them for I-TASSER runs # a) input types # i) a single FASTA file with single or multiple sequences # ii) multiple FASTA files contained in the working directory # iii) a dataframe with IDs and sequences # iv) a sequence string and an ID (and optional additional identifiers) # b) types of runs # i) NERSC slurm (sbatch) inputs # ii) local torque (qsub) inputs # iii) simple executable background scripts # 4) Output executable scripts or submit things to the queue # root = '/home/nathan/projects/GEM-PRO/cyano/' # files = glob.glob(os.path.join(root,'*.faa')) # for f in files: # identifier = os.path.splitext(os.path.basename(f))[0] # ip = ITASSERPrep(id=identifier, root_dir='/home/nathan/projects/GEM-PRO/cyano') # # sequence = sl.seq_loader(f, is_file=True) # execute_dir = ip.prep_folder(sequence) # ip.prep_script_local(itasser_loc='/home/nathan/software/I-TASSER4.4', # itlib_loc='/home/nathan/software/ITLIB', # datadir=execute_dir) # ip = ITASSERPrep(id='W5EP13', root_dir='/home/nathan/projects/GEM-PRO/cyano/') # # sequence = sl.seq_loader('/home/nathan/Downloads/W5EP13.faa', is_file=True) # execute_dir = ip.prep_folder(sequence) # ip.prep_script_local(itasser_loc='/home/nathan/software/I-TASSER4.4', # itlib_loc='/home/nathan/software/ITLIB', # datadir=execute_dir) ## below is old run_all script in python # import os # import shutil # import subprocess # # thedir = '.' # folders = [name for name in os.listdir( # thedir) if os.path.isdir(os.path.join(thedir, name))] # folders = sorted(folders, reverse=True) # for_ssb3 = folders[:len(folders) / 2] # # for fo in for_ssb3: # coach = open('%s_coach.sh' % fo, 'w') # # coach.write('#!/bin/bash\n') # coach.write('#PBS -l walltime=05:20:00\n') # coach.write('#PBS -q regular\n') # coach.write('#PBS -N %s\n' % fo) # coach.write('perl ~/software/I-TASSER4.4/I-TASSERmod/runCOACH.pl -pkgdir /home/nathan/software/I-TASSER4.4 -libdir /home/nathan/software/ITLIB -protname %s -model model1.pdb -datadir /home/nathan/projects/GEM-PRO/yome/all_test/%s -GO true\n\n' % (fo, fo)) # # coach.close() # # # subprocess.call('qsub %s_coach.sh;' % (fo), shell=True) # print('qsub %s_coach.sh;' % (fo)),