Source code for ssbio.protein.sequence.properties.scratch

import logging
import os.path as op

import ssbio.protein.sequence.utils.fasta
import ssbio.utils

log = logging.getLogger(__name__)


[docs]class SCRATCH(): """Provide wrappers for running and parsing SCRATCH on a sequence file or sequence string. To run from the command line:: ./run_SCRATCH-1D_predictors.sh input_fasta output_prefix [num_threads] SCRATCH predicts: - Secondary structure - 3 classes (helix, strand, other) using SSpro - 8 classes (standard DSSP definitions) using SSpro8 - Relative solvent accessibility (RSA, also known as relative accessible surface area) - @ 25% exposed RSA cutoff (<25% RSA means it is buried) - @ all cutoffs in 5% increments from 0 to 100 """ # TODO: also provide summary dataframes def __init__(self, project_name, seq_file=None, seq_str=None): self.project_name = project_name self.seq_file = seq_file if seq_str: self.seq_file = ssbio.protein.sequence.utils.fasta.write_seq_as_temp_fasta(seq_str)
[docs] def run_scratch(self, path_to_scratch, num_cores=1, outname=None, outdir=None, force_rerun=False): """Run SCRATCH on the sequence_file that was loaded into the class. Args: path_to_scratch: Path to the SCRATCH executable, run_SCRATCH-1D_predictors.sh outname: Prefix to name the output files outdir: Directory to store the output files force_rerun: Flag to force rerunning of SCRATCH even if the output files exist Returns: """ if not outname: outname = self.project_name if not outdir: outdir = '' outname = op.join(outdir, outname) self.out_sspro = '{}.ss'.format(outname) self.out_sspro8 = '{}.ss8'.format(outname) self.out_accpro = '{}.acc'.format(outname) self.out_accpro20 = '{}.acc20'.format(outname) # TODO: check for multiple output files in command_runner ssbio.utils.command_runner( shell_command='{} {} {} {}'.format(path_to_scratch, self.seq_file, outname, num_cores),
force_rerun_flag=force_rerun, outfile_checker='{}.ss'.format(outname))
[docs] def sspro_results(self): """Parse the SSpro output file and return a dict of secondary structure compositions. Returns: dict: Keys are sequence IDs, values are the lists of secondary structure predictions. H: helix E: strand C: the rest """
return ssbio.protein.sequence.utils.fasta.load_fasta_file_as_dict_of_seqs(self.out_sspro)
[docs] def sspro_summary(self): """Parse the SSpro output file and return a summary of secondary structure composition. The output file is just a FASTA formatted file, so you can get residue level information by parsing it like a normal sequence file. Returns: dict: Percentage of: H: helix E: strand C: the rest """ summary = {} records = ssbio.protein.sequence.utils.fasta.load_fasta_file(self.out_sspro) for r in records: seq_summary = {} seq_summary['percent_H-sspro'] = r.seq.count('H')/float(len(r)) seq_summary['percent_E-sspro'] = r.seq.count('E')/float(len(r)) seq_summary['percent_C-sspro'] = r.seq.count('C')/float(len(r)) summary[r.id] = seq_summary
return summary
[docs] def sspro8_results(self): """Parse the SSpro8 output file and return a dict of secondary structure compositions. """
return ssbio.protein.sequence.utils.fasta.load_fasta_file_as_dict_of_seqs(self.out_sspro8)
[docs] def sspro8_summary(self): """Parse the SSpro8 output file and return a summary of secondary structure composition. The output file is just a FASTA formatted file, so you can get residue level information by parsing it like a normal sequence file. Returns: dict: Percentage of: H: alpha-helix G: 310-helix I: pi-helix (extremely rare) E: extended strand B: beta-bridge T: turn S: bend C: the rest """ summary = {} records = ssbio.protein.sequence.utils.fasta.load_fasta_file(self.out_sspro8) for r in records: seq_summary = {} seq_summary['percent_H-sspro8'] = r.seq.count('H') / float(len(r)) seq_summary['percent_G-sspro8'] = r.seq.count('G') / float(len(r)) seq_summary['percent_I-sspro8'] = r.seq.count('I') / float(len(r)) seq_summary['percent_E-sspro8'] = r.seq.count('E') / float(len(r)) seq_summary['percent_B-sspro8'] = r.seq.count('B') / float(len(r)) seq_summary['percent_T-sspro8'] = r.seq.count('T') / float(len(r)) seq_summary['percent_S-sspro8'] = r.seq.count('S') / float(len(r)) seq_summary['percent_C-sspro8'] = r.seq.count('C') / float(len(r)) summary[r.id] = seq_summary
return summary
[docs] def accpro_results(self): """Parse the ACCpro output file and return a dict of secondary structure compositions. """
return ssbio.protein.sequence.utils.fasta.load_fasta_file_as_dict_of_seqs(self.out_accpro)
[docs] def accpro_summary(self): """Parse the ACCpro output file and return a summary of percent exposed/buried residues. The output file is just a FASTA formatted file, so you can get residue level information by parsing it like a normal sequence file. Returns: dict: Percentage of buried and exposed residues """ summary = {} records = ssbio.protein.sequence.utils.fasta.load_fasta_file(self.out_accpro) for r in records: seq_summary = {} seq_summary['percent_exposed-accpro'] = r.seq.count('e') / float(len(r)) seq_summary['percent_buried-accpro'] = r.seq.count('-') / float(len(r)) summary[r.id] = seq_summary
return summary
[docs] def accpro20_results(self): """Parse the ACCpro output file and return a dict of secondary structure compositions"""
return read_accpro20(self.out_accpro20) # return ssbio.sequence.utils.fasta.load_fasta_file_as_dict_of_seqs(self.out_accpro20)
[docs] def accpro20_summary(self, cutoff): """Parse the ACCpro output file and return a summary of percent exposed/buried residues based on a cutoff. Below the cutoff = buried Equal to or greater than cutoff = exposed The default cutoff used in accpro is 25%. The output file is just a FASTA formatted file, so you can get residue level information by parsing it like a normal sequence file. Args: cutoff (float): Cutoff for defining a buried or exposed residue. Returns: dict: Percentage of buried and exposed residues """ summary = {} if cutoff < 1: cutoff = 1 * 100 records = read_accpro20(self.out_accpro20) for k,v in records.items(): seq_summary = {} exposed = 0 buried = 0 for s in v: if s > cutoff: exposed += 1 else: buried += 1 seq_summary['percent_exposed-accpro20'] = exposed / float(len(v)) seq_summary['percent_buried-accpro20'] = buried / float(len(v)) summary[k] = seq_summary
return summary
[docs]def read_accpro20(infile): """Read the accpro20 output (.acc20) and return the parsed FASTA records. Keeps the spaces between the accessibility numbers. Args: infile: Path to .acc20 file Returns: dict: Dictionary of accessibilities with keys as the ID """ with open(infile) as f: records = f.read().splitlines() accpro20_dict = {} for i, r in enumerate(records): if i % 2 == 0: # TODO: Double check how to parse FASTA IDs (can they have a space because that is what i split by) # Key was originally records[i][1:] accpro20_dict[records[i].split(' ')[0][1:]] = [int(x) for x in records[i + 1].split(' ')]
return accpro20_dict