Source code for ssbio.protein.sequence.properties.tmhmm

from collections import defaultdict
import logging
log = logging.getLogger(__name__)

# Parsing TMHMM results
# There are two output formats: Long and short. Long output format
#
# For the long format (default), tmhmm gives some statistics and a list of the location of the predicted transmembrane helices and the predicted location of the intervening loop regions. Here is an example:
#
#     # COX2_BACSU Length: 278
#     # COX2_BACSU Number of predicted TMHs: 3
#     # COX2_BACSU Exp number of AAs in TMHs: 68.6888999999999
#     # COX2_BACSU Exp number, first 60 AAs: 39.8875
#     # COX2_BACSU Total prob of N-in: 0.99950
#     # COX2_BACSU POSSIBLE N-term signal sequence
#     COX2_BACSU TMHMM2.0 inside 1 6
#     COX2_BACSU TMHMM2.0 TMhelix 7 29
#     COX2_BACSU TMHMM2.0 outside 30 43
#     COX2_BACSU TMHMM2.0 TMhelix 44 66
#     COX2_BACSU TMHMM2.0 inside 67 86
#     COX2_BACSU TMHMM2.0 TMhelix 87 109
#     COX2_BACSU TMHMM2.0 outside 110 278
#
# If the whole sequence is labeled as inside or outside, the prediction is that it contains no membrane
# helices. It is probably not wise to interpret it as a prediction of location. The prediction gives the most probable location and orientation of transmembrane helices in the sequence. It is found by an algorithm called N-best (or 1-best in this case) that sums over all paths through the model with the same location and direction of the helices.
#
# The first few lines gives some statistics:
#
# Length: the length of the protein sequence.
#
# Number of predicted TMHs: The number of predicted transmembrane helices.
#
# Exp number of AAs in TMHs: The expected number of amino acids intransmembrane helices. If this number is larger than 18 it is very likely to be a transmembrane protein (OR have a signal peptide).
#
# Exp number, first 60 AAs: The expected number of amino acids in transmembrane helices in the first 60 amino acids of the protein. If this number more than a few, you should be warned that a predicted transmembrane helix in the N-term could be a signal peptide.
#
# Total prob of N-in: The total probability that the N-term is on the cytoplasmic side of the membrane.
#
# POSSIBLE N-term signal sequence: a warning that is produced when "Exp number, first 60 AAs" is larger than 10.


[docs]def parse_tmhmm_long(tmhmm_results): """Parse the 'long' output format of TMHMM and return a dictionary of ``{sequence_ID: TMHMM_prediction}``. Args: tmhmm_results (str): Path to long format TMHMM output. Returns: dict: Dictionary of ``{sequence_ID: TMHMM_prediction}`` """ with open(tmhmm_results) as f: lines = f.read().splitlines() infodict = defaultdict(dict) for l in lines: if 'Number of predicted TMHs:' in l: gene = l.split(' Number')[0].strip('# ') infodict[gene]['num_tm_helices'] = int(l.split(': ')[1]) if 'WARNING' in l: log.warning('{}: no TMHMM predictions'.format(l)) continue # TODO: POSSIBLE N-term signal sequence - parse this # Look for the lines without #, these are the TM predicted regions if '#' not in l: stuff = l.split() if stuff[1] == 'TMHMM2.0': gene = stuff[0] region = stuff[2] region_start = stuff[3] region_end = stuff[4] if 'sequence' in infodict[gene]: tm_seq = infodict[gene]['sequence'] else: tm_seq = '' if region == 'outside': info = 'O' elif region == 'inside': info = 'I' elif region == 'TMhelix': info = 'T' else: log.error('{}: unknown region type'.format(info)) info = '-' for r in range(int(region_start), int(region_end) + 1): tm_seq += info infodict[gene]['sequence'] = tm_seq
return infodict
[docs]def label_TM_tmhmm_residue_numbers_and_leaflets(tmhmm_seq): """Determine the residue numbers of the TM-helix residues that cross the membrane and label them by leaflet. Args: tmhmm_seq: g.protein.representative_sequence.seq_record.letter_annotations['TM-tmhmm'] Returns: leaflet_dict: a dictionary with leaflet_variable : [residue list] where the variable is inside or outside TM_boundary dict: outputs a dictionar with : TM helix number : [TM helix residue start , TM helix residue end] TODO: untested method! """ TM_number_dict = {} T_index = [] T_residue = [] residue_count = 1 for residue_label in tmhmm_seq: if residue_label == 'T': T_residue.append(residue_count) residue_count = residue_count + 1 TM_number_dict.update({'T_residue': T_residue}) # finding the TM boundaries T_residue_list = TM_number_dict['T_residue'] count = 0 max_count = len(T_residue_list) - 1 TM_helix_count = 0 TM_boundary_dict = {} while count <= max_count: # first residue = TM start if count == 0: TM_start = T_residue_list[count] count = count + 1 continue # Last residue = TM end elif count == max_count: TM_end = T_residue_list[count] TM_helix_count = TM_helix_count + 1 TM_boundary_dict.update({'TM_helix_' + str(TM_helix_count): [TM_start, TM_end]}) break # middle residues need to be start or end elif T_residue_list[count] != T_residue_list[count + 1] - 1: TM_end = T_residue_list[count] TM_helix_count = TM_helix_count + 1 TM_boundary_dict.update({'TM_helix_' + str(TM_helix_count): [TM_start, TM_end]}) # new TM_start TM_start = T_residue_list[count + 1] count = count + 1 # assign leaflet to proper TM residues O or I leaflet_dict = {} for leaflet in ['O', 'I']: leaflet_list = [] for TM_helix, TM_residues in TM_boundary_dict.items(): for residue_num in TM_residues: tmhmm_seq_index = residue_num - 1 previous_residue = tmhmm_seq_index - 1 next_residue = tmhmm_seq_index + 1 # identify if the previous or next residue closest to the TM helix start/end is the proper leaflet if tmhmm_seq[previous_residue] == leaflet or tmhmm_seq[next_residue] == leaflet: leaflet_list.append(residue_num) leaflet_dict.update({'tmhmm_leaflet_' + leaflet: leaflet_list})
return TM_boundary_dict, leaflet_dict