import logging
from Bio.PDB.Polypeptide import one_to_three
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import ssbio.protein.sequence.utils
import ssbio.utils
log = logging.getLogger(__name__)
_aa_property_dict_one = {
'Aliphatic': ['A', 'I', 'L', 'V'],
'Aromatic' : ['F', 'H', 'W', 'Y'],
'Non-polar': ['A', 'C', 'F', 'G', 'I', 'L', 'M', 'P', 'V', 'W', 'Y'],
'Polar' : ['D', 'E', 'H', 'K', 'N', 'Q', 'R', 'S', 'T'],
'Charged' : ['D', 'E', 'H', 'K', 'R'],
'Basic' : ['H', 'K', 'R'],
'Acidic' : ['D', 'E']}
# 'Tiny': ['A','C','G','S','T']
# 'Small': ['A','C','D','G','N','P','S','T','V']
_aa_property_dict_three = {k: [one_to_three(x) for x in v] for k, v in _aa_property_dict_one.items()}
_aa_flexibility_dict_one = {'A': -0.605,
'C': -0.692,
'D': -0.279,
'E': -0.16,
'F': -0.719,
'G': -0.537,
'H': -0.662,
'I': -0.682,
'K': -0.043,
'L': -0.631,
'M': -0.626,
'N': -0.381,
'P': -0.271,
'Q': -0.368,
'R': -0.448,
'S': -0.424,
'T': -0.525,
'V': -0.669,
'W': -0.727,
'Y': -0.721}
_aa_flexibility_dict_three = {one_to_three(k): v for k, v in _aa_flexibility_dict_one.items()}
_human_readable_pepstats = {'A_percent-biop': '% Ala',
'C_percent-biop': '% Cys',
'D_percent-biop': '% Asp',
'E_percent-biop': '% Glu',
'F_percent-biop': '% Phe',
'G_percent-biop': '% Gly',
'H_percent-biop': '% His',
'I_percent-biop': '% Ile',
'K_percent-biop': '% Lys',
'L_percent-biop': '% Leu',
'M_percent-biop': '% Met',
'N_percent-biop': '% Asn',
'P_percent-biop': '% Pro',
'Q_percent-biop': '% Gln',
'R_percent-biop': '% Arg',
'S_percent-biop': '% Ser',
'T_percent-biop': '% Thr',
'V_percent-biop': '% Val',
'W_percent-biop': '% Trp',
'Y_percent-biop': '% Tyr',
'aromaticity-biop': 'Aromaticity',
'instability_index-biop': 'Instability index',
'isoelectric_point-biop': 'Isoelectric point (pI)',
'percent_helix_naive-biop': '% residues commonly in helix (V, I, Y, F, W, L)',
'percent_turn_naive-biop': '% residues commonly in turn (N, P, G, S))',
'percent_strand_naive-biop': '% residues commonly in sheet (E, M, A, L)',
'mol_percent_tiny-pepstats': 'Molar % of tiny residues (A, C, G, S, T)',
'mol_percent_small-pepstats': 'Molar % of small residues (A, B, C, D, G, N, P, S, T, V)',
'mol_percent_aliphatic-pepstats': 'Molar % of aliphatic residues (A, I, L, V)',
'mol_percent_aromatic-pepstats': 'Molar % of aromatic residues (F, H, W, Y)',
'mol_percent_non-polar-pepstats': 'Molar % of non-polar residues (A, C, F, G, I, L, M, P, V, W, Y)',
'mol_percent_polar-pepstats': 'Molar % of polar residues (D, E, H, K, N, Q, R, S, T, Z)',
'mol_percent_charged-pepstats': 'Molar % of charged residues (B, D, E, H, K, R, Z)',
'mol_percent_basic-pepstats': 'Molar % of basic residues (H, K, R)',
'mol_percent_acidic-pepstats': 'Molar % of acidic residues (B, D, E, Z)'}
[docs]def biopython_protein_analysis(inseq):
"""Utiize Biopython's ProteinAnalysis module to return general sequence properties of an amino acid string.
For full definitions see: http://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParam.ProteinAnalysis-class.html
Args:
inseq: Amino acid sequence
Returns:
dict: Dictionary of sequence properties. Some definitions include:
instability_index: Any value above 40 means the protein is unstable (has a short half life).
secondary_structure_fraction: Percentage of protein in helix, turn or sheet
TODO:
Finish definitions of dictionary
"""
inseq = ssbio.protein.sequence.utils.cast_to_str(inseq)
analysed_seq = ProteinAnalysis(inseq)
info_dict = {}
# info_dict['amino_acids_content-biop'] = analysed_seq.count_amino_acids()
info_dict['amino_acids_percent-biop'] = analysed_seq.get_amino_acids_percent()
# info_dict['length-biop'] = analysed_seq.length
info_dict['monoisotopic-biop'] = analysed_seq.monoisotopic
info_dict['molecular_weight-biop'] = analysed_seq.molecular_weight()
info_dict['aromaticity-biop'] = analysed_seq.aromaticity()
info_dict['instability_index-biop'] = analysed_seq.instability_index()
# TODO: What is flexibility?
# info_dict['flexibility-biop'] = analysed_seq.flexibility()
info_dict['isoelectric_point-biop'] = analysed_seq.isoelectric_point()
# Separated secondary_structure_fraction into each definition
# info_dict['secondary_structure_fraction-biop'] = analysed_seq.secondary_structure_fraction()
info_dict['percent_helix_naive-biop'] = analysed_seq.secondary_structure_fraction()[0]
info_dict['percent_turn_naive-biop'] = analysed_seq.secondary_structure_fraction()[1]
info_dict['percent_strand_naive-biop'] = analysed_seq.secondary_structure_fraction()[2]
return info_dict
[docs]def emboss_pepstats_on_fasta(infile, outfile='', outdir='', outext='.pepstats', force_rerun=False):
"""Run EMBOSS pepstats on a FASTA file.
Args:
infile: Path to FASTA file
outfile: Name of output file without extension
outdir: Path to output directory
outext: Extension of results file, default is ".pepstats"
force_rerun: Flag to rerun pepstats
Returns:
str: Path to output file.
"""
# Create the output file name
outfile = ssbio.utils.outfile_maker(inname=infile, outname=outfile, outdir=outdir, outext=outext)
# Run pepstats
program = 'pepstats'
pepstats_args = '-sequence="{}" -outfile="{}"'.format(infile, outfile)
cmd_string = '{} {}'.format(program, pepstats_args)
ssbio.utils.command_runner(cmd_string, force_rerun_flag=force_rerun, outfile_checker=outfile, silent=True)
return outfile
[docs]def emboss_pepstats_parser(infile):
"""Get dictionary of pepstats results.
Args:
infile: Path to pepstats outfile
Returns:
dict: Parsed information from pepstats
TODO:
Only currently parsing the bottom of the file for percentages of properties.
"""
with open(infile) as f:
lines = f.read().split('\n')
info_dict = {}
for l in lines[38:47]:
info = l.split('\t')
cleaninfo = list(filter(lambda x: x != '', info))
prop = cleaninfo[0]
num = cleaninfo[2]
percent = float(cleaninfo[-1]) / float(100)
info_dict['percent_' + prop.lower() + '-pepstats'] = percent
return info_dict
def residue_biochemical_definition(res):
# TODO: docstring
resprop = []
for k, v in _aa_property_dict_one.items():
if res in v:
resprop.append(k)
return resprop
[docs]def grantham_score(ref_aa, mut_aa):
"""https://github.com/ashutoshkpandey/Annotation/blob/master/Grantham_score_calculator.py"""
grantham = {
'S': {'R': 110, 'L': 145, 'P': 74, 'T': 58, 'A': 99, 'V': 124, 'G': 56, 'I': 142, 'F': 155, 'Y': 144, 'C': 112,
'H': 89, 'Q': 68, 'N': 46, 'K': 121, 'D': 65, 'E': 80, 'M': 135, 'W': 177},
'R': {'R': 0, 'L': 102, 'P': 103, 'T': 71, 'A': 112, 'V': 96, 'G': 125, 'I': 97, 'F': 97, 'Y': 77, 'C': 180,
'H': 29, 'Q': 43, 'N': 86, 'K': 26, 'D': 96, 'E': 54, 'M': 91, 'W': 101, 'S': 0},
'L': {'R': 0, 'L': 0, 'P': 98, 'T': 92, 'A': 96, 'V': 32, 'G': 138, 'I': 5, 'F': 22, 'Y': 36, 'C': 198, 'H': 99,
'Q': 113, 'N': 153, 'K': 107, 'D': 172, 'E': 138, 'M': 15, 'W': 61, 'S': 0},
'P': {'R': 0, 'L': 0, 'P': 0, 'T': 38, 'A': 27, 'V': 68, 'G': 42, 'I': 95, 'F': 114, 'Y': 110, 'C': 169,
'H': 77, 'Q': 76, 'N': 91, 'K': 103, 'D': 108, 'E': 93, 'M': 87, 'W': 147, 'S': 0},
'T': {'R': 0, 'L': 0, 'P': 0, 'T': 0, 'A': 58, 'V': 69, 'G': 59, 'I': 89, 'F': 103, 'Y': 92, 'C': 149, 'H': 47,
'Q': 42, 'N': 65, 'K': 78, 'D': 85, 'E': 65, 'M': 81, 'W': 128, 'S': 0},
'A': {'R': 0, 'L': 0, 'P': 0, 'T': 0, 'A': 0, 'V': 64, 'G': 60, 'I': 94, 'F': 113, 'Y': 112, 'C': 195, 'H': 86,
'Q': 91, 'N': 111, 'K': 106, 'D': 126, 'E': 107, 'M': 84, 'W': 148, 'S': 0},
'V': {'R': 0, 'L': 0, 'P': 0, 'T': 0, 'A': 0, 'V': 0, 'G': 109, 'I': 29, 'F': 50, 'Y': 55, 'C': 192, 'H': 84,
'Q': 96, 'N': 133, 'K': 97, 'D': 152, 'E': 121, 'M': 21, 'W': 88, 'S': 0},
'G': {'R': 0, 'L': 0, 'P': 0, 'T': 0, 'A': 0, 'V': 0, 'G': 0, 'I': 135, 'F': 153, 'Y': 147, 'C': 159, 'H': 98,
'Q': 87, 'N': 80, 'K': 127, 'D': 94, 'E': 98, 'M': 127, 'W': 184, 'S': 0},
'I': {'R': 0, 'L': 0, 'P': 0, 'T': 0, 'A': 0, 'V': 0, 'G': 0, 'I': 0, 'F': 21, 'Y': 33, 'C': 198, 'H': 94,
'Q': 109, 'N': 149, 'K': 102, 'D': 168, 'E': 134, 'M': 10, 'W': 61, 'S': 0},
'F': {'R': 0, 'L': 0, 'P': 0, 'T': 0, 'A': 0, 'V': 0, 'G': 0, 'I': 0, 'F': 0, 'Y': 22, 'C': 205, 'H': 100,
'Q': 116, 'N': 158, 'K': 102, 'D': 177, 'E': 140, 'M': 28, 'W': 40, 'S': 0},
'Y': {'R': 0, 'L': 0, 'P': 0, 'T': 0, 'A': 0, 'V': 0, 'G': 0, 'I': 0, 'F': 0, 'Y': 0, 'C': 194, 'H': 83,
'Q': 99, 'N': 143, 'K': 85, 'D': 160, 'E': 122, 'M': 36, 'W': 37, 'S': 0},
'C': {'R': 0, 'L': 0, 'P': 0, 'T': 0, 'A': 0, 'V': 0, 'G': 0, 'I': 0, 'F': 0, 'Y': 0, 'C': 0, 'H': 174,
'Q': 154, 'N': 139, 'K': 202, 'D': 154, 'E': 170, 'M': 196, 'W': 215, 'S': 0},
'H': {'R': 0, 'L': 0, 'P': 0, 'T': 0, 'A': 0, 'V': 0, 'G': 0, 'I': 0, 'F': 0, 'Y': 0, 'C': 0, 'H': 0, 'Q': 24,
'N': 68, 'K': 32, 'D': 81, 'E': 40, 'M': 87, 'W': 115, 'S': 0},
'Q': {'R': 0, 'L': 0, 'P': 0, 'T': 0, 'A': 0, 'V': 0, 'G': 0, 'I': 0, 'F': 0, 'Y': 0, 'C': 0, 'H': 0, 'Q': 0,
'N': 46, 'K': 53, 'D': 61, 'E': 29, 'M': 101, 'W': 130, 'S': 0},
'N': {'R': 0, 'L': 0, 'P': 0, 'T': 0, 'A': 0, 'V': 0, 'G': 0, 'I': 0, 'F': 0, 'Y': 0, 'C': 0, 'H': 0, 'Q': 0,
'N': 0, 'K': 94, 'D': 23, 'E': 42, 'M': 142, 'W': 174, 'S': 0},
'K': {'R': 0, 'L': 0, 'P': 0, 'T': 0, 'A': 0, 'V': 0, 'G': 0, 'I': 0, 'F': 0, 'Y': 0, 'C': 0, 'H': 0, 'Q': 0,
'N': 0, 'K': 0, 'D': 101, 'E': 56, 'M': 95, 'W': 110, 'S': 0},
'D': {'R': 0, 'L': 0, 'P': 0, 'T': 0, 'A': 0, 'V': 0, 'G': 0, 'I': 0, 'F': 0, 'Y': 0, 'C': 0, 'H': 0, 'Q': 0,
'N': 0, 'K': 0, 'D': 0, 'E': 45, 'M': 160, 'W': 181, 'S': 0},
'E': {'R': 0, 'L': 0, 'P': 0, 'T': 0, 'A': 0, 'V': 0, 'G': 0, 'I': 0, 'F': 0, 'Y': 0, 'C': 0, 'H': 0, 'Q': 0,
'N': 0, 'K': 0, 'D': 0, 'E': 0, 'M': 126, 'W': 152, 'S': 0},
'M': {'R': 0, 'L': 0, 'P': 0, 'T': 0, 'A': 0, 'V': 0, 'G': 0, 'I': 0, 'F': 0, 'Y': 0, 'C': 0, 'H': 0, 'Q': 0,
'N': 0, 'K': 0, 'D': 0, 'E': 0, 'M': 0, 'W': 67, 'S': 0},
'W': {'R': 0, 'L': 0, 'P': 0, 'T': 0, 'A': 0, 'V': 0, 'G': 0, 'I': 0, 'F': 0, 'Y': 0, 'C': 0, 'H': 0, 'Q': 0,
'N': 0, 'K': 0, 'D': 0, 'E': 0, 'M': 0, 'W': 0, 'S': 0}}
score = 0
if ref_aa not in grantham or mut_aa not in grantham:
log.error('{} to {}: a residue is not in the Grantham matrix'.format(ref_aa, mut_aa))
return score, 'Unknown'
if ref_aa == mut_aa:
return score, 'Conservative'
else:
if int(grantham[ref_aa][mut_aa]) != 0:
score += int(grantham[ref_aa][mut_aa])
else:
score += int(grantham[mut_aa][ref_aa])
if score > 150:
return score, "Radical"
elif 150 >= score > 100:
return score, "Moderately Radical"
elif 100 >= score > 50:
return score, "Moderately Conservative"
else:
return score, "Conservative"
[docs]def flexibility_index(aa_one):
"""From Smith DK, Radivoja P, ObradovicZ, et al. Improved amino acid flexibility parameters, Protein Sci.2003, 12:1060
Author: Ke Chen
Args:
aa_one:
Returns:
"""
aa_flexibility_index={}
aa_flexibility_index['A']=-0.605
aa_flexibility_index['C']=-0.692
aa_flexibility_index['D']=-0.279
aa_flexibility_index['E']=-0.160
aa_flexibility_index['F']=-0.719
aa_flexibility_index['G']=-0.537
aa_flexibility_index['H']=-0.662
aa_flexibility_index['I']=-0.682
aa_flexibility_index['K']=-0.043
aa_flexibility_index['L']=-0.631
aa_flexibility_index['M']=-0.626
aa_flexibility_index['N']=-0.381
aa_flexibility_index['P']=-0.271
aa_flexibility_index['Q']=-0.368
aa_flexibility_index['R']=-0.448
aa_flexibility_index['S']=-0.424
aa_flexibility_index['T']=-0.525
aa_flexibility_index['V']=-0.669
aa_flexibility_index['W']=-0.727
aa_flexibility_index['Y']=-0.721
return aa_flexibility_index[aa_one]