"""
GEMPRO
======
"""
import logging
import os
import os.path as op
import shutil
from copy import copy
import pandas as pd
from Bio import SeqIO
from bioservices import KEGG
from bioservices import UniProt
from cobra.core import DictList
from six.moves.urllib.error import HTTPError
from slugify import Slugify
import ssbio.core.modelpro
import ssbio.databases.kegg
import ssbio.databases.pdb
import ssbio.databases.uniprot
import ssbio.protein.sequence.properties.residues
import ssbio.protein.sequence.properties.tmhmm
import ssbio.protein.sequence.utils.fasta
import ssbio.protein.structure.properties.msms
import ssbio.protein.structure.properties.quality
import ssbio.protein.structure.properties.residues
from ssbio import utils
from ssbio.core.genepro import GenePro
from ssbio.core.modelpro import ModelPro
from ssbio.core.object import Object
from ssbio.databases.kegg import KEGGProp
from ssbio.databases.uniprot import UniProtProp
from ssbio.protein.sequence.properties.scratch import SCRATCH
if utils.is_ipynb():
from tqdm import tqdm_notebook as tqdm
else:
from tqdm import tqdm
custom_slugify = Slugify(safe_chars='-_.')
logging.getLogger("requests").setLevel(logging.ERROR)
logging.getLogger("urllib3").setLevel(logging.ERROR)
log = logging.getLogger(__name__)
date = utils.Date()
bs_unip = UniProt()
bs_kegg = KEGG()
[docs]class GEMPRO(Object):
"""Generic class to represent all information for a GEM-PRO project.
Initialize the GEM-PRO project with a genome-scale model, a list of genes, or a dict of genes and sequences.
Specify the name of your project, along with the root directory where a folder with that name will be created.
Main methods provided are:
#. Automated mapping of sequence IDs
* With KEGG mapper
* With UniProt mapper
* Allowing manual gene ID --> protein sequence entry
* Allowing manual gene ID --> UniProt ID
#. Consolidating sequence IDs and setting a representative sequence
* Currently these are set based on available PDB IDs
#. Mapping of representative sequence --> structures
* With UniProt --> ranking of PDB structures
* BLAST representative sequence --> PDB database
#. Preparation of files for homology modeling (currently for I-TASSER)
* Mapping to existing models
* Preparation for running I-TASSER
* Parsing I-TASSER runs
#. Running QC/QA on structures and setting a representative structure
* Various cutoffs (mutations, insertions, deletions) can be set to filter structures
#. Automation of protein sequence and structure property calculation
#. Creation of Pandas DataFrame summaries directly from downloaded metadata
Args:
gem_name (str): The name of your GEM or just your project in general. This will be the name of the main folder
that is created in root_dir.
root_dir (str): Path to where the folder named after ``gem_name`` will be created. If not provided, directories
will not be created and output directories need to be specified for some steps.
pdb_file_type (str): ``pdb``, ``mmCif``, ``xml``, ``mmtf`` - file type for files downloaded from the PDB
gem (Model): COBRApy Model object
gem_file_path (str): Path to GEM file
gem_file_type (str): GEM model type - ``sbml`` (or ``xml``), ``mat``, or ``json`` formats
genes_list (list): List of gene IDs that you want to map
genes_and_sequences (dict): Dictionary of gene IDs and their amino acid sequence strings
genome_path (str): FASTA file of all protein sequences
write_protein_fasta_files (bool): If individual protein FASTA files should be written out
description (str): Description string of your project
custom_spont_id (str): ID of spontaneous genes in a COBRA model which will be ignored for analysis
"""
def __init__(self, gem_name, root_dir=None, pdb_file_type='mmtf',
gem=None, gem_file_path=None, gem_file_type=None,
genes_list=None, genes_and_sequences=None, genome_path=None,
write_protein_fasta_files=True,
description=None, custom_spont_id=None):
Object.__init__(self, id=gem_name, description=description)
self.genes = DictList()
"""DictList: All protein-coding genes in this GEM-PRO project"""
self.custom_spont_id = custom_spont_id
"""str: ID of spontaneous genes in a COBRA model which will be ignored for analysis"""
self.pdb_file_type = pdb_file_type
"""str: ``pdb``, ``mmCif``, ``xml``, ``mmtf`` - file type for files downloaded from the PDB"""
self.genome_path = genome_path
"""str: Simple link to the filepath of the FASTA file containing all protein sequences"""
self.model = None
"""Model: COBRApy model object"""
# Create directories
self._root_dir = None
if root_dir:
self.root_dir = root_dir
# TODO: add some checks for multiple inputs (only allow one!)
# Load a Model object
if gem:
self.load_cobra_model(gem)
# Or, load a GEM file
elif gem_file_path and gem_file_type:
gem = ssbio.core.modelpro.model_loader(gem_file_path=gem_file_path,
gem_file_type=gem_file_type)
self.load_cobra_model(gem)
# Or, load a list of gene IDs
elif genes_list:
self.add_gene_ids(genes_list)
# Or, load a dictionary of genes and their sequences
elif genes_and_sequences:
self.add_gene_ids(list(genes_and_sequences.keys()))
self.manual_seq_mapping(genes_and_sequences, write_fasta_files=write_protein_fasta_files)
# Or, load the provided FASTA file
elif genome_path:
genes_and_sequences = ssbio.protein.sequence.utils.fasta.load_fasta_file_as_dict_of_seqs(genome_path)
self.add_gene_ids(list(genes_and_sequences.keys()))
self.manual_seq_mapping(genes_and_sequences, write_fasta_files=write_protein_fasta_files)
# If neither a model or genes are input, you can still add IDs with method add_gene_ids later
else:
log.warning('No model or genes input')
log.info('{}: number of genes'.format(len(self.genes)))
@property
def root_dir(self):
"""str: Directory where GEM-PRO project folder named after the attribute ``base_dir`` is located."""
return self._root_dir
@root_dir.setter
def root_dir(self, path):
if not path:
raise ValueError('No path specified')
if not op.exists(path):
raise ValueError('{}: folder does not exist'.format(path))
if self._root_dir:
log.info('Changing root directory of GEM-PRO project "{}" from {} to {}'.format(self.id, self.root_dir, path))
if not op.exists(op.join(path, self.id)):
raise IOError('GEM-PRO project "{}" does not exist in folder {}'.format(self.id, path))
else:
log.info('Creating GEM-PRO project directory in folder {}'.format(path))
self._root_dir = path
for d in [self.base_dir, self.model_dir, self.data_dir, self.genes_dir, self.structures_dir]:
ssbio.utils.make_dir(d)
log.info('{}: GEM-PRO project location'.format(self.base_dir))
# Propagate changes to gene
if hasattr(self, 'genes'):
for g in self.genes:
g.root_dir = self.genes_dir
@property
def base_dir(self):
"""str: GEM-PRO project folder."""
if self.root_dir:
return op.join(self.root_dir, self.id)
else:
return None
@property
def model_dir(self):
"""str: Directory where original GEMs and GEM-related files are stored."""
if self.base_dir:
return op.join(self.base_dir, 'model')
else:
return None
@property
def data_dir(self):
"""str: Directory where all data are stored."""
if self.base_dir:
return op.join(self.base_dir, 'data')
else:
return None
@property
def genes_dir(self):
"""str: Directory where all gene specific information is stored."""
if self.base_dir:
return op.join(self.base_dir, 'genes')
else:
return None
@property
def structures_dir(self):
"""str: Directory where all structures are stored."""
# XTODO: replace storage of structures in individual protein directories with this to reduce redundancy
if self.base_dir:
return op.join(self.base_dir, 'structures')
else:
return None
[docs] def load_cobra_model(self, model):
"""Load a COBRApy Model object into the GEM-PRO project.
Args:
model (Model): COBRApy ``Model`` object
"""
self.model = ModelPro(model)
for g in self.model.genes:
if self.genes_dir:
g.root_dir = self.genes_dir
g.protein.pdb_file_type = self.pdb_file_type
self.genes = self.model.genes
log.info('{}: loaded model'.format(model.id))
log.info('{}: number of reactions'.format(len(self.model.reactions)))
log.info('{}: number of reactions linked to a gene'.format(ssbio.core.modelpro.true_num_reactions(self.model)))
log.info('{}: number of genes (excluding spontaneous)'.format(ssbio.core.modelpro.true_num_genes(self.model,
custom_spont_id=self.custom_spont_id)))
log.info('{}: number of metabolites'.format(len(self.model.metabolites)))
log.warning('IMPORTANT: All Gene objects have been transformed into GenePro '
'objects, and will be for any new ones')
@property
def genes_with_structures(self):
"""DictList: All genes with any mapped protein structures."""
return DictList(x for x in self.genes if x.protein.num_structures > 0)
@property
def genes_with_experimental_structures(self):
"""DictList: All genes that have at least one experimental structure."""
return DictList(x for x in self.genes_with_structures if x.protein.num_structures_experimental > 0)
@property
def genes_with_homology_models(self):
"""DictList: All genes that have at least one homology model."""
return DictList(x for x in self.genes_with_structures if x.protein.num_structures_homology > 0)
@property
def genes_with_a_representative_sequence(self):
"""DictList: All genes with a representative sequence."""
return DictList(x for x in self.genes if x.protein.representative_sequence)
# return DictList(y for y in tmp if y.protein.representative_sequence.seq)
@property
def genes_with_a_representative_structure(self):
"""DictList: All genes with a representative protein structure."""
tmp = DictList(x for x in self.genes if x.protein.representative_structure)
return DictList(y for y in tmp if y.protein.representative_structure.structure_file)
@property
def functional_genes(self):
"""DictList: All genes with a representative protein structure."""
return DictList(x for x in self.genes if x.functional)
# @property
# def genes(self):
# """DictList: All genes excluding spontaneous ones."""
# return ssbio.core.modelpro.filter_out_spontaneous_genes(self._genes, custom_spont_id=self.custom_spont_id)
# @genes.setter
# def genes(self, genes_list):
# """Set the genes attribute to be a DictList of GenePro objects.
#
# A "protein" attribute will be added to each Gene.
#
# Args:
# genes_list: DictList of COBRApy Gene objects, or list of gene IDs
#
# """
#
# if not isinstance(genes_list, DictList):
# tmp_list = []
# for x in list(set(genes_list)):
# x = str(x)
# new_gene = GenePro(id=x, pdb_file_type=self.pdb_file_type, root_dir=self.genes_dir)
# tmp_list.append(new_gene)
# self._genes = DictList(tmp_list)
# else:
# self._genes = genes_list
[docs] def add_gene_ids(self, genes_list):
"""Add gene IDs manually into the GEM-PRO project.
Args:
genes_list (list): List of gene IDs as strings.
"""
orig_num_genes = len(self.genes)
for g in list(set(genes_list)):
if not self.genes.has_id(g):
new_gene = GenePro(id=g, pdb_file_type=self.pdb_file_type, root_dir=self.genes_dir)
if self.model:
self.model.genes.append(new_gene)
else:
self.genes.append(new_gene)
log.info('Added {} genes to GEM-PRO project'.format(len(self.genes)-orig_num_genes))
####################################################################################################################
### SEQUENCE RELATED METHODS ###
log.info('Completed ID mapping --> KEGG. See the "df_kegg_metadata" attribute for a summary dataframe.')
log.info('Completed ID mapping --> KEGG. See the "df_kegg_metadata" attribute for a summary dataframe.')
@property
def df_kegg_metadata(self):
"""DataFrame: Pandas DataFrame of KEGG metadata per protein."""
kegg_pre_df = []
df_cols = ['gene', 'kegg', 'refseq', 'uniprot', 'num_pdbs', 'pdbs', 'seq_len', 'sequence_file', 'metadata_file']
for g in self.genes:
kegg_mappings = g.protein.filter_sequences(KEGGProp)
for kegg_prop in kegg_mappings:
kegg_dict = kegg_prop.get_dict(df_format=True, only_attributes=df_cols)
kegg_dict['gene'] = g.id
kegg_pre_df.append(kegg_dict)
# Save a dataframe of the file mapping info
df = pd.DataFrame.from_records(kegg_pre_df, columns=df_cols).set_index('gene')
if df.empty:
log.warning('Empty dataframe')
return df
else:
return ssbio.utils.clean_df(df)
@property
def missing_kegg_mapping(self):
"""list: List of genes with no mapping to KEGG."""
kegg_missing = []
for g in self.genes:
keggs = g.protein.filter_sequences(KEGGProp)
no_sequence_file_available = True
for k in keggs:
if k.sequence_file:
no_sequence_file_available = False
break
if no_sequence_file_available:
kegg_missing.append(g.id)
return list(set(kegg_missing))
log.info('Completed ID mapping --> UniProt. See the "df_uniprot_metadata" attribute for a summary dataframe.')
[docs] def manual_uniprot_mapping(self, gene_to_uniprot_dict, outdir=None, set_as_representative=True):
"""Read a manual dictionary of model gene IDs --> UniProt IDs. By default sets them as representative.
This allows for mapping of the missing genes, or overriding of automatic mappings.
Input a dictionary of::
{
<gene_id1>: <uniprot_id1>,
<gene_id2>: <uniprot_id2>,
}
Args:
gene_to_uniprot_dict: Dictionary of mappings as shown above
outdir (str): Path to output directory of downloaded files, must be set if GEM-PRO directories
were not created initially
set_as_representative (bool): If mapped UniProt IDs should be set as representative sequences
"""
for g, u in tqdm(gene_to_uniprot_dict.items()):
g = str(g)
gene = self.genes.get_by_id(g)
try:
uniprot_prop = gene.protein.load_uniprot(uniprot_id=u,
outdir=outdir, download=True,
set_as_representative=set_as_representative)
except HTTPError as e:
log.error('{}, {}: unable to complete web request'.format(g, u))
print(e)
continue
log.info('Completed manual ID mapping --> UniProt. See the "df_uniprot_metadata" attribute for a summary dataframe.')
@property
def df_uniprot_metadata(self):
"""DataFrame: Pandas DataFrame of UniProt metadata per protein."""
uniprot_pre_df = []
df_cols = ['gene', 'uniprot', 'reviewed', 'gene_name', 'kegg', 'refseq', 'num_pdbs', 'pdbs', 'ec_number',
'pfam', 'seq_len', 'description', 'entry_date', 'entry_version', 'seq_date', 'seq_version',
'sequence_file', 'metadata_file']
for g in self.genes:
uniprot_mappings = g.protein.filter_sequences(UniProtProp)
for uniprot_prop in uniprot_mappings:
uniprot_dict = uniprot_prop.get_dict(df_format=True, only_attributes=df_cols)
uniprot_dict['gene'] = g.id
uniprot_pre_df.append(uniprot_dict)
df = pd.DataFrame.from_records(uniprot_pre_df, columns=df_cols).set_index('gene')
if df.empty:
log.warning('Empty dataframe')
return df
else:
return ssbio.utils.clean_df(df)
@property
def missing_uniprot_mapping(self):
"""list: List of genes with no mapping to UniProt."""
uniprot_missing = []
for g in self.genes:
ups = g.protein.filter_sequences(UniProtProp)
no_sequence_file_available = True
for u in ups:
if u.sequence_file or u.metadata_file:
no_sequence_file_available = False
break
if no_sequence_file_available:
uniprot_missing.append(g.id)
return list(set(uniprot_missing))
# TODO: should also have a seq --> uniprot id function (has to be 100% match) (also needs organism)
[docs] def manual_seq_mapping(self, gene_to_seq_dict, outdir=None, write_fasta_files=True, set_as_representative=True):
"""Read a manual input dictionary of model gene IDs --> protein sequences. By default sets them as representative.
Args:
gene_to_seq_dict (dict): Mapping of gene IDs to their protein sequence strings
outdir (str): Path to output directory of downloaded files, must be set if GEM-PRO directories
were not created initially
write_fasta_files (bool): If individual protein FASTA files should be written out
set_as_representative (bool): If mapped sequences should be set as representative
"""
if outdir:
outdir_set = True
else:
outdir_set = False
# Save the sequence information in individual FASTA files
for g, s in gene_to_seq_dict.items():
gene = self.genes.get_by_id(str(g))
if not outdir_set and write_fasta_files:
outdir = gene.protein.sequence_dir
if not outdir:
raise ValueError('Output directory must be specified')
manual_info = gene.protein.load_manual_sequence(ident=g, seq=s, outdir=outdir,
write_fasta_file=write_fasta_files,
set_as_representative=set_as_representative)
log.debug('{}: loaded manually defined sequence information'.format(g))
log.info('Loaded in {} sequences'.format(len(gene_to_seq_dict)))
[docs] def set_representative_sequence(self, force_rerun=False):
"""Automatically consolidate loaded sequences (manual, UniProt, or KEGG) and set a single representative sequence.
Manually set representative sequences override all existing mappings. UniProt mappings override KEGG mappings
except when KEGG mappings have PDBs associated with them and UniProt doesn't.
Args:
force_rerun (bool): Set to True to recheck stored sequences
"""
# TODO: rethink use of multiple database sources - may lead to inconsistency with genome sources
successfully_mapped_counter = 0
for g in tqdm(self.genes):
repseq = g.protein.set_representative_sequence(force_rerun=force_rerun)
if repseq:
if repseq.sequence_file:
successfully_mapped_counter += 1
log.info('{}/{}: number of genes with a representative sequence'.format(len(self.genes_with_a_representative_sequence),
len(self.genes)))
log.info('See the "df_representative_sequences" attribute for a summary dataframe.')
@property
def df_representative_sequences(self):
"""DataFrame: Pandas DataFrame of representative sequence information per protein."""
seq_mapping_pre_df = []
df_cols = ['gene', 'uniprot', 'kegg', 'num_pdbs', 'pdbs', 'seq_len', 'sequence_file', 'metadata_file']
for g in self.genes_with_a_representative_sequence:
gene_dict = g.protein.representative_sequence.get_dict(df_format=True, only_attributes=df_cols)
gene_dict['gene'] = g.id
seq_mapping_pre_df.append(gene_dict)
df = pd.DataFrame.from_records(seq_mapping_pre_df, columns=df_cols).set_index('gene')
if df.empty:
log.warning('Empty dataframe')
return df
else:
return ssbio.utils.clean_df(df)
@property
def missing_representative_sequence(self):
"""list: List of genes with no mapping to a representative sequence."""
return [x.id for x in self.genes if not self.genes_with_a_representative_sequence.has_id(x.id)]
[docs] def write_representative_sequences_file(self, outname, outdir=None, set_ids_from_model=True):
"""Write all the model's sequences as a single FASTA file. By default, sets IDs to model gene IDs.
Args:
outname (str): Name of the output FASTA file without the extension
outdir (str): Path to output directory of downloaded files, must be set if GEM-PRO directories
were not created initially
set_ids_from_model (bool): If the gene ID source should be the model gene IDs, not the original sequence ID
"""
if not outdir:
outdir = self.data_dir
if not outdir:
raise ValueError('Output directory must be specified')
outfile = op.join(outdir, outname + '.faa')
tmp = []
for x in self.genes_with_a_representative_sequence:
repseq = x.protein.representative_sequence
copied_seq_record = copy(repseq)
if set_ids_from_model:
copied_seq_record.id = x.id
tmp.append(copied_seq_record)
SeqIO.write(tmp, outfile, "fasta")
log.info('{}: wrote all representative sequences to file'.format(outfile))
self.genome_path = outfile
return self.genome_path
[docs] def get_sequence_properties(self, representatives_only=True):
"""Run Biopython ProteinAnalysis and EMBOSS pepstats to summarize basic statistics of all protein sequences.
Results are stored in the protein's respective SeqProp objects at ``.annotations``
Args:
representative_only (bool): If analysis should only be run on the representative sequences
"""
for g in tqdm(self.genes):
g.protein.get_sequence_properties(representative_only=representatives_only)
[docs] def get_scratch_predictions(self, path_to_scratch, results_dir, scratch_basename='scratch', num_cores=1,
exposed_buried_cutoff=25, custom_gene_mapping=None):
"""Run and parse ``SCRATCH`` results to predict secondary structure and solvent accessibility.
Annotations are stored in the protein's representative sequence at:
* ``.annotations``
* ``.letter_annotations``
Args:
path_to_scratch (str): Path to SCRATCH executable
results_dir (str): Path to SCRATCH results folder, which will have the files (scratch.ss, scratch.ss8,
scratch.acc, scratch.acc20)
scratch_basename (str): Basename of the SCRATCH results ('scratch' is default)
num_cores (int): Number of cores to use to parallelize SCRATCH run
exposed_buried_cutoff (int): Cutoff of exposed/buried for the acc20 predictions
custom_gene_mapping (dict): Default parsing of SCRATCH output files is to look for the model gene IDs. If
your output files contain IDs which differ from the model gene IDs, use this dictionary to map model
gene IDs to result file IDs. Dictionary keys must match model genes.
"""
if not self.genome_path:
# Write all sequences as one file
all_seqs = self.write_representative_sequences_file(outname=self.id)
# Runs SCRATCH or loads existing results in results_dir
scratch = SCRATCH(project_name=scratch_basename, seq_file=self.genome_path)
scratch.run_scratch(path_to_scratch=path_to_scratch, num_cores=num_cores, outdir=results_dir)
sspro_summary = scratch.sspro_summary()
sspro8_summary = scratch.sspro8_summary()
sspro_results = scratch.sspro_results()
sspro8_results = scratch.sspro8_results()
accpro_summary = scratch.accpro_summary()
accpro20_summary = scratch.accpro20_summary(exposed_buried_cutoff)
accpro_results = scratch.accpro_results()
accpro20_results = scratch.accpro20_results()
counter = 0
# Adding the scratch annotations to the representative_sequences letter_annotations
for g in tqdm(self.genes_with_a_representative_sequence):
if custom_gene_mapping:
g_id = custom_gene_mapping[g.id]
else:
g_id = g.id
if g_id in sspro_summary:
# Secondary structure
g.protein.representative_sequence.annotations.update(sspro_summary[g_id])
g.protein.representative_sequence.annotations.update(sspro8_summary[g_id])
g.protein.representative_sequence.letter_annotations['SS-sspro'] = sspro_results[g_id]
g.protein.representative_sequence.letter_annotations['SS-sspro8'] = sspro8_results[g_id]
# Solvent accessibility
g.protein.representative_sequence.annotations.update(accpro_summary[g_id])
g.protein.representative_sequence.annotations.update(accpro20_summary[g_id])
g.protein.representative_sequence.letter_annotations['RSA-accpro'] = accpro_results[g_id]
g.protein.representative_sequence.letter_annotations['RSA-accpro20'] = accpro20_results[g_id]
counter += 1
else:
log.error('{}: missing SCRATCH results'.format(g.id))
log.info('{}/{}: number of genes with SCRATCH predictions loaded'.format(counter, len(self.genes)))
[docs] def get_tmhmm_predictions(self, tmhmm_results, custom_gene_mapping=None):
"""Parse TMHMM results and store in the representative sequences.
This is a basic function to parse pre-run TMHMM results. Run TMHMM from the
web service (http://www.cbs.dtu.dk/services/TMHMM/) by doing the following:
1. Write all representative sequences in the GEM-PRO using the function ``write_representative_sequences_file``
2. Upload the file to http://www.cbs.dtu.dk/services/TMHMM/ and choose "Extensive, no graphics" as the output
3. Copy and paste the results (ignoring the top header and above "HELP with output formats") into a file and save it
4. Run this function on that file
Args:
tmhmm_results (str): Path to TMHMM results (long format)
custom_gene_mapping (dict): Default parsing of TMHMM output is to look for the model gene IDs. If
your output file contains IDs which differ from the model gene IDs, use this dictionary to map model
gene IDs to result file IDs. Dictionary keys must match model genes.
"""
# TODO: refactor to Protein class?
tmhmm_dict = ssbio.protein.sequence.properties.tmhmm.parse_tmhmm_long(tmhmm_results)
counter = 0
for g in tqdm(self.genes_with_a_representative_sequence):
if custom_gene_mapping:
g_id = custom_gene_mapping[g.id]
else:
g_id = g.id
if g_id in tmhmm_dict:
log.debug('{}: loading TMHMM results'.format(g.id))
if not tmhmm_dict[g_id]:
log.error("{}: missing TMHMM results".format(g.id))
g.protein.representative_sequence.annotations['num_tm_helix-tmhmm'] = tmhmm_dict[g_id]['num_tm_helices']
g.protein.representative_sequence.letter_annotations['TM-tmhmm'] = tmhmm_dict[g_id]['sequence']
counter += 1
else:
log.error("{}: missing TMHMM results".format(g.id))
log.info('{}/{}: number of genes with TMHMM predictions loaded'.format(counter, len(self.genes)))
### END SEQUENCE RELATED METHODS ###
####################################################################################################################
####################################################################################################################
### STRUCTURE RELATED METHODS ###
[docs] def blast_seqs_to_pdb(self, seq_ident_cutoff=0, evalue=0.0001, all_genes=False, display_link=False,
outdir=None, force_rerun=False):
"""BLAST each representative protein sequence to the PDB. Saves raw BLAST results (XML files).
Args:
seq_ident_cutoff (float, optional): Cutoff results based on percent coverage (in decimal form)
evalue (float, optional): Cutoff for the E-value - filters for significant hits. 0.001 is liberal,
0.0001 is stringent (default).
all_genes (bool): If all genes should be BLASTed, or only those without any structures currently mapped
display_link (bool, optional): Set to True if links to the HTML results should be displayed
outdir (str): Path to output directory of downloaded files, must be set if GEM-PRO directories
were not created initially
force_rerun (bool, optional): If existing BLAST results should not be used, set to True. Default is False
"""
counter = 0
for g in tqdm(self.genes_with_a_representative_sequence):
# If all_genes=False, BLAST only genes without a uniprot -> pdb mapping
if g.protein.num_structures_experimental > 0 and not all_genes and not force_rerun:
log.debug('{}: skipping BLAST, {} experimental structures already mapped '
'and all_genes flag is False'.format(g.id,
g.protein.num_structures_experimental))
continue
# BLAST the sequence to the PDB
new_pdbs = g.protein.blast_representative_sequence_to_pdb(seq_ident_cutoff=seq_ident_cutoff,
evalue=evalue,
display_link=display_link,
outdir=outdir,
force_rerun=force_rerun)
if new_pdbs:
counter += 1
log.debug('{}: {} PDBs BLASTed'.format(g.id, len(new_pdbs)))
else:
log.debug('{}: no BLAST results'.format(g.id))
log.info('Completed sequence --> PDB BLAST. See the "df_pdb_blast" attribute for a summary dataframe.')
log.info('{}: number of genes with additional structures added from BLAST'.format(counter))
@property
def df_pdb_blast(self):
"""DataFrame: Get a dataframe of PDB BLAST results"""
df = pd.DataFrame()
for g in self.genes_with_experimental_structures:
protein_df = g.protein.df_pdb_blast.copy().reset_index()
if not protein_df.empty:
protein_df['gene'] = g.id
df = df.append(protein_df)
if df.empty:
log.warning('Empty dataframe')
return df
else:
return ssbio.utils.clean_df(df.set_index('gene'))
[docs] def map_uniprot_to_pdb(self, seq_ident_cutoff=0.0, outdir=None, force_rerun=False):
"""Map all representative sequences' UniProt ID to PDB IDs using the PDBe "Best Structures" API.
Will save a JSON file of the results to each protein's ``sequences`` folder.
The "Best structures" API is available at https://www.ebi.ac.uk/pdbe/api/doc/sifts.html
The list of PDB structures mapping to a UniProt accession sorted by coverage of the protein and,
if the same, resolution.
Args:
seq_ident_cutoff (float): Sequence identity cutoff in decimal form
outdir (str): Output directory to cache JSON results of search
force_rerun (bool): Force re-downloading of JSON results if they already exist
Returns:
list: A rank-ordered list of PDBProp objects that map to the UniProt ID
"""
# First get all UniProt IDs and check if they have PDBs
all_representative_uniprots = []
for g in self.genes_with_a_representative_sequence:
uniprot_id = g.protein.representative_sequence.uniprot
if uniprot_id:
# TODO: add warning or something for isoform ids?
if '-' in uniprot_id:
uniprot_id = uniprot_id.split('-')[0]
all_representative_uniprots.append(uniprot_id)
log.info('Mapping UniProt IDs --> PDB IDs...')
uniprots_to_pdbs = bs_unip.mapping(fr='ACC', to='PDB_ID', query=all_representative_uniprots)
counter = 0
# Now run the best_structures API for all genes
for g in tqdm(self.genes_with_a_representative_sequence):
uniprot_id = g.protein.representative_sequence.uniprot
if uniprot_id:
if '-' in uniprot_id:
uniprot_id = uniprot_id.split('-')[0]
if uniprot_id in uniprots_to_pdbs:
best_structures = g.protein.map_uniprot_to_pdb(seq_ident_cutoff=seq_ident_cutoff, outdir=outdir, force_rerun=force_rerun)
if best_structures:
counter += 1
log.debug('{}: {} PDBs mapped'.format(g.id, len(best_structures)))
else:
log.debug('{}, {}: no PDBs available'.format(g.id, uniprot_id))
log.info('{}/{}: number of genes with at least one experimental structure'.format(len(self.genes_with_experimental_structures),
len(self.genes)))
log.info('Completed UniProt --> best PDB mapping. See the "df_pdb_ranking" attribute for a summary dataframe.')
@property
def df_pdb_ranking(self):
"""DataFrame: Get a dataframe of UniProt -> best structure in PDB results"""
df = pd.DataFrame()
for g in self.genes_with_experimental_structures:
protein_df = g.protein.df_pdb_ranking.copy().reset_index()
if not protein_df.empty:
protein_df['gene'] = g.id
df = df.append(protein_df)
if df.empty:
log.warning('Empty dataframe')
return df
else:
return ssbio.utils.clean_df(df.set_index('gene'))
@property
def missing_pdb_structures(self):
"""list: List of genes with no mapping to any experimental PDB structure."""
return [x.id for x in self.genes if not self.genes_with_experimental_structures.has_id(x.id)]
[docs] def get_manual_homology_models(self, input_dict, outdir=None, clean=True, force_rerun=False):
"""Copy homology models to the GEM-PRO project.
Requires an input of a dictionary formatted like so::
{
model_gene: {
homology_model_id1: {
'model_file': '/path/to/homology/model.pdb',
'file_type': 'pdb'
'additional_info': info_value
},
homology_model_id2: {
'model_file': '/path/to/homology/model.pdb'
'file_type': 'pdb'
}
}
}
Args:
input_dict (dict): Dictionary of dictionaries of gene names to homology model IDs and other information
outdir (str): Path to output directory of downloaded files, must be set if GEM-PRO directories
were not created initially
clean (bool): If homology files should be cleaned and saved as a new PDB file
force_rerun (bool): If homology files should be copied again even if they exist in the GEM-PRO directory
"""
if outdir:
outdir_set = True
else:
outdir_set = False
counter = 0
for g in tqdm(self.genes):
if g.id not in input_dict:
continue
if not outdir_set:
outdir = g.protein.structure_dir
if not outdir:
raise ValueError('Output directory must be specified')
for hid, hdict in input_dict[g.id].items():
if 'model_file' not in hdict or 'file_type' not in hdict:
raise KeyError('"model_file" and "file_type" must be keys in the manual input dictionary.')
new_homology = g.protein.load_pdb(pdb_id=hid, pdb_file=hdict['model_file'],
file_type=hdict['file_type'], is_experimental=False)
if clean:
new_homology.load_structure_path(new_homology.clean_structure(outdir=outdir, force_rerun=force_rerun),
hdict['file_type'])
else:
copy_to = op.join(outdir, op.basename(hdict['model_file']))
if ssbio.utils.force_rerun(force_rerun, copy_to):
# Just copy the file to the structure directory and store the file name
log.debug('{}: copying model from original directory to GEM-PRO directory'.format(op.basename(hdict['model_file'])))
shutil.copy2(hdict['model_file'], outdir)
new_homology.load_structure_path(copy_to, hdict['file_type'])
else:
log.debug('{}: homology model already copied to directory'.format(copy_to))
new_homology.load_structure_path(copy_to, hdict['file_type'])
# TODO: need to better handle other info in the provided dictionary, if any
new_homology.update(hdict)
log.debug('{}: updated homology model information and copied model file.'.format(g.id))
counter += 1
log.info('Updated homology model information for {} genes.'.format(counter))
[docs] def get_itasser_models(self, homology_raw_dir, custom_itasser_name_mapping=None, outdir=None, force_rerun=False):
"""Copy generated I-TASSER models from a directory to the GEM-PRO directory.
Args:
homology_raw_dir (str): Root directory of I-TASSER folders.
custom_itasser_name_mapping (dict): Use this if your I-TASSER folder names differ from your model gene names.
Input a dict of {model_gene: ITASSER_folder}.
outdir (str): Path to output directory of downloaded files, must be set if GEM-PRO directories
were not created initially
force_rerun (bool): If homology files should be copied again even if they exist in the GEM-PRO directory
"""
counter = 0
for g in tqdm(self.genes):
if custom_itasser_name_mapping and g.id in custom_itasser_name_mapping:
hom_id = custom_itasser_name_mapping[g.id]
if not op.exists(op.join(homology_raw_dir, hom_id)):
hom_id = g.id
else:
hom_id = g.id
# The name of the actual pdb file will be $GENEID_model1.pdb
new_itasser_name = hom_id + '_model1'
orig_itasser_dir = op.join(homology_raw_dir, hom_id)
try:
itasser_prop = g.protein.load_itasser_folder(ident=hom_id, itasser_folder=orig_itasser_dir,
organize=True, outdir=outdir,
organize_name=new_itasser_name,
force_rerun=force_rerun)
except OSError:
log.debug('{}: homology model folder unavailable'.format(g.id))
continue
except IOError:
log.debug('{}: homology model unavailable'.format(g.id))
continue
if itasser_prop.structure_file:
counter += 1
else:
log.debug('{}: homology model file unavailable, perhaps modelling did not finish'.format(g.id))
log.info('Completed copying of {} I-TASSER models to GEM-PRO directory. See the "df_homology_models" attribute for a summary dataframe.'.format(counter))
@property
def df_homology_models(self):
"""DataFrame: Get a dataframe of I-TASSER homology model results"""
df = pd.DataFrame()
for g in self.genes_with_homology_models:
protein_df = g.protein.df_homology_models.copy().reset_index()
if not protein_df.empty:
protein_df['gene'] = g.id
df = df.append(protein_df)
if df.empty:
log.warning('Empty dataframe')
return df
else:
return ssbio.utils.clean_df(df.set_index('gene'))
@property
def missing_homology_models(self):
"""list: List of genes with no mapping to any homology models."""
return [x.id for x in self.genes if not self.genes_with_homology_models.has_id(x.id)]
[docs] def set_representative_structure(self, seq_outdir=None, struct_outdir=None, pdb_file_type=None,
engine='needle', always_use_homology=False, rez_cutoff=0.0,
seq_ident_cutoff=0.5, allow_missing_on_termini=0.2,
allow_mutants=True, allow_deletions=False,
allow_insertions=False, allow_unresolved=True, skip_large_structures=False,
clean=True, force_rerun=False):
"""Set all representative structure for proteins from a structure in the structures attribute.
Each gene can have a combination of the following, which will be analyzed to set a representative structure.
* Homology model(s)
* Ranked PDBs
* BLASTed PDBs
If the ``always_use_homology`` flag is true, homology models are always set as representative when they exist.
If there are multiple homology models, we rank by the percent sequence coverage.
Args:
seq_outdir (str): Path to output directory of sequence alignment files, must be set if GEM-PRO directories
were not created initially
struct_outdir (str): Path to output directory of structure files, must be set if GEM-PRO directories
were not created initially
pdb_file_type (str): ``pdb``, ``mmCif``, ``xml``, ``mmtf`` - file type for files downloaded from the PDB
engine (str): ``biopython`` or ``needle`` - which pairwise alignment program to use.
``needle`` is the standard EMBOSS tool to run pairwise alignments.
``biopython`` is Biopython's implementation of needle. Results can differ!
always_use_homology (bool): If homology models should always be set as the representative structure
rez_cutoff (float): Resolution cutoff, in Angstroms (only if experimental structure)
seq_ident_cutoff (float): Percent sequence identity cutoff, in decimal form
allow_missing_on_termini (float): Percentage of the total length of the reference sequence which will be ignored
when checking for modifications. Example: if 0.1, and reference sequence is 100 AA, then only residues
5 to 95 will be checked for modifications.
allow_mutants (bool): If mutations should be allowed or checked for
allow_deletions (bool): If deletions should be allowed or checked for
allow_insertions (bool): If insertions should be allowed or checked for
allow_unresolved (bool): If unresolved residues should be allowed or checked for
skip_large_structures (bool): Default False -- currently, large structures can't be saved as a PDB file even
if you just want to save a single chain, so Biopython will throw an error when trying to do so. As an
alternative, if a large structure is selected as representative, the pipeline will currently point to it
and not clean it. If you don't want this to happen, set this to true.
clean (bool): If structures should be cleaned
force_rerun (bool): If sequence to structure alignment should be rerun
Todo:
- Remedy large structure representative setting
"""
for g in tqdm(self.genes):
repstruct = g.protein.set_representative_structure(seq_outdir=seq_outdir,
struct_outdir=struct_outdir,
pdb_file_type=pdb_file_type,
engine=engine,
rez_cutoff=rez_cutoff,
seq_ident_cutoff=seq_ident_cutoff,
always_use_homology=always_use_homology,
allow_missing_on_termini=allow_missing_on_termini,
allow_mutants=allow_mutants,
allow_deletions=allow_deletions,
allow_insertions=allow_insertions,
allow_unresolved=allow_unresolved,
skip_large_structures=skip_large_structures,
clean=clean,
force_rerun=force_rerun)
log.info('{}/{}: number of genes with a representative structure'.format(len(self.genes_with_a_representative_structure),
len(self.genes)))
log.info('See the "df_representative_structures" attribute for a summary dataframe.')
def set_representative_structure_parallelize(self, sc, seq_outdir=None, struct_outdir=None, pdb_file_type=None,
engine='needle', always_use_homology=False, rez_cutoff=0.0,
seq_ident_cutoff=0.5, allow_missing_on_termini=0.2,
allow_mutants=True, allow_deletions=False,
allow_insertions=False, allow_unresolved=True, skip_large_structures=False,
clean=True, force_rerun=False):
def set_repstruct(g, seq_outdir=seq_outdir, struct_outdir=struct_outdir,
pdb_file_type=pdb_file_type, engine=engine,
rez_cutoff=rez_cutoff, seq_ident_cutoff=seq_ident_cutoff,
always_use_homology=always_use_homology,
allow_missing_on_termini=allow_missing_on_termini,
allow_mutants=allow_mutants, allow_deletions=allow_deletions,
allow_insertions=allow_insertions, allow_unresolved=allow_unresolved,
skip_large_structures=skip_large_structures,
clean=clean, force_rerun=force_rerun):
g.protein.set_representative_structure(seq_outdir=seq_outdir, struct_outdir=struct_outdir,
pdb_file_type=pdb_file_type, engine=engine,
rez_cutoff=rez_cutoff, seq_ident_cutoff=seq_ident_cutoff,
always_use_homology=always_use_homology,
allow_missing_on_termini=allow_missing_on_termini,
allow_mutants=allow_mutants, allow_deletions=allow_deletions,
allow_insertions=allow_insertions, allow_unresolved=allow_unresolved,
skip_large_structures=skip_large_structures,
clean=clean, force_rerun=force_rerun)
return g
genes_rdd = sc.parallelize(self.genes)
result = genes_rdd.map(set_repstruct).collect()
# Copy the results over to the GEM-PRO object's genes using the GenePro function "copy_modified_gene"
# Also count how many genes mapped to KEGG
for modified_g in result:
original_gene = self.genes.get_by_id(modified_g.id)
original_gene.copy_modified_gene(modified_g)
log.info('{}/{}: number of genes with a representative structure'.format(len(self.genes_with_a_representative_structure),
len(self.genes)))
log.info('See the "df_representative_structures" attribute for a summary dataframe.')
@property
def df_representative_structures(self):
"""DataFrame: Get a dataframe of representative protein structure information."""
rep_struct_pre_df = []
df_cols = ['gene', 'id', 'is_experimental', 'file_type', 'structure_file']
for g in self.genes_with_a_representative_structure:
repdict = g.protein.representative_structure.get_dict(df_format=True, only_attributes=df_cols)
repdict['gene'] = g.id
rep_struct_pre_df.append(repdict)
df = pd.DataFrame.from_records(rep_struct_pre_df, columns=df_cols).set_index('gene')
if df.empty:
log.warning('Empty dataframe')
return df
else:
return ssbio.utils.clean_df(df)
@property
def missing_representative_structure(self):
"""list: List of genes with no mapping to a representative structure."""
return [x.id for x in self.genes if not self.genes_with_a_representative_structure.has_id(x.id)]
[docs] def prep_itasser_modeling(self, itasser_installation, itlib_folder, runtype, create_in_dir=None,
execute_from_dir=None, all_genes=False, print_exec=False, **kwargs):
"""Prepare to run I-TASSER homology modeling for genes without structures, or all genes.
Args:
itasser_installation (str): Path to I-TASSER folder, i.e. ``~/software/I-TASSER4.4``
itlib_folder (str): Path to ITLIB folder, i.e. ``~/software/ITLIB``
runtype: How you will be running I-TASSER - local, slurm, or torque
create_in_dir (str): Local directory where folders will be created
execute_from_dir (str): Optional path to execution directory - use this if you are copying the homology
models to another location such as a supercomputer for running
all_genes (bool): If all genes should be prepped, or only those without any mapped structures
print_exec (bool): If the execution statement should be printed to run modelling
Todo:
* Document kwargs - extra options for I-TASSER, SLURM or Torque execution
* Allow modeling of any sequence in sequences attribute, select by ID or provide SeqProp?
"""
if not create_in_dir:
if not self.data_dir:
raise ValueError('Output directory must be specified')
self.homology_models_dir = op.join(self.data_dir, 'homology_models')
else:
self.homology_models_dir = create_in_dir
ssbio.utils.make_dir(self.homology_models_dir)
if not execute_from_dir:
execute_from_dir = self.homology_models_dir
counter = 0
for g in self.genes_with_a_representative_sequence:
repstruct = g.protein.representative_structure
if repstruct and not all_genes:
log.debug('{}: representative structure set, skipping homology modeling'.format(g.id))
continue
g.protein.prep_itasser_modeling(itasser_installation=itasser_installation,
itlib_folder=itlib_folder, runtype=runtype,
create_in_dir=self.homology_models_dir,
execute_from_dir=execute_from_dir,
print_exec=print_exec, **kwargs)
counter += 1
log.info('Prepared I-TASSER modeling folders for {} genes in folder {}'.format(counter,
self.homology_models_dir))
log.info('Saved {} structures total'.format(counter))
def download_all_pdbs(self, outdir=None, pdb_file_type=None, load_metadata=False, force_rerun=False):
if not pdb_file_type:
pdb_file_type = self.pdb_file_type
all_structures = []
for g in tqdm(self.genes):
pdbs = g.protein.download_all_pdbs(outdir=outdir, pdb_file_type=pdb_file_type,
load_metadata=load_metadata, force_rerun=force_rerun)
all_structures.extend(pdbs)
return list(set(all_structures))
@property
def df_pdb_metadata(self):
"""DataFrame: Get a dataframe of PDB metadata (PDBs have to be downloaded first)."""
df = pd.DataFrame()
for g in self.genes_with_experimental_structures:
# Get per protein DataFrame
protein_df = g.protein.df_pdb_metadata.copy().reset_index()
protein_df['gene'] = g.id
df = df.append(protein_df)
if df.empty:
log.warning('Empty dataframe')
return df
else:
return ssbio.utils.clean_df(df.set_index('gene'))
@property
def df_proteins(self):
"""DataFrame: Get a summary dataframe of all proteins in the project."""
pre_df = []
df_cols = ['gene', 'id', 'sequences', 'num_sequences', 'representative_sequence',
'repseq_gene_name', 'repseq_uniprot', 'repseq_description',
'num_structures', 'experimental_structures', 'num_experimental_structures',
'homology_models', 'num_homology_models',
'representative_structure', 'representative_chain', 'representative_chain_seq_coverage',
'repstruct_description', 'repstruct_resolution',
'num_sequence_alignments', 'num_structure_alignments']
for g in self.genes:
# Get per protein DataFrame
protein_dict = g.protein.protein_statistics
protein_dict['gene'] = g.id
pre_df.append(protein_dict)
df = pd.DataFrame.from_records(pre_df, columns=df_cols).set_index('gene')
if df.empty:
log.warning('Empty dataframe')
return df
else:
return ssbio.utils.clean_df(df)
[docs] def get_dssp_annotations(self, representatives_only=True, force_rerun=False):
"""Run DSSP on structures and store calculations.
Annotations are stored in the protein structure's chain sequence at:
``<chain_prop>.seq_record.letter_annotations['*-dssp']``
Args:
representative_only (bool): If analysis should only be run on the representative structure
force_rerun (bool): If calculations should be rerun even if an output file exists
"""
for g in tqdm(self.genes):
g.protein.get_dssp_annotations(representative_only=representatives_only, force_rerun=force_rerun)
[docs] def get_dssp_annotations_parallelize(self, sc, representatives_only=True, force_rerun=False):
"""Run DSSP on structures and store calculations.
Annotations are stored in the protein structure's chain sequence at:
``<chain_prop>.seq_record.letter_annotations['*-dssp']``
Args:
representative_only (bool): If analysis should only be run on the representative structure
force_rerun (bool): If calculations should be rerun even if an output file exists
"""
genes_rdd = sc.parallelize(self.genes)
def get_dssp_annotation(g):
g.protein.get_dssp_annotations(representative_only=representatives_only, force_rerun=force_rerun)
return g
result = genes_rdd.map(get_dssp_annotation).collect()
for modified_g in result:
original_gene = self.genes.get_by_id(modified_g.id)
original_gene.copy_modified_gene(modified_g)
[docs] def get_msms_annotations(self, representatives_only=True, force_rerun=False):
"""Run MSMS on structures and store calculations.
Annotations are stored in the protein structure's chain sequence at:
``<chain_prop>.seq_record.letter_annotations['*-msms']``
Args:
representative_only (bool): If analysis should only be run on the representative structure
force_rerun (bool): If calculations should be rerun even if an output file exists
"""
for g in tqdm(self.genes):
g.protein.get_msms_annotations(representative_only=representatives_only, force_rerun=force_rerun)
[docs] def get_msms_annotations_parallelize(self, sc, representatives_only=True, force_rerun=False):
"""Run MSMS on structures and store calculations.
Annotations are stored in the protein structure's chain sequence at:
``<chain_prop>.seq_record.letter_annotations['*-msms']``
Args:
representative_only (bool): If analysis should only be run on the representative structure
force_rerun (bool): If calculations should be rerun even if an output file exists
"""
genes_rdd = sc.parallelize(self.genes)
def get_msms_annotation(g):
g.protein.get_msms_annotations(representative_only=representatives_only, force_rerun=force_rerun)
return g
result = genes_rdd.map(get_msms_annotation).collect()
for modified_g in result:
original_gene = self.genes.get_by_id(modified_g.id)
original_gene.copy_modified_gene(modified_g)
[docs] def get_freesasa_annotations(self, include_hetatms=False, representatives_only=True, force_rerun=False):
"""Run freesasa on structures and store calculations.
Annotations are stored in the protein structure's chain sequence at:
``<chain_prop>.seq_record.letter_annotations['*-freesasa']``
Args:
include_hetatms (bool): If HETATMs should be included in calculations. Defaults to ``False``.
representative_only (bool): If analysis should only be run on the representative structure
force_rerun (bool): If calculations should be rerun even if an output file exists
"""
for g in tqdm(self.genes):
g.protein.get_freesasa_annotations(include_hetatms=include_hetatms,
representative_only=representatives_only,
force_rerun=force_rerun)
[docs] def get_freesasa_annotations_parallelize(self, sc, include_hetatms=False,
representatives_only=True, force_rerun=False):
"""Run freesasa on structures and store calculations.
Annotations are stored in the protein structure's chain sequence at:
``<chain_prop>.seq_record.letter_annotations['*-freesasa']``
Args:
include_hetatms (bool): If HETATMs should be included in calculations. Defaults to ``False``.
representative_only (bool): If analysis should only be run on the representative structure
force_rerun (bool): If calculations should be rerun even if an output file exists
"""
genes_rdd = sc.parallelize(self.genes)
def get_freesasa_annotation(g):
g.protein.get_freesasa_annotations(include_hetatms=include_hetatms,
representative_only=representatives_only,
force_rerun=force_rerun)
return g
result = genes_rdd.map(get_freesasa_annotation).collect()
for modified_g in result:
original_gene = self.genes.get_by_id(modified_g.id)
original_gene.copy_modified_gene(modified_g)
[docs] def find_disulfide_bridges(self, representatives_only=True):
"""Run Biopython's disulfide bridge finder and store found bridges.
Annotations are stored in the protein structure's chain sequence at:
``<chain_prop>.seq_record.annotations['SSBOND-biopython']``
Args:
representative_only (bool): If analysis should only be run on the representative structure
"""
for g in tqdm(self.genes):
g.protein.find_disulfide_bridges(representative_only=representatives_only)
[docs] def find_disulfide_bridges_parallelize(self, sc, representatives_only=True):
"""Run Biopython's disulfide bridge finder and store found bridges.
Annotations are stored in the protein structure's chain sequence at:
``<chain_prop>.seq_record.annotations['SSBOND-biopython']``
Args:
representative_only (bool): If analysis should only be run on the representative structure
"""
genes_rdd = sc.parallelize(self.genes)
def find_disulfide_bridges(g):
g.protein.find_disulfide_bridges(representative_only=representatives_only)
return g
result = genes_rdd.map(find_disulfide_bridges).collect()
for modified_g in result:
original_gene = self.genes.get_by_id(modified_g.id)
original_gene.copy_modified_gene(modified_g)
### END STRUCTURE RELATED METHODS ###
####################################################################################################################
def __json_encode__(self):
to_return = {}
# Don't save properties, methods in the JSON
for x in [a for a in dir(self) if not a.startswith('__') and not isinstance(getattr(type(self), a, None), property) and not callable(getattr(self,a))]:
if self.model and x == 'genes':
continue
to_return.update({x: getattr(self, x)})
return to_return
def __json_decode__(self, **attrs):
for k, v in attrs.items():
setattr(self, k, v)
if not self.model:
self.genes = DictList(self.genes)
else:
self.genes = self.model.genes