Source code for ssbio.pipeline.gempro

"""
GEMPRO
======
"""

import logging
import os
import os.path as op
import shutil
from copy import copy

import pandas as pd
from Bio import SeqIO
from bioservices import KEGG
from bioservices import UniProt
from cobra.core import DictList
from six.moves.urllib.error import HTTPError
from slugify import Slugify

import ssbio.core.modelpro
import ssbio.databases.kegg
import ssbio.databases.pdb
import ssbio.databases.uniprot
import ssbio.protein.sequence.properties.residues
import ssbio.protein.sequence.properties.tmhmm
import ssbio.protein.sequence.utils.fasta
import ssbio.protein.structure.properties.msms
import ssbio.protein.structure.properties.quality
import ssbio.protein.structure.properties.residues
from ssbio import utils
from ssbio.core.genepro import GenePro
from ssbio.core.modelpro import ModelPro
from ssbio.core.object import Object
from ssbio.databases.kegg import KEGGProp
from ssbio.databases.uniprot import UniProtProp
from ssbio.protein.sequence.properties.scratch import SCRATCH

if utils.is_ipynb():
    from tqdm import tqdm_notebook as tqdm
else:
    from tqdm import tqdm

custom_slugify = Slugify(safe_chars='-_.')
logging.getLogger("requests").setLevel(logging.ERROR)
logging.getLogger("urllib3").setLevel(logging.ERROR)
log = logging.getLogger(__name__)
date = utils.Date()
bs_unip = UniProt()
bs_kegg = KEGG()


[docs]class GEMPRO(Object):

    """Generic class to represent all information for a GEM-PRO project.

    Initialize the GEM-PRO project with a genome-scale model, a list of genes, or a dict of genes and sequences.
    Specify the name of your project, along with the root directory where a folder with that name will be created.

    Main methods provided are:

    #. Automated mapping of sequence IDs

        * With KEGG mapper
        * With UniProt mapper
        * Allowing manual gene ID --> protein sequence entry
        * Allowing manual gene ID --> UniProt ID

    #. Consolidating sequence IDs and setting a representative sequence

        * Currently these are set based on available PDB IDs

    #. Mapping of representative sequence --> structures

        * With UniProt --> ranking of PDB structures
        * BLAST representative sequence --> PDB database

    #. Preparation of files for homology modeling (currently for I-TASSER)

        * Mapping to existing models
        * Preparation for running I-TASSER
        * Parsing I-TASSER runs

    #. Running QC/QA on structures and setting a representative structure

        * Various cutoffs (mutations, insertions, deletions) can be set to filter structures

    #. Automation of protein sequence and structure property calculation

    #. Creation of Pandas DataFrame summaries directly from downloaded metadata

    Args:
        gem_name (str): The name of your GEM or just your project in general. This will be the name of the main folder
            that is created in root_dir.
        root_dir (str): Path to where the folder named after ``gem_name`` will be created. If not provided, directories
            will not be created and output directories need to be specified for some steps.
        pdb_file_type (str): ``pdb``, ``mmCif``, ``xml``, ``mmtf`` - file type for files downloaded from the PDB
        gem (Model): COBRApy Model object
        gem_file_path (str): Path to GEM file
        gem_file_type (str): GEM model type - ``sbml`` (or ``xml``), ``mat``, or ``json`` formats
        genes_list (list): List of gene IDs that you want to map
        genes_and_sequences (dict): Dictionary of gene IDs and their amino acid sequence strings
        genome_path (str): FASTA file of all protein sequences
        write_protein_fasta_files (bool): If individual protein FASTA files should be written out
        description (str): Description string of your project
        custom_spont_id (str): ID of spontaneous genes in a COBRA model which will be ignored for analysis

    """

    def __init__(self, gem_name, root_dir=None, pdb_file_type='mmtf',
                 gem=None, gem_file_path=None, gem_file_type=None,
                 genes_list=None, genes_and_sequences=None, genome_path=None,
                 write_protein_fasta_files=True,
                 description=None, custom_spont_id=None):
        Object.__init__(self, id=gem_name, description=description)

        self.genes = DictList()
        """DictList: All protein-coding genes in this GEM-PRO project"""
        self.custom_spont_id = custom_spont_id
        """str: ID of spontaneous genes in a COBRA model which will be ignored for analysis"""
        self.pdb_file_type = pdb_file_type
        """str: ``pdb``, ``mmCif``, ``xml``, ``mmtf`` - file type for files downloaded from the PDB"""
        self.genome_path = genome_path
        """str: Simple link to the filepath of the FASTA file containing all protein sequences"""
        self.model = None
        """Model: COBRApy model object"""

        # Create directories
        self._root_dir = None
        if root_dir:
            self.root_dir = root_dir

        # TODO: add some checks for multiple inputs (only allow one!)

        # Load a Model object
        if gem:
            self.load_cobra_model(gem)

        # Or, load a GEM file
        elif gem_file_path and gem_file_type:
            gem = ssbio.core.modelpro.model_loader(gem_file_path=gem_file_path,
                                                   gem_file_type=gem_file_type)
            self.load_cobra_model(gem)

        # Or, load a list of gene IDs
        elif genes_list:
            self.add_gene_ids(genes_list)

        # Or, load a dictionary of genes and their sequences
        elif genes_and_sequences:
            self.add_gene_ids(list(genes_and_sequences.keys()))
            self.manual_seq_mapping(genes_and_sequences, write_fasta_files=write_protein_fasta_files)

        # Or, load the provided FASTA file
        elif genome_path:
            genes_and_sequences = ssbio.protein.sequence.utils.fasta.load_fasta_file_as_dict_of_seqs(genome_path)
            self.add_gene_ids(list(genes_and_sequences.keys()))
            self.manual_seq_mapping(genes_and_sequences, write_fasta_files=write_protein_fasta_files)

        # If neither a model or genes are input, you can still add IDs with method add_gene_ids later
        else:
            log.warning('No model or genes input')

        log.info('{}: number of genes'.format(len(self.genes)))

    @property
    def root_dir(self):
        """str: Directory where GEM-PRO project folder named after the attribute ``base_dir`` is located."""
        return self._root_dir

    @root_dir.setter
    def root_dir(self, path):
        if not path:
            raise ValueError('No path specified')

        if not op.exists(path):
            raise ValueError('{}: folder does not exist'.format(path))

        if self._root_dir:
            log.info('Changing root directory of GEM-PRO project "{}" from {} to {}'.format(self.id, self.root_dir, path))

            if not op.exists(op.join(path, self.id)):
                raise IOError('GEM-PRO project "{}" does not exist in folder {}'.format(self.id, path))
        else:
            log.info('Creating GEM-PRO project directory in folder {}'.format(path))

        self._root_dir = path

        for d in [self.base_dir, self.model_dir, self.data_dir, self.genes_dir, self.structures_dir]:
            ssbio.utils.make_dir(d)

        log.info('{}: GEM-PRO project location'.format(self.base_dir))

        # Propagate changes to gene
        if hasattr(self, 'genes'):
            for g in self.genes:
                g.root_dir = self.genes_dir

    @property
    def base_dir(self):
        """str: GEM-PRO project folder."""
        if self.root_dir:
            return op.join(self.root_dir, self.id)
        else:
            return None

    @property
    def model_dir(self):
        """str: Directory where original GEMs and GEM-related files are stored."""
        if self.base_dir:
            return op.join(self.base_dir, 'model')
        else:
            return None

    @property
    def data_dir(self):
        """str: Directory where all data are stored."""
        if self.base_dir:
            return op.join(self.base_dir, 'data')
        else:
            return None

    @property
    def genes_dir(self):
        """str: Directory where all gene specific information is stored."""
        if self.base_dir:
            return op.join(self.base_dir, 'genes')
        else:
            return None

    @property
    def structures_dir(self):
        """str: Directory where all structures are stored."""
        # XTODO: replace storage of structures in individual protein directories with this to reduce redundancy
        if self.base_dir:
            return op.join(self.base_dir, 'structures')
        else:
            return None

[docs]    def load_cobra_model(self, model):
        """Load a COBRApy Model object into the GEM-PRO project.

        Args:
            model (Model): COBRApy ``Model`` object

        """
        self.model = ModelPro(model)
        for g in self.model.genes:
            if self.genes_dir:
                g.root_dir = self.genes_dir
            g.protein.pdb_file_type = self.pdb_file_type
        self.genes = self.model.genes

        log.info('{}: loaded model'.format(model.id))
        log.info('{}: number of reactions'.format(len(self.model.reactions)))
        log.info('{}: number of reactions linked to a gene'.format(ssbio.core.modelpro.true_num_reactions(self.model)))
        log.info('{}: number of genes (excluding spontaneous)'.format(ssbio.core.modelpro.true_num_genes(self.model,
                                                                                                         custom_spont_id=self.custom_spont_id)))
        log.info('{}: number of metabolites'.format(len(self.model.metabolites)))
        log.warning('IMPORTANT: All Gene objects have been transformed into GenePro '
                    'objects, and will be for any new ones')

    @property
    def genes_with_structures(self):
        """DictList: All genes with any mapped protein structures."""
        return DictList(x for x in self.genes if x.protein.num_structures > 0)

    @property
    def genes_with_experimental_structures(self):
        """DictList: All genes that have at least one experimental structure."""
        return DictList(x for x in self.genes_with_structures if x.protein.num_structures_experimental > 0)

    @property
    def genes_with_homology_models(self):
        """DictList: All genes that have at least one homology model."""
        return DictList(x for x in self.genes_with_structures if x.protein.num_structures_homology > 0)

    @property
    def genes_with_a_representative_sequence(self):
        """DictList: All genes with a representative sequence."""
        return DictList(x for x in self.genes if x.protein.representative_sequence)
        # return DictList(y for y in tmp if y.protein.representative_sequence.seq)

    @property
    def genes_with_a_representative_structure(self):
        """DictList: All genes with a representative protein structure."""
        tmp = DictList(x for x in self.genes if x.protein.representative_structure)
        return DictList(y for y in tmp if y.protein.representative_structure.structure_file)

    @property
    def functional_genes(self):
        """DictList: All genes with a representative protein structure."""
        return DictList(x for x in self.genes if x.functional)

    # @property
    # def genes(self):
    #     """DictList: All genes excluding spontaneous ones."""
    #     return ssbio.core.modelpro.filter_out_spontaneous_genes(self._genes, custom_spont_id=self.custom_spont_id)

    # @genes.setter
    # def genes(self, genes_list):
    #     """Set the genes attribute to be a DictList of GenePro objects.
    #
    #     A "protein" attribute will be added to each Gene.
    #
    #     Args:
    #         genes_list: DictList of COBRApy Gene objects, or list of gene IDs
    #
    #     """
    #
    #     if not isinstance(genes_list, DictList):
    #         tmp_list = []
    #         for x in list(set(genes_list)):
    #             x = str(x)
    #             new_gene = GenePro(id=x, pdb_file_type=self.pdb_file_type, root_dir=self.genes_dir)
    #             tmp_list.append(new_gene)
    #         self._genes = DictList(tmp_list)
    #     else:
    #         self._genes = genes_list

[docs]    def add_gene_ids(self, genes_list):
        """Add gene IDs manually into the GEM-PRO project.

        Args:
            genes_list (list): List of gene IDs as strings.

        """
        orig_num_genes = len(self.genes)

        for g in list(set(genes_list)):
            if not self.genes.has_id(g):
                new_gene = GenePro(id=g, pdb_file_type=self.pdb_file_type, root_dir=self.genes_dir)
                if self.model:
                    self.model.genes.append(new_gene)
                else:
                    self.genes.append(new_gene)

        log.info('Added {} genes to GEM-PRO project'.format(len(self.genes)-orig_num_genes))

    ####################################################################################################################
    ### SEQUENCE RELATED METHODS ###
[docs]    def kegg_mapping_and_metadata(self, kegg_organism_code, custom_gene_mapping=None, outdir=None,
                                  set_as_representative=False, force_rerun=False):
        """Map all genes in the model to KEGG IDs using the KEGG service.

        Steps:
            1. Download all metadata and sequence files in the sequences directory
            2. Creates a KEGGProp object in the protein.sequences attribute
            3. Returns a Pandas DataFrame of mapping results

        Args:
            kegg_organism_code (str): The three letter KEGG code of your organism
            custom_gene_mapping (dict): If your model genes differ from the gene IDs you want to map,
                custom_gene_mapping allows you to input a dictionary which maps model gene IDs to new ones.
                Dictionary keys must match model gene IDs.
            outdir (str): Path to output directory of downloaded files, must be set if GEM-PRO directories
                were not created initially
            set_as_representative (bool): If mapped KEGG IDs should be set as representative sequences
            force_rerun (bool): If you want to overwrite any existing mappings and files

        """

        # First map all of the organism's KEGG genes to UniProt
        kegg_to_uniprot = ssbio.databases.kegg.map_kegg_all_genes(organism_code=kegg_organism_code, target_db='uniprot')

        successfully_mapped_counter = 0

        for g in tqdm(self.genes):
            if custom_gene_mapping:
                kegg_g = custom_gene_mapping[g.id]
            else:
                kegg_g = g.id

            if kegg_g not in kegg_to_uniprot:
                log.debug('{}: unable to map to KEGG'.format(g.id))
                continue

            # Download both FASTA and KEGG metadata files
            kegg_prop = g.protein.load_kegg(kegg_id=kegg_g, kegg_organism_code=kegg_organism_code,
                                            download=True, outdir=outdir, set_as_representative=set_as_representative,
                                            force_rerun=force_rerun)

            # Update potentially old UniProt ID
            if kegg_g in kegg_to_uniprot.keys():
                kegg_prop.uniprot = kegg_to_uniprot[kegg_g]
                if g.protein.representative_sequence:
                    if g.protein.representative_sequence.kegg == kegg_prop.kegg:
                        g.protein.representative_sequence.uniprot = kegg_to_uniprot[kegg_g]

            # Keep track of missing mappings - missing is defined by no available sequence
            if kegg_prop.sequence_file:
                successfully_mapped_counter += 1

            log.debug('{}: loaded KEGG information for gene'.format(g.id))

        log.info('{}/{}: number of genes mapped to KEGG'.format(successfully_mapped_counter, len(self.genes)))
        log.info('Completed ID mapping --> KEGG. See the "df_kegg_metadata" attribute for a summary dataframe.')

[docs]    def kegg_mapping_and_metadata_parallelize(self, sc, kegg_organism_code, custom_gene_mapping=None, outdir=None,
                                              set_as_representative=False, force_rerun=False):
        """Map all genes in the model to KEGG IDs using the KEGG service.

        Steps:
            1. Download all metadata and sequence files in the sequences directory
            2. Creates a KEGGProp object in the protein.sequences attribute
            3. Returns a Pandas DataFrame of mapping results

        Args:
            sc (SparkContext): Spark Context to parallelize this function
            kegg_organism_code (str): The three letter KEGG code of your organism
            custom_gene_mapping (dict): If your model genes differ from the gene IDs you want to map,
                custom_gene_mapping allows you to input a dictionary which maps model gene IDs to new ones.
                Dictionary keys must match model gene IDs.
            outdir (str): Path to output directory of downloaded files, must be set if GEM-PRO directories
                were not created initially
            set_as_representative (bool): If mapped KEGG IDs should be set as representative sequences
            force_rerun (bool): If you want to overwrite any existing mappings and files

        """

        # First map all of the organism's KEGG genes to UniProt
        kegg_to_uniprot = ssbio.databases.kegg.map_kegg_all_genes(organism_code=kegg_organism_code, target_db='uniprot')

        # Parallelize the genes list
        genes_rdd = sc.parallelize(self.genes)

        # Write a sub-function to carry out the bulk of the original function
        def gp_kegg_sc(g):
            if custom_gene_mapping:
                kegg_g = custom_gene_mapping[g.id]
            else:
                kegg_g = g.id

            # Download both FASTA and KEGG metadata files
            kegg_prop = g.protein.load_kegg(kegg_id=kegg_g, kegg_organism_code=kegg_organism_code,
                                            download=True, outdir=outdir,
                                            set_as_representative=set_as_representative,
                                            force_rerun=force_rerun)

            # Update potentially old UniProt ID
            if kegg_g in kegg_to_uniprot.keys():
                kegg_prop.uniprot = kegg_to_uniprot[kegg_g]
                if g.protein.representative_sequence:
                    if g.protein.representative_sequence.kegg == kegg_prop.kegg:
                        g.protein.representative_sequence.uniprot = kegg_to_uniprot[kegg_g]

            # Tracker for if it mapped successfully to KEGG
            if kegg_prop.sequence_file:
                success = True
            else:
                success = False

            return g, success

        # Run a map operation to execute the function on all items in the RDD
        result = genes_rdd.map(gp_kegg_sc).collect()

        # Copy the results over to the GEM-PRO object's genes using the GenePro function "copy_modified_gene"
        # Also count how many genes mapped to KEGG
        successfully_mapped_counter = 0
        for modified_g, success in result:
            original_gene = self.genes.get_by_id(modified_g.id)
            original_gene.copy_modified_gene(modified_g)
            if success:
                successfully_mapped_counter += 1

        log.info('{}/{}: number of genes mapped to KEGG'.format(successfully_mapped_counter, len(self.genes)))
        log.info('Completed ID mapping --> KEGG. See the "df_kegg_metadata" attribute for a summary dataframe.')

    @property
    def df_kegg_metadata(self):
        """DataFrame: Pandas DataFrame of KEGG metadata per protein."""
        kegg_pre_df = []
        df_cols = ['gene', 'kegg', 'refseq', 'uniprot', 'num_pdbs', 'pdbs', 'seq_len', 'sequence_file', 'metadata_file']

        for g in self.genes:
            kegg_mappings = g.protein.filter_sequences(KEGGProp)
            for kegg_prop in kegg_mappings:
                kegg_dict = kegg_prop.get_dict(df_format=True, only_attributes=df_cols)
                kegg_dict['gene'] = g.id
                kegg_pre_df.append(kegg_dict)

        # Save a dataframe of the file mapping info
        df = pd.DataFrame.from_records(kegg_pre_df, columns=df_cols).set_index('gene')
        if df.empty:
            log.warning('Empty dataframe')
            return df
        else:
            return ssbio.utils.clean_df(df)

    @property
    def missing_kegg_mapping(self):
        """list: List of genes with no mapping to KEGG."""
        kegg_missing = []
        for g in self.genes:
            keggs = g.protein.filter_sequences(KEGGProp)
            no_sequence_file_available = True
            for k in keggs:
                if k.sequence_file:
                    no_sequence_file_available = False
                    break
            if no_sequence_file_available:
                kegg_missing.append(g.id)
        return list(set(kegg_missing))

[docs]    def uniprot_mapping_and_metadata(self, model_gene_source, custom_gene_mapping=None, outdir=None,
                                     set_as_representative=False, force_rerun=False):
        """Map all genes in the model to UniProt IDs using the UniProt mapping service.
        Also download all metadata and sequences.

        Args:
            model_gene_source (str): the database source of your model gene IDs.
                See: http://www.uniprot.org/help/api_idmapping
                Common model gene sources are:

                * Ensembl Genomes - ``ENSEMBLGENOME_ID`` (i.e. E. coli b-numbers)
                * Entrez Gene (GeneID) - ``P_ENTREZGENEID``
                * RefSeq Protein - ``P_REFSEQ_AC``

            custom_gene_mapping (dict): If your model genes differ from the gene IDs you want to map,
                custom_gene_mapping allows you to input a dictionary which maps model gene IDs to new ones.
                Dictionary keys must match model genes.
            outdir (str): Path to output directory of downloaded files, must be set if GEM-PRO directories
                were not created initially
            set_as_representative (bool): If mapped UniProt IDs should be set as representative sequences
            force_rerun (bool): If you want to overwrite any existing mappings and files

        """

        # Allow model gene --> custom ID mapping ({'TM_1012':'TM1012'})
        if custom_gene_mapping:
            genes_to_map = list(custom_gene_mapping.values())
        else:
            genes_to_map = [x.id for x in self.genes]

        # Map all IDs first to available UniProts
        genes_to_uniprots = bs_unip.mapping(fr=model_gene_source, to='ACC', query=genes_to_map)

        successfully_mapped_counter = 0
        for g in tqdm(self.genes):
            if custom_gene_mapping and g.id in custom_gene_mapping.keys():
                uniprot_gene = custom_gene_mapping[g.id]
            else:
                uniprot_gene = g.id

            if uniprot_gene not in genes_to_uniprots:
                log.debug('{}: unable to map to UniProt'.format(g.id))
                continue

            for mapped_uniprot in genes_to_uniprots[uniprot_gene]:
                try:
                    uniprot_prop = g.protein.load_uniprot(uniprot_id=mapped_uniprot, download=True, outdir=outdir,
                                                          set_as_representative=set_as_representative,
                                                          force_rerun=force_rerun)
                except HTTPError as e:
                    log.error('{}, {}: unable to complete web request'.format(g.id, mapped_uniprot))
                    print(e)
                    continue

                if uniprot_prop.sequence_file or uniprot_prop.metadata_file:
                    successfully_mapped_counter += 1

        log.info('{}/{}: number of genes mapped to UniProt'.format(successfully_mapped_counter, len(self.genes)))
        log.info('Completed ID mapping --> UniProt. See the "df_uniprot_metadata" attribute for a summary dataframe.')

[docs]    def manual_uniprot_mapping(self, gene_to_uniprot_dict, outdir=None, set_as_representative=True):
        """Read a manual dictionary of model gene IDs --> UniProt IDs. By default sets them as representative.

        This allows for mapping of the missing genes, or overriding of automatic mappings.

        Input a dictionary of::

            {
                <gene_id1>: <uniprot_id1>,
                <gene_id2>: <uniprot_id2>,
            }

        Args:
            gene_to_uniprot_dict: Dictionary of mappings as shown above
            outdir (str): Path to output directory of downloaded files, must be set if GEM-PRO directories
                were not created initially
            set_as_representative (bool): If mapped UniProt IDs should be set as representative sequences

        """
        for g, u in tqdm(gene_to_uniprot_dict.items()):
            g = str(g)
            gene = self.genes.get_by_id(g)

            try:
                uniprot_prop = gene.protein.load_uniprot(uniprot_id=u,
                                                         outdir=outdir, download=True,
                                                         set_as_representative=set_as_representative)
            except HTTPError as e:
                log.error('{}, {}: unable to complete web request'.format(g, u))
                print(e)
                continue

        log.info('Completed manual ID mapping --> UniProt. See the "df_uniprot_metadata" attribute for a summary dataframe.')

    @property
    def df_uniprot_metadata(self):
        """DataFrame: Pandas DataFrame of UniProt metadata per protein."""
        uniprot_pre_df = []
        df_cols = ['gene', 'uniprot', 'reviewed', 'gene_name', 'kegg', 'refseq', 'num_pdbs', 'pdbs', 'ec_number',
                   'pfam', 'seq_len', 'description', 'entry_date', 'entry_version', 'seq_date', 'seq_version',
                   'sequence_file', 'metadata_file']

        for g in self.genes:
            uniprot_mappings = g.protein.filter_sequences(UniProtProp)
            for uniprot_prop in uniprot_mappings:
                uniprot_dict = uniprot_prop.get_dict(df_format=True, only_attributes=df_cols)
                uniprot_dict['gene'] = g.id
                uniprot_pre_df.append(uniprot_dict)

        df = pd.DataFrame.from_records(uniprot_pre_df, columns=df_cols).set_index('gene')
        if df.empty:
            log.warning('Empty dataframe')
            return df
        else:
            return ssbio.utils.clean_df(df)

    @property
    def missing_uniprot_mapping(self):
        """list: List of genes with no mapping to UniProt."""
        uniprot_missing = []
        for g in self.genes:
            ups = g.protein.filter_sequences(UniProtProp)

            no_sequence_file_available = True
            for u in ups:
                if u.sequence_file or u.metadata_file:
                    no_sequence_file_available = False
                    break
            if no_sequence_file_available:
                uniprot_missing.append(g.id)

        return list(set(uniprot_missing))

    # TODO: should also have a seq --> uniprot id function (has to be 100% match) (also needs organism)
[docs]    def manual_seq_mapping(self, gene_to_seq_dict, outdir=None, write_fasta_files=True, set_as_representative=True):
        """Read a manual input dictionary of model gene IDs --> protein sequences. By default sets them as representative.

        Args:
            gene_to_seq_dict (dict): Mapping of gene IDs to their protein sequence strings
            outdir (str): Path to output directory of downloaded files, must be set if GEM-PRO directories
                were not created initially
            write_fasta_files (bool): If individual protein FASTA files should be written out
            set_as_representative (bool): If mapped sequences should be set as representative

        """
        if outdir:
            outdir_set = True
        else:
            outdir_set = False

        # Save the sequence information in individual FASTA files
        for g, s in gene_to_seq_dict.items():
            gene = self.genes.get_by_id(str(g))

            if not outdir_set and write_fasta_files:
                outdir = gene.protein.sequence_dir
                if not outdir:
                    raise ValueError('Output directory must be specified')

            manual_info = gene.protein.load_manual_sequence(ident=g, seq=s, outdir=outdir,
                                                            write_fasta_file=write_fasta_files,
                                                            set_as_representative=set_as_representative)
            log.debug('{}: loaded manually defined sequence information'.format(g))

        log.info('Loaded in {} sequences'.format(len(gene_to_seq_dict)))

[docs]    def set_representative_sequence(self, force_rerun=False):
        """Automatically consolidate loaded sequences (manual, UniProt, or KEGG) and set a single representative sequence.

        Manually set representative sequences override all existing mappings. UniProt mappings override KEGG mappings
        except when KEGG mappings have PDBs associated with them and UniProt doesn't.

        Args:
            force_rerun (bool): Set to True to recheck stored sequences

        """

        # TODO: rethink use of multiple database sources - may lead to inconsistency with genome sources

        successfully_mapped_counter = 0
        for g in tqdm(self.genes):
            repseq = g.protein.set_representative_sequence(force_rerun=force_rerun)

            if repseq:
                if repseq.sequence_file:
                    successfully_mapped_counter += 1

        log.info('{}/{}: number of genes with a representative sequence'.format(len(self.genes_with_a_representative_sequence),
                                                                                len(self.genes)))
        log.info('See the "df_representative_sequences" attribute for a summary dataframe.')

    @property
    def df_representative_sequences(self):
        """DataFrame: Pandas DataFrame of representative sequence information per protein."""

        seq_mapping_pre_df = []
        df_cols = ['gene', 'uniprot', 'kegg', 'num_pdbs', 'pdbs', 'seq_len', 'sequence_file', 'metadata_file']

        for g in self.genes_with_a_representative_sequence:
            gene_dict = g.protein.representative_sequence.get_dict(df_format=True, only_attributes=df_cols)
            gene_dict['gene'] = g.id
            seq_mapping_pre_df.append(gene_dict)

        df = pd.DataFrame.from_records(seq_mapping_pre_df, columns=df_cols).set_index('gene')
        if df.empty:
            log.warning('Empty dataframe')
            return df
        else:
            return ssbio.utils.clean_df(df)

    @property
    def missing_representative_sequence(self):
        """list: List of genes with no mapping to a representative sequence."""
        return [x.id for x in self.genes if not self.genes_with_a_representative_sequence.has_id(x.id)]

[docs]    def write_representative_sequences_file(self, outname, outdir=None, set_ids_from_model=True):
        """Write all the model's sequences as a single FASTA file. By default, sets IDs to model gene IDs.

        Args:
            outname (str): Name of the output FASTA file without the extension
            outdir (str): Path to output directory of downloaded files, must be set if GEM-PRO directories
                were not created initially
            set_ids_from_model (bool): If the gene ID source should be the model gene IDs, not the original sequence ID

        """

        if not outdir:
            outdir = self.data_dir
            if not outdir:
                raise ValueError('Output directory must be specified')

        outfile = op.join(outdir, outname + '.faa')

        tmp = []
        for x in self.genes_with_a_representative_sequence:
            repseq = x.protein.representative_sequence
            copied_seq_record = copy(repseq)
            if set_ids_from_model:
                copied_seq_record.id = x.id
            tmp.append(copied_seq_record)

        SeqIO.write(tmp, outfile, "fasta")

        log.info('{}: wrote all representative sequences to file'.format(outfile))
        self.genome_path = outfile
        return self.genome_path

[docs]    def get_sequence_properties(self, representatives_only=True):
        """Run Biopython ProteinAnalysis and EMBOSS pepstats to summarize basic statistics of all protein sequences.
        Results are stored in the protein's respective SeqProp objects at ``.annotations``

        Args:
            representative_only (bool): If analysis should only be run on the representative sequences

        """
        for g in tqdm(self.genes):
            g.protein.get_sequence_properties(representative_only=representatives_only)

[docs]    def get_scratch_predictions(self, path_to_scratch, results_dir, scratch_basename='scratch', num_cores=1,
                                exposed_buried_cutoff=25, custom_gene_mapping=None):
        """Run and parse ``SCRATCH`` results to predict secondary structure and solvent accessibility.
        Annotations are stored in the protein's representative sequence at:

            * ``.annotations``
            * ``.letter_annotations``

        Args:
            path_to_scratch (str): Path to SCRATCH executable
            results_dir (str): Path to SCRATCH results folder, which will have the files (scratch.ss, scratch.ss8,
                scratch.acc, scratch.acc20)
            scratch_basename (str): Basename of the SCRATCH results ('scratch' is default)
            num_cores (int): Number of cores to use to parallelize SCRATCH run
            exposed_buried_cutoff (int): Cutoff of exposed/buried for the acc20 predictions
            custom_gene_mapping (dict): Default parsing of SCRATCH output files is to look for the model gene IDs. If
                your output files contain IDs which differ from the model gene IDs, use this dictionary to map model
                gene IDs to result file IDs. Dictionary keys must match model genes.

        """
        if not self.genome_path:
            # Write all sequences as one file
            all_seqs = self.write_representative_sequences_file(outname=self.id)

        # Runs SCRATCH or loads existing results in results_dir
        scratch = SCRATCH(project_name=scratch_basename, seq_file=self.genome_path)
        scratch.run_scratch(path_to_scratch=path_to_scratch, num_cores=num_cores, outdir=results_dir)
        sspro_summary = scratch.sspro_summary()
        sspro8_summary = scratch.sspro8_summary()
        sspro_results = scratch.sspro_results()
        sspro8_results = scratch.sspro8_results()
        accpro_summary = scratch.accpro_summary()
        accpro20_summary = scratch.accpro20_summary(exposed_buried_cutoff)
        accpro_results = scratch.accpro_results()
        accpro20_results = scratch.accpro20_results()
        counter = 0

        # Adding the scratch annotations to the representative_sequences letter_annotations
        for g in tqdm(self.genes_with_a_representative_sequence):
            if custom_gene_mapping:
                g_id = custom_gene_mapping[g.id]
            else:
                g_id = g.id

            if g_id in sspro_summary:
                # Secondary structure
                g.protein.representative_sequence.annotations.update(sspro_summary[g_id])
                g.protein.representative_sequence.annotations.update(sspro8_summary[g_id])
                g.protein.representative_sequence.letter_annotations['SS-sspro'] = sspro_results[g_id]
                g.protein.representative_sequence.letter_annotations['SS-sspro8'] = sspro8_results[g_id]

                # Solvent accessibility
                g.protein.representative_sequence.annotations.update(accpro_summary[g_id])
                g.protein.representative_sequence.annotations.update(accpro20_summary[g_id])
                g.protein.representative_sequence.letter_annotations['RSA-accpro'] = accpro_results[g_id]
                g.protein.representative_sequence.letter_annotations['RSA-accpro20'] = accpro20_results[g_id]

                counter += 1
            else:
                log.error('{}: missing SCRATCH results'.format(g.id))

        log.info('{}/{}: number of genes with SCRATCH predictions loaded'.format(counter, len(self.genes)))

[docs]    def get_tmhmm_predictions(self, tmhmm_results, custom_gene_mapping=None):
        """Parse TMHMM results and store in the representative sequences.

        This is a basic function to parse pre-run TMHMM results. Run TMHMM from the
        web service (http://www.cbs.dtu.dk/services/TMHMM/) by doing the following:

            1. Write all representative sequences in the GEM-PRO using the function ``write_representative_sequences_file``
            2. Upload the file to http://www.cbs.dtu.dk/services/TMHMM/ and choose "Extensive, no graphics" as the output
            3. Copy and paste the results (ignoring the top header and above "HELP with output formats") into a file and save it
            4. Run this function on that file

        Args:
            tmhmm_results (str): Path to TMHMM results (long format)
            custom_gene_mapping (dict): Default parsing of TMHMM output is to look for the model gene IDs. If
                your output file contains IDs which differ from the model gene IDs, use this dictionary to map model
                gene IDs to result file IDs. Dictionary keys must match model genes.

        """
        # TODO: refactor to Protein class?
        tmhmm_dict = ssbio.protein.sequence.properties.tmhmm.parse_tmhmm_long(tmhmm_results)

        counter = 0
        for g in tqdm(self.genes_with_a_representative_sequence):
            if custom_gene_mapping:
                g_id = custom_gene_mapping[g.id]
            else:
                g_id = g.id

            if g_id in tmhmm_dict:
                log.debug('{}: loading TMHMM results'.format(g.id))
                if not tmhmm_dict[g_id]:
                    log.error("{}: missing TMHMM results".format(g.id))
                g.protein.representative_sequence.annotations['num_tm_helix-tmhmm'] = tmhmm_dict[g_id]['num_tm_helices']
                g.protein.representative_sequence.letter_annotations['TM-tmhmm'] = tmhmm_dict[g_id]['sequence']
                counter += 1
            else:
                log.error("{}: missing TMHMM results".format(g.id))

        log.info('{}/{}: number of genes with TMHMM predictions loaded'.format(counter, len(self.genes)))

    ### END SEQUENCE RELATED METHODS ###
    ####################################################################################################################

    ####################################################################################################################
    ### STRUCTURE RELATED METHODS ###
[docs]    def blast_seqs_to_pdb(self, seq_ident_cutoff=0, evalue=0.0001, all_genes=False, display_link=False,
                          outdir=None, force_rerun=False):
        """BLAST each representative protein sequence to the PDB. Saves raw BLAST results (XML files).

        Args:
            seq_ident_cutoff (float, optional): Cutoff results based on percent coverage (in decimal form)
            evalue (float, optional): Cutoff for the E-value - filters for significant hits. 0.001 is liberal,
                0.0001 is stringent (default).
            all_genes (bool): If all genes should be BLASTed, or only those without any structures currently mapped
            display_link (bool, optional): Set to True if links to the HTML results should be displayed
            outdir (str): Path to output directory of downloaded files, must be set if GEM-PRO directories
                were not created initially
            force_rerun (bool, optional): If existing BLAST results should not be used, set to True. Default is False

        """
        counter = 0

        for g in tqdm(self.genes_with_a_representative_sequence):
            # If all_genes=False, BLAST only genes without a uniprot -> pdb mapping
            if g.protein.num_structures_experimental > 0 and not all_genes and not force_rerun:
                log.debug('{}: skipping BLAST, {} experimental structures already mapped '
                          'and all_genes flag is False'.format(g.id,
                                                               g.protein.num_structures_experimental))
                continue

            # BLAST the sequence to the PDB
            new_pdbs = g.protein.blast_representative_sequence_to_pdb(seq_ident_cutoff=seq_ident_cutoff,
                                                                   evalue=evalue,
                                                                   display_link=display_link,
                                                                   outdir=outdir,
                                                                   force_rerun=force_rerun)

            if new_pdbs:
                counter += 1
                log.debug('{}: {} PDBs BLASTed'.format(g.id, len(new_pdbs)))
            else:
                log.debug('{}: no BLAST results'.format(g.id))

        log.info('Completed sequence --> PDB BLAST. See the "df_pdb_blast" attribute for a summary dataframe.')
        log.info('{}: number of genes with additional structures added from BLAST'.format(counter))

    @property
    def df_pdb_blast(self):
        """DataFrame: Get a dataframe of PDB BLAST results"""

        df = pd.DataFrame()
        for g in self.genes_with_experimental_structures:
            protein_df = g.protein.df_pdb_blast.copy().reset_index()
            if not protein_df.empty:
                protein_df['gene'] = g.id
                df = df.append(protein_df)
        if df.empty:
            log.warning('Empty dataframe')
            return df
        else:
            return ssbio.utils.clean_df(df.set_index('gene'))

[docs]    def map_uniprot_to_pdb(self, seq_ident_cutoff=0.0, outdir=None, force_rerun=False):
        """Map all representative sequences' UniProt ID to PDB IDs using the PDBe "Best Structures" API.
        Will save a JSON file of the results to each protein's ``sequences`` folder.

        The "Best structures" API is available at https://www.ebi.ac.uk/pdbe/api/doc/sifts.html
        The list of PDB structures mapping to a UniProt accession sorted by coverage of the protein and,
        if the same, resolution.

        Args:
            seq_ident_cutoff (float): Sequence identity cutoff in decimal form
            outdir (str): Output directory to cache JSON results of search
            force_rerun (bool): Force re-downloading of JSON results if they already exist

        Returns:
            list: A rank-ordered list of PDBProp objects that map to the UniProt ID

        """

        # First get all UniProt IDs and check if they have PDBs
        all_representative_uniprots = []
        for g in self.genes_with_a_representative_sequence:
            uniprot_id = g.protein.representative_sequence.uniprot
            if uniprot_id:
                # TODO: add warning or something for isoform ids?
                if '-' in uniprot_id:
                    uniprot_id = uniprot_id.split('-')[0]
                all_representative_uniprots.append(uniprot_id)
        log.info('Mapping UniProt IDs --> PDB IDs...')
        uniprots_to_pdbs = bs_unip.mapping(fr='ACC', to='PDB_ID', query=all_representative_uniprots)

        counter = 0
        # Now run the best_structures API for all genes
        for g in tqdm(self.genes_with_a_representative_sequence):
            uniprot_id = g.protein.representative_sequence.uniprot
            if uniprot_id:
                if '-' in uniprot_id:
                    uniprot_id = uniprot_id.split('-')[0]
                if uniprot_id in uniprots_to_pdbs:
                    best_structures = g.protein.map_uniprot_to_pdb(seq_ident_cutoff=seq_ident_cutoff, outdir=outdir, force_rerun=force_rerun)
                    if best_structures:
                        counter += 1
                        log.debug('{}: {} PDBs mapped'.format(g.id, len(best_structures)))
                else:
                    log.debug('{}, {}: no PDBs available'.format(g.id, uniprot_id))

        log.info('{}/{}: number of genes with at least one experimental structure'.format(len(self.genes_with_experimental_structures),
                                                                                          len(self.genes)))
        log.info('Completed UniProt --> best PDB mapping. See the "df_pdb_ranking" attribute for a summary dataframe.')

    @property
    def df_pdb_ranking(self):
        """DataFrame: Get a dataframe of UniProt -> best structure in PDB results"""
        df = pd.DataFrame()
        for g in self.genes_with_experimental_structures:
            protein_df = g.protein.df_pdb_ranking.copy().reset_index()
            if not protein_df.empty:
                protein_df['gene'] = g.id
                df = df.append(protein_df)
        if df.empty:
            log.warning('Empty dataframe')
            return df
        else:
            return ssbio.utils.clean_df(df.set_index('gene'))

    @property
    def missing_pdb_structures(self):
        """list: List of genes with no mapping to any experimental PDB structure."""
        return [x.id for x in self.genes if not self.genes_with_experimental_structures.has_id(x.id)]

[docs]    def get_manual_homology_models(self, input_dict, outdir=None, clean=True, force_rerun=False):
        """Copy homology models to the GEM-PRO project.

        Requires an input of a dictionary formatted like so::

            {
                model_gene: {
                                homology_model_id1: {
                                                        'model_file': '/path/to/homology/model.pdb',
                                                        'file_type': 'pdb'
                                                        'additional_info': info_value
                                                    },
                                homology_model_id2: {
                                                        'model_file': '/path/to/homology/model.pdb'
                                                        'file_type': 'pdb'
                                                    }
                            }
            }

        Args:
            input_dict (dict): Dictionary of dictionaries of gene names to homology model IDs and other information
            outdir (str): Path to output directory of downloaded files, must be set if GEM-PRO directories
                were not created initially
            clean (bool): If homology files should be cleaned and saved as a new PDB file
            force_rerun (bool): If homology files should be copied again even if they exist in the GEM-PRO directory

        """
        if outdir:
            outdir_set = True
        else:
            outdir_set = False

        counter = 0
        for g in tqdm(self.genes):
            if g.id not in input_dict:
                continue

            if not outdir_set:
                outdir = g.protein.structure_dir
                if not outdir:
                    raise ValueError('Output directory must be specified')

            for hid, hdict in input_dict[g.id].items():
                if 'model_file' not in hdict or 'file_type' not in hdict:
                    raise KeyError('"model_file" and "file_type" must be keys in the manual input dictionary.')

                new_homology = g.protein.load_pdb(pdb_id=hid, pdb_file=hdict['model_file'],
                                                  file_type=hdict['file_type'], is_experimental=False)

                if clean:
                    new_homology.load_structure_path(new_homology.clean_structure(outdir=outdir, force_rerun=force_rerun),
                                                     hdict['file_type'])
                else:
                    copy_to = op.join(outdir, op.basename(hdict['model_file']))
                    if ssbio.utils.force_rerun(force_rerun, copy_to):
                        # Just copy the file to the structure directory and store the file name
                        log.debug('{}: copying model from original directory to GEM-PRO directory'.format(op.basename(hdict['model_file'])))
                        shutil.copy2(hdict['model_file'], outdir)
                        new_homology.load_structure_path(copy_to, hdict['file_type'])
                    else:
                        log.debug('{}: homology model already copied to directory'.format(copy_to))
                        new_homology.load_structure_path(copy_to, hdict['file_type'])

                # TODO: need to better handle other info in the provided dictionary, if any
                new_homology.update(hdict)

                log.debug('{}: updated homology model information and copied model file.'.format(g.id))
            counter += 1

        log.info('Updated homology model information for {} genes.'.format(counter))

[docs]    def get_itasser_models(self, homology_raw_dir, custom_itasser_name_mapping=None, outdir=None, force_rerun=False):
        """Copy generated I-TASSER models from a directory to the GEM-PRO directory.

        Args:
            homology_raw_dir (str): Root directory of I-TASSER folders.
            custom_itasser_name_mapping (dict): Use this if your I-TASSER folder names differ from your model gene names.
                Input a dict of {model_gene: ITASSER_folder}.
            outdir (str): Path to output directory of downloaded files, must be set if GEM-PRO directories
                were not created initially
            force_rerun (bool): If homology files should be copied again even if they exist in the GEM-PRO directory

        """
        counter = 0

        for g in tqdm(self.genes):
            if custom_itasser_name_mapping and g.id in custom_itasser_name_mapping:
                hom_id = custom_itasser_name_mapping[g.id]
                if not op.exists(op.join(homology_raw_dir, hom_id)):
                    hom_id = g.id
            else:
                hom_id = g.id

            # The name of the actual pdb file will be $GENEID_model1.pdb
            new_itasser_name = hom_id + '_model1'
            orig_itasser_dir = op.join(homology_raw_dir, hom_id)

            try:
                itasser_prop = g.protein.load_itasser_folder(ident=hom_id, itasser_folder=orig_itasser_dir,
                                                             organize=True, outdir=outdir,
                                                             organize_name=new_itasser_name,
                                                             force_rerun=force_rerun)
            except OSError:
                log.debug('{}: homology model folder unavailable'.format(g.id))
                continue
            except IOError:
                log.debug('{}: homology model unavailable'.format(g.id))
                continue

            if itasser_prop.structure_file:
                counter += 1
            else:
                log.debug('{}: homology model file unavailable, perhaps modelling did not finish'.format(g.id))

        log.info('Completed copying of {} I-TASSER models to GEM-PRO directory. See the "df_homology_models" attribute for a summary dataframe.'.format(counter))

    @property
    def df_homology_models(self):
        """DataFrame: Get a dataframe of I-TASSER homology model results"""
        df = pd.DataFrame()
        for g in self.genes_with_homology_models:
            protein_df = g.protein.df_homology_models.copy().reset_index()
            if not protein_df.empty:
                protein_df['gene'] = g.id
                df = df.append(protein_df)
        if df.empty:
            log.warning('Empty dataframe')
            return df
        else:
            return ssbio.utils.clean_df(df.set_index('gene'))

    @property
    def missing_homology_models(self):
        """list: List of genes with no mapping to any homology models."""
        return [x.id for x in self.genes if not self.genes_with_homology_models.has_id(x.id)]

[docs]    def set_representative_structure(self, seq_outdir=None, struct_outdir=None, pdb_file_type=None,
                                     engine='needle', always_use_homology=False, rez_cutoff=0.0,
                                     seq_ident_cutoff=0.5, allow_missing_on_termini=0.2,
                                     allow_mutants=True, allow_deletions=False,
                                     allow_insertions=False, allow_unresolved=True, skip_large_structures=False,
                                     clean=True, force_rerun=False):
        """Set all representative structure for proteins from a structure in the structures attribute.

        Each gene can have a combination of the following, which will be analyzed to set a representative structure.

            * Homology model(s)
            * Ranked PDBs
            * BLASTed PDBs

        If the ``always_use_homology`` flag is true, homology models are always set as representative when they exist.
        If there are multiple homology models, we rank by the percent sequence coverage.

        Args:
            seq_outdir (str): Path to output directory of sequence alignment files, must be set if GEM-PRO directories
                were not created initially
            struct_outdir (str): Path to output directory of structure files, must be set if GEM-PRO directories
                were not created initially
            pdb_file_type (str): ``pdb``, ``mmCif``, ``xml``, ``mmtf`` - file type for files downloaded from the PDB
            engine (str): ``biopython`` or ``needle`` - which pairwise alignment program to use.
                ``needle`` is the standard EMBOSS tool to run pairwise alignments.
                ``biopython`` is Biopython's implementation of needle. Results can differ!
            always_use_homology (bool): If homology models should always be set as the representative structure
            rez_cutoff (float): Resolution cutoff, in Angstroms (only if experimental structure)
            seq_ident_cutoff (float): Percent sequence identity cutoff, in decimal form
            allow_missing_on_termini (float): Percentage of the total length of the reference sequence which will be ignored
                when checking for modifications. Example: if 0.1, and reference sequence is 100 AA, then only residues
                5 to 95 will be checked for modifications.
            allow_mutants (bool): If mutations should be allowed or checked for
            allow_deletions (bool): If deletions should be allowed or checked for
            allow_insertions (bool): If insertions should be allowed or checked for
            allow_unresolved (bool): If unresolved residues should be allowed or checked for
            skip_large_structures (bool): Default False -- currently, large structures can't be saved as a PDB file even
                if you just want to save a single chain, so Biopython will throw an error when trying to do so. As an
                alternative, if a large structure is selected as representative, the pipeline will currently point to it
                and not clean it. If you don't want this to happen, set this to true.
            clean (bool): If structures should be cleaned
            force_rerun (bool): If sequence to structure alignment should be rerun

        Todo:
            - Remedy large structure representative setting

        """
        for g in tqdm(self.genes):
            repstruct = g.protein.set_representative_structure(seq_outdir=seq_outdir,
                                                               struct_outdir=struct_outdir,
                                                               pdb_file_type=pdb_file_type,
                                                               engine=engine,
                                                               rez_cutoff=rez_cutoff,
                                                               seq_ident_cutoff=seq_ident_cutoff,
                                                               always_use_homology=always_use_homology,
                                                               allow_missing_on_termini=allow_missing_on_termini,
                                                               allow_mutants=allow_mutants,
                                                               allow_deletions=allow_deletions,
                                                               allow_insertions=allow_insertions,
                                                               allow_unresolved=allow_unresolved,
                                                               skip_large_structures=skip_large_structures,
                                                               clean=clean,
                                                               force_rerun=force_rerun)

        log.info('{}/{}: number of genes with a representative structure'.format(len(self.genes_with_a_representative_structure),
                                                                                 len(self.genes)))
        log.info('See the "df_representative_structures" attribute for a summary dataframe.')

    def set_representative_structure_parallelize(self, sc, seq_outdir=None, struct_outdir=None, pdb_file_type=None,
                                     engine='needle', always_use_homology=False, rez_cutoff=0.0,
                                     seq_ident_cutoff=0.5, allow_missing_on_termini=0.2,
                                     allow_mutants=True, allow_deletions=False,
                                     allow_insertions=False, allow_unresolved=True, skip_large_structures=False,
                                     clean=True, force_rerun=False):
        def set_repstruct(g, seq_outdir=seq_outdir, struct_outdir=struct_outdir,
                          pdb_file_type=pdb_file_type, engine=engine,
                          rez_cutoff=rez_cutoff, seq_ident_cutoff=seq_ident_cutoff,
                          always_use_homology=always_use_homology,
                          allow_missing_on_termini=allow_missing_on_termini,
                          allow_mutants=allow_mutants, allow_deletions=allow_deletions,
                          allow_insertions=allow_insertions, allow_unresolved=allow_unresolved,
                          skip_large_structures=skip_large_structures,
                          clean=clean, force_rerun=force_rerun):
            g.protein.set_representative_structure(seq_outdir=seq_outdir, struct_outdir=struct_outdir,
                          pdb_file_type=pdb_file_type, engine=engine,
                          rez_cutoff=rez_cutoff, seq_ident_cutoff=seq_ident_cutoff,
                          always_use_homology=always_use_homology,
                          allow_missing_on_termini=allow_missing_on_termini,
                          allow_mutants=allow_mutants, allow_deletions=allow_deletions,
                          allow_insertions=allow_insertions, allow_unresolved=allow_unresolved,
                          skip_large_structures=skip_large_structures,
                          clean=clean, force_rerun=force_rerun)
            return g

        genes_rdd = sc.parallelize(self.genes)
        result = genes_rdd.map(set_repstruct).collect()

        # Copy the results over to the GEM-PRO object's genes using the GenePro function "copy_modified_gene"
        # Also count how many genes mapped to KEGG
        for modified_g in result:
            original_gene = self.genes.get_by_id(modified_g.id)
            original_gene.copy_modified_gene(modified_g)

        log.info('{}/{}: number of genes with a representative structure'.format(len(self.genes_with_a_representative_structure),
                                                                                 len(self.genes)))
        log.info('See the "df_representative_structures" attribute for a summary dataframe.')

    @property
    def df_representative_structures(self):
        """DataFrame: Get a dataframe of representative protein structure information."""
        rep_struct_pre_df = []
        df_cols = ['gene', 'id', 'is_experimental', 'file_type', 'structure_file']

        for g in self.genes_with_a_representative_structure:
            repdict = g.protein.representative_structure.get_dict(df_format=True, only_attributes=df_cols)
            repdict['gene'] = g.id
            rep_struct_pre_df.append(repdict)

        df = pd.DataFrame.from_records(rep_struct_pre_df, columns=df_cols).set_index('gene')
        if df.empty:
            log.warning('Empty dataframe')
            return df
        else:
            return ssbio.utils.clean_df(df)

    @property
    def missing_representative_structure(self):
        """list: List of genes with no mapping to a representative structure."""
        return [x.id for x in self.genes if not self.genes_with_a_representative_structure.has_id(x.id)]

[docs]    def prep_itasser_modeling(self, itasser_installation, itlib_folder, runtype, create_in_dir=None,
                            execute_from_dir=None, all_genes=False, print_exec=False, **kwargs):
        """Prepare to run I-TASSER homology modeling for genes without structures, or all genes.

        Args:
            itasser_installation (str): Path to I-TASSER folder, i.e. ``~/software/I-TASSER4.4``
            itlib_folder (str): Path to ITLIB folder, i.e. ``~/software/ITLIB``
            runtype: How you will be running I-TASSER - local, slurm, or torque
            create_in_dir (str): Local directory where folders will be created
            execute_from_dir (str): Optional path to execution directory - use this if you are copying the homology
                models to another location such as a supercomputer for running
            all_genes (bool): If all genes should be prepped, or only those without any mapped structures
            print_exec (bool): If the execution statement should be printed to run modelling

        Todo:
            * Document kwargs - extra options for I-TASSER, SLURM or Torque execution
            * Allow modeling of any sequence in sequences attribute, select by ID or provide SeqProp?

        """
        if not create_in_dir:
            if not self.data_dir:
                raise ValueError('Output directory must be specified')
            self.homology_models_dir = op.join(self.data_dir, 'homology_models')
        else:
            self.homology_models_dir = create_in_dir

        ssbio.utils.make_dir(self.homology_models_dir)

        if not execute_from_dir:
            execute_from_dir = self.homology_models_dir

        counter = 0
        for g in self.genes_with_a_representative_sequence:
            repstruct = g.protein.representative_structure
            if repstruct and not all_genes:
                log.debug('{}: representative structure set, skipping homology modeling'.format(g.id))
                continue

            g.protein.prep_itasser_modeling(itasser_installation=itasser_installation,
                                            itlib_folder=itlib_folder, runtype=runtype,
                                            create_in_dir=self.homology_models_dir,
                                            execute_from_dir=execute_from_dir,
                                            print_exec=print_exec, **kwargs)
            counter += 1

        log.info('Prepared I-TASSER modeling folders for {} genes in folder {}'.format(counter,
                                                                                       self.homology_models_dir))

[docs]    def pdb_downloader_and_metadata(self, outdir=None, pdb_file_type=None, force_rerun=False):
        """Download ALL mapped experimental structures to each protein's structures directory.

        Args:
            outdir (str): Path to output directory, if GEM-PRO directories were not set or other output directory is
                desired
            pdb_file_type (str): Type of PDB file to download, if not already set or other format is desired
            force_rerun (bool): If files should be re-downloaded if they already exist

        """

        if not pdb_file_type:
            pdb_file_type = self.pdb_file_type

        counter = 0
        for g in tqdm(self.genes):
            pdbs = g.protein.pdb_downloader_and_metadata(outdir=outdir, pdb_file_type=pdb_file_type, force_rerun=force_rerun)

            if pdbs:
                counter += len(pdbs)

        log.info('Updated PDB metadata dataframe. See the "df_pdb_metadata" attribute for a summary dataframe.')
        log.info('Saved {} structures total'.format(counter))

    def download_all_pdbs(self, outdir=None, pdb_file_type=None, load_metadata=False, force_rerun=False):
        if not pdb_file_type:
            pdb_file_type = self.pdb_file_type

        all_structures = []
        for g in tqdm(self.genes):
            pdbs = g.protein.download_all_pdbs(outdir=outdir, pdb_file_type=pdb_file_type,
                                               load_metadata=load_metadata, force_rerun=force_rerun)
            all_structures.extend(pdbs)
        return list(set(all_structures))

    @property
    def df_pdb_metadata(self):
        """DataFrame: Get a dataframe of PDB metadata (PDBs have to be downloaded first)."""
        df = pd.DataFrame()
        for g in self.genes_with_experimental_structures:
            # Get per protein DataFrame
            protein_df = g.protein.df_pdb_metadata.copy().reset_index()
            protein_df['gene'] = g.id
            df = df.append(protein_df)
        if df.empty:
            log.warning('Empty dataframe')
            return df
        else:
            return ssbio.utils.clean_df(df.set_index('gene'))

    @property
    def df_proteins(self):
        """DataFrame: Get a summary dataframe of all proteins in the project."""
        pre_df = []
        df_cols = ['gene', 'id', 'sequences', 'num_sequences', 'representative_sequence',
                   'repseq_gene_name', 'repseq_uniprot', 'repseq_description',
                   'num_structures', 'experimental_structures', 'num_experimental_structures',
                   'homology_models', 'num_homology_models',
                   'representative_structure', 'representative_chain', 'representative_chain_seq_coverage',
                   'repstruct_description', 'repstruct_resolution',
                   'num_sequence_alignments', 'num_structure_alignments']

        for g in self.genes:
            # Get per protein DataFrame
            protein_dict = g.protein.protein_statistics
            protein_dict['gene'] = g.id
            pre_df.append(protein_dict)

        df = pd.DataFrame.from_records(pre_df, columns=df_cols).set_index('gene')
        if df.empty:
            log.warning('Empty dataframe')
            return df
        else:
            return ssbio.utils.clean_df(df)

[docs]    def get_dssp_annotations(self, representatives_only=True, force_rerun=False):
        """Run DSSP on structures and store calculations.

        Annotations are stored in the protein structure's chain sequence at:
        ``<chain_prop>.seq_record.letter_annotations['*-dssp']``

        Args:
            representative_only (bool): If analysis should only be run on the representative structure
            force_rerun (bool): If calculations should be rerun even if an output file exists

        """
        for g in tqdm(self.genes):
            g.protein.get_dssp_annotations(representative_only=representatives_only, force_rerun=force_rerun)

[docs]    def get_dssp_annotations_parallelize(self, sc, representatives_only=True, force_rerun=False):
        """Run DSSP on structures and store calculations.

        Annotations are stored in the protein structure's chain sequence at:
        ``<chain_prop>.seq_record.letter_annotations['*-dssp']``

        Args:
            representative_only (bool): If analysis should only be run on the representative structure
            force_rerun (bool): If calculations should be rerun even if an output file exists

        """
        genes_rdd = sc.parallelize(self.genes)

        def get_dssp_annotation(g):
            g.protein.get_dssp_annotations(representative_only=representatives_only, force_rerun=force_rerun)
            return g

        result = genes_rdd.map(get_dssp_annotation).collect()
        for modified_g in result:
            original_gene = self.genes.get_by_id(modified_g.id)
            original_gene.copy_modified_gene(modified_g)

[docs]    def get_msms_annotations(self, representatives_only=True, force_rerun=False):
        """Run MSMS on structures and store calculations.

        Annotations are stored in the protein structure's chain sequence at:
        ``<chain_prop>.seq_record.letter_annotations['*-msms']``

        Args:
            representative_only (bool): If analysis should only be run on the representative structure
            force_rerun (bool): If calculations should be rerun even if an output file exists

        """
        for g in tqdm(self.genes):
            g.protein.get_msms_annotations(representative_only=representatives_only, force_rerun=force_rerun)

[docs]    def get_msms_annotations_parallelize(self, sc, representatives_only=True, force_rerun=False):
        """Run MSMS on structures and store calculations.

        Annotations are stored in the protein structure's chain sequence at:
        ``<chain_prop>.seq_record.letter_annotations['*-msms']``

        Args:
            representative_only (bool): If analysis should only be run on the representative structure
            force_rerun (bool): If calculations should be rerun even if an output file exists

        """
        genes_rdd = sc.parallelize(self.genes)

        def get_msms_annotation(g):
            g.protein.get_msms_annotations(representative_only=representatives_only, force_rerun=force_rerun)
            return g

        result = genes_rdd.map(get_msms_annotation).collect()
        for modified_g in result:
            original_gene = self.genes.get_by_id(modified_g.id)
            original_gene.copy_modified_gene(modified_g)

[docs]    def get_freesasa_annotations(self, include_hetatms=False, representatives_only=True, force_rerun=False):
        """Run freesasa on structures and store calculations.

        Annotations are stored in the protein structure's chain sequence at:
        ``<chain_prop>.seq_record.letter_annotations['*-freesasa']``

        Args:
            include_hetatms (bool): If HETATMs should be included in calculations. Defaults to ``False``.
            representative_only (bool): If analysis should only be run on the representative structure
            force_rerun (bool): If calculations should be rerun even if an output file exists

        """
        for g in tqdm(self.genes):
            g.protein.get_freesasa_annotations(include_hetatms=include_hetatms,
                                               representative_only=representatives_only,
                                               force_rerun=force_rerun)

[docs]    def get_freesasa_annotations_parallelize(self, sc, include_hetatms=False,
                                             representatives_only=True, force_rerun=False):
        """Run freesasa on structures and store calculations.

        Annotations are stored in the protein structure's chain sequence at:
        ``<chain_prop>.seq_record.letter_annotations['*-freesasa']``

        Args:
            include_hetatms (bool): If HETATMs should be included in calculations. Defaults to ``False``.
            representative_only (bool): If analysis should only be run on the representative structure
            force_rerun (bool): If calculations should be rerun even if an output file exists

        """
        genes_rdd = sc.parallelize(self.genes)

        def get_freesasa_annotation(g):
            g.protein.get_freesasa_annotations(include_hetatms=include_hetatms,
                                               representative_only=representatives_only,
                                               force_rerun=force_rerun)
            return g

        result = genes_rdd.map(get_freesasa_annotation).collect()
        for modified_g in result:
            original_gene = self.genes.get_by_id(modified_g.id)
            original_gene.copy_modified_gene(modified_g)

[docs]    def find_disulfide_bridges(self, representatives_only=True):
        """Run Biopython's disulfide bridge finder and store found bridges.

        Annotations are stored in the protein structure's chain sequence at:
        ``<chain_prop>.seq_record.annotations['SSBOND-biopython']``

        Args:
            representative_only (bool): If analysis should only be run on the representative structure

        """
        for g in tqdm(self.genes):
            g.protein.find_disulfide_bridges(representative_only=representatives_only)

[docs]    def find_disulfide_bridges_parallelize(self, sc, representatives_only=True):
        """Run Biopython's disulfide bridge finder and store found bridges.

        Annotations are stored in the protein structure's chain sequence at:
        ``<chain_prop>.seq_record.annotations['SSBOND-biopython']``

        Args:
            representative_only (bool): If analysis should only be run on the representative structure

        """
        genes_rdd = sc.parallelize(self.genes)

        def find_disulfide_bridges(g):
            g.protein.find_disulfide_bridges(representative_only=representatives_only)
            return g

        result = genes_rdd.map(find_disulfide_bridges).collect()
        for modified_g in result:
            original_gene = self.genes.get_by_id(modified_g.id)
            original_gene.copy_modified_gene(modified_g)

    ### END STRUCTURE RELATED METHODS ###
    ####################################################################################################################

    def __json_encode__(self):
        to_return = {}
        # Don't save properties, methods in the JSON
        for x in [a for a in dir(self) if not a.startswith('__') and not isinstance(getattr(type(self), a, None), property) and not callable(getattr(self,a))]:
            if self.model and x == 'genes':
                continue
            to_return.update({x: getattr(self, x)})
        return to_return

    def __json_decode__(self, **attrs):
        for k, v in attrs.items():
            setattr(self, k, v)
        if not self.model:
            self.genes = DictList(self.genes)
        else:
            self.genes = self.model.genes