Source code for ample.util.sequence_util

#!/usr/bin/env ccp4-python
from collections import OrderedDict
import logging
import os

import iotbx.pdb

from ample.util import ample_util, exit_util


[docs]def sequence(pdbin):
    return _sequence(iotbx.pdb.pdb_input(pdbin).construct_hierarchy())


def _sequence(hierarchy):
    """Extract the sequence of residues from a pdb file."""
    chain2data = _sequence_data(hierarchy)
    return dict((k, chain2data[k][0]) for k in chain2data.keys())


def _sequence1(hierarchy):
    """Return sequence of the first chain"""
    d = _sequence(hierarchy)
    return d[sorted(d.keys())[0]]


[docs]def sequence_data(pdbin):
    return _sequence_data(iotbx.pdb.pdb_input(pdbin).construct_hierarchy())


def _sequence_data(hierarchy):
    """Extract the sequence of residues and resseqs from a pdb file."""
    chain2data = OrderedDict()
    for chain in hierarchy.models()[0].chains():  # only the first model
        if not chain.is_protein():
            continue
        seq, resseq = chain_data(chain)
        chain2data[chain.id] = (seq, resseq)
    return chain2data


[docs]def chain_data(chain):
    seq = ""
    resseq = []
    for residue in chain.conformers()[0].residues():  # Just look at the first conformer
        # See if any of the atoms are non-hetero - if so we add this residue
        if any([not atom.hetero for atom in residue.atoms()]):
            seq += ample_util.three2one[residue.resname]
            resseq.append(residue.resseq_as_int())
    return seq, resseq


[docs]def chain_sequence(chain):
    if not chain.is_protein():
        return None
    return chain_data(chain)[0]


[docs]class Sequence(object):
    """A class to handle a fasta file"""

    def __init__(self, fasta=None, pdb=None, canonicalise=False):
        """Initialise the object"""

        self.MAXWIDTH = 80  # maximum width of any line
        self.name = None  # identifier for this sequence object
        self.headers = []  # title lines
        self.sequences = []  # The fasta sequences (just AA) as a single string
        self.resseqs = []  # The fasta sequences (just AA) as a single string
        self.pdbs = []  # pdb files that sequences were derived from
        self.chains = []  # The chain that the sequence belongs too (if applicable)
        self.fasta_files = []  # The fasta files the sequence were read from

        if fasta:
            self.from_fasta(fasta, canonicalise=canonicalise)
        elif pdb:
            self.from_pdb(pdbin=pdb)
        return

[docs]    def add_pdb_data(self, pdbin):
        """Add the resseq information from a pdb to ourselves when we already have the sequence information from a fasta file - such as from an alignment
        We assume that there will be gaps (-) in the sequence and the letters may be upper or lower case
        Currently this only supports adding data for single-chain pdbs
        """
        if not os.path.isfile(pdbin):
            raise RuntimeError("Cannot find provided pdb file: {}".format(pdbin))

        if len(self.headers) < 1 and len(self.sequences) < 1:
            raise ValueError("Require at least one one sequence.")

        fname = os.path.basename(pdbin)
        name, ext = os.path.splitext(fname)
        if ext != ".pdb":
            raise TypeError("PDB files must have extension .pdb")

        got = False
        for idx, h in enumerate(self.headers):
            n = h[1:].split('.')[0]
            if n == name:
                got = True
                break

        if not got:
            raise RuntimeError("Could not find matching pdb name for {0} in headers {1}".format(fname, self.headers))
        seqd = sequence_data(pdbin)
        if len(seqd) > 1:
            raise RuntimeError("Currently only support adding data for single chain pdbs")

        chain = list(seqd.keys())[0]

        self.pdbs[idx] = fname
        self.chains[idx] = chain

        self.resseqs[idx] = []
        sequence = seqd[chain][0]
        resseqs = seqd[chain][1]

        needle = 0
        GAP = '-'
        for res1 in self.sequences[idx]:
            if res1 == GAP:
                self.resseqs[idx].append(None)
            else:
                assert res1.upper() == sequence[needle]
                self.resseqs[idx].append(resseqs[needle])
                needle += 1
        return True

[docs]    def fasta_str(self, pdbname=False):
        if len(self.sequences) < 1:
            raise RuntimeError("No sequences have been read!")
        headers = []
        for i, header in enumerate(self.headers):
            if pdbname:
                header = ">{0}".format(os.path.basename(self.pdbs[i]))
            headers.append(header)
        return self._fasta_str(headers, self.sequences)

    def _fasta_str(self, headers, sequences):
        s = ""
        for i, seq in enumerate(sequences):
            s += headers[i] + os.linesep
            for chunk in range(0, len(seq), self.MAXWIDTH):
                s += seq[chunk : chunk + self.MAXWIDTH] + os.linesep
            s += os.linesep
        return s

[docs]    def from_fasta(self, fasta_file, canonicalise=True, resseq=True):
        self.name = os.path.splitext(os.path.basename(fasta_file))[0]
        with open(fasta_file, "r") as f:
            self._parse_fasta(f, fasta_file=fasta_file, canonicalise=canonicalise)
        if resseq:
            for i, seq in enumerate(self.sequences):
                if len(self.resseqs) >= i + 1 and self.resseqs[i] is None:
                    self.resseqs[i] = []
                    for j in range(len(seq)):
                        self.resseqs[i].append(j + 1)

[docs]    def from_pdb(self, pdbin):
        pdbin_name = os.path.basename(pdbin)
        self.name = os.path.splitext(pdbin_name)[0]
        chain2data = sequence_data(pdbin)
        if len(chain2data) < 1:
            raise RuntimeError("Could not read sequence from pdb: {0}".format(pdbin))
        self.headers = []
        self.sequences = []
        self.pdbs = []
        self.resseqs = []
        self.fasta_files = []
        for chain in sorted(chain2data.keys()):
            seq = chain2data[chain][0]
            resseq = chain2data[chain][1]
            self.headers.append(">From pdb: {0} chain={1} length={2}".format(pdbin_name, chain, len(seq)))
            self.sequences.append(seq)
            self.resseqs.append(resseq)
            self.pdbs.append(pdbin_name)
            self.chains.append(chain)
            self.fasta_files.append(None)

[docs]    def canonicalise(self):
        """Reformat the fasta file
        
        Description
        -----------
        Needed because Rosetta has problems reading fastas. For it to be read, it has to have no spaces in the sequence,
        a name that is 4 characters and an underscore (ABCD_), everything has to be uppercase, and there has to be a
        return carriage at the end - this has to be linux formatted as when I make a fasta in windows, it doesnt recognize
        the return carriage.

        Rosetta has a lot of problems with fastas so we put in this script to deal with it.

        """
        aa = list("ACDEFGHIKLMNPQRSTVWY")
        for i, seq in enumerate(self.sequences):
            cs = ""
            for char in seq:
                char = char.upper()
                if char in aa:
                    cs += char
                else:
                    msg = "Non-standard amino acid in your sequence: '{}'. Please format to standard amino acids!"
                    raise RuntimeError(msg.format(char))
            self.sequences[i] = cs

[docs]    def length(self, seq_no=0):
        return len(self.sequences[seq_no])

[docs]    def mutate_residue(self, from_aa, res_seq, to_aa, seq_id=0):
        """Change residue type from_aa at position res_seq to be of type to_aa
        
        Note: res_seq positions start from 1 not zero!

        Parameters
        ----------
        from_aa : str
           Single-letter amino acid
        res_seq : int
           Residue sequence number
        to_aa : str
           Single-letter amino acid
        seq_id : int
           The index of the sequence to operate on (counting from zero)
        
        """
        seq = self.sequences[seq_id]
        assert seq[res_seq - 1] == from_aa, "Amino acid at position {0} is {1} not {2}".format(
            res_seq, seq[res_seq - 1], from_aa
        )
        self.sequences[seq_id] = seq[: res_seq - 1] + to_aa + seq[res_seq:]
        return

[docs]    def numSequences(self):
        return len(self.sequences)

    def _parse_fasta(self, fasta, fasta_file=None, canonicalise=True):
        """Parse the fasta file int our data structures & check for consistency"""
        headers = []
        sequences = []
        sequence = ""
        header = None
        for line in fasta:
            line = line.strip().rstrip(os.linesep)
            if not line:
                continue

            if header is None:
                if line.startswith(">"):
                    header = line
                    continue
                else:
                    raise RuntimeError("FASTA sequencs must be prefixed with a > character: {}".format(line))

            if header and line.startswith(">"):
                headers.append(header)
                sequences.append(sequence)
                header = line
                sequence = ""
            else:
                sequence += line

        headers.append(header)
        sequences.append(sequence)

        logging.debug("Stripping whitespace characters from sequence")
        logging.debug("Stripping '*' character of end of sequence")
        for i in range(len(sequences)):
            seq = sequences[i].replace(" ", "")
            if seq.endswith('*'):
                seq = seq[:-1]
            sequences[i] = seq

        self.headers, self.sequences, self.resseqs, self.fasta_files, self.pdbs, self.chains = [], [], [], [], [], []
        for h, s in zip(headers, sequences):
            self.headers.append(h)
            self.sequences.append(s)
            self.fasta_files.append(fasta_file)
            self.resseqs.append(None)
            self.pdbs.append(None)
            self.chains.append(None)

        if canonicalise:
            self.canonicalise()

[docs]    def pirStr(self, seqNo=0):
        """Return a canonical MAXWIDTH PIR representation of the file as a line-separated string"""
        pirStr = [self.title + os.linesep + os.linesep]
        for chunk in range(0, len(self.sequances[seqNo]), self.MAXWIDTH):
            pirStr.append(self.sequences[seqNo][chunk : chunk + self.MAXWIDTH] + os.linesep)
        pirStr.append(os.linesep)
        return pirStr

[docs]    def sequence(self, seq_no=0):
        return self.sequences[seq_no]

[docs]    def toPir(self, input_fasta, output_pir=None):
        """Take a fasta file and output the corresponding PIR file"""
        self.parseFasta(input_fasta)
        if not output_pir:
            dir, ffile = os.path.split(input_fasta)
            fname = os.path.splitext(ffile)[0]
            output_pir = os.path.join(dir, fname + ".pir")
        with open(output_pir, "w") as pirout:
            for line in self.pirStr():
                pirout.write(line)
        return

[docs]    def write_fasta(self, fasta_file, pdbname=False):
        with open(fasta_file, 'w') as f:
            for s in self.fasta_str(pdbname=pdbname):
                f.write(s)
        return

    def __add__(self, other):
        self.headers += other.headers
        self.sequences += other.sequences
        self.resseqs += other.resseqs
        self.pdbs += other.pdbs
        self.chains += other.chains
        self.fasta_files += other.fasta_files
        return self

    def __str__(self):
        return self.__repr__() + os.linesep + self.fasta_str()


[docs]def process_fasta(amoptd, canonicalise=False):
    # Check we can find the input fasta
    if not os.path.exists(str(amoptd['fasta'])):
        msg = 'Cannot find fasta file: {0}'.format(amoptd['fasta'])
        exit_util.exit_error(msg)

    # Reformat to what we need
    logging.debug('Parsing FASTA file')
    try:
        fp = Sequence(fasta=amoptd['fasta'], canonicalise=canonicalise)
    except Exception as e:
        msg = "Error parsing FASTA file: {0}\n\n{1}".format(amoptd['fasta'], e.message)
        exit_util.exit_error(msg)
    if fp.numSequences() != 1:
        msg = "ERROR! Fasta file {0} has > 1 sequence in it.".format(amoptd['fasta'])
        exit_util.exit_error(msg)

    # Length checks
    amoptd['fasta_length'] = fp.length()
    logging.info("Fasta is {0} amino acids long".format(amoptd['fasta_length']))

    # Check we have a decent length
    if amoptd['fasta_length'] < 9:
        msg = "ERROR! Fasta is of length {0}. This is much too short!".format(amoptd['fasta_length'])
        exit_util.exit_error(msg)

    # Check we will be able to truncate at this level
    if (float(amoptd['fasta_length']) / 100) * float(amoptd['percent']) < 1:
        msg = "Cannot truncate a fasta sequence of length {0} with {1} percent intervals. Please select a larger interval.".format(
            amoptd['fasta_length'], amoptd['percent']
        )
        exit_util.exit_error(msg)

    # Check that the sequence doesn't have a his-tag in it
    if not amoptd['allow_his_tag']:
        his_tag = 'HHHHHH'
        i = fp.sequence().find(his_tag)
        l = fp.length()
        if (0 <= i <= 20) or (l - 20 <= i <= l):
            msg = 'The fasta sequence contains a his tag sequence {0} at position {1}. If you wish to use ample with this sequence, please use the \"-allow_his_tag True\" option'.format(
                his_tag, i
            )
            exit_util.exit_error(msg)

    # Fasta is ok, so write out a canonical fasta in the work directory
    outfasta = os.path.join(amoptd['work_dir'], amoptd['name'] + '_.fasta')
    fp.write_fasta(outfasta)
    amoptd['fasta'] = outfasta
    amoptd['sequence'] = fp.sequence()

    return