Source code for ample.util.residue_map

"""Useful manipulations on PDB files"""

import types

from ample.util import ample_util, pdb_model


[docs]class residueSequenceMap(object):
    """Class for handling mapping between model and native residue indices."""

    def __init__(self, refPdb=None, targetPdb=None):

        self.refResSeq = []
        self.refSequence = None
        self.refCAlphaMask = []
        self.refBbMask = []
        self.refOffset = None
        self._refIncomparable = None  # List of atoms in the model that cannot be compared to the model

        self.targetResSeq = []
        self.targetSequence = None
        self.targetCAlphaMask = []
        self.targetBbMask = []
        self.targetOffset = None  # Where the matched part of the sequences starts in the model
        self._targetIncomparable = None  # List of atoms in the model that cannot be compared to the native

        self.lenMatch = None

        # The window of AA we used to check for a match
        self.probeLen = 10

        # maxInset is the max number of AA into the sequence that we will go searching for a match - i.e. if more
        # then maxInset AA at the start are non-matching, we won't find the match
        self.maxInset = 50

        # Like this just for testing
        if refPdb and targetPdb:
            self.calc_map(refPdb, targetPdb)

        return

[docs]    def ref2target(self, refResSeq):
        """Return the target resSeq for the given reference resSeq.
        This will calculate a resSeq in the target if there isn't one.
        """

        # Work out how many residues from the start of the matching region this residue is in the target
        indent = refResSeq - self.refResSeq[self.refOffset]

        # calculate the corresponding index in the reference
        targetResSeq = self.targetResSeq[self.targetOffset] + indent

        ## paranoid check
        # if 0 < self.targetOffset + indent < len( self.targetResSeq ):
        #    assert targetResSeq == self.targetResSeq[ self.targetOffset + indent ]

        return targetResSeq

[docs]    def target2ref(self, targetResSeq):
        """Return the referece resSeq for the given target resSeq.
        This will calculate a resSeq in the reference if there isn't one.
        """

        # Work out how many residues from the start of the matching region this residue is in the target
        indent = targetResSeq - self.targetResSeq[self.targetOffset]

        refResSeq = self.refResSeq[self.refOffset] + indent

        ## paranoid check
        # if 0 < self.refOffset + indent < len( self.refResSeq ):
        #    assert refResSeq == self.refResSeq[ self.refOffset + indent ]

        return refResSeq

[docs]    def targetIncomparable(self, cAlphaMask=True, bbMask=False):
        """Return a list of the resSeq in the target that cannot be compared to the reference.
        This includes any where there isn't a corresponding residue in the reference, or there isn't a c-alpha
        or backbone atom in either (if cAlphaMask or bbMask is set)
        """

        if self._targetIncomparable == None:

            self._targetIncomparable = []
            for i, resSeq in enumerate(self.targetResSeq):

                # Before the start of the matching region
                if i < self.targetOffset:
                    self._targetIncomparable.append(resSeq)
                    continue

                # After end of matching region
                if i > self.lenMatch + 1:
                    self._targetIncomparable.append(resSeq)
                    continue

                # In matching region but no C-alpha
                if cAlphaMask:
                    if self.targetCAlphaMask[i]:
                        self._targetIncomparable.append(resSeq)
                        continue

                # In matching region but no complete bbatoms
                if bbMask:
                    if self.targetBbMask[i]:
                        self._targetIncomparable.append(resSeq)
                        continue

                # Matching residues in reference
                refResSeq = self.target2ref(resSeq)
                try:
                    j = self.refResSeq.index(refResSeq)
                except ValueError:
                    # A residue that isn't actually in the reference
                    self._targetIncomparable.append(resSeq)
                    continue

                # No C-Alpha
                if cAlphaMask:
                    if self.refCAlphaMask[j]:
                        self._targetIncomparable.append(resSeq)
                        continue

                # No bbMask
                if bbMask:
                    if self.refBbMask[j]:
                        self._targetIncomparable.append(resSeq)
                        continue

        return self._targetIncomparable

[docs]    def refIncomparable(self, cAlphaMask=True, bbMask=False):
        """Return a list of the resSeq in the reference that cannot be compared to the target.
        This includes any where there isn't a corresponding residue in the target, or there isn't a c-alpha
        or backbone atom in either (if cAlphaMask or bbMask is set)
        """

        if self._refIncomparable == None:
            self._refIncomparable = []
            for i, resSeq in enumerate(self.refResSeq):

                # Before the start of the matching region
                if i < self.refOffset:
                    self._refIncomparable.append(resSeq)
                    continue

                # After end of matching region
                if i > self.lenMatch + 1:
                    self._refIncomparable.append(resSeq)
                    continue

                # In matching region but no C-alpha
                if cAlphaMask:
                    if self.refCAlphaMask[i]:
                        self._refIncomparable.append(resSeq)
                        continue

                # In matching region but no complete bbatoms
                if bbMask:
                    if self.refBbMask[i]:
                        self._refIncomparable.append(resSeq)
                        continue

                # Matching residues in reference
                targetResSeq = self.ref2target(resSeq)
                try:
                    j = self.targetResSeq.index(targetResSeq)
                except ValueError:
                    # A residue that isn't actually in the reference
                    self._refIncomparable.append(resSeq)
                    continue

                # No C-Alpha
                if cAlphaMask:
                    if self.targetCAlphaMask[j]:
                        self._refIncomparable.append(resSeq)
                        continue

                # No bbMask
                if bbMask:
                    if self.targetBbMask[j]:
                        self._refIncomparable.append(resSeq)
                        continue

        return self._refIncomparable

    def __str__(self):
        me = {}
        for slot in dir(self):
            attr = getattr(self, slot)
            if not slot.startswith("__") and not (
                isinstance(attr, types.MethodType) or isinstance(attr, types.FunctionType)
            ):
                me[slot] = attr

        s = self.__repr__() + "\n"
        for k in sorted(me.keys()):
            s += "{0}: {1}\n".format(k, me[k])
        return s

[docs]    def calc_map(self, nativePdb, modelPdb):

        self.refSequence, self.refResSeq, self.refCAlphaMask = self.read_pdb(nativePdb)
        self.targetSequence, self.targetResSeq, self.targetCAlphaMask = self.read_pdb(modelPdb)

        self._calc_map()

        return

    def _calc_map(self):
        """Return a ResSeqMap mapping the index of a residue in the model to the corresponding residue in the native.
        Only works if 1 chain in either file and with standard residues
        """
        self.refOffset, self.targetOffset = self._calcOffset(self.refSequence, self.targetSequence)
        self.lenMatch = self._lenMatch()
        self._checkContiguous()

    def _checkContiguous(self):
        """
        Check that there is congiuous resSeq numbering as otherwise the map in current form won't work
        Can only check from the start while residues are matching - won't work after the first gap
        """
        p_refResSeq = p_targetResSeq = None
        for count in range(self.lenMatch):

            refIdx = self.refOffset + count
            refAA = self.refSequence[refIdx]
            refResSeq = self.refResSeq[refIdx]

            targetIdx = self.targetOffset + count
            targetAA = self.targetSequence[targetIdx]
            targetResSeq = self.targetResSeq[targetIdx]

            if count != 0:

                # Need to stop after the first gap
                if refAA != targetAA:
                    break

                # Complain if the residues match but the numbering doesn't
                if refResSeq != p_refResSeq + 1 or targetResSeq != p_targetResSeq + 1:
                    raise RuntimeError(
                        "Non-contiguous residue numbering: {}->{} and {}->{}".format(
                            p_refResSeq, refResSeq, p_targetResSeq, targetResSeq
                        )
                    )
            p_refResSeq = refResSeq
            p_targetResSeq = targetResSeq

    def _calcOffset(self, refSequence, targetSequence, reverse=False):

        # Probe length dependant on protein size - is length of shortest protein or the default
        shortest = min(len(targetSequence), len(refSequence))
        probeLen = min(self.probeLen, shortest)

        # Work out how far we can probe into each sequence
        if len(targetSequence) - probeLen >= self.maxInset:
            targetMaxInset = self.maxInset
        else:
            targetMaxInset = len(targetSequence) - probeLen

        if len(refSequence) - probeLen >= self.maxInset:
            refMaxInset = self.maxInset
        else:
            refMaxInset = len(refSequence) - probeLen

        # If checking from the end, reverse the strings
        if reverse:
            refSequence = refSequence[::-1]
            targetSequence = targetSequence[::-1]

        got = False
        for targetOffset in range(targetMaxInset + 1):
            probe = targetSequence[targetOffset : targetOffset + probeLen]
            for refOffset in range(refMaxInset + 1):
                if refSequence[refOffset : refOffset + probeLen] == probe:
                    got = True
                    break

            if got:
                break

        if not got:
            raise RuntimeError("Could not calculate map for:\n{}\n{}".format(refSequence, targetSequence))

        return refOffset, targetOffset

    def _lenMatch(self):
        refBackOffset, targetBackOffset = self._calcOffset(self.refSequence, self.targetSequence, reverse=True)
        # Calculate match from the residue numbers - use reference for now
        length = self.refResSeq[len(self.refSequence) - 1 - refBackOffset] - self.refResSeq[self.refOffset] + 1

        if length > len(self.refResSeq) and length > len(self.targetResSeq):
            raise RuntimeError("Match of {} is longer than both of the sequences!".format(length))

        return length

[docs]    def fromInfo(self, refInfo=None, refChainID=None, targetInfo=None, targetChainID=None, modelIdx=0):
        """Create a map from 2 info objects"""

        # Determine index of chain so we know where to get the data from
        nativeIdx = refInfo.models[modelIdx].chains.index(refChainID)

        self.refResSeq = refInfo.models[modelIdx].resSeqs[nativeIdx]
        self.refSequence = refInfo.models[modelIdx].sequences[nativeIdx]
        self.refCAlphaMask = refInfo.models[modelIdx].caMask[nativeIdx]
        self.refBbMask = refInfo.models[modelIdx].bbMask[nativeIdx]
        self.refOffset = None
        self._refIncomparable = None

        targetIdx = targetInfo.models[modelIdx].chains.index(targetChainID)
        self.targetResSeq = targetInfo.models[modelIdx].resSeqs[targetIdx]
        self.targetSequence = targetInfo.models[modelIdx].sequences[targetIdx]
        self.targetCAlphaMask = targetInfo.models[modelIdx].caMask[targetIdx]
        self.targetBbMask = targetInfo.models[modelIdx].bbMask[targetIdx]
        self.targetOffset = None
        self._targetIncomparable = None

        self._calc_map()

[docs]    def read_pdb(self, pdb):
        """Get sequence as string of 1AA
        get list of matching resSeq
        """

        atomTypes = []

        resSeq = []
        resName = []
        _atomTypes = []
        atomTypesList = []

        chain = None
        readingResSeq = None
        readingResName = None
        for line in open(pdb):

            if line.startswith("MODEL"):
                raise RuntimeError("FOUND MULTI_MODEL FILE!")

            if line.startswith("TER"):
                break

            if line.startswith("ATOM"):

                atom = pdb_model.PdbAtom(line)

                if not chain:
                    chain = atom.chainID

                if atom.chainID != chain:
                    raise RuntimeError("FOUND ADDITIONAL CHAIN")

                # First atom in first residue
                if readingResSeq == None:
                    readingResSeq = atom.resSeq
                    readingResName = atom.resName
                    _atomTypes.append(atom.name.strip())
                    continue

                if readingResSeq != atom.resSeq:
                    resName.append(readingResName)
                    resSeq.append(readingResSeq)
                    atomTypesList.append(_atomTypes)

                    readingResSeq = atom.resSeq
                    readingResName = atom.resName
                    _atomTypes = [atom.name.strip()]
                else:
                    if atom.name not in _atomTypes:
                        _atomTypes.append(atom.name.strip())

        resName.append(readingResName)
        resSeq.append(readingResSeq)
        atomTypesList.append(_atomTypes)

        sequence = ""
        for n in resName:
            sequence += ample_util.three2one[n]

        cAlphaMask = ['CA' not in atomTypes for atomTypes in atomTypesList]
        return sequence, resSeq, cAlphaMask

[docs]    def resSeqMatch(self):
        """Return true if the residue numbering between the model and native over the aligned region is the same"""
        return (
            self.targetResSeq[self.targetOffset : self.targetOffset + self.lenMatch]
            == self.refResSeq[self.refOffset : self.refOffset + self.lenMatch]
        )