Source code for ample.util.residue_map

'''
Useful manipulations on PDB files
'''

import types

import pdb_edit
import pdb_model

[docs]class residueSequenceMap( object ):
    """Class for handling mapping between model and native residue indices.
    
     """
    
    def __init__( self, refPdb=None, targetPdb=None ):
        
        
        self.refResSeq = []
        self.refSequence = None
        self.refCAlphaMask = []
        self.refBbMask = []
        self.refOffset = None
        self._refIncomparable = None # List of atoms in the model that cannot be compared to the model
        
        self.targetResSeq = []
        self.targetSequence = None
        self.targetCAlphaMask = []
        self.targetBbMask = []
        self.targetOffset = None # Where the matched part of the sequences starts in the model
        self._targetIncomparable = None # List of atoms in the model that cannot be compared to the native
        
        self.lenMatch = None

        # The window of AA we used to check for a match    
        self.probeLen = 10
        
        # maxInset is the max number of AA into the sequence that we will go searching for a match - i.e. if more
        # then maxInset AA at the start are non-matching, we won't find the match 
        self.maxInset = 50
        
        # Like this just for testing
        if refPdb and targetPdb:
            self.calc_map(refPdb, targetPdb)
        
        return
    
[docs]    def ref2target( self, refResSeq ):
        """Return the target resSeq for the given reference resSeq.
        This will calculate a resSeq in the target if there isn't one.
        """
        
        # Work out how many residues from the start of the matching region this residue is in the target
        indent = refResSeq - self.refResSeq[ self.refOffset ]
        
        # calculate the corresponding index in the reference
        targetResSeq = self.targetResSeq[ self.targetOffset ] + indent
        
        ## paranoid check
        #if 0 < self.targetOffset + indent < len( self.targetResSeq ):
        #    assert targetResSeq == self.targetResSeq[ self.targetOffset + indent ]
        
        return targetResSeq
         
[docs]    def target2ref( self, targetResSeq ):
        """Return the referece resSeq for the given target resSeq.
        This will calculate a resSeq in the reference if there isn't one.
        """
        
        # Work out how many residues from the start of the matching region this residue is in the target
        indent = targetResSeq - self.targetResSeq[ self.targetOffset ]
        
        refResSeq = self.refResSeq[ self.refOffset ] + indent
        
        ## paranoid check
        #if 0 < self.refOffset + indent < len( self.refResSeq ):
        #    assert refResSeq == self.refResSeq[ self.refOffset + indent ]
        
        return refResSeq

[docs]    def targetIncomparable( self, cAlphaMask=True, bbMask=False ):
        """Return a list of the resSeq in the target that cannot be compared to the reference.
        This includes any where there isn't a corresponding residue in the reference, or there isn't a c-alpha
        or backbone atom in either (if cAlphaMask or bbMask is set)
        """
        
        if self._targetIncomparable == None:
            
            self._targetIncomparable = []
            for i, resSeq in enumerate( self.targetResSeq ):
                
                # Before the start of the matching region
                if i < self.targetOffset:
                    self._targetIncomparable.append( resSeq )
                    continue
                
                # After end of matching region
                if i > self.lenMatch + 1:
                    self._targetIncomparable.append( resSeq )
                    continue
                
                # In matching region but no C-alpha
                if cAlphaMask:
                    if self.targetCAlphaMask[ i ]:
                        self._targetIncomparable.append( resSeq )
                        continue
                    
                # In matching region but no complete bbatoms
                if bbMask:
                    if self.targetBbMask[ i ]:
                        self._targetIncomparable.append( resSeq )
                        continue
                
                # Matching residues in reference
                refResSeq = self.target2ref( resSeq )
                try:
                    j = self.refResSeq.index( refResSeq )
                except ValueError:
                    # A residue that isn't actually in the reference
                    self._targetIncomparable.append( resSeq )
                    continue
                
                # No C-Alpha
                if cAlphaMask:
                    if self.refCAlphaMask[ j ]:
                        self._targetIncomparable.append( resSeq )
                        continue
                    
                # No bbMask
                if bbMask:
                    if self.refBbMask[ j ]:
                        self._targetIncomparable.append( resSeq )
                        continue
            
        return self._targetIncomparable

[docs]    def refIncomparable( self, cAlphaMask=True, bbMask=False ):
        """Return a list of the resSeq in the reference that cannot be compared to the target.
        This includes any where there isn't a corresponding residue in the target, or there isn't a c-alpha
        or backbone atom in either (if cAlphaMask or bbMask is set)
        """
        
        if self._refIncomparable == None:
            self._refIncomparable = []
            for i, resSeq in enumerate( self.refResSeq ):
                
                # Before the start of the matching region
                if i < self.refOffset:
                    self._refIncomparable.append( resSeq )
                    continue

                # After end of matching region
                if i > self.lenMatch + 1:
                    self._refIncomparable.append( resSeq )
                    continue

                # In matching region but no C-alpha
                if cAlphaMask:
                    if self.refCAlphaMask[ i ]:
                        self._refIncomparable.append( resSeq )
                        continue
                    
                # In matching region but no complete bbatoms
                if bbMask:
                    if self.refBbMask[ i ]:
                        self._refIncomparable.append( resSeq )
                        continue
                
                # Matching residues in reference
                targetResSeq = self.ref2target( resSeq )
                try:
                    j = self.targetResSeq.index( targetResSeq )
                except ValueError:
                    # A residue that isn't actually in the reference
                    self._refIncomparable.append( resSeq )
                    continue
                
                # No C-Alpha
                if cAlphaMask:
                    if self.targetCAlphaMask[ j ]:
                        self._refIncomparable.append( resSeq )
                        continue
                    
                # No bbMask
                if bbMask:
                    if self.targetBbMask[ j ]:
                        self._refIncomparable.append( resSeq )
                        continue
            
        return self._refIncomparable

    def __str__(self):
        me = {}
        for slot in dir(self):
            attr = getattr(self, slot)
            if not slot.startswith("__") and not ( isinstance(attr, types.MethodType) or
              isinstance(attr, types.FunctionType) ):
                me[slot] = attr
        
        s = self.__repr__() + "\n"
        for k in sorted( me.keys() ):
            s += "{0}: {1}\n".format( k, me[k]  )
        return s
    
[docs]    def calc_map( self, nativePdb, modelPdb ):
        
        self.refSequence, self.refResSeq, self.refCAlphaMask = self.read_pdb( nativePdb )
        self.targetSequence, self.targetResSeq, self.targetCAlphaMask = self.read_pdb( modelPdb )
        
        self._calc_map()
        
        return
    
    def _calc_map( self ):
        """Return a ResSeqMap mapping the index of a residue in the model to the corresponding residue in the native.
        Only works if 1 chain in either file and with standard residues
        """
        
        #if len(self.refSequence) < self.probeLen or len(self.targetSequence) < self.probeLen:
        #    raise RuntimeError,"Very short sequences - this will not work: {0}  : {1}".format( self.refSequence, self.targetSequence )

        if False:        
            print "Calculating map for:\n{0}\n{1}\n{2}\n{3}\n".format( self.refSequence,
                                                                      self.targetSequence,
                                                                      self.refResSeq,
                                                                      self.targetResSeq,
                                                                     )
        
        # Find where they match at the start
        self.refOffset, self.targetOffset = self._calcOffset( self.refSequence, self.targetSequence )
        #print "Got refOffset, ", self.refOffset
        #print "Got refResSeq ",self.refResSeq[ self.refOffset ]
        #print "Got targetOffset ", self.targetOffset
        #print "Got targetResSeq ",self.targetResSeq[ self.targetOffset ]
        
        self.lenMatch = self._lenMatch()
        
        self._checkContiguous()
        
        return
    
    def _XcheckContiguous(self):
        """UNUSED"""
        #
        # Check that there is congiuous resSeq numbering as otherwise the map in current form won't work
        #
        # For reference
        previous=self.refResSeq[ self.refOffset ]
        for i in range( self.refOffset + 1, self.refOffset + self.lenMatch ):
            
            if self.refResSeq[ i ] != previous + 1:
                raise RuntimeError,"Non-contiguous residue numbering in refSequence for {0} to {1}".format( previous, self.refResSeq[ i ] )
            
            previous = self.refResSeq[ i ]
            #print "GOT i {0}, refResSeq {1}, refSequence {2}".format(  i, self.refResSeq[ i ], self.refSequence[ i ]  )
        
        # Now for target
        previous=self.targetResSeq[ self.targetOffset ]
        for i in range( self.targetOffset + 1, self.targetOffset + self.lenMatch ):
            
            if self.targetResSeq[ i ] != previous + 1:
                raise RuntimeError,"Non-contiguous residue numbering in refSequence for {0} to {1}".format( previous, self.targetResSeq[ i ] )
            
            previous = self.targetResSeq[ i ]
            #print "GOT i {0}, targetResSeq {1}, targetSequence {2}".format(  i, self.targetResSeq[ i ], self.targetSequence[ i ]  )
         
        return
    
    def _checkContiguous(self):
        """
        Check that there is congiuous resSeq numbering as otherwise the map in current form won't work
        Can only check from the start while residues are matching - won't work after the first gap
        """
        p_refResSeq = p_targetResSeq = None
        for count in range( self.lenMatch ):
            
            refIdx = self.refOffset + count
            refAA = self.refSequence[ refIdx ]
            refResSeq = self.refResSeq[ refIdx ]
            
            targetIdx = self.targetOffset + count
            targetAA = self.targetSequence[ targetIdx ]
            targetResSeq = self.targetResSeq[ targetIdx ]
            
            if count != 0:
                
                # Need to stop after the first gap
                if refAA != targetAA:
                    break
                
                # Complain if the residues match but the numbering doesn't
                if refResSeq != p_refResSeq + 1 or targetResSeq !=  p_targetResSeq + 1:
                    raise RuntimeError,"Non-contiguous residue numbering: {0}->{1} and {2}->{3}".format( p_refResSeq, refResSeq, p_targetResSeq, targetResSeq  )
            
            p_refResSeq = refResSeq
            p_targetResSeq = targetResSeq
            
        return
     
    
    def _calcOffset(self, refSequence, targetSequence, reverse=False ):
        
        # Probe length dependant on protein size - is length of shortest protein or the default
        shortest = min( len(targetSequence), len(refSequence) )
        probeLen = min( self.probeLen, shortest )
        
        # Work out how far we can probe into each sequence
        if len( targetSequence ) - probeLen >= self.maxInset:
            targetMaxInset = self.maxInset
        else:
            targetMaxInset =  len( targetSequence ) - probeLen 
        
        if len( refSequence ) - probeLen >= self.maxInset:
            refMaxInset = self.maxInset
        else:
            refMaxInset =  len( refSequence ) - probeLen 

        #print "GOT targetMaxInset, refMaxInset, probeLen ",targetMaxInset, refMaxInset, probeLen

        # If checking from the end, reverse the strings
        if reverse:
            refSequence = refSequence[::-1]
            targetSequence = targetSequence[::-1]
        
        got=False
        for targetOffset in range( targetMaxInset + 1 ):
            probe = targetSequence[ targetOffset : targetOffset + probeLen ]
            #print "PROBE ",probe
            for refOffset in range( refMaxInset + 1 ):
                #print "TEST ",refSequence[ refOffset:refOffset + probeLen ]
                if refSequence[ refOffset:refOffset + probeLen ] == probe:
                    got=True
                    break
            
            if got:
#                 print "GOT MODEL MATCH AT i,j ",targetOffset,refOffset
                break
            
        if not got:
            raise RuntimeError,"Could not calculate map for:\n{0}\n{1}".format( refSequence, targetSequence )
        
        return ( refOffset, targetOffset )
    
    def _lenMatch(self):
        refBackOffset, targetBackOffset = self._calcOffset( self.refSequence, self.targetSequence, reverse=True )
        #print "Got refBackOffset, ", refBackOffset
        #print "Got targetBackOffset ", targetBackOffset
        # Calculate match from the residue numbers - use reference for now
        length = self.refResSeq[ len(self.refSequence) - 1 - refBackOffset ] - self.refResSeq[ self.refOffset ] + 1
        
        if length > len(self.refResSeq) and length > len(self.targetResSeq):
            raise RuntimeError, "Match of {0} is longer than both of the sequences!".format( length )
        
        return length

[docs]    def fromInfo(self, refInfo=None, refChainID=None, targetInfo=None, targetChainID=None, modelIdx=0):
        """Create a map from 2 info objects"""
        
        # Determine index of chain so we know where to get the data from
        nativeIdx = refInfo.models[ modelIdx ].chains.index( refChainID )

        self.refResSeq = refInfo.models[ modelIdx ].resSeqs[ nativeIdx ]
        self.refSequence = refInfo.models[ modelIdx ].sequences[ nativeIdx ]
        self.refCAlphaMask = refInfo.models[ modelIdx ].caMask[ nativeIdx ]
        self.refBbMask = refInfo.models[ modelIdx ].bbMask[ nativeIdx ]
        self.refOffset = None
        self._refIncomparable = None

        targetIdx = targetInfo.models[ modelIdx ].chains.index( targetChainID )
        self.targetResSeq = targetInfo.models[ modelIdx ].resSeqs[ targetIdx ]
        self.targetSequence = targetInfo.models[ modelIdx ].sequences[ targetIdx ]
        self.targetCAlphaMask = targetInfo.models[ modelIdx ].caMask[ targetIdx ]
        self.targetBbMask =  targetInfo.models[ modelIdx ].bbMask[ targetIdx ]
        self.targetOffset = None
        self._targetIncomparable = None
        
        #print "refSequence    ",self.refSequence
        #print "targetSequence   ", self.targetSequence
        
        self._calc_map()
        
        return
    
[docs]    def read_pdb( self, pdb ):
        """Get sequence as string of 1AA
        get list of matching resSeq
        """
         
        atomTypes = [] # For checking we have all required atom types
     
        resSeq = []
        resName = []
        _atomTypes = []
        atomTypesList = []
         
        chain=None
        readingResSeq=None
        readingResName=None
        for line in open( pdb ):
             
            if line.startswith("MODEL"):
                raise RuntimeError,"FOUND MULTI_MODEL FILE!"
             
            if line.startswith("TER"):
                break
             
            if line.startswith("ATOM"):
                 
                atom = pdb_model.PdbAtom( line )
                 
                if not chain:
                    chain = atom.chainID
                 
                if atom.chainID != chain:
                    raise RuntimeError," FOUND ADDITIONAL CHAIN"
                    break
                     
                # First atom in first residue
                if readingResSeq == None:
                    readingResSeq = atom.resSeq
                    readingResName = atom.resName
                    _atomTypes.append( atom.name.strip() )
                    continue
                 
                if readingResSeq != atom.resSeq:
                    # Adding a new residue
                     
                    # Add the atom we've just finished reading
                    resName.append( readingResName )
                    resSeq.append( readingResSeq )
                    atomTypesList.append( _atomTypes )
                     
                    # Reset
                    readingResSeq = atom.resSeq
                    readingResName = atom.resName
                    _atomTypes = [ atom.name.strip() ]
                else:
                    if atom.name not in _atomTypes:
                        _atomTypes.append( atom.name.strip() )
                         
        # End reading loop
 
        # Add the atom we've just finished reading
        resName.append( readingResName )
        resSeq.append( readingResSeq )
        atomTypesList.append( _atomTypes )
         
        sequence = ""
        # Build up the sequence
        for n in resName:
            sequence += pdb_edit.three2one[ n ]
         
        # Build up the mask
        cAlphaMask = []
        for atomTypes in atomTypesList:
            if 'CA' not in atomTypes:
                cAlphaMask.append( True )
            else:
                cAlphaMask.append( False )
         
        return ( sequence, resSeq, cAlphaMask )
    
[docs]    def resSeqMatch(self):
        """Return true if the residue numbering between the model and native over the aligned region is the same"""
        
        #print self.targetResSeq[ self.targetOffset : self.targetOffset + self.lenMatch ]
        #print self.refResSeq[ self.refOffset : self.refOffset + self.lenMatch ]
        return self.targetResSeq[ self.targetOffset : self.targetOffset + self.lenMatch ] == self.refResSeq[ self.refOffset : self.refOffset + self.lenMatch ]