Source code for ample.util.residue_map

'''
Useful manipulations on PDB files
'''

import types

import pdb_edit
import pdb_model

[docs]class residueSequenceMap( object ): """Class for handling mapping between model and native residue indices. """ def __init__( self, refPdb=None, targetPdb=None ): self.refResSeq = [] self.refSequence = None self.refCAlphaMask = [] self.refBbMask = [] self.refOffset = None self._refIncomparable = None # List of atoms in the model that cannot be compared to the model self.targetResSeq = [] self.targetSequence = None self.targetCAlphaMask = [] self.targetBbMask = [] self.targetOffset = None # Where the matched part of the sequences starts in the model self._targetIncomparable = None # List of atoms in the model that cannot be compared to the native self.lenMatch = None # The window of AA we used to check for a match self.probeLen = 10 # maxInset is the max number of AA into the sequence that we will go searching for a match - i.e. if more # then maxInset AA at the start are non-matching, we won't find the match self.maxInset = 50 # Like this just for testing if refPdb and targetPdb: self.calc_map(refPdb, targetPdb) return
[docs] def ref2target( self, refResSeq ): """Return the target resSeq for the given reference resSeq. This will calculate a resSeq in the target if there isn't one. """ # Work out how many residues from the start of the matching region this residue is in the target indent = refResSeq - self.refResSeq[ self.refOffset ] # calculate the corresponding index in the reference targetResSeq = self.targetResSeq[ self.targetOffset ] + indent ## paranoid check #if 0 < self.targetOffset + indent < len( self.targetResSeq ): # assert targetResSeq == self.targetResSeq[ self.targetOffset + indent ] return targetResSeq
[docs] def target2ref( self, targetResSeq ): """Return the referece resSeq for the given target resSeq. This will calculate a resSeq in the reference if there isn't one. """ # Work out how many residues from the start of the matching region this residue is in the target indent = targetResSeq - self.targetResSeq[ self.targetOffset ] refResSeq = self.refResSeq[ self.refOffset ] + indent ## paranoid check #if 0 < self.refOffset + indent < len( self.refResSeq ): # assert refResSeq == self.refResSeq[ self.refOffset + indent ] return refResSeq
[docs] def targetIncomparable( self, cAlphaMask=True, bbMask=False ): """Return a list of the resSeq in the target that cannot be compared to the reference. This includes any where there isn't a corresponding residue in the reference, or there isn't a c-alpha or backbone atom in either (if cAlphaMask or bbMask is set) """ if self._targetIncomparable == None: self._targetIncomparable = [] for i, resSeq in enumerate( self.targetResSeq ): # Before the start of the matching region if i < self.targetOffset: self._targetIncomparable.append( resSeq ) continue # After end of matching region if i > self.lenMatch + 1: self._targetIncomparable.append( resSeq ) continue # In matching region but no C-alpha if cAlphaMask: if self.targetCAlphaMask[ i ]: self._targetIncomparable.append( resSeq ) continue # In matching region but no complete bbatoms if bbMask: if self.targetBbMask[ i ]: self._targetIncomparable.append( resSeq ) continue # Matching residues in reference refResSeq = self.target2ref( resSeq ) try: j = self.refResSeq.index( refResSeq ) except ValueError: # A residue that isn't actually in the reference self._targetIncomparable.append( resSeq ) continue # No C-Alpha if cAlphaMask: if self.refCAlphaMask[ j ]: self._targetIncomparable.append( resSeq ) continue # No bbMask if bbMask: if self.refBbMask[ j ]: self._targetIncomparable.append( resSeq ) continue return self._targetIncomparable
[docs] def refIncomparable( self, cAlphaMask=True, bbMask=False ): """Return a list of the resSeq in the reference that cannot be compared to the target. This includes any where there isn't a corresponding residue in the target, or there isn't a c-alpha or backbone atom in either (if cAlphaMask or bbMask is set) """ if self._refIncomparable == None: self._refIncomparable = [] for i, resSeq in enumerate( self.refResSeq ): # Before the start of the matching region if i < self.refOffset: self._refIncomparable.append( resSeq ) continue # After end of matching region if i > self.lenMatch + 1: self._refIncomparable.append( resSeq ) continue # In matching region but no C-alpha if cAlphaMask: if self.refCAlphaMask[ i ]: self._refIncomparable.append( resSeq ) continue # In matching region but no complete bbatoms if bbMask: if self.refBbMask[ i ]: self._refIncomparable.append( resSeq ) continue # Matching residues in reference targetResSeq = self.ref2target( resSeq ) try: j = self.targetResSeq.index( targetResSeq ) except ValueError: # A residue that isn't actually in the reference self._refIncomparable.append( resSeq ) continue # No C-Alpha if cAlphaMask: if self.targetCAlphaMask[ j ]: self._refIncomparable.append( resSeq ) continue # No bbMask if bbMask: if self.targetBbMask[ j ]: self._refIncomparable.append( resSeq ) continue return self._refIncomparable
def __str__(self): me = {} for slot in dir(self): attr = getattr(self, slot) if not slot.startswith("__") and not ( isinstance(attr, types.MethodType) or isinstance(attr, types.FunctionType) ): me[slot] = attr s = self.__repr__() + "\n" for k in sorted( me.keys() ): s += "{0}: {1}\n".format( k, me[k] ) return s
[docs] def calc_map( self, nativePdb, modelPdb ): self.refSequence, self.refResSeq, self.refCAlphaMask = self.read_pdb( nativePdb ) self.targetSequence, self.targetResSeq, self.targetCAlphaMask = self.read_pdb( modelPdb ) self._calc_map() return
def _calc_map( self ): """Return a ResSeqMap mapping the index of a residue in the model to the corresponding residue in the native. Only works if 1 chain in either file and with standard residues """ #if len(self.refSequence) < self.probeLen or len(self.targetSequence) < self.probeLen: # raise RuntimeError,"Very short sequences - this will not work: {0} : {1}".format( self.refSequence, self.targetSequence ) if False: print "Calculating map for:\n{0}\n{1}\n{2}\n{3}\n".format( self.refSequence, self.targetSequence, self.refResSeq, self.targetResSeq, ) # Find where they match at the start self.refOffset, self.targetOffset = self._calcOffset( self.refSequence, self.targetSequence ) #print "Got refOffset, ", self.refOffset #print "Got refResSeq ",self.refResSeq[ self.refOffset ] #print "Got targetOffset ", self.targetOffset #print "Got targetResSeq ",self.targetResSeq[ self.targetOffset ] self.lenMatch = self._lenMatch() self._checkContiguous() return def _XcheckContiguous(self): """UNUSED""" # # Check that there is congiuous resSeq numbering as otherwise the map in current form won't work # # For reference previous=self.refResSeq[ self.refOffset ] for i in range( self.refOffset + 1, self.refOffset + self.lenMatch ): if self.refResSeq[ i ] != previous + 1: raise RuntimeError,"Non-contiguous residue numbering in refSequence for {0} to {1}".format( previous, self.refResSeq[ i ] ) previous = self.refResSeq[ i ] #print "GOT i {0}, refResSeq {1}, refSequence {2}".format( i, self.refResSeq[ i ], self.refSequence[ i ] ) # Now for target previous=self.targetResSeq[ self.targetOffset ] for i in range( self.targetOffset + 1, self.targetOffset + self.lenMatch ): if self.targetResSeq[ i ] != previous + 1: raise RuntimeError,"Non-contiguous residue numbering in refSequence for {0} to {1}".format( previous, self.targetResSeq[ i ] ) previous = self.targetResSeq[ i ] #print "GOT i {0}, targetResSeq {1}, targetSequence {2}".format( i, self.targetResSeq[ i ], self.targetSequence[ i ] ) return def _checkContiguous(self): """ Check that there is congiuous resSeq numbering as otherwise the map in current form won't work Can only check from the start while residues are matching - won't work after the first gap """ p_refResSeq = p_targetResSeq = None for count in range( self.lenMatch ): refIdx = self.refOffset + count refAA = self.refSequence[ refIdx ] refResSeq = self.refResSeq[ refIdx ] targetIdx = self.targetOffset + count targetAA = self.targetSequence[ targetIdx ] targetResSeq = self.targetResSeq[ targetIdx ] if count != 0: # Need to stop after the first gap if refAA != targetAA: break # Complain if the residues match but the numbering doesn't if refResSeq != p_refResSeq + 1 or targetResSeq != p_targetResSeq + 1: raise RuntimeError,"Non-contiguous residue numbering: {0}->{1} and {2}->{3}".format( p_refResSeq, refResSeq, p_targetResSeq, targetResSeq ) p_refResSeq = refResSeq p_targetResSeq = targetResSeq return def _calcOffset(self, refSequence, targetSequence, reverse=False ): # Probe length dependant on protein size - is length of shortest protein or the default shortest = min( len(targetSequence), len(refSequence) ) probeLen = min( self.probeLen, shortest ) # Work out how far we can probe into each sequence if len( targetSequence ) - probeLen >= self.maxInset: targetMaxInset = self.maxInset else: targetMaxInset = len( targetSequence ) - probeLen if len( refSequence ) - probeLen >= self.maxInset: refMaxInset = self.maxInset else: refMaxInset = len( refSequence ) - probeLen #print "GOT targetMaxInset, refMaxInset, probeLen ",targetMaxInset, refMaxInset, probeLen # If checking from the end, reverse the strings if reverse: refSequence = refSequence[::-1] targetSequence = targetSequence[::-1] got=False for targetOffset in range( targetMaxInset + 1 ): probe = targetSequence[ targetOffset : targetOffset + probeLen ] #print "PROBE ",probe for refOffset in range( refMaxInset + 1 ): #print "TEST ",refSequence[ refOffset:refOffset + probeLen ] if refSequence[ refOffset:refOffset + probeLen ] == probe: got=True break if got: # print "GOT MODEL MATCH AT i,j ",targetOffset,refOffset break if not got: raise RuntimeError,"Could not calculate map for:\n{0}\n{1}".format( refSequence, targetSequence ) return ( refOffset, targetOffset ) def _lenMatch(self): refBackOffset, targetBackOffset = self._calcOffset( self.refSequence, self.targetSequence, reverse=True ) #print "Got refBackOffset, ", refBackOffset #print "Got targetBackOffset ", targetBackOffset # Calculate match from the residue numbers - use reference for now length = self.refResSeq[ len(self.refSequence) - 1 - refBackOffset ] - self.refResSeq[ self.refOffset ] + 1 if length > len(self.refResSeq) and length > len(self.targetResSeq): raise RuntimeError, "Match of {0} is longer than both of the sequences!".format( length ) return length
[docs] def fromInfo(self, refInfo=None, refChainID=None, targetInfo=None, targetChainID=None, modelIdx=0): """Create a map from 2 info objects""" # Determine index of chain so we know where to get the data from nativeIdx = refInfo.models[ modelIdx ].chains.index( refChainID ) self.refResSeq = refInfo.models[ modelIdx ].resSeqs[ nativeIdx ] self.refSequence = refInfo.models[ modelIdx ].sequences[ nativeIdx ] self.refCAlphaMask = refInfo.models[ modelIdx ].caMask[ nativeIdx ] self.refBbMask = refInfo.models[ modelIdx ].bbMask[ nativeIdx ] self.refOffset = None self._refIncomparable = None targetIdx = targetInfo.models[ modelIdx ].chains.index( targetChainID ) self.targetResSeq = targetInfo.models[ modelIdx ].resSeqs[ targetIdx ] self.targetSequence = targetInfo.models[ modelIdx ].sequences[ targetIdx ] self.targetCAlphaMask = targetInfo.models[ modelIdx ].caMask[ targetIdx ] self.targetBbMask = targetInfo.models[ modelIdx ].bbMask[ targetIdx ] self.targetOffset = None self._targetIncomparable = None #print "refSequence ",self.refSequence #print "targetSequence ", self.targetSequence self._calc_map() return
[docs] def read_pdb( self, pdb ): """Get sequence as string of 1AA get list of matching resSeq """ atomTypes = [] # For checking we have all required atom types resSeq = [] resName = [] _atomTypes = [] atomTypesList = [] chain=None readingResSeq=None readingResName=None for line in open( pdb ): if line.startswith("MODEL"): raise RuntimeError,"FOUND MULTI_MODEL FILE!" if line.startswith("TER"): break if line.startswith("ATOM"): atom = pdb_model.PdbAtom( line ) if not chain: chain = atom.chainID if atom.chainID != chain: raise RuntimeError," FOUND ADDITIONAL CHAIN" break # First atom in first residue if readingResSeq == None: readingResSeq = atom.resSeq readingResName = atom.resName _atomTypes.append( atom.name.strip() ) continue if readingResSeq != atom.resSeq: # Adding a new residue # Add the atom we've just finished reading resName.append( readingResName ) resSeq.append( readingResSeq ) atomTypesList.append( _atomTypes ) # Reset readingResSeq = atom.resSeq readingResName = atom.resName _atomTypes = [ atom.name.strip() ] else: if atom.name not in _atomTypes: _atomTypes.append( atom.name.strip() ) # End reading loop # Add the atom we've just finished reading resName.append( readingResName ) resSeq.append( readingResSeq ) atomTypesList.append( _atomTypes ) sequence = "" # Build up the sequence for n in resName: sequence += pdb_edit.three2one[ n ] # Build up the mask cAlphaMask = [] for atomTypes in atomTypesList: if 'CA' not in atomTypes: cAlphaMask.append( True ) else: cAlphaMask.append( False ) return ( sequence, resSeq, cAlphaMask )
[docs] def resSeqMatch(self): """Return true if the residue numbering between the model and native over the aligned region is the same""" #print self.targetResSeq[ self.targetOffset : self.targetOffset + self.lenMatch ] #print self.refResSeq[ self.refOffset : self.refOffset + self.lenMatch ] return self.targetResSeq[ self.targetOffset : self.targetOffset + self.lenMatch ] == self.refResSeq[ self.refOffset : self.refOffset + self.lenMatch ]