Source code for ample.util.residue_map

"""Useful manipulations on PDB files"""

import types

from ample.util import ample_util, pdb_model


[docs]class residueSequenceMap(object): """Class for handling mapping between model and native residue indices.""" def __init__(self, refPdb=None, targetPdb=None): self.refResSeq = [] self.refSequence = None self.refCAlphaMask = [] self.refBbMask = [] self.refOffset = None self._refIncomparable = None # List of atoms in the model that cannot be compared to the model self.targetResSeq = [] self.targetSequence = None self.targetCAlphaMask = [] self.targetBbMask = [] self.targetOffset = None # Where the matched part of the sequences starts in the model self._targetIncomparable = None # List of atoms in the model that cannot be compared to the native self.lenMatch = None # The window of AA we used to check for a match self.probeLen = 10 # maxInset is the max number of AA into the sequence that we will go searching for a match - i.e. if more # then maxInset AA at the start are non-matching, we won't find the match self.maxInset = 50 # Like this just for testing if refPdb and targetPdb: self.calc_map(refPdb, targetPdb) return
[docs] def ref2target(self, refResSeq): """Return the target resSeq for the given reference resSeq. This will calculate a resSeq in the target if there isn't one. """ # Work out how many residues from the start of the matching region this residue is in the target indent = refResSeq - self.refResSeq[self.refOffset] # calculate the corresponding index in the reference targetResSeq = self.targetResSeq[self.targetOffset] + indent ## paranoid check # if 0 < self.targetOffset + indent < len( self.targetResSeq ): # assert targetResSeq == self.targetResSeq[ self.targetOffset + indent ] return targetResSeq
[docs] def target2ref(self, targetResSeq): """Return the referece resSeq for the given target resSeq. This will calculate a resSeq in the reference if there isn't one. """ # Work out how many residues from the start of the matching region this residue is in the target indent = targetResSeq - self.targetResSeq[self.targetOffset] refResSeq = self.refResSeq[self.refOffset] + indent ## paranoid check # if 0 < self.refOffset + indent < len( self.refResSeq ): # assert refResSeq == self.refResSeq[ self.refOffset + indent ] return refResSeq
[docs] def targetIncomparable(self, cAlphaMask=True, bbMask=False): """Return a list of the resSeq in the target that cannot be compared to the reference. This includes any where there isn't a corresponding residue in the reference, or there isn't a c-alpha or backbone atom in either (if cAlphaMask or bbMask is set) """ if self._targetIncomparable == None: self._targetIncomparable = [] for i, resSeq in enumerate(self.targetResSeq): # Before the start of the matching region if i < self.targetOffset: self._targetIncomparable.append(resSeq) continue # After end of matching region if i > self.lenMatch + 1: self._targetIncomparable.append(resSeq) continue # In matching region but no C-alpha if cAlphaMask: if self.targetCAlphaMask[i]: self._targetIncomparable.append(resSeq) continue # In matching region but no complete bbatoms if bbMask: if self.targetBbMask[i]: self._targetIncomparable.append(resSeq) continue # Matching residues in reference refResSeq = self.target2ref(resSeq) try: j = self.refResSeq.index(refResSeq) except ValueError: # A residue that isn't actually in the reference self._targetIncomparable.append(resSeq) continue # No C-Alpha if cAlphaMask: if self.refCAlphaMask[j]: self._targetIncomparable.append(resSeq) continue # No bbMask if bbMask: if self.refBbMask[j]: self._targetIncomparable.append(resSeq) continue return self._targetIncomparable
[docs] def refIncomparable(self, cAlphaMask=True, bbMask=False): """Return a list of the resSeq in the reference that cannot be compared to the target. This includes any where there isn't a corresponding residue in the target, or there isn't a c-alpha or backbone atom in either (if cAlphaMask or bbMask is set) """ if self._refIncomparable == None: self._refIncomparable = [] for i, resSeq in enumerate(self.refResSeq): # Before the start of the matching region if i < self.refOffset: self._refIncomparable.append(resSeq) continue # After end of matching region if i > self.lenMatch + 1: self._refIncomparable.append(resSeq) continue # In matching region but no C-alpha if cAlphaMask: if self.refCAlphaMask[i]: self._refIncomparable.append(resSeq) continue # In matching region but no complete bbatoms if bbMask: if self.refBbMask[i]: self._refIncomparable.append(resSeq) continue # Matching residues in reference targetResSeq = self.ref2target(resSeq) try: j = self.targetResSeq.index(targetResSeq) except ValueError: # A residue that isn't actually in the reference self._refIncomparable.append(resSeq) continue # No C-Alpha if cAlphaMask: if self.targetCAlphaMask[j]: self._refIncomparable.append(resSeq) continue # No bbMask if bbMask: if self.targetBbMask[j]: self._refIncomparable.append(resSeq) continue return self._refIncomparable
def __str__(self): me = {} for slot in dir(self): attr = getattr(self, slot) if not slot.startswith("__") and not ( isinstance(attr, types.MethodType) or isinstance(attr, types.FunctionType) ): me[slot] = attr s = self.__repr__() + "\n" for k in sorted(me.keys()): s += "{0}: {1}\n".format(k, me[k]) return s
[docs] def calc_map(self, nativePdb, modelPdb): self.refSequence, self.refResSeq, self.refCAlphaMask = self.read_pdb(nativePdb) self.targetSequence, self.targetResSeq, self.targetCAlphaMask = self.read_pdb(modelPdb) self._calc_map() return
def _calc_map(self): """Return a ResSeqMap mapping the index of a residue in the model to the corresponding residue in the native. Only works if 1 chain in either file and with standard residues """ self.refOffset, self.targetOffset = self._calcOffset(self.refSequence, self.targetSequence) self.lenMatch = self._lenMatch() self._checkContiguous() def _checkContiguous(self): """ Check that there is congiuous resSeq numbering as otherwise the map in current form won't work Can only check from the start while residues are matching - won't work after the first gap """ p_refResSeq = p_targetResSeq = None for count in range(self.lenMatch): refIdx = self.refOffset + count refAA = self.refSequence[refIdx] refResSeq = self.refResSeq[refIdx] targetIdx = self.targetOffset + count targetAA = self.targetSequence[targetIdx] targetResSeq = self.targetResSeq[targetIdx] if count != 0: # Need to stop after the first gap if refAA != targetAA: break # Complain if the residues match but the numbering doesn't if refResSeq != p_refResSeq + 1 or targetResSeq != p_targetResSeq + 1: raise RuntimeError( "Non-contiguous residue numbering: {}->{} and {}->{}".format( p_refResSeq, refResSeq, p_targetResSeq, targetResSeq ) ) p_refResSeq = refResSeq p_targetResSeq = targetResSeq def _calcOffset(self, refSequence, targetSequence, reverse=False): # Probe length dependant on protein size - is length of shortest protein or the default shortest = min(len(targetSequence), len(refSequence)) probeLen = min(self.probeLen, shortest) # Work out how far we can probe into each sequence if len(targetSequence) - probeLen >= self.maxInset: targetMaxInset = self.maxInset else: targetMaxInset = len(targetSequence) - probeLen if len(refSequence) - probeLen >= self.maxInset: refMaxInset = self.maxInset else: refMaxInset = len(refSequence) - probeLen # If checking from the end, reverse the strings if reverse: refSequence = refSequence[::-1] targetSequence = targetSequence[::-1] got = False for targetOffset in range(targetMaxInset + 1): probe = targetSequence[targetOffset : targetOffset + probeLen] for refOffset in range(refMaxInset + 1): if refSequence[refOffset : refOffset + probeLen] == probe: got = True break if got: break if not got: raise RuntimeError("Could not calculate map for:\n{}\n{}".format(refSequence, targetSequence)) return refOffset, targetOffset def _lenMatch(self): refBackOffset, targetBackOffset = self._calcOffset(self.refSequence, self.targetSequence, reverse=True) # Calculate match from the residue numbers - use reference for now length = self.refResSeq[len(self.refSequence) - 1 - refBackOffset] - self.refResSeq[self.refOffset] + 1 if length > len(self.refResSeq) and length > len(self.targetResSeq): raise RuntimeError("Match of {} is longer than both of the sequences!".format(length)) return length
[docs] def fromInfo(self, refInfo=None, refChainID=None, targetInfo=None, targetChainID=None, modelIdx=0): """Create a map from 2 info objects""" # Determine index of chain so we know where to get the data from nativeIdx = refInfo.models[modelIdx].chains.index(refChainID) self.refResSeq = refInfo.models[modelIdx].resSeqs[nativeIdx] self.refSequence = refInfo.models[modelIdx].sequences[nativeIdx] self.refCAlphaMask = refInfo.models[modelIdx].caMask[nativeIdx] self.refBbMask = refInfo.models[modelIdx].bbMask[nativeIdx] self.refOffset = None self._refIncomparable = None targetIdx = targetInfo.models[modelIdx].chains.index(targetChainID) self.targetResSeq = targetInfo.models[modelIdx].resSeqs[targetIdx] self.targetSequence = targetInfo.models[modelIdx].sequences[targetIdx] self.targetCAlphaMask = targetInfo.models[modelIdx].caMask[targetIdx] self.targetBbMask = targetInfo.models[modelIdx].bbMask[targetIdx] self.targetOffset = None self._targetIncomparable = None self._calc_map()
[docs] def read_pdb(self, pdb): """Get sequence as string of 1AA get list of matching resSeq """ atomTypes = [] resSeq = [] resName = [] _atomTypes = [] atomTypesList = [] chain = None readingResSeq = None readingResName = None for line in open(pdb): if line.startswith("MODEL"): raise RuntimeError("FOUND MULTI_MODEL FILE!") if line.startswith("TER"): break if line.startswith("ATOM"): atom = pdb_model.PdbAtom(line) if not chain: chain = atom.chainID if atom.chainID != chain: raise RuntimeError("FOUND ADDITIONAL CHAIN") # First atom in first residue if readingResSeq == None: readingResSeq = atom.resSeq readingResName = atom.resName _atomTypes.append(atom.name.strip()) continue if readingResSeq != atom.resSeq: resName.append(readingResName) resSeq.append(readingResSeq) atomTypesList.append(_atomTypes) readingResSeq = atom.resSeq readingResName = atom.resName _atomTypes = [atom.name.strip()] else: if atom.name not in _atomTypes: _atomTypes.append(atom.name.strip()) resName.append(readingResName) resSeq.append(readingResSeq) atomTypesList.append(_atomTypes) sequence = "" for n in resName: sequence += ample_util.three2one[n] cAlphaMask = ['CA' not in atomTypes for atomTypes in atomTypesList] return sequence, resSeq, cAlphaMask
[docs] def resSeqMatch(self): """Return true if the residue numbering between the model and native over the aligned region is the same""" return ( self.targetResSeq[self.targetOffset : self.targetOffset + self.lenMatch] == self.refResSeq[self.refOffset : self.refOffset + self.lenMatch] )