#!/usr/bin/env ccp4-python
import logging
import os
import exit_util
import pdb_edit
[docs]class Sequence(object):
"""A class to handle a fasta file"""
def __init__(self, fasta=None, pdb=None, canonicalise=False):
"""Initialise the object"""
self.MAXWIDTH=80 # maximum width of any line
self.name = None # identifier for this sequence object
self.headers = [] # title lines
self.sequences = [] # The fasta sequences (just AA) as a single string
self.resseqs = [] # The fasta sequences (just AA) as a single string
self.pdbs = [] # pdb files that sequences were derived from
self.chains = [] # The chain that the sequence belongs too (if applicable)
self.fasta_files = [] # The fasta files the sequence were read from
if fasta:
self.from_fasta(fasta, canonicalise=canonicalise)
elif pdb:
self.from_pdb(pdbin=pdb)
return
[docs] def add_pdb_data(self, pdbin):
"""Add the resseq information from a pdb to ourselves when we already have the sequence information from a fasta file - such as from an alignment
We assume that there will be gaps (-) in the sequence and the letters may be upper or lower case
Currently this only supports adding data for single-chain pdbs
"""
assert len(self.headers) and len(self.sequences)
assert os.path.isfile(pdbin),"Cannot find pdb file: {0}".format(pdbin)
# Assume the name of the pdb is the first part of the header
fname = os.path.basename(pdbin)
name, ext = os.path.splitext(fname)
assert ext == '.pdb', "PDB files must have extension .pdb"
# Find where in the list of data this sequence is
got=False
for idx, h in enumerate(self.headers):
n = h[1:].split('.')[0]
if n == name:
got=True
break
if not got: raise RuntimeError,"Could not find matching pdb name for {0} in headers {1}".format(fname,self.headers)
seqd = pdb_edit.sequence_data(pdbin)
assert len(seqd) == 1,'Currently only support adding data for single chain pdbs'
chain = seqd.keys()[0]
# Add the pdb and chain data
self.pdbs[idx] = fname
self.chains[idx] = chain
self.resseqs[idx] = [] # Clear out any existing resseqs
sequence = seqd[chain][0]
resseqs = seqd[chain][1]
# Loop through both sequences
needle = 0
GAP='-'
for res1 in self.sequences[idx]:
if res1 == GAP:
self.resseqs[idx].append(None)
else:
assert res1.upper() == sequence[needle]
self.resseqs[idx].append( resseqs[needle] )
needle += 1
return True
[docs] def fasta_str(self,pdbname=False):
if not len(self.sequences): raise RuntimeError,"No sequences have been read!"
headers=[]
for i, header in enumerate(self.headers):
#if pdbname: header = os.path.basename(self.pdbs[i])
if pdbname: header = ">{0}".format(os.path.basename(self.pdbs[i]))
headers.append(header)
return self._fasta_str(headers, self.sequences)
def _fasta_str(self,headers,sequences):
s=""
for i, seq in enumerate(sequences):
s += headers[i] + os.linesep
for chunk in range(0, len(seq), self.MAXWIDTH):
s += seq[chunk:chunk+self.MAXWIDTH] + os.linesep
s += os.linesep
return s
[docs] def from_fasta(self, fasta_file, canonicalise=True, resseq=True):
self.name = os.path.splitext(os.path.basename(fasta_file))[0]
with open(fasta_file, "r") as f:
self._parse_fasta(f, fasta_file=fasta_file, canonicalise=canonicalise)
if resseq:
# Add automatically calculated ressegs starting from 1
#assert len(self.resseqs) == 0,"Altering existing resseqs!"
for i, seq in enumerate(self.sequences):
if len(self.resseqs) >= i+1 and self.resseqs[i] is None:
self.resseqs[i] = []
for j in range(len(seq)):
self.resseqs[i].append(j+1)
return
[docs] def from_pdb(self, pdbin):
pdbin_name = os.path.basename(pdbin)
self.name = os.path.splitext(pdbin_name)[0]
chain2data = pdb_edit.sequence_data(pdbin)
assert len(chain2data),"Could not read sequence from pdb: {0}".format(pdbin)
self.headers = []
self.sequences = []
self.pdbs = []
self.resseqs = []
self.fasta_files = []
for chain in sorted(chain2data.keys()):
seq = chain2data[chain][0]
resseq = chain2data[chain][1]
self.headers.append(">From pdb: {0} chain={1} length={2}".format(pdbin_name,chain,len(seq)))
self.sequences.append(seq)
self.resseqs.append(resseq)
self.pdbs.append(pdbin_name)
self.chains.append(chain)
self.fasta_files.append(None) # Need to make sure pdb abd fasta arrays have same length
return
[docs] def canonicalise(self):
"""Reformat the fasta file
Description
-----------
Needed because Rosetta has problems reading fastas. For it to be read, it has to have no spaces in the sequence,
a name that is 4 characters and an underscore (ABCD_), everything has to be uppercase, and there has to be a
return carriage at the end - this has to be linux formatted as when I make a fasta in windows, it doesnt recognize
the return carriage.
Rosetta has a lot of problems with fastas so we put in this script to deal with it.
"""
aa = ['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V']
for i,seq in enumerate(self.sequences):
cs=""
for char in seq:
char = char.upper()
if char not in aa:
raise RuntimeError,"There appear to be non-standard AA in your sequence: '{0}'\nPlease format to only use standard AA.".format(char)
cs+=char
self.sequences[i]=cs
return
[docs] def length(self,seq_no=0):
return len(self.sequences[seq_no])
[docs] def mutate_residue(self, from_aa, res_seq, to_aa, seq_id=0):
"""Change residue type from_aa at position res_seq to be of type to_aa
Note: res_seq positions start from 1 not zero!
Parameters
----------
from_aa : str
Single-letter amino acid
res_seq : int
Residue sequence number
to_aa : str
Single-letter amino acid
seq_id : int
The index of the sequence to operate on (counting from zero)
"""
seq = self.sequences[seq_id]
assert seq[res_seq - 1] == from_aa, "Amino acid at position {0} is {1} not {2}".format(res_seq, seq[res_seq - 1], from_aa)
self.sequences[seq_id] = seq[:res_seq - 1] + to_aa + seq[res_seq:]
return
[docs] def numSequences(self):
return len(self.sequences)
def _parse_fasta(self, fasta, fasta_file=None, canonicalise=True):
"""Parse the fasta file int our data structures & check for consistency"""
headers = []
sequences = []
sequence = ""
header = None
for line in fasta:
line = line.strip().rstrip(os.linesep)
if not line: continue # skip blank lines
if not header:
if not line.startswith(">"):
raise RuntimeError,"FASTA sequencs must be prefixed with a > character: {0}".format(line)
header = line
continue
if header and line.startswith(">"):
headers.append(header)
sequences.append(sequence)
header = line
sequence = ""
else:
sequence += line
# Add the last header and sequence
headers.append(header)
sequences.append(sequence)
# Now add all the collected data
self.headers = []
self.sequences = []
self.resseqs = []
self.fasta_files = []
self.pdbs = []
self.chains = []
for h, s in zip(headers,sequences):
self.headers.append(h)
self.sequences.append(s)
self.fasta_files.append(fasta_file)
self.resseqs.append(None)
self.pdbs.append(None)
self.chains.append(None)
if canonicalise: self.canonicalise()
return
[docs] def pirStr(self,seqNo=0):
"""Return a canonical MAXWIDTH PIR representation of the file as a line-separated string"""
pirStr = [ self.title + os.linesep + os.linesep ] # Title plus the obligatory blank line
# Now reformat to MAXWIDTH chars
for chunk in range(0, len(self.sequances[seqNo]), self.MAXWIDTH):
pirStr.append(self.sequences[seqNo][ chunk:chunk+self.MAXWIDTH ]+os.linesep)
# Add last newline
pirStr.append(os.linesep)
return pirStr
[docs] def sequence(self,seq_no=0):
return self.sequences[seq_no]
[docs] def toPir(self, input_fasta, output_pir=None ):
"""Take a fasta file and output the corresponding PIR file"""
self.parseFasta(input_fasta)
if not output_pir:
dir, ffile = os.path.split(input_fasta)
fname = os.path.splitext(ffile)[0]
output_pir = os.path.join( dir, fname+".pir" )
with open( output_pir, "w") as pirout:
for line in self.pirStr():
pirout.write(line)
return
[docs] def write_fasta(self,fasta_file,pdbname=False):
with open(fasta_file,'w') as f:
for s in self.fasta_str(pdbname=pdbname):
f.write(s)
return
def __add__(self,other):
self.headers += other.headers
self.sequences += other.sequences
self.resseqs += other.resseqs
self.pdbs += other.pdbs
self.chains += other.chains
self.fasta_files += other.fasta_files
return self
def __str__(self):
return self.__repr__() + os.linesep + self.fasta_str()
[docs]def process_fasta(amoptd, canonicalise=False):
# Check we can find the input fasta
if not os.path.exists(str(amoptd['fasta'])):
msg = 'Cannot find fasta file: {0}'.format(amoptd['fasta'])
exit_util.exit_error(msg)
# Reformat to what we need
logging.debug('Parsing FASTA file')
try: fp = Sequence(fasta=amoptd['fasta'], canonicalise=canonicalise)
except Exception as e:
msg = "Error parsing FASTA file: {0}\n\n{1}".format(amoptd['fasta'],e.message)
exit_util.exit_error(msg)
if fp.numSequences() != 1:
msg = "ERROR! Fasta file {0} has > 1 sequence in it.".format(amoptd['fasta'])
exit_util.exit_error(msg)
# Length checks
amoptd['fasta_length'] = fp.length()
logging.info("Fasta is {0} amino acids long".format(amoptd['fasta_length']))
# Check we have a decent length
if amoptd['fasta_length'] < 9:
msg = "ERROR! Fasta is of length {0}. This is much too short!".format(amoptd['fasta_length'])
exit_util.exit_error(msg)
# Check we will be able to truncate at this level
if (float(amoptd['fasta_length']) / 100) * float(amoptd['percent']) < 1:
msg = "Cannot truncate a fasta sequence of length {0} with {1} percent intervals. Please select a larger interval.".format(amoptd['fasta_length'], amoptd['percent'])
exit_util.exit_error(msg)
# Check that the sequence doesn't have a his-tag in it
if not amoptd['allow_his_tag']:
his_tag = 'HHHHHH'
i = fp.sequence().find(his_tag)
l = fp.length()
if (0 <= i <= 20) or (l-20 <= i <= l):
msg = 'The fasta sequence contains a his tag sequence {0} at position {1}. If you wish to use ample with this sequence, please use the \"-allow_his_tag True\" option'.format(his_tag,i)
exit_util.exit_error(msg)
# Fasta is ok, so write out a canonical fasta in the work directory
outfasta = os.path.join(amoptd['work_dir'], amoptd['name'] + '_.fasta')
fp.write_fasta(outfasta)
amoptd['fasta'] = outfasta
amoptd['sequence'] = fp.sequence()
return