Source code for ample.modelling.octopus_predict
"""
Created on 18 Feb 2013
@author: jmht
Query the octopus server http://octopus.cbr.su.se to get transmembrane predictions
"""
import logging
import os
import sys
import urllib
if sys.version_info.major < 3:
from HTMLParser import HTMLParser
from urllib2 import urlopen, HTTPError
else:
from html.parser import HTMLParser
from urllib.error import HTTPError
from urllib.request import urlopen
TIMEOUT = 10.0 # in seconds
[docs]class ParseFileUrl(HTMLParser):
"""
Parse the page returned by an octopus search to get the links to the files
"""
def __init__(self):
self.topo = None
self.nnprf = None
HTMLParser.__init__(self)
[docs] def handle_starttag(self, tag, attrs):
"""Set recording whenever we encounter a tag so handle_data can process it"""
if tag == 'a' and attrs:
for name, value in attrs:
if name == "href" and str(value).endswith(".topo"):
self.topo = str(value)
elif name == "href" and str(value).endswith(".nnprf"):
self.nnprf = str(value)
# End ParseFileUrl
[docs]class OctopusPredict(object):
"""
Query the octopus server http://octopus.cbr.su.se to get transmembrane predictions
"""
def __init__(self):
self.logger = logging.getLogger()
self.octopus_url = "http://octopus.cbr.su.se/"
# The fasta sequence to query with
self.fasta = None
# path to the topo file
self.topo = None
# path to the nnprf file
self.nnprf = None
# url of topo & nnprf files on server
self.topo_url = None
self.nnprf_url = None
[docs] def getPredict(self, name, fasta, directory=None):
"""
Get the octopus prediction for the given fasta sequence string
Args:
fasta -- fasta sequence string
name -- name for the files
directory -- directory to write files to
Sets the topo and nnprf attributes
"""
if not directory:
directory = os.getcwd()
self.octopusFileUrls(fasta)
self.writeFiles(name, directory)
[docs] def octopusFileUrls(self, fasta):
"""
Query the server for a prediction for the given fasta sequence.
Args:
fasta -- a single fasta sequence as a string
Sets the urls of the topo and nnprf files
"""
data = dict(do='Submit OCTOPUS', sequence=fasta)
edata = urllib.urlencode(data)
req = urlopen(self.octopus_url, edata, timeout=TIMEOUT)
m = ParseFileUrl()
# Calls handle_starttag, _data etc.
html = req.read()
m.feed(html)
if not m.topo:
with open("OCTOPUS_ERROR.html", "w") as f:
f.write(html)
raise RuntimeError(
"Error getting prediction for fasta:{}\nCheck file: OCTOPUS_ERROR.html".format(fasta.splitlines()[0])
)
self.topo_url = self.octopus_url + m.topo
self.nnprf_url = self.octopus_url + m.nnprf
[docs] def writeFiles(self, name, directory):
"""
Write the files on the server to disk
Args:
directory: where to write files
name: name for files (with suffix .topo and .nnprf)
"""
try:
topo_req = urlopen(self.topo_url, timeout=TIMEOUT)
except HTTPError as e:
raise RuntimeError("Error accessing topo file: {}\n{}".format(self.topo_url, e))
try:
nnprf_req = urlopen(self.nnprf_url, timeout=TIMEOUT)
except uHTTPError as e:
msg = "Error accessing nnprf file: {}\n{}\nTransmembrane prediction may have failed!".format(
self.nnprf_url, e
)
self.logger.warn(msg)
fname = os.path.join(directory, name + ".topo")
with open(fname, "w") as f:
f.writelines(topo_req.readlines())
self.logger.debug("Wrote topo file: {}".format(fname))
self.topo = fname
fname = os.path.join(directory, name + ".nnprf")
with open(fname, "w") as f:
f.writelines(nnprf_req.readlines())
self.logger.debug("Wrote nnprf file: {}".format(fname))
self.nnprf = fname
[docs] def getFasta(self, fastafile):
"""Given a fastafile, extract the first sequence and return it as \n separated string"""
fasta = []
header = False
with open(fastafile, "r") as f:
for line in f:
line = line.strip()
# skip empty
if not len(line):
continue
# Only read one sequence
if line.startswith(">"):
if header:
break
header = True
fasta.append(line)
if not len(fasta):
return None
return "\n".join(fasta)