Commit 4da4423e authored by Lukas Jelonek's avatar Lukas Jelonek
Browse files

Each entry gets a status code. Introduced more status codes

parent 526d86ff
import requests
from cachecontrol import CacheControl
from cachecontrol.caches.file_cache import FileCache
import logging
logger = logging.getLogger(__name__)
from dbxref.config import load_providers
providers = load_providers()
FOUND='FOUND'
NOT_FOUND='NOT_FOUND'
UNSUPPORTED='UNSUPPORTED'
TIMEOUT='TIMEOUT'
cache = FileCache(".web_cache", forever=True)
sess = CacheControl(requests.Session(), cache=cache)
STATUS_EXISTS='found'
STATUS_NOT_EXISTS='not found'
STATUS_UNKNOWN='status unknown'
STATUS_NOT_CHECKED='status not checked'
STATUS_CHECK_NOT_SUPPORTED='check of status not supported'
STATUS_CHECK_TIMEOUT='status check timed out'
STATUS_UNSUPPORTED_DB='database unsupported'
def resolve(strings, check_existence=True):
results = []
for s in strings:
exists = FOUND
status = STATUS_NOT_CHECKED
if check_existence:
exists = check_dbxref_exists(s)
status = check_dbxref_exists(s)
dbxref = convert_string_to_dbxref(s)
if exists == FOUND and dbxref['db'] in providers:
if dbxref['db'] in providers:
provider = providers[dbxref['db']]
locations = {}
for _type in provider['resources']:
......@@ -25,7 +33,9 @@ def resolve(strings, check_existence=True):
for url_template in provider['resources'][_type]:
urls.append(compile_url(url_template, dbxref))
locations[_type] = urls
results.append({'dbxref': dbxref['db'] + ':' + dbxref['id'], 'locations': locations})
results.append({'dbxref': dbxref['db'] + ':' + dbxref['id'], 'locations': locations, 'status': status})
else:
results.append({'dbxref': dbxref['db'] + ':' + dbxref['id'], 'status': STATUS_UNSUPPORTED_DB})
return results
def check_dbxref_exists(string):
......@@ -33,33 +43,33 @@ def check_dbxref_exists(string):
if dbxref['db'] in providers:
provider = providers[dbxref['db']]
urls = []
exists = FOUND
exists = STATUS_NOT_CHECKED
if 'check_existence' in provider:
url = compile_url(provider['check_existence'], dbxref)
logger.debug('Checking existence of dbxref at "%s"', url)
exists = check_url_exists(url)
if exists == NOT_FOUND or exists == TIMEOUT:
logger.info('The dbxref "%s" cannot be found. It will be ignored.', string)
return exists
else:
return UNSUPPORTED
return UNSUPPORTED
return STATUS_CHECK_NOT_SUPPORTED
return STATUS_UNSUPPORTED_DB
def compile_url(template, dbxref):
return template.replace('%i', dbxref['id']).replace('%d', dbxref['db'])
def check_url_exists(url):
try:
r = requests.head(url, allow_redirects=True, timeout=1)
r = sess.head(url, allow_redirects=True, timeout=1)
r.close()
if r.status_code <= 400:
return FOUND
return STATUS_EXISTS
else:
logger.debug('The server responded with status code: %s', r.status_code)
return NOT_FOUND
return STATUS_NOT_EXISTS
except requests.exceptions.Timeout as ex:
logger.info('Timeout for URL: "%s"', url)
return TIMEOUT
return STATUS_CHECK_TIMEOUT
except:
return STATUS_NOT_EXISTS
def convert_string_to_dbxref(string):
"""
......
......@@ -74,7 +74,7 @@
resources:
html: ["http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i"]
xml: ["http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i&format=oboxml"]
check_existence: "http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i&format=oboxml"
check_existence: "http://purl.obolibrary.org/obo/GO_%i"
- name: HTTP
prefixes: ["http", "https"]
resources:
......
......@@ -22,32 +22,32 @@ class TestDbxrefResolve(unittest.TestCase):
def test_check_dbxref_exists(self):
import logging
from dbxref.resolver import FOUND, NOT_FOUND, UNSUPPORTED
from dbxref.resolver import STATUS_EXISTS, STATUS_NOT_EXISTS, STATUS_UNSUPPORTED_DB, STATUS_UNKNOWN
logging.basicConfig(level=logging.DEBUG)
logging.getLogger().setLevel(logging.WARNING)
resolver.logger.setLevel(logging.DEBUG)
data = [
# existent ids
('GO:0097281', FOUND),
('EC:1.1.1.1', FOUND),
('UniProtKB/Swiss-Prot:P12345', FOUND),
('UniProtKB/TrEMBL:A2VB99', FOUND),
('taxon:452271', FOUND),
('pubmed:19037750', FOUND),
('PDB:4AJY', FOUND),
('http://www.google.de', FOUND),
('https://www.google.de', FOUND),
('GO:0097281', STATUS_EXISTS),
('EC:1.1.1.1', STATUS_EXISTS),
('UniProtKB/Swiss-Prot:P12345', STATUS_EXISTS),
('UniProtKB/TrEMBL:A2VB99', STATUS_EXISTS),
('taxon:452271', STATUS_EXISTS),
('pubmed:19037750', STATUS_EXISTS),
('PDB:4AJY', STATUS_EXISTS),
('http://www.google.de', STATUS_EXISTS),
('https://www.google.de', STATUS_EXISTS),
# non existent ids
('GO:123', NOT_FOUND),
('EC:hoho', NOT_FOUND),
('UniProtKB/Swiss-Prot:45', NOT_FOUND),
('UniProtKB/TrEMBL:99', NOT_FOUND),
('taxon:hoho', NOT_FOUND),
('pubmed:hoho', NOT_FOUND),
('PDB:hoho', NOT_FOUND),
('http://wurst', NOT_FOUND),
('https://wurst', NOT_FOUND),
('GO:123', STATUS_NOT_EXISTS),
('EC:hoho', STATUS_NOT_EXISTS),
('UniProtKB/Swiss-Prot:45', STATUS_NOT_EXISTS),
('UniProtKB/TrEMBL:99', STATUS_NOT_EXISTS),
('taxon:hoho', STATUS_NOT_EXISTS),
('pubmed:hoho', STATUS_NOT_EXISTS),
('PDB:hoho', STATUS_NOT_EXISTS),
('http://wurst', STATUS_NOT_EXISTS),
('https://wurst', STATUS_NOT_EXISTS),
# currently unsupported
#('GeneID:956582', FOUND),
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment