Commit 4da4423e authored by Lukas Jelonek's avatar Lukas Jelonek
Browse files

Each entry gets a status code. Introduced more status codes

parent 526d86ff
import requests import requests
from cachecontrol import CacheControl
from cachecontrol.caches.file_cache import FileCache
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
from dbxref.config import load_providers from dbxref.config import load_providers
providers = load_providers() providers = load_providers()
FOUND='FOUND' cache = FileCache(".web_cache", forever=True)
NOT_FOUND='NOT_FOUND' sess = CacheControl(requests.Session(), cache=cache)
UNSUPPORTED='UNSUPPORTED'
TIMEOUT='TIMEOUT' STATUS_EXISTS='found'
STATUS_NOT_EXISTS='not found'
STATUS_UNKNOWN='status unknown'
STATUS_NOT_CHECKED='status not checked'
STATUS_CHECK_NOT_SUPPORTED='check of status not supported'
STATUS_CHECK_TIMEOUT='status check timed out'
STATUS_UNSUPPORTED_DB='database unsupported'
def resolve(strings, check_existence=True): def resolve(strings, check_existence=True):
results = [] results = []
for s in strings: for s in strings:
exists = FOUND status = STATUS_NOT_CHECKED
if check_existence: if check_existence:
exists = check_dbxref_exists(s) status = check_dbxref_exists(s)
dbxref = convert_string_to_dbxref(s) dbxref = convert_string_to_dbxref(s)
if exists == FOUND and dbxref['db'] in providers: if dbxref['db'] in providers:
provider = providers[dbxref['db']] provider = providers[dbxref['db']]
locations = {} locations = {}
for _type in provider['resources']: for _type in provider['resources']:
...@@ -25,7 +33,9 @@ def resolve(strings, check_existence=True): ...@@ -25,7 +33,9 @@ def resolve(strings, check_existence=True):
for url_template in provider['resources'][_type]: for url_template in provider['resources'][_type]:
urls.append(compile_url(url_template, dbxref)) urls.append(compile_url(url_template, dbxref))
locations[_type] = urls locations[_type] = urls
results.append({'dbxref': dbxref['db'] + ':' + dbxref['id'], 'locations': locations}) results.append({'dbxref': dbxref['db'] + ':' + dbxref['id'], 'locations': locations, 'status': status})
else:
results.append({'dbxref': dbxref['db'] + ':' + dbxref['id'], 'status': STATUS_UNSUPPORTED_DB})
return results return results
def check_dbxref_exists(string): def check_dbxref_exists(string):
...@@ -33,33 +43,33 @@ def check_dbxref_exists(string): ...@@ -33,33 +43,33 @@ def check_dbxref_exists(string):
if dbxref['db'] in providers: if dbxref['db'] in providers:
provider = providers[dbxref['db']] provider = providers[dbxref['db']]
urls = [] urls = []
exists = FOUND exists = STATUS_NOT_CHECKED
if 'check_existence' in provider: if 'check_existence' in provider:
url = compile_url(provider['check_existence'], dbxref) url = compile_url(provider['check_existence'], dbxref)
logger.debug('Checking existence of dbxref at "%s"', url) logger.debug('Checking existence of dbxref at "%s"', url)
exists = check_url_exists(url) exists = check_url_exists(url)
if exists == NOT_FOUND or exists == TIMEOUT:
logger.info('The dbxref "%s" cannot be found. It will be ignored.', string)
return exists return exists
else: else:
return UNSUPPORTED return STATUS_CHECK_NOT_SUPPORTED
return UNSUPPORTED return STATUS_UNSUPPORTED_DB
def compile_url(template, dbxref): def compile_url(template, dbxref):
return template.replace('%i', dbxref['id']).replace('%d', dbxref['db']) return template.replace('%i', dbxref['id']).replace('%d', dbxref['db'])
def check_url_exists(url): def check_url_exists(url):
try: try:
r = requests.head(url, allow_redirects=True, timeout=1) r = sess.head(url, allow_redirects=True, timeout=1)
r.close() r.close()
if r.status_code <= 400: if r.status_code <= 400:
return FOUND return STATUS_EXISTS
else: else:
logger.debug('The server responded with status code: %s', r.status_code) logger.debug('The server responded with status code: %s', r.status_code)
return NOT_FOUND return STATUS_NOT_EXISTS
except requests.exceptions.Timeout as ex: except requests.exceptions.Timeout as ex:
logger.info('Timeout for URL: "%s"', url) logger.info('Timeout for URL: "%s"', url)
return TIMEOUT return STATUS_CHECK_TIMEOUT
except:
return STATUS_NOT_EXISTS
def convert_string_to_dbxref(string): def convert_string_to_dbxref(string):
""" """
......
...@@ -74,7 +74,7 @@ ...@@ -74,7 +74,7 @@
resources: resources:
html: ["http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i"] html: ["http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i"]
xml: ["http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i&format=oboxml"] xml: ["http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i&format=oboxml"]
check_existence: "http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i&format=oboxml" check_existence: "http://purl.obolibrary.org/obo/GO_%i"
- name: HTTP - name: HTTP
prefixes: ["http", "https"] prefixes: ["http", "https"]
resources: resources:
......
...@@ -22,32 +22,32 @@ class TestDbxrefResolve(unittest.TestCase): ...@@ -22,32 +22,32 @@ class TestDbxrefResolve(unittest.TestCase):
def test_check_dbxref_exists(self): def test_check_dbxref_exists(self):
import logging import logging
from dbxref.resolver import FOUND, NOT_FOUND, UNSUPPORTED from dbxref.resolver import STATUS_EXISTS, STATUS_NOT_EXISTS, STATUS_UNSUPPORTED_DB, STATUS_UNKNOWN
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
logging.getLogger().setLevel(logging.WARNING) logging.getLogger().setLevel(logging.WARNING)
resolver.logger.setLevel(logging.DEBUG) resolver.logger.setLevel(logging.DEBUG)
data = [ data = [
# existent ids # existent ids
('GO:0097281', FOUND), ('GO:0097281', STATUS_EXISTS),
('EC:1.1.1.1', FOUND), ('EC:1.1.1.1', STATUS_EXISTS),
('UniProtKB/Swiss-Prot:P12345', FOUND), ('UniProtKB/Swiss-Prot:P12345', STATUS_EXISTS),
('UniProtKB/TrEMBL:A2VB99', FOUND), ('UniProtKB/TrEMBL:A2VB99', STATUS_EXISTS),
('taxon:452271', FOUND), ('taxon:452271', STATUS_EXISTS),
('pubmed:19037750', FOUND), ('pubmed:19037750', STATUS_EXISTS),
('PDB:4AJY', FOUND), ('PDB:4AJY', STATUS_EXISTS),
('http://www.google.de', FOUND), ('http://www.google.de', STATUS_EXISTS),
('https://www.google.de', FOUND), ('https://www.google.de', STATUS_EXISTS),
# non existent ids # non existent ids
('GO:123', NOT_FOUND), ('GO:123', STATUS_NOT_EXISTS),
('EC:hoho', NOT_FOUND), ('EC:hoho', STATUS_NOT_EXISTS),
('UniProtKB/Swiss-Prot:45', NOT_FOUND), ('UniProtKB/Swiss-Prot:45', STATUS_NOT_EXISTS),
('UniProtKB/TrEMBL:99', NOT_FOUND), ('UniProtKB/TrEMBL:99', STATUS_NOT_EXISTS),
('taxon:hoho', NOT_FOUND), ('taxon:hoho', STATUS_NOT_EXISTS),
('pubmed:hoho', NOT_FOUND), ('pubmed:hoho', STATUS_NOT_EXISTS),
('PDB:hoho', NOT_FOUND), ('PDB:hoho', STATUS_NOT_EXISTS),
('http://wurst', NOT_FOUND), ('http://wurst', STATUS_NOT_EXISTS),
('https://wurst', NOT_FOUND), ('https://wurst', STATUS_NOT_EXISTS),
# currently unsupported # currently unsupported
#('GeneID:956582', FOUND), #('GeneID:956582', FOUND),
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment