Commit d81f0580 authored by Lukas Jelonek's avatar Lukas Jelonek
Browse files

Test if a provider can be checked for existence of entries

parent 0618e776
import requests
import logging
logger = logging.getLogger(__name__)
from dbxref.config import load_providers
providers = load_providers()
FOUND='FOUND'
NOT_FOUND='NOT_FOUND'
UNSUPPORTED='UNSUPPORTED'
def resolve(strings, check_existence=True):
results = []
for s in strings:
......@@ -27,24 +33,31 @@ def check_dbxref_exists(string):
if dbxref['db'] in providers:
provider = providers[dbxref['db']]
urls = []
exists = True
exists = FOUND
if 'check_existence' in provider:
url = compile_url(provider['check_existence'], dbxref)
logger.debug('Checking existence of dbxref at "%s"', url)
exists = check_url_exists(url)
if not exists:
logging.info('The dbxref "' + string + '" cannot be found. It will be ignored.')
if exists == NOT_FOUND:
logger.info('The dbxref "%s" cannot be found. It will be ignored.', string)
return exists
else:
return False
return False
return UNSUPPORTED
return UNSUPPORTED
def compile_url(template, dbxref):
return template.replace('%i', dbxref['id']).replace('%d', dbxref['db'])
def check_url_exists(url):
r = requests.head(url, allow_redirects=True)
r.close()
return r.status_code <= 400
try:
r = requests.head(url, allow_redirects=True)
r.close()
if r.status_code <= 400:
return FOUND
else:
return NOT_FOUND
except:
return NOT_FOUND
def convert_string_to_dbxref(string):
"""
......
......@@ -33,28 +33,39 @@
resources:
html: ["http://rfam.xfam.org/family/%i"]
xml: ["http://rfam.xfam.org/family/%i?content-type=text%2Fxml"]
# does not work
# check_existence: "http://rfam.xfam.org/family/%i?content-type=text%2Fxml"
- name: Pubmed
prefixes: ["pubmed", "Pubmed"]
resources:
html: ["http://www.ncbi.nlm.nih.gov/pubmed/%i"]
check_existence: "http://www.ncbi.nlm.nih.gov/pubmed/%i"
- name: Protein Families
prefixes: ["PFAM"]
resources:
html: ["http://pfam.xfam.org/family/%i"]
xml: ["http://pfam.xfam.org/family/%i?content-type=text%2Fxml"]
# does not work
# check_existence: "http://pfam.xfam.org/family/%i?content-type=text%2Fxml"
- name: PDB
prefixes: ["PDB"]
resources:
html: ["http://www.rcsb.org/pdb/explore/explore.do?structureId=%i"]
xml: ["http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=xml&compression=NO&structureId=%i"]
check_existence: "http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=xml&compression=NO&structureId=%i"
- name: InterPro
prefixes: ["InterPro"]
resources:
html: ["http://www.ebi.ac.uk/interpro/entry/%i"]
# does not work
# check_existence: "http://www.ebi.ac.uk/interpro/entry/%i"
- name: GeneID
prefixes: ["GeneID"]
resources:
html: ["http://www.ncbi.nlm.nih.gov/gene/%i"]
xml: ["http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=gene&dopt=xml&sort=&val=%i&retmode=file"]
# does not work
# check_existence: "http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=gene&dopt=xml&sort=&val=%i&retmode=file"
- name: Gene Ontology
prefixes: ["GO"]
resources:
......@@ -65,3 +76,4 @@
prefixes: ["http", "https"]
resources:
html: ["%d:%i"]
check_existence: "%d:%i"
......@@ -21,15 +21,48 @@ class TestDbxrefResolve(unittest.TestCase):
self.assertNotEqual(resolver.resolve(["EC:1.1.1.1"]), [])
def test_check_dbxref_exists(self):
import logging
from dbxref.resolver import FOUND, NOT_FOUND, UNSUPPORTED
logging.basicConfig(level=logging.DEBUG)
logging.getLogger().setLevel(logging.WARNING)
resolver.logger.setLevel(logging.DEBUG)
data = [
('GO:1234', False),
('GO:0097281', True),
('SO:1234', False),
('Taxon:hoho', False),
('UniProtKB/Swiss-Prot:abc', False),
('EC:fa', False),
('EC:1.1.1.1', True),
('GI:abc', False),
# existent ids
('GO:0097281', FOUND),
('EC:1.1.1.1', FOUND),
('UniProtKB/Swiss-Prot:P12345', FOUND),
('UniProtKB/TrEMBL:A2VB99', FOUND),
('taxon:452271', FOUND),
('pubmed:19037750', FOUND),
('PDB:4AJY', FOUND),
('http://www.google.de', FOUND),
('https://www.google.de', FOUND),
# non existent ids
('GO:123', NOT_FOUND),
('EC:hoho', NOT_FOUND),
('UniProtKB/Swiss-Prot:45', NOT_FOUND),
('UniProtKB/TrEMBL:99', NOT_FOUND),
('taxon:hoho', NOT_FOUND),
('pubmed:hoho', NOT_FOUND),
('PDB:hoho', NOT_FOUND),
('http://wurst', NOT_FOUND),
('https://wurst', NOT_FOUND),
# currently unsupported
#('GeneID:956582', FOUND),
#('GI:731497', FOUND),
#('PFAM:PF00002', FOUND),
#('RFAM:RF00360', FOUND),
#('InterPro:IPR002928', FOUND),
#('SO:0000704', FOUND),
#('InterPro:hoho', NOT_FOUND),
#('GI:hoho', NOT_FOUND),
#('GeneID:hoho', NOT_FOUND),
#('PFAM:hoho', NOT_FOUND),
#('RFAM:hoho', NOT_FOUND),
#('SO:123', NOT_FOUND),
]
for d in data:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment