Commit 2d5421c4 authored by Lukas Jelonek's avatar Lukas Jelonek
Browse files

Add existence checking for dbxrefs

parent 7b99ebc8
......@@ -13,6 +13,7 @@ def main():
resolve_parser = subparsers.add_parser('resolve')
resolve_parser.add_argument('dbxrefs', nargs=argparse.REMAINDER)
resolve_parser.add_argument('--no_check', '-n', action='store_false', default=True, help="Do not check existence of cross reference")
resolve_parser.set_defaults(func=resolve)
retrieve_parser = subparsers.add_parser('retrieve')
......@@ -34,7 +35,7 @@ def info(args, config):
def resolve(args, config):
from dbxref import resolver
import json
print(json.dumps(resolver.resolve(args.dbxrefs)))
print(json.dumps(resolver.resolve(args.dbxrefs, check_existence=args.no_check)))
def retrieve(args, config):
#TODO implement
......
def resolve(strings):
from dbxref.config import load_providers
providers = load_providers()
import requests
import logging
from dbxref.config import load_providers
providers = load_providers()
def resolve(strings, check_existence=True):
results = []
for s in strings:
exists = True
if check_existence:
exists = check_dbxref_exists(s)
dbxref = convert_string_to_dbxref(s)
if dbxref['db'] in providers:
if exists and dbxref['db'] in providers:
provider = providers[dbxref['db']]
urls = []
for r in provider['resources']:
for url_template in provider['resources'][r]:
urls.append( {
'type': r,
'url': url_template.replace('%i', dbxref['id']).replace('%d', dbxref['db'])
'url': compile_url(url_template, dbxref)
} )
results.append({'dbxref': dbxref['db'] + ':' + dbxref['id'], 'locations': urls})
else:
results.append({'dbxref': dbxref['db'] + ':' + dbxref['id'], 'locations': []})
return results
def check_dbxref_exists(string):
dbxref = convert_string_to_dbxref(string)
if dbxref['db'] in providers:
provider = providers[dbxref['db']]
urls = []
exists = True
if 'check_existence' in provider:
url = compile_url(provider['check_existence'], dbxref)
exists = check_url_exists(url)
if not exists:
logging.warning('The dbxref "' + string + '" cannot be found. It will be ignored.')
return exists
else:
return False
return False
def compile_url(template, dbxref):
return template.replace('%i', dbxref['id']).replace('%d', dbxref['db'])
def check_url_exists(url):
r = requests.head(url, allow_redirects=True)
r.close()
return r.status_code <= 400
def convert_string_to_dbxref(string):
"""
A dbxref is dictionary with two keys: db and id.
......
......@@ -2,25 +2,32 @@
prefixes: [EC, Enzyme]
resources:
html: ["http://enzyme.expasy.org/EC/%i"]
check_existence: "http://enzyme.expasy.org/EC/%i.txt"
- name: Gene Identifier
prefixes: [GI]
resources:
html: ["http://www.ncbi.nlm.nih.gov/protein/GI:%i"]
xml: ["http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=protein&dopt=xml&sort=&val=%i&retmode=file"]
# does not work
# check_existence: "http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=protein&dopt=xml&sort=&val=%i&retmode=file"
- name: Uniprot
prefixes: ["UniProtKB/TrEMBL", "UniProtKB/Swiss-Prot"]
resources:
html: [ "http://www.uniprot.org/uniprot/%i"]
xml: [ "http://www.uniprot.org/uniprot/%i.xml"]
check_existence: "http://www.uniprot.org/uniprot/%i.xml"
- name: Taxonomy
prefixes: ["Taxon", "taxon", "taxid"]
resources:
html: ["http://www.uniprot.org/taxonomy/%i"]
xml: ["http://www.uniprot.org/taxonomy/%i.rdf"]
check_existence: "http://www.uniprot.org/taxonomy/%i"
- name: SequenceOntology
prefixes: ["SO"]
resources:
html: ["http://www.sequenceontology.org/browser/current_svn/term/SO:%i"]
# does not work
# check_existence: "http://www.sequenceontology.org/browser/current_svn/term/SO:%i"
- name: RFAM
prefixes: ["RFAM"]
resources:
......@@ -53,6 +60,7 @@
resources:
html: ["http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i"]
xml: ["http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i&format=oboxml"]
check_existence: "http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i&format=oboxml"
- name: HTTP
prefixes: ["http", "https"]
resources:
......
......@@ -19,3 +19,28 @@ class TestDbxrefResolve(unittest.TestCase):
def test_resolve_enzyme(self):
self.assertNotEqual(resolver.resolve(["EC:1.1.1.1"]), [])
def test_check_dbxref_exists(self):
data = [
('GO:1234', False),
('GO:0097281', True),
('SO:1234', False),
('Taxon:hoho', False),
('UniProtKB/Swiss-Prot:abc', False),
('EC:fa', False),
('EC:1.1.1.1', True),
('GI:abc', False),
]
for d in data:
with self.subTest(d=d):
self.assertEqual(resolver.check_dbxref_exists(d[0]), d[1] )
def test_check_urls(self):
import json
data = '[]'
data3 = '[{"locations": [{"type": "xml", "url": "http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:1234&format=oboxml"}, {"type": "html", "url": "http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:1234"}], "dbxref": "GO:1234"},{"dbxref": "UniProtKB/Swiss-Prot:P12345", "locations": [{"url": "http://www.uniprot.org/uniprot/P12345.xml", "type": "xml"}, {"url": "http://www.uniprot.org/uniprot/P12345", "type": "html"}]}, {"dbxref": "UniProtKB/TrEMBL:A2VB99", "locations": [{"url": "http://www.uniprot.org/uniprot/A2VB99.xml", "type": "xml"}, {"url": "http://www.uniprot.org/uniprot/A2VB99", "type": "html"}]}, {"dbxref": "taxon:452271", "locations": [{"url": "http://www.uniprot.org/taxonomy/452271.rdf", "type": "xml"}, {"url": "http://www.uniprot.org/taxonomy/452271", "type": "html"}]}, {"dbxref": "SO:0000704", "locations": [{"url": "http://www.sequenceontology.org/browser/current_svn/term/SO:0000704", "type": "html"}]}, {"dbxref": "RFAM:RF00360", "locations": [{"url": "http://rfam.xfam.org/family/RF00360?content-type=text%2Fxml", "type": "xml"}, {"url": "http://rfam.xfam.org/family/RF00360", "type": "html"}]}, {"dbxref": "pubmed:19037750", "locations": [{"url": "http://www.ncbi.nlm.nih.gov/pubmed/19037750", "type": "html"}]}, {"dbxref": "PFAM:PF00002", "locations": [{"url": "http://pfam.xfam.org/family/PF00002", "type": "html"}]}, {"dbxref": "PDB:4AJY", "locations": [{"url": "http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=xml&compression=NO&structureId=4AJY", "type": "xml"}, {"url": "http://www.rcsb.org/pdb/explore/explore.do?structureId=4AJY", "type": "html"}]}, {"dbxref": "InterPro:IPR002928", "locations": [{"url": "http://www.ebi.ac.uk/interpro/entry/IPR002928", "type": "html"}]}, {"dbxref": "http://www.google.de", "locations": [{"url": "http://www.google.de", "type": "html"}]}, {"dbxref": "https://www.google.de", "locations": [{"url": "https://www.google.de", "type": "html"}]}, {"dbxref": "GeneID:956582", "locations": [{"url": "http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=gene&dopt=xml&sort=&val=956582&retmode=file", "type": "xml"}, {"url": "http://www.ncbi.nlm.nih.gov/gene/956582", "type": "html"}]}, {"dbxref": "GO:0097281", "locations": [{"url": "http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:0097281&format=oboxml", "type": "xml"}, {"url": "http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:0097281", "type": "html"}]}, {"dbxref": "GI:731497", "locations": [{"url": "http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=protein&dopt=xml&sort=&val=731497&retmode=file", "type": "xml"}, {"url": "http://www.ncbi.nlm.nih.gov/protein/GI:731497", "type": "html"}]}, {"dbxref": "EC:1.1.1.1", "locations": [{"url": "http://enzyme.expasy.org/EC/1.1.1.1", "type": "html"}]}]'
data = json.loads(data3)
#resolver.check_urls(data)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment