Commit 8eda5c8f authored by Lukas Jelonek's avatar Lukas Jelonek
Browse files

Fix bug: #12 case sensitivity causes problems with retrievers

parent e55f7b34
......@@ -3,11 +3,14 @@ def get_providers_path():
return pkg_resources.resource_filename('dbxref', 'providers.yaml')
def load_providers():
return _load_providers(get_providers_path())
def _load_providers(path):
import yaml
data = []
with open(get_providers_path()) as data_file:
with open(path) as data_file:
data = yaml.load(data_file)
return index_providers(data)
return normalize_index(index_providers(data))
def index_providers(providers):
index = {}
......@@ -15,3 +18,16 @@ def index_providers(providers):
for db in p['prefixes']:
index[db] = p
return index
def normalize_index(index):
'create a new index with lowercase keys'
return {k.lower():v for (k,v) in index.items()}
def has_provider(provider):
return _has_provider(load_providers(), provider)
def _has_provider(providers, provider):
return provider.lower() in providers
def get_provider(provider):
return load_providers()[provider.lower()]
- name: Enzyme
prefixes: ["EC", "ec"]
prefixes: ["EC"]
resources:
html: ["https://enzyme.expasy.org/EC/%i"]
text: ["https://enzyme.expasy.org/EC/%i.txt"]
......@@ -8,14 +8,14 @@
type: 'internal'
location: 'dbxref.retrieve.enzyme'
- name: Gene Identifier
prefixes: ["GI", "gi"]
prefixes: ["GI"]
resources:
html: ["http://www.ncbi.nlm.nih.gov/protein/GI:%i"]
xml: ["http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=protein&dopt=xml&sort=&val=%i&retmode=file"]
# does not work
# check_existence: "http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=protein&dopt=xml&sort=&val=%i&retmode=file"
- name: Uniprot
prefixes: ["UniProtKB/TrEMBL", "UniProtKB/Swiss-Prot", "uniprotkb/trembl", "uniprotkb/swiss-prot"]
prefixes: ["UniProtKB/TrEMBL", "UniProtKB/Swiss-Prot"]
resources:
html: [ "http://www.uniprot.org/uniprot/%i"]
xml: [ "http://www.uniprot.org/uniprot/%i.xml"]
......@@ -24,7 +24,7 @@
type: 'internal'
location: 'dbxref.retrieve.uniprot'
- name: Taxonomy
prefixes: ["Taxon", "taxon", "taxid"]
prefixes: ["Taxon", "taxid"]
resources:
html: ["http://www.uniprot.org/taxonomy/%i"]
json: ["https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/tax-id/%i"]
......@@ -35,7 +35,7 @@
type: 'internal'
location: 'dbxref.retrieve.taxonomy'
- name: SequenceOntology
prefixes: ["SO", "so"]
prefixes: ["SO"]
resources:
html: ["http://www.sequenceontology.org/browser/current_svn/term/SO:%i"]
obo: ["http://www.sequenceontology.org/browser/current_svn/export/term_only/obo/SO:%i"]
......@@ -46,19 +46,19 @@
type: 'internal'
location: 'dbxref.retrieve.sequence_ontology'
- name: RFAM
prefixes: ["RFAM", "rfam"]
prefixes: ["RFAM"]
resources:
html: ["http://rfam.xfam.org/family/%i"]
xml: ["http://rfam.xfam.org/family/%i?content-type=text%2Fxml"]
# does not work
# check_existence: "http://rfam.xfam.org/family/%i?content-type=text%2Fxml"
- name: Pubmed
prefixes: ["pubmed", "Pubmed"]
prefixes: ["Pubmed"]
resources:
html: ["http://www.ncbi.nlm.nih.gov/pubmed/%i"]
check_existence: "http://www.ncbi.nlm.nih.gov/pubmed/%i"
- name: Protein Families
prefixes: ["PFAM", "Pfam", "pfam"]
prefixes: ["PFAM"]
resources:
html: ["http://pfam.xfam.org/family/%i"]
xml: ["http://pfam.xfam.org/family/%i?output=xml"]
......@@ -68,26 +68,26 @@
type: 'internal'
location: 'dbxref.retrieve.pfam'
- name: PDB
prefixes: ["PDB", "pdb"]
prefixes: ["PDB"]
resources:
html: ["http://www.rcsb.org/pdb/explore/explore.do?structureId=%i"]
xml: ["http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=xml&compression=NO&structureId=%i"]
check_existence: "http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=xml&compression=NO&structureId=%i"
- name: InterPro
prefixes: ["InterPro", "interpro"]
prefixes: ["InterPro"]
resources:
html: ["http://www.ebi.ac.uk/interpro/entry/%i"]
# does not work
# check_existence: "http://www.ebi.ac.uk/interpro/entry/%i"
- name: GeneID
prefixes: ["GeneID", "geneid"]
prefixes: ["GeneID"]
resources:
html: ["http://www.ncbi.nlm.nih.gov/gene/%i"]
xml: ["http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=gene&dopt=xml&sort=&val=%i&retmode=file"]
# does not work
# check_existence: "http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=gene&dopt=xml&sort=&val=%i&retmode=file"
- name: Gene Ontology
prefixes: ["GO", "go"]
prefixes: ["GO"]
resources:
html: ["http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i"]
xml: ["http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i&format=oboxml"]
......
......@@ -4,8 +4,7 @@ from cachecontrol.caches.file_cache import FileCache
import logging
logger = logging.getLogger(__name__)
from dbxref.config import load_providers
providers = load_providers()
from dbxref import config
cache = FileCache(".web_cache", forever=True)
sess = CacheControl(requests.Session(), cache=cache)
......@@ -24,8 +23,8 @@ def resolve(dbxrefs, check_existence=True):
status = STATUS_NOT_CHECKED
if check_existence:
status = check_dbxref_exists(dbxref)
if dbxref['db'] in providers:
provider = providers[dbxref['db']]
if config.has_provider(dbxref['db']):
provider = config.get_provider(dbxref['db'])
locations = {}
for _type in provider['resources']:
urls = []
......@@ -42,8 +41,8 @@ def convert_to_dbxrefs(strings):
return list(map(convert_string_to_dbxref, strings))
def check_dbxref_exists(dbxref):
if dbxref['db'] in providers:
provider = providers[dbxref['db']]
if config.has_provider(dbxref['db']):
provider = config.get_provider(dbxref['db'])
urls = []
exists = STATUS_NOT_CHECKED
if 'check_existence' in provider:
......
import logging
logger = logging.getLogger(__name__)
from dbxref.config import load_providers
from dbxref import config
from itertools import groupby
import json
providers = load_providers()
def retrieve(dbxrefs):
sorted(dbxrefs, key=lambda x: x['db'])
sorted(dbxrefs, key=lambda x: x['db'].lower()) # normalize db to lowercase to allow differently cased notations
results = []
for key, dbxrefs in groupby(dbxrefs, lambda x: x['db']):
if key.lower() in providers and 'retriever' in providers[key.lower()]:
provider = providers[key.lower()]
if config.has_provider(key):
provider = config.get_provider(key)
logger.debug('{0} is supported'.format(key))
if provider['retriever']['type'] == 'external':
results.extend( load_with_external_provider(provider, list(dbxrefs)))
......
......@@ -9,3 +9,12 @@ class TestConfig(unittest.TestCase):
def test_index_providers(self):
data = [{'name': 'test', 'prefixes':['a', 'b']}]
self.assertEqual(config.index_providers(data), {'a': data[0], 'b': data[0]})
def test_normalize_index(self):
index = {'A': 'some value', 'b': 'some other value'}
self.assertEqual(config.normalize_index(index), {'a' : 'some value', 'b':'some other value'})
def test_has_provider(self):
index = config.normalize_index({'A': 'some value', 'b': 'some other value'})
self.assertTrue(config._has_provider(index, 'B'))
self.assertTrue(config._has_provider(index, 'a'))
import unittest
from dbxref import retriever, resolver
class TestDbxrefResolve(unittest.TestCase):
def test_different_case_database_prefix(self):
entries = resolver.convert_to_dbxrefs(['PFAM:PF00002','Pfam:PF00002','pfam:PF00002'])
documents = retriever.retrieve(entries)
for d in documents:
with self.subTest(d=d):
self.assertTrue('description' in d)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment