Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
SOaAS
dbxref
Commits
8eda5c8f
Commit
8eda5c8f
authored
Jan 23, 2018
by
Lukas Jelonek
Browse files
Fix bug:
#12
case sensitivity causes problems with retrievers
parent
e55f7b34
Changes
6
Hide whitespace changes
Inline
Side-by-side
dbxref/config.py
View file @
8eda5c8f
...
...
@@ -3,11 +3,14 @@ def get_providers_path():
return
pkg_resources
.
resource_filename
(
'dbxref'
,
'providers.yaml'
)
def
load_providers
():
return
_load_providers
(
get_providers_path
())
def
_load_providers
(
path
):
import
yaml
data
=
[]
with
open
(
get_providers_
path
()
)
as
data_file
:
with
open
(
path
)
as
data_file
:
data
=
yaml
.
load
(
data_file
)
return
index_providers
(
data
)
return
normalize_index
(
index_providers
(
data
)
)
def
index_providers
(
providers
):
index
=
{}
...
...
@@ -15,3 +18,16 @@ def index_providers(providers):
for
db
in
p
[
'prefixes'
]:
index
[
db
]
=
p
return
index
def
normalize_index
(
index
):
'create a new index with lowercase keys'
return
{
k
.
lower
():
v
for
(
k
,
v
)
in
index
.
items
()}
def
has_provider
(
provider
):
return
_has_provider
(
load_providers
(),
provider
)
def
_has_provider
(
providers
,
provider
):
return
provider
.
lower
()
in
providers
def
get_provider
(
provider
):
return
load_providers
()[
provider
.
lower
()]
dbxref/providers.yaml
View file @
8eda5c8f
-
name
:
Enzyme
prefixes
:
[
"
EC"
,
"
ec"
]
prefixes
:
[
"
EC"
]
resources
:
html
:
[
"
https://enzyme.expasy.org/EC/%i"
]
text
:
[
"
https://enzyme.expasy.org/EC/%i.txt"
]
...
...
@@ -8,14 +8,14 @@
type
:
'
internal'
location
:
'
dbxref.retrieve.enzyme'
-
name
:
Gene Identifier
prefixes
:
[
"
GI"
,
"
gi"
]
prefixes
:
[
"
GI"
]
resources
:
html
:
[
"
http://www.ncbi.nlm.nih.gov/protein/GI:%i"
]
xml
:
[
"
http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=protein&dopt=xml&sort=&val=%i&retmode=file"
]
# does not work
# check_existence: "http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=protein&dopt=xml&sort=&val=%i&retmode=file"
-
name
:
Uniprot
prefixes
:
[
"
UniProtKB/TrEMBL"
,
"
UniProtKB/Swiss-Prot"
,
"
uniprotkb/trembl"
,
"
uniprotkb/swiss-prot"
]
prefixes
:
[
"
UniProtKB/TrEMBL"
,
"
UniProtKB/Swiss-Prot"
]
resources
:
html
:
[
"
http://www.uniprot.org/uniprot/%i"
]
xml
:
[
"
http://www.uniprot.org/uniprot/%i.xml"
]
...
...
@@ -24,7 +24,7 @@
type
:
'
internal'
location
:
'
dbxref.retrieve.uniprot'
-
name
:
Taxonomy
prefixes
:
[
"
Taxon"
,
"
taxon"
,
"
taxid"
]
prefixes
:
[
"
Taxon"
,
"
taxid"
]
resources
:
html
:
[
"
http://www.uniprot.org/taxonomy/%i"
]
json
:
[
"
https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/tax-id/%i"
]
...
...
@@ -35,7 +35,7 @@
type
:
'
internal'
location
:
'
dbxref.retrieve.taxonomy'
-
name
:
SequenceOntology
prefixes
:
[
"
SO"
,
"
so"
]
prefixes
:
[
"
SO"
]
resources
:
html
:
[
"
http://www.sequenceontology.org/browser/current_svn/term/SO:%i"
]
obo
:
[
"
http://www.sequenceontology.org/browser/current_svn/export/term_only/obo/SO:%i"
]
...
...
@@ -46,19 +46,19 @@
type
:
'
internal'
location
:
'
dbxref.retrieve.sequence_ontology'
-
name
:
RFAM
prefixes
:
[
"
RFAM"
,
"
rfam"
]
prefixes
:
[
"
RFAM"
]
resources
:
html
:
[
"
http://rfam.xfam.org/family/%i"
]
xml
:
[
"
http://rfam.xfam.org/family/%i?content-type=text%2Fxml"
]
# does not work
# check_existence: "http://rfam.xfam.org/family/%i?content-type=text%2Fxml"
-
name
:
Pubmed
prefixes
:
[
"
pubmed"
,
"
Pubmed"
]
prefixes
:
[
"
Pubmed"
]
resources
:
html
:
[
"
http://www.ncbi.nlm.nih.gov/pubmed/%i"
]
check_existence
:
"
http://www.ncbi.nlm.nih.gov/pubmed/%i"
-
name
:
Protein Families
prefixes
:
[
"
PFAM"
,
"
Pfam"
,
"
pfam"
]
prefixes
:
[
"
PFAM"
]
resources
:
html
:
[
"
http://pfam.xfam.org/family/%i"
]
xml
:
[
"
http://pfam.xfam.org/family/%i?output=xml"
]
...
...
@@ -68,26 +68,26 @@
type
:
'
internal'
location
:
'
dbxref.retrieve.pfam'
-
name
:
PDB
prefixes
:
[
"
PDB"
,
"
pdb"
]
prefixes
:
[
"
PDB"
]
resources
:
html
:
[
"
http://www.rcsb.org/pdb/explore/explore.do?structureId=%i"
]
xml
:
[
"
http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=xml&compression=NO&structureId=%i"
]
check_existence
:
"
http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=xml&compression=NO&structureId=%i"
-
name
:
InterPro
prefixes
:
[
"
InterPro"
,
"
interpro"
]
prefixes
:
[
"
InterPro"
]
resources
:
html
:
[
"
http://www.ebi.ac.uk/interpro/entry/%i"
]
# does not work
# check_existence: "http://www.ebi.ac.uk/interpro/entry/%i"
-
name
:
GeneID
prefixes
:
[
"
GeneID"
,
"
geneid"
]
prefixes
:
[
"
GeneID"
]
resources
:
html
:
[
"
http://www.ncbi.nlm.nih.gov/gene/%i"
]
xml
:
[
"
http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=gene&dopt=xml&sort=&val=%i&retmode=file"
]
# does not work
# check_existence: "http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=gene&dopt=xml&sort=&val=%i&retmode=file"
-
name
:
Gene Ontology
prefixes
:
[
"
GO"
,
"
go"
]
prefixes
:
[
"
GO"
]
resources
:
html
:
[
"
http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i"
]
xml
:
[
"
http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i&format=oboxml"
]
...
...
dbxref/resolver.py
View file @
8eda5c8f
...
...
@@ -4,8 +4,7 @@ from cachecontrol.caches.file_cache import FileCache
import
logging
logger
=
logging
.
getLogger
(
__name__
)
from
dbxref.config
import
load_providers
providers
=
load_providers
()
from
dbxref
import
config
cache
=
FileCache
(
".web_cache"
,
forever
=
True
)
sess
=
CacheControl
(
requests
.
Session
(),
cache
=
cache
)
...
...
@@ -24,8 +23,8 @@ def resolve(dbxrefs, check_existence=True):
status
=
STATUS_NOT_CHECKED
if
check_existence
:
status
=
check_dbxref_exists
(
dbxref
)
if
dbxref
[
'db'
]
in
providers
:
provider
=
provider
s
[
dbxref
[
'db'
]
]
if
config
.
has_provider
(
dbxref
[
'db'
])
:
provider
=
config
.
get_
provider
(
dbxref
[
'db'
]
)
locations
=
{}
for
_type
in
provider
[
'resources'
]:
urls
=
[]
...
...
@@ -42,8 +41,8 @@ def convert_to_dbxrefs(strings):
return
list
(
map
(
convert_string_to_dbxref
,
strings
))
def
check_dbxref_exists
(
dbxref
):
if
dbxref
[
'db'
]
in
providers
:
provider
=
provider
s
[
dbxref
[
'db'
]
]
if
config
.
has_provider
(
dbxref
[
'db'
])
:
provider
=
config
.
get_
provider
(
dbxref
[
'db'
]
)
urls
=
[]
exists
=
STATUS_NOT_CHECKED
if
'check_existence'
in
provider
:
...
...
dbxref/retriever.py
View file @
8eda5c8f
import
logging
logger
=
logging
.
getLogger
(
__name__
)
from
dbxref
.config
import
load_providers
from
dbxref
import
config
from
itertools
import
groupby
import
json
providers
=
load_providers
()
def
retrieve
(
dbxrefs
):
sorted
(
dbxrefs
,
key
=
lambda
x
:
x
[
'db'
]
)
sorted
(
dbxrefs
,
key
=
lambda
x
:
x
[
'db'
]
.
lower
())
# normalize db to lowercase to allow differently cased notations
results
=
[]
for
key
,
dbxrefs
in
groupby
(
dbxrefs
,
lambda
x
:
x
[
'db'
]):
if
key
.
lower
()
in
providers
and
'retriever'
in
provider
s
[
key
.
lower
()]
:
provider
=
provider
s
[
key
.
lower
()]
if
config
.
has_
provider
(
key
)
:
provider
=
config
.
get_
provider
(
key
)
logger
.
debug
(
'{0} is supported'
.
format
(
key
))
if
provider
[
'retriever'
][
'type'
]
==
'external'
:
results
.
extend
(
load_with_external_provider
(
provider
,
list
(
dbxrefs
)))
...
...
tests/test_config.py
View file @
8eda5c8f
...
...
@@ -9,3 +9,12 @@ class TestConfig(unittest.TestCase):
def
test_index_providers
(
self
):
data
=
[{
'name'
:
'test'
,
'prefixes'
:[
'a'
,
'b'
]}]
self
.
assertEqual
(
config
.
index_providers
(
data
),
{
'a'
:
data
[
0
],
'b'
:
data
[
0
]})
def
test_normalize_index
(
self
):
index
=
{
'A'
:
'some value'
,
'b'
:
'some other value'
}
self
.
assertEqual
(
config
.
normalize_index
(
index
),
{
'a'
:
'some value'
,
'b'
:
'some other value'
})
def
test_has_provider
(
self
):
index
=
config
.
normalize_index
({
'A'
:
'some value'
,
'b'
:
'some other value'
})
self
.
assertTrue
(
config
.
_has_provider
(
index
,
'B'
))
self
.
assertTrue
(
config
.
_has_provider
(
index
,
'a'
))
tests/test_retriever.py
0 → 100644
View file @
8eda5c8f
import
unittest
from
dbxref
import
retriever
,
resolver
class
TestDbxrefResolve
(
unittest
.
TestCase
):
def
test_different_case_database_prefix
(
self
):
entries
=
resolver
.
convert_to_dbxrefs
([
'PFAM:PF00002'
,
'Pfam:PF00002'
,
'pfam:PF00002'
])
documents
=
retriever
.
retrieve
(
entries
)
for
d
in
documents
:
with
self
.
subTest
(
d
=
d
):
self
.
assertTrue
(
'description'
in
d
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment