Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
SOaAS
dbxref
Commits
b112d79b
Commit
b112d79b
authored
Dec 04, 2017
by
lmueller
Browse files
modefied retriever scripts so invalid dbxrefs do not cause a crash
parent
f4475346
Changes
5
Hide whitespace changes
Inline
Side-by-side
dbxref/resolver.py
View file @
b112d79b
...
...
@@ -23,7 +23,7 @@ def resolve(strings, check_existence=True):
for
s
in
strings
:
status
=
STATUS_NOT_CHECKED
if
check_existence
:
status
=
check_dbxref_exists
(
s
)
status
=
check_dbxref_exists
(
s
)
dbxref
=
convert_string_to_dbxref
(
s
)
if
dbxref
[
'db'
]
in
providers
:
provider
=
providers
[
dbxref
[
'db'
]]
...
...
@@ -51,7 +51,7 @@ def check_dbxref_exists(string):
return
exists
else
:
return
STATUS_CHECK_NOT_SUPPORTED
return
STATUS_UNSUPPORTED_DB
return
STATUS_UNSUPPORTED_DB
def
compile_url
(
template
,
dbxref
):
return
template
.
replace
(
'%i'
,
dbxref
[
'id'
]).
replace
(
'%d'
,
dbxref
[
'db'
])
...
...
providers.yaml
View file @
b112d79b
-
name
:
Enzyme
prefixes
:
[
"
EC"
]
prefixes
:
[
"
EC"
,
"
ec"
]
resources
:
html
:
[
"
https://enzyme.expasy.org/EC/%i"
]
text
:
[
"
https://enzyme.expasy.org/EC/%i.txt"
]
...
...
@@ -8,14 +8,14 @@
type
:
'
external'
location
:
'
scripts/retrieve_enzyme.py'
-
name
:
Gene Identifier
prefixes
:
[
GI
]
prefixes
:
[
"
GI
"
,
"
gi"
]
resources
:
html
:
[
"
http://www.ncbi.nlm.nih.gov/protein/GI:%i"
]
xml
:
[
"
http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=protein&dopt=xml&sort=&val=%i&retmode=file"
]
# does not work
# check_existence: "http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=protein&dopt=xml&sort=&val=%i&retmode=file"
-
name
:
Uniprot
prefixes
:
[
"
UniProtKB/TrEMBL"
,
"
UniProtKB/Swiss-Prot"
]
prefixes
:
[
"
UniProtKB/TrEMBL"
,
"
UniProtKB/Swiss-Prot"
,
"
uniprotkb/trembl"
,
"
uniprotkb/swiss-prot"
]
resources
:
html
:
[
"
http://www.uniprot.org/uniprot/%i"
]
xml
:
[
"
http://www.uniprot.org/uniprot/%i.xml"
]
...
...
@@ -30,7 +30,7 @@
xml
:
[
"
http://www.uniprot.org/taxonomy/%i.rdf"
]
check_existence
:
"
http://www.uniprot.org/taxonomy/%i"
-
name
:
SequenceOntology
prefixes
:
[
"
SO"
]
prefixes
:
[
"
SO"
,
"
so"
]
resources
:
html
:
[
"
http://www.sequenceontology.org/browser/current_svn/term/SO:%i"
]
obo
:
[
"
http://www.sequenceontology.org/browser/current_svn/export/term_only/obo/SO:%i"
]
...
...
@@ -41,7 +41,7 @@
type
:
'
external'
location
:
'
scripts/retrieve_sequence_ontology.py'
-
name
:
RFAM
prefixes
:
[
"
RFAM"
]
prefixes
:
[
"
RFAM"
,
"
rfam"
]
resources
:
html
:
[
"
http://rfam.xfam.org/family/%i"
]
xml
:
[
"
http://rfam.xfam.org/family/%i?content-type=text%2Fxml"
]
...
...
@@ -53,7 +53,7 @@
html
:
[
"
http://www.ncbi.nlm.nih.gov/pubmed/%i"
]
check_existence
:
"
http://www.ncbi.nlm.nih.gov/pubmed/%i"
-
name
:
Protein Families
prefixes
:
[
"
PFAM"
]
prefixes
:
[
"
PFAM"
,
"
pfam"
]
resources
:
html
:
[
"
http://pfam.xfam.org/family/%i"
]
xml
:
[
"
http://pfam.xfam.org/family/%i?output=xml"
]
...
...
@@ -63,26 +63,26 @@
type
:
'
external'
location
:
'
scripts/retrieve_pfam.py'
-
name
:
PDB
prefixes
:
[
"
PDB"
]
prefixes
:
[
"
PDB"
,
"
pdb"
]
resources
:
html
:
[
"
http://www.rcsb.org/pdb/explore/explore.do?structureId=%i"
]
xml
:
[
"
http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=xml&compression=NO&structureId=%i"
]
check_existence
:
"
http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=xml&compression=NO&structureId=%i"
-
name
:
InterPro
prefixes
:
[
"
InterPro"
]
prefixes
:
[
"
InterPro"
,
"
interpro"
]
resources
:
html
:
[
"
http://www.ebi.ac.uk/interpro/entry/%i"
]
# does not work
# check_existence: "http://www.ebi.ac.uk/interpro/entry/%i"
-
name
:
GeneID
prefixes
:
[
"
GeneID"
]
prefixes
:
[
"
GeneID"
,
"
geneid"
]
resources
:
html
:
[
"
http://www.ncbi.nlm.nih.gov/gene/%i"
]
xml
:
[
"
http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=gene&dopt=xml&sort=&val=%i&retmode=file"
]
# does not work
# check_existence: "http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=gene&dopt=xml&sort=&val=%i&retmode=file"
-
name
:
Gene Ontology
prefixes
:
[
"
GO"
]
prefixes
:
[
"
GO"
,
"
go"
]
resources
:
html
:
[
"
http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i"
]
xml
:
[
"
http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i&format=oboxml"
]
...
...
scripts/retrieve_gene_ontology.py
View file @
b112d79b
...
...
@@ -25,13 +25,16 @@ def main():
logger
.
debug
(
'Content: %s'
,
r
.
text
)
d
=
json
.
loads
(
r
.
text
)
output
=
{
'id'
:
entry
[
'dbxref'
]}
if
args
.
basic
:
output
.
update
(
read_basic
(
d
))
if
args
.
relations
:
output
.
update
(
read_relations
(
d
))
if
not
args
.
basic
and
not
args
.
relations
:
output
.
update
(
read_basic
(
d
))
output
.
update
(
read_relations
(
d
))
if
not
'messages'
in
d
:
if
args
.
basic
:
output
.
update
(
read_basic
(
d
))
if
args
.
relations
:
output
.
update
(
read_relations
(
d
))
if
not
args
.
basic
and
not
args
.
relations
:
output
.
update
(
read_basic
(
d
))
output
.
update
(
read_relations
(
d
))
else
:
output
[
'messages'
]
=
d
[
'messages'
]
documents
.
append
(
output
)
print
(
json
.
dumps
(
documents
))
...
...
scripts/retrieve_sequence_ontology.py
View file @
b112d79b
...
...
@@ -38,15 +38,16 @@ def main():
elements
=
[]
else
:
elements
.
append
(
line
.
strip
())
d
=
resolve_elements
(
elements
)
output
=
{
'id'
:
entry
[
'dbxref'
]}
if
'id'
in
d
and
d
[
'id'
]
==
entry
[
'dbxref'
]
and
args
.
basic
:
output
.
update
(
format_output
(
d
))
if
args
.
relations
:
output
[
'relations'
]
=
resolve_relations
(
entry
)
if
'id'
in
d
and
d
[
'id'
]
==
entry
[
'dbxref'
]
and
not
args
.
basic
and
not
args
.
relations
:
output
.
update
(
format_output
(
d
))
output
[
'relations'
]
=
resolve_relations
(
entry
)
if
not
'<title>500 Internal Server Error</title>'
in
elements
:
d
=
resolve_elements
(
elements
)
if
'id'
in
d
and
d
[
'id'
]
==
entry
[
'dbxref'
]
and
args
.
basic
:
output
.
update
(
format_output
(
d
))
if
args
.
relations
:
output
[
'relations'
]
=
resolve_relations
(
entry
)
if
'id'
in
d
and
d
[
'id'
]
==
entry
[
'dbxref'
]
and
not
args
.
basic
and
not
args
.
relations
:
output
.
update
(
format_output
(
d
))
output
[
'relations'
]
=
resolve_relations
(
entry
)
documents
.
append
(
output
)
print
(
json
.
dumps
(
documents
))
...
...
scripts/retrieve_uniprot.py
View file @
b112d79b
...
...
@@ -37,21 +37,24 @@ def main():
logger
.
debug
(
'URL: %s'
,
xml_url
)
r
=
requests
.
get
(
xml_url
)
logger
.
debug
(
'Content: %s'
,
r
.
text
)
root
=
ET
.
fromstring
(
r
.
text
)
output
=
{
'id'
:
entry
[
'dbxref'
]}
for
child
in
root
.
findall
(
'uniprot:entry'
,
ns
):
if
args
.
basic
:
output
.
update
(
read_basic
(
child
))
if
args
.
sequence
:
output
.
update
(
read_sequence
(
child
))
if
args
.
organism
:
output
.
update
(
read_taxonomy
(
child
))
if
args
.
annotation
:
output
.
update
(
read_annotation
(
child
))
if
args
.
features
:
output
[
'features'
]
=
read_features
(
child
)
try
:
root
=
ET
.
fromstring
(
r
.
text
)
for
child
in
root
.
findall
(
'uniprot:entry'
,
ns
):
if
args
.
basic
:
output
.
update
(
read_basic
(
child
))
if
args
.
sequence
:
output
.
update
(
read_sequence
(
child
))
if
args
.
organism
:
output
.
update
(
read_taxonomy
(
child
))
if
args
.
annotation
:
output
.
update
(
read_annotation
(
child
))
if
args
.
features
:
output
[
'features'
]
=
read_features
(
child
)
except
:
pass
documents
.
append
(
output
)
print
(
json
.
dumps
(
documents
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment