Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
SOaAS
dbxref
Commits
941bb534
Commit
941bb534
authored
Jan 23, 2018
by
Lukas Jelonek
Browse files
Merge branch 'develop' into 'master'
Develop See merge request SOaAS/dbxref!2
parents
4da4423e
3d7ce949
Changes
33
Hide whitespace changes
Inline
Side-by-side
.gitignore
View file @
941bb534
*.pyc
docs/build/
build/
dist/
*.egg-info/
*.egg
*.py[cod]
__pycache__/
*.so
*~
docs/_build/
CHANGELOG
View file @
941bb534
[0.1]
Implement basic structure
Integrate several databases
LICENSE
View file @
941bb534
The MIT License (MIT)
Copyright (c) 2017
SOaAS
Copyright (c) 2017
Lukas Jelonek
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
...
...
README.md
View file @
941bb534
# DB XREF resolver and retriever tool
A tool that resolves db_xrefs into URLs and that retrieves the data as json documents.
A tool that resolves database cross references (dbxrefs). It can return a list of
locations where the cross reference points to in different formats, like HTML,
XML, flat file or json. It can also retrieve the data for some of the supported
databases and convert it into json.
# Getting started (Setup)
The intended audience for this tool are bioinformatician that need to collect
data for dbxrefs and postprocess it. By returning everything in json format the
need for normalization and special parsing of the data is reduced.
# Getting started for development (Setup)
Prerequisites:
...
...
@@ -11,10 +18,31 @@ Prerequisites:
Supported bioinformatic databases:
*
None yet
*
Ontologies
*
Gene Ontology
Checkout the repository:
~~~~
git clone git@git.computational.bio.uni-giessen.de:SOaAS/dbxref.git
~~~~
Setup a virtualenv for development and install it in editable mode:
~~~~
# install in development environment
virtualenv --python=python3 venv; source venv/bin/activate;
pip install -e .
# run tests
python3 setup.py test
# compile documentation
python3 setup.py build_sphinx
~~~~
Use the application:
~~~~
dbxref resolve GO:0097281
~~~~
bin/dbxref
deleted
100755 → 0
View file @
4da4423e
#!/usr/bin/env python3
import
os
import
sys
sys
.
path
.
insert
(
0
,
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'..'
)))
from
dbxref.main
import
main
main
()
dbxref/config.py
View file @
941bb534
def
get_install_location
():
"""Finds the location directory of the tool"""
import
os
script_path
=
os
.
path
.
realpath
(
__file__
)
script_dir
=
os
.
path
.
dirname
(
script_path
)
install_dir
=
os
.
path
.
dirname
(
script_dir
)
return
install_dir
def
get_providers_path
():
import
pkg_resources
return
pkg_resources
.
resource_filename
(
'dbxref'
,
'providers.yaml'
)
def
load_providers
():
return
_load_providers
(
get_providers_path
())
def
_load_providers
(
path
):
import
yaml
data
=
[]
with
open
(
get_install_location
()
+
'/providers.yaml'
)
as
data_file
:
with
open
(
path
)
as
data_file
:
data
=
yaml
.
load
(
data_file
)
return
index_providers
(
data
)
return
normalize_index
(
index_providers
(
data
)
)
def
index_providers
(
providers
):
index
=
{}
...
...
@@ -19,3 +18,16 @@ def index_providers(providers):
for
db
in
p
[
'prefixes'
]:
index
[
db
]
=
p
return
index
def
normalize_index
(
index
):
'create a new index with lowercase keys'
return
{
k
.
lower
():
v
for
(
k
,
v
)
in
index
.
items
()}
def
has_provider
(
provider
):
return
_has_provider
(
load_providers
(),
provider
)
def
_has_provider
(
providers
,
provider
):
return
provider
.
lower
()
in
providers
def
get_provider
(
provider
):
return
load_providers
()[
provider
.
lower
()]
dbxref/main.py
View file @
941bb534
...
...
@@ -2,6 +2,8 @@
import
argparse
import
os
import
logging
from
dbxref
import
resolver
import
json
def
main
():
parser
=
argparse
.
ArgumentParser
(
description
=
'Lookup locations of database cross references and retrieve them as json'
)
...
...
@@ -38,13 +40,11 @@ def info(args, config):
print
(
'info'
)
def
resolve
(
args
,
config
):
from
dbxref
import
resolver
import
json
print
(
json
.
dumps
(
resolver
.
resolve
(
args
.
dbxrefs
,
check_existence
=
args
.
no_check
)))
print
(
json
.
dumps
(
resolver
.
resolve
(
resolver
.
convert_to_dbxrefs
(
args
.
dbxrefs
),
check_existence
=
args
.
no_check
)))
def
retrieve
(
args
,
config
):
from
dbxref
import
retriever
retriever
.
retrieve
(
args
.
dbxrefs
)
print
(
json
.
dumps
(
retriever
.
retrieve
(
resolver
.
convert_to_dbxrefs
(
args
.
dbxrefs
)
)))
if
__name__
==
"__main__"
:
main
()
providers.yaml
→
dbxref/
providers.yaml
View file @
941bb534
-
name
:
Enzyme
prefixes
:
[
EC
,
Enzyme
]
prefixes
:
[
"
EC
"
]
resources
:
html
:
[
"
http://enzyme.expasy.org/EC/%i"
]
check_existence
:
"
http://enzyme.expasy.org/EC/%i.txt"
html
:
[
"
https://enzyme.expasy.org/EC/%i"
]
text
:
[
"
https://enzyme.expasy.org/EC/%i.txt"
]
check_existence
:
"
https://enzyme.expasy.org/EC/%i.txt"
retriever
:
type
:
'
internal'
location
:
'
dbxref.retrieve.enzyme'
-
name
:
Gene Identifier
prefixes
:
[
GI
]
prefixes
:
[
"
GI
"
]
resources
:
html
:
[
"
http://www.ncbi.nlm.nih.gov/protein/GI:%i"
]
xml
:
[
"
http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=protein&dopt=xml&sort=&val=%i&retmode=file"
]
...
...
@@ -17,20 +21,30 @@
xml
:
[
"
http://www.uniprot.org/uniprot/%i.xml"
]
check_existence
:
"
http://www.uniprot.org/uniprot/%i.xml"
retriever
:
type
:
'
ex
ternal'
location
:
'
scripts/
retrieve
_
uniprot
.py
'
type
:
'
in
ternal'
location
:
'
dbxref.
retrieve
.
uniprot'
-
name
:
Taxonomy
prefixes
:
[
"
Taxon"
,
"
taxon"
,
"
taxid"
]
prefixes
:
[
"
Taxon"
,
"
taxid"
]
resources
:
html
:
[
"
http://www.uniprot.org/taxonomy/%i"
]
json
:
[
"
https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/tax-id/%i"
]
xml
:
[
"
http://www.uniprot.org/taxonomy/%i.rdf"
]
xml_ncbi
:
[
"
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=taxonomy&id=%i"
]
check_existence
:
"
http://www.uniprot.org/taxonomy/%i"
retriever
:
type
:
'
internal'
location
:
'
dbxref.retrieve.taxonomy'
-
name
:
SequenceOntology
prefixes
:
[
"
SO"
]
resources
:
html
:
[
"
http://www.sequenceontology.org/browser/current_svn/term/SO:%i"
]
obo
:
[
"
http://www.sequenceontology.org/browser/current_svn/export/term_only/obo/SO:%i"
]
tsv
:
[
"
http://www.sequenceontology.org/browser/current_svn/export/term_only/csv_text/SO:%i"
]
# does not work
# check_existence: "http://www.sequenceontology.org/browser/current_svn/term/SO:%i"
retriever
:
type
:
'
internal'
location
:
'
dbxref.retrieve.sequence_ontology'
-
name
:
RFAM
prefixes
:
[
"
RFAM"
]
resources
:
...
...
@@ -39,7 +53,7 @@
# does not work
# check_existence: "http://rfam.xfam.org/family/%i?content-type=text%2Fxml"
-
name
:
Pubmed
prefixes
:
[
"
pubmed"
,
"
Pubmed"
]
prefixes
:
[
"
Pubmed"
]
resources
:
html
:
[
"
http://www.ncbi.nlm.nih.gov/pubmed/%i"
]
check_existence
:
"
http://www.ncbi.nlm.nih.gov/pubmed/%i"
...
...
@@ -50,6 +64,9 @@
xml
:
[
"
http://pfam.xfam.org/family/%i?output=xml"
]
# does not work
# check_existence: "http://pfam.xfam.org/family/%i?content-type=text%2Fxml"
retriever
:
type
:
'
internal'
location
:
'
dbxref.retrieve.pfam'
-
name
:
PDB
prefixes
:
[
"
PDB"
]
resources
:
...
...
@@ -74,7 +91,11 @@
resources
:
html
:
[
"
http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i"
]
xml
:
[
"
http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i&format=oboxml"
]
json
:
[
"
https://www.ebi.ac.uk/QuickGO/services/ontology/go/terms/GO:%i/complete"
]
check_existence
:
"
http://purl.obolibrary.org/obo/GO_%i"
retriever
:
type
:
'
internal'
location
:
'
dbxref.retrieve.gene_ontology'
-
name
:
HTTP
prefixes
:
[
"
http"
,
"
https"
]
resources
:
...
...
dbxref/resolver.py
View file @
941bb534
...
...
@@ -4,8 +4,7 @@ from cachecontrol.caches.file_cache import FileCache
import
logging
logger
=
logging
.
getLogger
(
__name__
)
from
dbxref.config
import
load_providers
providers
=
load_providers
()
from
dbxref
import
config
cache
=
FileCache
(
".web_cache"
,
forever
=
True
)
sess
=
CacheControl
(
requests
.
Session
(),
cache
=
cache
)
...
...
@@ -18,15 +17,14 @@ STATUS_CHECK_NOT_SUPPORTED='check of status not supported'
STATUS_CHECK_TIMEOUT
=
'status check timed out'
STATUS_UNSUPPORTED_DB
=
'database unsupported'
def
resolve
(
string
s
,
check_existence
=
True
):
def
resolve
(
dbxref
s
,
check_existence
=
True
):
results
=
[]
for
s
in
string
s
:
for
dbxref
in
dbxref
s
:
status
=
STATUS_NOT_CHECKED
if
check_existence
:
status
=
check_dbxref_exists
(
s
)
dbxref
=
convert_string_to_dbxref
(
s
)
if
dbxref
[
'db'
]
in
providers
:
provider
=
providers
[
dbxref
[
'db'
]]
status
=
check_dbxref_exists
(
dbxref
)
if
config
.
has_provider
(
dbxref
[
'db'
]):
provider
=
config
.
get_provider
(
dbxref
[
'db'
])
locations
=
{}
for
_type
in
provider
[
'resources'
]:
urls
=
[]
...
...
@@ -38,10 +36,13 @@ def resolve(strings, check_existence=True):
results
.
append
({
'dbxref'
:
dbxref
[
'db'
]
+
':'
+
dbxref
[
'id'
],
'status'
:
STATUS_UNSUPPORTED_DB
})
return
results
def
check_dbxref_exists
(
string
):
dbxref
=
convert_string_to_dbxref
(
string
)
if
dbxref
[
'db'
]
in
providers
:
provider
=
providers
[
dbxref
[
'db'
]]
def
convert_to_dbxrefs
(
strings
):
'''convert a list of strings to dbxref maps with db and id attribute'''
return
list
(
map
(
convert_string_to_dbxref
,
strings
))
def
check_dbxref_exists
(
dbxref
):
if
config
.
has_provider
(
dbxref
[
'db'
]):
provider
=
config
.
get_provider
(
dbxref
[
'db'
])
urls
=
[]
exists
=
STATUS_NOT_CHECKED
if
'check_existence'
in
provider
:
...
...
@@ -51,7 +52,7 @@ def check_dbxref_exists(string):
return
exists
else
:
return
STATUS_CHECK_NOT_SUPPORTED
return
STATUS_UNSUPPORTED_DB
return
STATUS_UNSUPPORTED_DB
def
compile_url
(
template
,
dbxref
):
return
template
.
replace
(
'%i'
,
dbxref
[
'id'
]).
replace
(
'%d'
,
dbxref
[
'db'
])
...
...
config.yaml
→
dbxref/retrieve/__init__.py
View file @
941bb534
File moved
dbxref/retrieve/enzyme.py
0 → 100755
View file @
941bb534
#!/usr/bin/env python3
import
dbxref.resolver
import
requests
import
logging
import
json
import
argparse
import
re
import
lxml.html
as
HTML
logger
=
logging
.
getLogger
(
__name__
)
def
main
():
parser
=
argparse
.
ArgumentParser
(
description
=
'Retrieve enzyme text documents for dbxrefs and convert them into json'
)
parser
.
add_argument
(
'--basic'
,
'-b'
,
action
=
'store_true'
,
help
=
'Include id, definition, name and synonyms'
)
parser
.
add_argument
(
'--references'
,
'-r'
,
action
=
'store_true'
,
help
=
'Include id, uniprot dbxrefs'
)
parser
.
add_argument
(
'dbxrefs'
,
nargs
=
argparse
.
REMAINDER
)
args
=
parser
.
parse_args
()
# Enable all options by default if they are not set
if
not
args
.
basic
and
not
args
.
references
:
args
.
basic
=
True
args
.
references
=
True
dbxrefs
=
dbxref
.
resolver
.
convert_to_dbxrefs
(
args
.
dbxrefs
)
documents
=
retrieve
(
dbxrefs
,
basic
=
args
.
basic
,
references
=
args
.
references
)
print
(
json
.
dumps
(
documents
))
def
retrieve
(
dbxrefs
,
basic
=
True
,
references
=
True
):
"""Retrieve the data for the dbxrefs and return a list"""
resolved
=
dbxref
.
resolver
.
resolve
(
dbxrefs
,
check_existence
=
False
)
documents
=
[]
for
entry
in
resolved
:
txt_url
=
entry
[
'locations'
][
'text'
][
0
]
logger
.
debug
(
'URL: %s'
,
txt_url
)
r
=
requests
.
get
(
txt_url
)
logger
.
debug
(
'Content: %s'
,
r
.
text
)
try
:
# We expect a plain text document
# check if the document returned is a html document
# if it is something went from and we assume that
# it is a error page.
ls
=
r
.
text
.
replace
(
'
\n
'
,
' '
)
html
=
HTML
.
document_fromstring
(
ls
).
head
.
text_content
()
# when everything is fine an exception was thrown for
# the last line
output
=
{
'dbxref'
:
entry
[
'dbxref'
]}
output
[
'message'
]
=
html
if
output
[
'message'
]
==
' 500 Internal Server Error '
:
output
[
'message'
]
=
'500 Internal Server Error; probably invalid ID'
documents
.
append
(
output
)
except
:
retrieved_entry
=
parse_flat_file
(
r
.
text
)
retrieved_entry
[
'dbxref'
]
=
entry
[
'dbxref'
]
documents
.
append
(
retrieved_entry
)
return
documents
def
parse_flat_file
(
text
):
lines
=
text
.
split
(
'
\n
'
)
comment
=
""
reaction
=
""
output
=
{}
refs
=
[]
for
line
in
lines
:
line_elements
=
line
.
strip
().
split
(
' '
)
if
line_elements
[
0
]
==
'DE'
:
output
[
'name'
]
=
line_elements
[
1
]
if
line_elements
[
0
]
==
'AN'
:
if
'alternative_names'
in
output
:
output
[
'alternative_names'
].
append
(
line_elements
[
1
])
else
:
output
[
'alternative_names'
]
=
[
line_elements
[
1
]]
if
line_elements
[
0
]
==
'CA'
:
if
re
.
match
(
re
.
compile
(
'^\(\d+\) '
),
line_elements
[
1
]):
if
len
(
reaction
)
==
0
:
reaction
+=
line_elements
[
1
][
line_elements
[
1
].
find
(
' '
)
+
1
:]
else
:
if
'reaction_catalyzed'
in
output
:
output
[
'reaction_catalyzed'
].
append
(
reaction
)
else
:
output
[
'reaction_catalyzed'
]
=
[
reaction
]
reaction
=
line_elements
[
1
][
line_elements
[
1
].
find
(
' '
)
+
1
:]
else
:
if
len
(
reaction
)
==
0
:
reaction
=
line_elements
[
1
]
else
:
reaction
=
reaction
+
" "
+
line_elements
[
1
]
if
line_elements
[
0
]
==
'CF'
:
if
'cofactors'
in
output
:
output
[
'cofactors'
].
append
(
line_elements
[
1
])
else
:
output
[
'cofactors'
]
=
[
line_elements
[
1
]]
if
line_elements
[
0
]
==
'CC'
:
if
"-!-"
in
line_elements
[
1
]:
if
len
(
comment
)
==
0
:
comment
+=
line_elements
[
1
][
4
:]
else
:
if
'comments'
in
output
:
output
[
'comments'
].
append
(
comment
)
else
:
output
[
'comments'
]
=
[
comment
]
comment
=
line_elements
[
1
][
4
:]
else
:
comment
+=
line_elements
[
2
]
if
line_elements
[
0
]
==
'PR'
:
link
=
line_elements
[
1
].
replace
(
';'
,
''
).
split
()
if
'prosite'
in
output
:
output
[
'prosite'
].
append
(
link
[
1
])
else
:
output
[
'prosite'
]
=
[
link
[
1
]]
if
line_elements
[
0
]
==
'DR'
:
for
i
in
range
(
1
,
len
(
line_elements
)):
for
e
in
line_elements
[
i
].
split
(
'; '
):
if
len
(
e
)
>
1
:
l
=
e
.
split
(
', '
)
l
[
1
]
=
l
[
1
].
replace
(
' '
,
''
)
l
[
1
]
=
l
[
1
].
replace
(
';'
,
''
)
refs
.
append
(
'UniProtKB/Swiss-Prot:'
+
l
[
0
])
output
[
'dbxrefs'
]
=
refs
if
len
(
reaction
)
>
0
:
if
'reaction_catalyzed'
in
output
:
output
[
'reaction_catalyzed'
].
append
(
reaction
)
else
:
output
[
'reaction_catalyzed'
]
=
[
reaction
]
if
len
(
comment
)
>
0
:
if
'comments'
in
output
:
output
[
'comments'
].
append
(
comment
)
else
:
output
[
'comments'
]
=
[
comment
]
return
output
def
read_basic
(
d
):
out
=
{}
definition
=
{}
if
'message'
in
d
:
out
[
'message'
]
=
d
[
'message'
]
if
'name'
in
d
:
out
[
'name'
]
=
d
[
'name'
]
if
'alternative_names'
in
d
:
out
[
'synonyms'
]
=
d
.
pop
(
'alternative_names'
)
if
'reaction_catalyzed'
in
d
:
definition
[
'reaction_catalyzed'
]
=
d
[
'reaction_catalyzed'
]
if
'cofactors'
in
d
:
definition
[
'cofactors'
]
=
d
[
'cofactors'
]
if
'comments'
in
d
:
definition
[
'comments'
]
=
d
[
'comments'
]
if
len
(
definition
)
==
1
:
out
[
'definition'
]
=
definition
[
0
]
elif
len
(
definition
)
>
1
:
out
[
'definition'
]
=
definition
return
(
out
)
def
format_output
(
d
,
basic
,
references
):
out
=
{
'id'
:
d
[
'dbxref'
]}
if
basic
:
out
.
update
(
read_basic
(
d
))
if
references
:
out
[
'dbxrefs'
]
=
d
[
'dbxrefs'
]
if
not
basic
and
not
references
:
out
.
update
(
read_basic
(
d
))
if
'dbxrefs'
in
d
:
out
[
'dbxrefs'
]
=
d
[
'dbxrefs'
]
return
(
out
)
if
__name__
==
'__main__'
:
main
()
dbxref/retrieve/gene_ontology.py
0 → 100755
View file @
941bb534
#!/usr/bin/env python3
import
dbxref.resolver
import
requests
import
logging
import
json
import
argparse
logger
=
logging
.
getLogger
(
__name__
)
def
main
():
parser
=
argparse
.
ArgumentParser
(
description
=
'Retrieve gene ontology documents for dbxrefs and convert them into json'
)
parser
.
add_argument
(
'--basic'
,
'-b'
,
action
=
'store_true'
,
help
=
'Include id, definition, name and synonyms'
)
parser
.
add_argument
(
'--relations'
,
'-r'
,
action
=
'store_true'
,
help
=
'Include id, parents and children'
)
parser
.
add_argument
(
'dbxrefs'
,
nargs
=
argparse
.
REMAINDER
)
args
=
parser
.
parse_args
()
if
not
args
.
basic
and
not
args
.
relations
:
args
.
basic
=
True
args
.
relations
=
False
dbxrefs
=
dbxref
.
resolver
.
convert_to_dbxrefs
(
args
.
dbxrefs
)
documents
=
retrieve
(
dbxrefs
,
basic
=
args
.
basic
,
relations
=
args
.
relations
)
print
(
json
.
dumps
(
documents
))
def
retrieve
(
dbxrefs
,
basic
=
True
,
relations
=
False
):
resolved
=
dbxref
.
resolver
.
resolve
(
dbxrefs
,
check_existence
=
False
)
documents
=
[]
for
entry
in
resolved
:
json_url
=
entry
[
'locations'
][
'json'
][
0
]
logger
.
debug
(
'URL: %s'
,
json_url
)
r
=
requests
.
get
(
json_url
)
logger
.
debug
(
'Content: %s'
,
r
.
text
)
d
=
json
.
loads
(
r
.
text
)
output
=
{
'id'
:
entry
[
'dbxref'
]}
if
'messages'
in
d
:
output
[
'message'
]
=
'; '
.
join
(
d
[
'messages'
])
else
:
if
basic
:
output
.
update
(
read_basic
(
d
))
if
relations
:
output
.
update
(
read_relations
(
d
))
documents
.
append
(
output
)
return
documents
def
read_basic
(
d
):
out
=
{
'definition'
:
d
[
'results'
][
0
][
'definition'
][
'text'
],
'synonyms'
:
[]}
out
[
'name'
]
=
d
[
'results'
][
0
][
'name'
]
if
'synonyms'
in
d
[
'results'
][
0
]:
out
[
'synonyms'
]
=
d
[
'results'
][
0
][
'synonyms'
]
return
(
out
)
def
read_relations
(
d
):
out
=
{
'relations'
:
{
'children'
:
[],
'parents'
:
[]}}
if
'children'
in
d
[
'results'
][
0
]:
out
[
'relations'
][
'children'
]
=
d
[
'results'
][
0
][
'children'
]
for
child
in
out
[
'relations'
][
'children'
]:
child
[
'type'
]
=
child
.
pop
(
'relation'
)
if
'history'
in
d
[
'results'
][
0
]:
out
[
'relations'
][
'parents'
]
=
parse_history
(
d
[
'results'
][
0
][
'history'
])
return
(
out
)
def
parse_history
(
h
):
out
=
[]
for
history
in
reversed
(
h
):
if
history
[
'category'
]
==
"RELATION"
:
if
history
[
'action'
]
==
"Updated"
or
history
[
'action'
]
==
"Added"
:
out
.
append
(
history
)
if
history
[
'action'
]
==
"Deleted"
:
for
i
in
reversed
(
range
(
len
(
out
))):
if
out
[
i
][
'text'
]
==
history
[
'text'
]:
del
out
[
i
]
break
for
i
in
range
(
len
(
out
)):
out
[
i
]
=
parse_text
(
out
[
i
][
'text'
])
return
(
out
)
def
parse_text
(
t
):
words
=
t
.
split
(
' '
)
type
=
''
out
=
{}
for
word
in
words
:
if
'GO:'
in
word
:
out
[
'id'
]
=
word
break
else
:
if
type
==
''
:
type
=
word
else
:
type
+=
"_"
+
word
out
[
'type'
]
=
type
return
(
out
)
if
__name__
==
'__main__'
:
main
()
scripts
/retrieve
_
pfam.py
→
dbxref
/retrieve
/
pfam.py
View file @
941bb534
#!/usr/bin/env python3
import
env
import
dbxref.config
import
dbxref.resolver
import
requests
import
xml.etree.ElementTree
as
ET
...
...
@@ -12,7 +10,7 @@ import argparse
logger
=
logging
.
getLogger
(
__name__
)
#logger.setLevel(logging.DEBUG)
ns
=
{
'pfam'
:
'http://pfam.xfam.org/'
}
ns
=
{
'pfam'
:
'http
s
://pfam.xfam.org/'
}
def
main
():
parser
=
argparse
.
ArgumentParser
(
description
=
'Retrieve pfam xml documents for dbxrefs and convert them into json'
)
...
...
@@ -20,37 +18,46 @@ def main():
parser
.
add_argument
(
'--annotation'
,
'-a'
,
action
=
'store_true'
,
help
=
'Include annotation'
)
parser
.
add_argument
(
'dbxrefs'
,
nargs
=
argparse
.
REMAINDER
)
args
=
parser
.
parse_args
()
if
not
(
args
.
basic
or
args
.
annotation
):
args
.
basic
=
True
args
.
annotation
=
True
resolved
=
dbxref
.
resolver
.
resolve
(
args
.
dbxrefs
,
check_existence
=
False
)
dbxrefs
=
dbxref
.
resolver
.
convert_to_dbxrefs
(
args
.
dbxrefs
)
documents
=
retrieve
(
dbxrefs
,
basic
=
args
.
basic
,
annotation
=
args
.
annotation
)
print
(
json
.
dumps
(
documents
))
def
retrieve
(
dbxrefs
,
basic
=
True
,
annotation
=
True
):
resolved
=
dbxref
.
resolver
.
resolve
(
dbxrefs
,
check_existence
=
False
)
documents
=
[]
for
entry
in
resolved
:
if
'xml'
in
entry
[
'locations'
]:
xml_url
=
entry
[
'locations'
][
'xml'
][
0
]
logger
.
debug
(
'URL: %s'
,
xml_url
)
r
=
requests
.
get
(
xml_url
)
logger
.
debug
(
'Content: %s'
,
r
.
text
)
root
=
ET
.
fromstring
(
r
.
text
)
output
=
{
'd
bxref
'
:
entry
[
'dbxref'
]}
output
=
{
'
i
d'
:
entry
[
'dbxref'
]}
for
child
in
root
.
findall
(
'pfam:entry'
,
ns
):
if
args
.
basic
:
output
.
update
(
read_basic
(
child
))
if
args
.
annotation
:
output
.
update
(
read_annotation
(
child
))
tree
=
str
(
ET
.
tostring
(
root
))
if
'<error>'
in
tree
:
output
[
'message'
]
=
tree
[
tree
.
find
(
'<error>'
)
+
7
:
tree
.
rfind
(
'</error>'
)]
else
:
for
child
in
root
.
findall
(
'pfam:entry'
,
ns
):
if
basic
:
output
.
update
(
read_basic
(
child
))
if
annotation
: