Commit 941bb534 authored by Lukas Jelonek's avatar Lukas Jelonek
Browse files

Merge branch 'develop' into 'master'

Develop

See merge request SOaAS/dbxref!2
parents 4da4423e 3d7ce949
*.pyc
docs/build/
build/
dist/
*.egg-info/
*.egg
*.py[cod]
__pycache__/
*.so
*~
docs/_build/
[0.1]
Implement basic structure
Integrate several databases
The MIT License (MIT)
Copyright (c) 2017 SOaAS
Copyright (c) 2017 Lukas Jelonek
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
......
# DB XREF resolver and retriever tool
A tool that resolves db_xrefs into URLs and that retrieves the data as json documents.
A tool that resolves database cross references (dbxrefs). It can return a list of
locations where the cross reference points to in different formats, like HTML,
XML, flat file or json. It can also retrieve the data for some of the supported
databases and convert it into json.
# Getting started (Setup)
The intended audience for this tool are bioinformatician that need to collect
data for dbxrefs and postprocess it. By returning everything in json format the
need for normalization and special parsing of the data is reduced.
# Getting started for development (Setup)
Prerequisites:
......@@ -11,10 +18,31 @@ Prerequisites:
Supported bioinformatic databases:
* None yet
* Ontologies
* Gene Ontology
Checkout the repository:
~~~~
git clone git@git.computational.bio.uni-giessen.de:SOaAS/dbxref.git
~~~~
Setup a virtualenv for development and install it in editable mode:
~~~~
# install in development environment
virtualenv --python=python3 venv; source venv/bin/activate;
pip install -e .
# run tests
python3 setup.py test
# compile documentation
python3 setup.py build_sphinx
~~~~
Use the application:
~~~~
dbxref resolve GO:0097281
~~~~
#!/usr/bin/env python3
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from dbxref.main import main
main()
def get_install_location():
"""Finds the location directory of the tool"""
import os
script_path = os.path.realpath(__file__)
script_dir = os.path.dirname(script_path)
install_dir = os.path.dirname(script_dir)
return install_dir
def get_providers_path():
import pkg_resources
return pkg_resources.resource_filename('dbxref', 'providers.yaml')
def load_providers():
return _load_providers(get_providers_path())
def _load_providers(path):
import yaml
data = []
with open(get_install_location() + '/providers.yaml') as data_file:
with open(path) as data_file:
data = yaml.load(data_file)
return index_providers(data)
return normalize_index(index_providers(data))
def index_providers(providers):
index = {}
......@@ -19,3 +18,16 @@ def index_providers(providers):
for db in p['prefixes']:
index[db] = p
return index
def normalize_index(index):
'create a new index with lowercase keys'
return {k.lower():v for (k,v) in index.items()}
def has_provider(provider):
return _has_provider(load_providers(), provider)
def _has_provider(providers, provider):
return provider.lower() in providers
def get_provider(provider):
return load_providers()[provider.lower()]
......@@ -2,6 +2,8 @@
import argparse
import os
import logging
from dbxref import resolver
import json
def main():
parser = argparse.ArgumentParser(description='Lookup locations of database cross references and retrieve them as json')
......@@ -38,13 +40,11 @@ def info(args, config):
print ('info')
def resolve(args, config):
from dbxref import resolver
import json
print(json.dumps(resolver.resolve(args.dbxrefs, check_existence=args.no_check)))
print(json.dumps(resolver.resolve(resolver.convert_to_dbxrefs(args.dbxrefs), check_existence=args.no_check)))
def retrieve(args, config):
from dbxref import retriever
retriever.retrieve(args.dbxrefs)
print(json.dumps(retriever.retrieve(resolver.convert_to_dbxrefs(args.dbxrefs))))
if __name__ == "__main__":
main()
- name: Enzyme
prefixes: [EC, Enzyme]
prefixes: ["EC"]
resources:
html: ["http://enzyme.expasy.org/EC/%i"]
check_existence: "http://enzyme.expasy.org/EC/%i.txt"
html: ["https://enzyme.expasy.org/EC/%i"]
text: ["https://enzyme.expasy.org/EC/%i.txt"]
check_existence: "https://enzyme.expasy.org/EC/%i.txt"
retriever:
type: 'internal'
location: 'dbxref.retrieve.enzyme'
- name: Gene Identifier
prefixes: [GI]
prefixes: ["GI"]
resources:
html: ["http://www.ncbi.nlm.nih.gov/protein/GI:%i"]
xml: ["http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=protein&dopt=xml&sort=&val=%i&retmode=file"]
......@@ -17,20 +21,30 @@
xml: [ "http://www.uniprot.org/uniprot/%i.xml"]
check_existence: "http://www.uniprot.org/uniprot/%i.xml"
retriever:
type: 'external'
location: 'scripts/retrieve_uniprot.py'
type: 'internal'
location: 'dbxref.retrieve.uniprot'
- name: Taxonomy
prefixes: ["Taxon", "taxon", "taxid"]
prefixes: ["Taxon", "taxid"]
resources:
html: ["http://www.uniprot.org/taxonomy/%i"]
json: ["https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/tax-id/%i"]
xml: ["http://www.uniprot.org/taxonomy/%i.rdf"]
xml_ncbi: ["https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=taxonomy&id=%i"]
check_existence: "http://www.uniprot.org/taxonomy/%i"
retriever:
type: 'internal'
location: 'dbxref.retrieve.taxonomy'
- name: SequenceOntology
prefixes: ["SO"]
resources:
html: ["http://www.sequenceontology.org/browser/current_svn/term/SO:%i"]
obo: ["http://www.sequenceontology.org/browser/current_svn/export/term_only/obo/SO:%i"]
tsv: ["http://www.sequenceontology.org/browser/current_svn/export/term_only/csv_text/SO:%i"]
# does not work
# check_existence: "http://www.sequenceontology.org/browser/current_svn/term/SO:%i"
retriever:
type: 'internal'
location: 'dbxref.retrieve.sequence_ontology'
- name: RFAM
prefixes: ["RFAM"]
resources:
......@@ -39,7 +53,7 @@
# does not work
# check_existence: "http://rfam.xfam.org/family/%i?content-type=text%2Fxml"
- name: Pubmed
prefixes: ["pubmed", "Pubmed"]
prefixes: ["Pubmed"]
resources:
html: ["http://www.ncbi.nlm.nih.gov/pubmed/%i"]
check_existence: "http://www.ncbi.nlm.nih.gov/pubmed/%i"
......@@ -50,6 +64,9 @@
xml: ["http://pfam.xfam.org/family/%i?output=xml"]
# does not work
# check_existence: "http://pfam.xfam.org/family/%i?content-type=text%2Fxml"
retriever:
type: 'internal'
location: 'dbxref.retrieve.pfam'
- name: PDB
prefixes: ["PDB"]
resources:
......@@ -74,7 +91,11 @@
resources:
html: ["http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i"]
xml: ["http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i&format=oboxml"]
json: ["https://www.ebi.ac.uk/QuickGO/services/ontology/go/terms/GO:%i/complete"]
check_existence: "http://purl.obolibrary.org/obo/GO_%i"
retriever:
type: 'internal'
location: 'dbxref.retrieve.gene_ontology'
- name: HTTP
prefixes: ["http", "https"]
resources:
......
......@@ -4,8 +4,7 @@ from cachecontrol.caches.file_cache import FileCache
import logging
logger = logging.getLogger(__name__)
from dbxref.config import load_providers
providers = load_providers()
from dbxref import config
cache = FileCache(".web_cache", forever=True)
sess = CacheControl(requests.Session(), cache=cache)
......@@ -18,15 +17,14 @@ STATUS_CHECK_NOT_SUPPORTED='check of status not supported'
STATUS_CHECK_TIMEOUT='status check timed out'
STATUS_UNSUPPORTED_DB='database unsupported'
def resolve(strings, check_existence=True):
def resolve(dbxrefs, check_existence=True):
results = []
for s in strings:
for dbxref in dbxrefs:
status = STATUS_NOT_CHECKED
if check_existence:
status = check_dbxref_exists(s)
dbxref = convert_string_to_dbxref(s)
if dbxref['db'] in providers:
provider = providers[dbxref['db']]
status = check_dbxref_exists(dbxref)
if config.has_provider(dbxref['db']):
provider = config.get_provider(dbxref['db'])
locations = {}
for _type in provider['resources']:
urls = []
......@@ -38,10 +36,13 @@ def resolve(strings, check_existence=True):
results.append({'dbxref': dbxref['db'] + ':' + dbxref['id'], 'status': STATUS_UNSUPPORTED_DB})
return results
def check_dbxref_exists(string):
dbxref = convert_string_to_dbxref(string)
if dbxref['db'] in providers:
provider = providers[dbxref['db']]
def convert_to_dbxrefs(strings):
'''convert a list of strings to dbxref maps with db and id attribute'''
return list(map(convert_string_to_dbxref, strings))
def check_dbxref_exists(dbxref):
if config.has_provider(dbxref['db']):
provider = config.get_provider(dbxref['db'])
urls = []
exists = STATUS_NOT_CHECKED
if 'check_existence' in provider:
......@@ -51,7 +52,7 @@ def check_dbxref_exists(string):
return exists
else:
return STATUS_CHECK_NOT_SUPPORTED
return STATUS_UNSUPPORTED_DB
return STATUS_UNSUPPORTED_DB
def compile_url(template, dbxref):
return template.replace('%i', dbxref['id']).replace('%d', dbxref['db'])
......
#!/usr/bin/env python3
import dbxref.resolver
import requests
import logging
import json
import argparse
import re
import lxml.html as HTML
logger = logging.getLogger(__name__)
def main():
parser = argparse.ArgumentParser(description='Retrieve enzyme text documents for dbxrefs and convert them into json')
parser.add_argument('--basic', '-b', action='store_true', help='Include id, definition, name and synonyms')
parser.add_argument('--references', '-r', action='store_true', help='Include id, uniprot dbxrefs')
parser.add_argument('dbxrefs', nargs=argparse.REMAINDER)
args = parser.parse_args()
# Enable all options by default if they are not set
if not args.basic and not args.references:
args.basic = True
args.references = True
dbxrefs = dbxref.resolver.convert_to_dbxrefs(args.dbxrefs)
documents = retrieve(dbxrefs, basic=args.basic, references=args.references)
print(json.dumps(documents))
def retrieve(dbxrefs, basic=True, references=True):
"""Retrieve the data for the dbxrefs and return a list"""
resolved = dbxref.resolver.resolve(dbxrefs, check_existence=False)
documents = []
for entry in resolved:
txt_url = entry['locations']['text'][0]
logger.debug('URL: %s', txt_url)
r = requests.get(txt_url)
logger.debug('Content: %s', r.text)
try:
# We expect a plain text document
# check if the document returned is a html document
# if it is something went from and we assume that
# it is a error page.
ls = r.text.replace('\n', ' ')
html = HTML.document_fromstring(ls).head.text_content()
# when everything is fine an exception was thrown for
# the last line
output = {'dbxref': entry['dbxref']}
output['message'] = html
if output['message'] == ' 500 Internal Server Error ':
output['message'] = '500 Internal Server Error; probably invalid ID'
documents.append(output)
except:
retrieved_entry = parse_flat_file(r.text)
retrieved_entry['dbxref'] = entry['dbxref']
documents.append(retrieved_entry)
return documents
def parse_flat_file(text):
lines = text.split('\n')
comment = ""
reaction = ""
output = {}
refs = []
for line in lines:
line_elements = line.strip().split(' ')
if line_elements[0] == 'DE':
output['name'] = line_elements[1]
if line_elements[0] == 'AN':
if 'alternative_names' in output:
output['alternative_names'].append(line_elements[1])
else:
output['alternative_names'] = [line_elements[1]]
if line_elements[0] == 'CA':
if re.match(re.compile('^\(\d+\) '), line_elements[1]):
if len(reaction) == 0:
reaction += line_elements[1][line_elements[1].find(' ')+1:]
else:
if 'reaction_catalyzed' in output:
output['reaction_catalyzed'].append(reaction)
else:
output['reaction_catalyzed'] = [reaction]
reaction = line_elements[1][line_elements[1].find(' ')+1:]
else:
if len(reaction) == 0:
reaction = line_elements[1]
else:
reaction = reaction + " " + line_elements[1]
if line_elements[0] == 'CF':
if 'cofactors' in output:
output['cofactors'].append(line_elements[1])
else:
output['cofactors'] = [line_elements[1]]
if line_elements[0] == 'CC':
if "-!-" in line_elements[1]:
if len(comment) == 0:
comment += line_elements[1][4:]
else:
if 'comments' in output:
output['comments'].append(comment)
else:
output['comments'] = [comment]
comment = line_elements[1][4:]
else:
comment += line_elements[2]
if line_elements[0] == 'PR':
link = line_elements[1].replace(';', '').split()
if 'prosite' in output:
output['prosite'].append(link[1])
else:
output['prosite'] = [link[1]]
if line_elements[0] == 'DR':
for i in range(1, len(line_elements)):
for e in line_elements[i].split('; '):
if len(e) > 1:
l = e.split(', ')
l[1] = l[1].replace(' ', '')
l[1] = l[1].replace(';', '')
refs.append('UniProtKB/Swiss-Prot:' + l[0])
output['dbxrefs'] = refs
if len(reaction) > 0:
if 'reaction_catalyzed' in output:
output['reaction_catalyzed'].append(reaction)
else:
output['reaction_catalyzed'] = [reaction]
if len(comment) > 0:
if 'comments' in output:
output['comments'].append(comment)
else:
output['comments'] = [comment]
return output
def read_basic(d):
out = {}
definition = {}
if 'message' in d:
out['message'] = d['message']
if 'name' in d:
out['name'] = d['name']
if 'alternative_names' in d:
out['synonyms'] = d.pop('alternative_names')
if 'reaction_catalyzed' in d:
definition['reaction_catalyzed'] = d['reaction_catalyzed']
if 'cofactors' in d:
definition['cofactors'] = d['cofactors']
if 'comments' in d:
definition['comments'] = d['comments']
if len(definition) == 1:
out['definition'] = definition[0]
elif len(definition) > 1:
out['definition'] = definition
return (out)
def format_output(d, basic, references):
out = {'id': d['dbxref']}
if basic:
out.update(read_basic(d))
if references:
out['dbxrefs'] = d['dbxrefs']
if not basic and not references:
out.update(read_basic(d))
if 'dbxrefs' in d:
out['dbxrefs'] = d['dbxrefs']
return (out)
if __name__ == '__main__':
main()
#!/usr/bin/env python3
import dbxref.resolver
import requests
import logging
import json
import argparse
logger = logging.getLogger(__name__)
def main():
parser = argparse.ArgumentParser(description='Retrieve gene ontology documents for dbxrefs and convert them into json')
parser.add_argument('--basic', '-b', action='store_true', help='Include id, definition, name and synonyms')
parser.add_argument('--relations', '-r', action='store_true', help='Include id, parents and children')
parser.add_argument('dbxrefs', nargs=argparse.REMAINDER)
args = parser.parse_args()
if not args.basic and not args.relations:
args.basic = True
args.relations = False
dbxrefs = dbxref.resolver.convert_to_dbxrefs(args.dbxrefs)
documents = retrieve(dbxrefs, basic=args.basic, relations=args.relations)
print(json.dumps(documents))
def retrieve(dbxrefs, basic=True, relations=False):
resolved = dbxref.resolver.resolve(dbxrefs, check_existence=False)
documents = []
for entry in resolved:
json_url = entry['locations']['json'][0]
logger.debug('URL: %s', json_url)
r = requests.get(json_url)
logger.debug('Content: %s', r.text)
d = json.loads(r.text)
output = {'id': entry['dbxref']}
if 'messages' in d:
output['message'] = '; '.join(d['messages'])
else:
if basic:
output.update(read_basic(d))
if relations:
output.update(read_relations(d))
documents.append(output)
return documents
def read_basic(d):
out = {'definition': d['results'][0]['definition']['text'], 'synonyms': []}
out['name'] = d['results'][0]['name']
if 'synonyms' in d['results'][0]:
out['synonyms'] = d['results'][0]['synonyms']
return (out)
def read_relations(d):
out = {'relations': {'children': [], 'parents': []}}
if 'children' in d['results'][0]:
out['relations']['children'] = d['results'][0]['children']
for child in out['relations']['children']:
child['type'] = child.pop('relation')
if 'history' in d['results'][0]:
out['relations']['parents'] = parse_history(d['results'][0]['history'])
return (out)
def parse_history(h):
out = []
for history in reversed(h):
if history['category'] == "RELATION":
if history['action'] == "Updated" or history['action'] == "Added":
out.append(history)
if history['action'] == "Deleted":
for i in reversed(range(len(out))):
if out[i]['text'] == history['text']:
del out[i]
break
for i in range(len(out)):
out[i] = parse_text(out[i]['text'])
return (out)
def parse_text(t):
words = t.split(' ')
type = ''
out = {}
for word in words:
if 'GO:' in word:
out['id'] = word
break
else:
if type == '':
type = word
else:
type += "_" + word
out['type'] = type
return (out)
if __name__ == '__main__':
main()
#!/usr/bin/env python3
import env
import dbxref.config
import dbxref.resolver
import requests
import xml.etree.ElementTree as ET
......@@ -12,7 +10,7 @@ import argparse
logger = logging.getLogger(__name__)
#logger.setLevel(logging.DEBUG)
ns = {'pfam': 'http://pfam.xfam.org/'}
ns = {'pfam': 'https://pfam.xfam.org/'}
def main():
parser = argparse.ArgumentParser(description='Retrieve pfam xml documents for dbxrefs and convert them into json')
......@@ -20,37 +18,46 @@ def main():
parser.add_argument('--annotation', '-a', action='store_true', help='Include annotation')
parser.add_argument('dbxrefs', nargs=argparse.REMAINDER)
args = parser.parse_args()
if not (args.basic or args.annotation):
args.basic = True
args.annotation = True
resolved = dbxref.resolver.resolve(args.dbxrefs, check_existence=False)
dbxrefs = dbxref.resolver.convert_to_dbxrefs(args.dbxrefs)
documents = retrieve(dbxrefs, basic=args.basic, annotation=args.annotation)
print(json.dumps(documents))
def retrieve(dbxrefs, basic=True, annotation=True):
resolved = dbxref.resolver.resolve(dbxrefs, check_existence=False)
documents = []
for entry in resolved:
if 'xml' in entry['locations']:
xml_url = entry['locations']['xml'][0]
logger.debug('URL: %s', xml_url)
r = requests.get(xml_url)
logger.debug('Content: %s', r.text)
root = ET.fromstring(r.text)
output = {'dbxref': entry['dbxref']}
output = {'id': entry['dbxref']}
for child in root.findall('pfam:entry', ns):
if args.basic:
output.update(read_basic(child))
if args.annotation:
output.update(read_annotation(child))
tree = str(ET.tostring(root))
if '<error>' in tree:
output['message'] = tree[tree.find('<error>')+7:tree.rfind('</error>')]