Commit 26edf18a authored by Lukas Jelonek's avatar Lukas Jelonek
Browse files

Merge branch 'develop'

parents 011d0be3 7b72d8c6
# This file is a template, and might need editing before it works on your project.
# Official language image. Look for the different tagged releases at:
# https://hub.docker.com/r/library/python/tags/
image: python:3-alpine
# Change pip's cache directory to be inside the project directory since we can
# only cache local items.
variables:
PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip"
# Pip's cache doesn't store the python packages
# https://pip.pypa.io/en/stable/reference/pip_install/#caching
#
# If you want to also cache the installed packages, you have to install
# them in a virtualenv and cache it as well.
cache:
paths:
- .cache/pip
- venv/
before_script:
- python -V # Print out python version for debugging
- pip install virtualenv
- virtualenv venv
- source venv/bin/activate
- pip install -r requirements.txt
- apk add git
stages:
- test
- bundle
- deploy
test:
stage: test
script:
- python setup.py test
run:
stage: bundle
script:
- python setup.py bdist_wheel
# an alternative approach is to install and run:
- pip install dist/*
# run the command here
artifacts:
paths:
- dist/*.whl
pages:
stage: deploy
script:
- pip install sphinx sphinx-rtd-theme
- python setup.py build_sphinx
- mv build/sphinx/html/ public
artifacts:
paths:
- public
only:
- master
...@@ -26,6 +26,7 @@ def main(): ...@@ -26,6 +26,7 @@ def main():
retrieve_parser = subparsers.add_parser('retrieve') retrieve_parser = subparsers.add_parser('retrieve')
retrieve_parser.set_defaults(func=retrieve) retrieve_parser.set_defaults(func=retrieve)
retrieve_parser.add_argument('dbxrefs', nargs=argparse.REMAINDER) retrieve_parser.add_argument('dbxrefs', nargs=argparse.REMAINDER)
retrieve_parser.add_argument('--ignore_cache', '-C', action='store_true', default=False, help="Ignore entries from cache. Fetched entries are still stored in cache.")
retrieve_parser.add_argument('--verbose', '-v', action='store_true', default=False, help="Show debug output") retrieve_parser.add_argument('--verbose', '-v', action='store_true', default=False, help="Show debug output")
args = parser.parse_args() args = parser.parse_args()
...@@ -53,7 +54,14 @@ def resolve(args, config): ...@@ -53,7 +54,14 @@ def resolve(args, config):
def retrieve(args, config): def retrieve(args, config):
from dbxref import retriever from dbxref import retriever
print(json.dumps(retriever.retrieve(resolver.convert_to_dbxrefs(args.dbxrefs)))) print(
json.dumps(
retriever.retrieve(
resolver.convert_to_dbxrefs(args.dbxrefs),
ignore_cache = args.ignore_cache
)
)
)
if __name__ == "__main__": if __name__ == "__main__":
main() main()
...@@ -7,6 +7,18 @@ ...@@ -7,6 +7,18 @@
retriever: retriever:
type: 'internal' type: 'internal'
location: 'dbxref.retrieve.enzyme' location: 'dbxref.retrieve.enzyme'
- name: HAMAP
prefixes: ["HAMAP"]
resources:
html: ["https://hamap.expasy.org/signature/%i"]
text: ["https://hamap.expasy.org/signature/%i.txt"]
check_existence: "https://hamap.expasy.org/signature/%i.txt"
- name: OMA - Orthologous MAtrix
prefixes: ["OMA"]
resources:
html: ["https://omabrowser.org/oma/omagroup/%i/"]
json: ["https://omabrowser.org/api/group/%i/"]
check_existence: "https://omabrowser.org/api/group/%i/"
- name: Gene Identifier - name: Gene Identifier
prefixes: ["GI"] prefixes: ["GI"]
resources: resources:
...@@ -14,6 +26,10 @@ ...@@ -14,6 +26,10 @@
xml: ["http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=protein&dopt=xml&sort=&val=%i&retmode=file"] xml: ["http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=protein&dopt=xml&sort=&val=%i&retmode=file"]
# does not work # does not work
# check_existence: "http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=protein&dopt=xml&sort=&val=%i&retmode=file" # check_existence: "http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=protein&dopt=xml&sort=&val=%i&retmode=file"
- name: Conserved domain database
prefixes: ["CDD"]
resources:
html: ["https://www.ncbi.nlm.nih.gov/Structure/cdd/cddsrv.cgi?uid=%i"]
- name: Uniprot - name: Uniprot
prefixes: ["UniProtKB/TrEMBL", "UniProtKB/Swiss-Prot"] prefixes: ["UniProtKB/TrEMBL", "UniProtKB/Swiss-Prot"]
resources: resources:
...@@ -89,15 +105,29 @@ ...@@ -89,15 +105,29 @@
- name: Gene Ontology - name: Gene Ontology
prefixes: ["GO"] prefixes: ["GO"]
resources: resources:
html: ["http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i"] html: ["https://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i"]
xml: ["http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i&format=oboxml"] xml: ["https://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i&format=oboxml"]
json: ["https://www.ebi.ac.uk/QuickGO/services/ontology/go/terms/GO:%i/complete"] json: ["https://www.ebi.ac.uk/QuickGO/services/ontology/go/terms/GO:%i/complete"]
check_existence: "http://purl.obolibrary.org/obo/GO_%i" check_existence: "http://purl.obolibrary.org/obo/GO_%i"
retriever: retriever:
type: 'internal' type: 'internal'
location: 'dbxref.retrieve.gene_ontology' location: 'dbxref.retrieve.gene_ontology'
- name: KEGG
prefixes: ["KO", "KEGG"]
resources:
html: ["https://www.genome.jp/dbget-bin/www_bget?%i"]
text: ["http://rest.kegg.jp/get/%i"]
check_existence: "http://rest.kegg.jsdjkaap/get/%i"
# not implemented yet
# retriever:
# type: 'internal'
# location: 'dbxref.retrieve.kegg'
- name: HTTP - name: HTTP
prefixes: ["http", "https"] prefixes: ["http", "https"]
resources: resources:
html: ["%d:%i"] html: ["%d:%i"]
check_existence: "%d:%i" check_existence: "%d:%i"
- name: EggNOG
prefixes: ['eggnog']
resources:
html: ["http://eggnogdb.embl.de/#/app/results?target_nogs=%i"]
...@@ -59,9 +59,9 @@ def compile_url(template, dbxref): ...@@ -59,9 +59,9 @@ def compile_url(template, dbxref):
def check_url_exists(url): def check_url_exists(url):
try: try:
r = sess.head(url, allow_redirects=True, timeout=1) r = sess.head(url, allow_redirects=True, timeout=5)
r.close() r.close()
if r.status_code <= 400: if r.status_code < 400:
return STATUS_EXISTS return STATUS_EXISTS
else: else:
logger.debug('The server responded with status code: %s', r.status_code) logger.debug('The server responded with status code: %s', r.status_code)
......
...@@ -35,16 +35,23 @@ def retrieve(dbxrefs, basic=True, references=True): ...@@ -35,16 +35,23 @@ def retrieve(dbxrefs, basic=True, references=True):
txt_url = entry['locations']['text'][0] txt_url = entry['locations']['text'][0]
logger.debug('URL: %s', txt_url) logger.debug('URL: %s', txt_url)
r = requests.get(txt_url) r = requests.get(txt_url)
retrieved_entry = {} logger.debug('Content: %s', r.text)
if r.status_code < 400: try:
# We expect a plain text document
# check if the document returned is a html document
# if it is something went from and we assume that
# it is a error page.
ls = r.text.replace('\n', ' ')
html = HTML.document_fromstring(ls).head.text_content()
# when everything is fine an exception was thrown for
# the last line
output = {'id': entry['dbxref']}
output['status'] = 'not found'
documents.append(output)
except:
retrieved_entry = parse_flat_file(r.text) retrieved_entry = parse_flat_file(r.text)
elif r.status_code == 404: retrieved_entry['id'] = entry['dbxref']
retrieved_entry = {'status' : '404 Not found'} documents.append(retrieved_entry)
else:
retrieved_entry = {'status' : r.status_code}
retrieved_entry['dbxref'] = entry['dbxref']
documents.append(retrieved_entry)
return documents return documents
......
...@@ -34,21 +34,24 @@ def retrieve(dbxrefs, basic=True, relations=False): ...@@ -34,21 +34,24 @@ def retrieve(dbxrefs, basic=True, relations=False):
if 'messages' in d: if 'messages' in d:
output['message'] = '; '.join(d['messages']) output['message'] = '; '.join(d['messages'])
else: else:
if basic: if len(d['results']) > 0:
output.update(read_basic(d)) if basic:
if relations: output.update(read_basic(d))
output.update(read_relations(d)) if relations:
output.update(read_relations(d))
else:
output['message'] = "no results found, probably invalid ID"
documents.append(output) documents.append(output)
return documents return documents
def read_basic(d): def read_basic(d):
out = {'definition': d['results'][0]['definition']['text'], 'synonyms': []} out = {'definition': d['results'][0]['definition']['text'], 'synonyms': []}
out['name'] = d['results'][0]['name'] out['name'] = d['results'][0]['name']
if 'aspect' in d['results'][0]: if 'aspect' in d['results'][0]:
out['aspect'] = d['results'][0]['aspect'] out['aspect'] = d['results'][0]['aspect']
if 'synonyms' in d['results'][0]: if 'synonyms' in d['results'][0]:
out['synonyms'] = d['results'][0]['synonyms'] out['synonyms'] = d['results'][0]['synonyms']
return (out) return (out)
def read_relations(d): def read_relations(d):
out = {'relations': {'children': [], 'parents': []}} out = {'relations': {'children': [], 'parents': []}}
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
import dbxref.resolver import dbxref.resolver
import requests import requests
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from xml.etree.ElementTree import ParseError
import logging import logging
import json import json
import argparse import argparse
...@@ -35,11 +36,12 @@ def retrieve(dbxrefs, basic=True, annotation=True): ...@@ -35,11 +36,12 @@ def retrieve(dbxrefs, basic=True, annotation=True):
logger.debug('URL: %s', xml_url) logger.debug('URL: %s', xml_url)
r = requests.get(xml_url) r = requests.get(xml_url)
logger.debug('Content: %s', r.text) logger.debug('Content: %s', r.text)
root = ET.fromstring(r.text)
output = {'id': entry['dbxref']} output = {'id': entry['dbxref']}
try: try:
root = ET.fromstring(r.text)
tree = str(ET.tostring(root)) tree = str(ET.tostring(root))
if '<error>' in tree: if '<error>' in tree:
output['message'] = tree[tree.find('<error>')+7:tree.rfind('</error>')] output['message'] = tree[tree.find('<error>')+7:tree.rfind('</error>')]
...@@ -52,7 +54,7 @@ def retrieve(dbxrefs, basic=True, annotation=True): ...@@ -52,7 +54,7 @@ def retrieve(dbxrefs, basic=True, annotation=True):
except (KeyError, AttributeError) as e: except (KeyError, AttributeError) as e:
logger.warn('Error in retrieving %s', str(entry)) logger.warn('Error in retrieving %s', str(entry))
raise raise
except RuntimeError as e: except (ParseError, RuntimeError) as e:
output['message'] = 'an error occurred' output['message'] = 'an error occurred'
try: try:
html = HTML.document_fromstring(r.text.replace('\n', ' ')) html = HTML.document_fromstring(r.text.replace('\n', ' '))
......
...@@ -180,6 +180,15 @@ def read_features(entry): ...@@ -180,6 +180,15 @@ def read_features(entry):
feature['end'] = end.attrib['position'] feature['end'] = end.attrib['position']
else: else:
feature['end'] = end.attrib['status'] feature['end'] = end.attrib['status']
if feature['begin'] is not 'unknown':
feature['begin'] = None
else:
feature['begin'] = int(feature['begin'])
if feature['end'] is not 'unknown':
feature['end'] = None
else:
feature['end'] = int(feature['end'])
features.append (feature) features.append (feature)
return features return features
......
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
from dbxref import config from dbxref import config
from itertools import groupby from itertools import groupby
from diskcache import Cache
from appdirs import user_cache_dir
import json import json
def retrieve(dbxrefs): def retrieve(dbxrefs, ignore_cache=False):
sorted(dbxrefs, key=lambda x: x['db'].lower()) # normalize db to lowercase to allow differently cased notations cache = init_cache()
# normalize db notation
normalize_db_notation(dbxrefs)
dbxrefs = sorted(dbxrefs, key=lambda x: x['db'])
# lookup from cache
uncached = []
cached = []
if ignore_cache:
uncached = dbxrefs
else :
(cached, uncached) = find_cached_entries(cache, dbxrefs)
# load uncached
loaded_uncached = load_uncached_entries(uncached)
cache_entries(cache, loaded_uncached)
# compile results
results = [] results = []
for key, dbxrefs in groupby(dbxrefs, lambda x: x['db']): results.extend(cached)
if config.has_provider(key): results.extend(loaded_uncached)
provider = config.get_provider(key) return results
logger.debug('{0} is supported'.format(key))
if 'retriever' in provider: def normalize_db_notation(dbxrefs):
if provider['retriever']['type'] == 'external': # take first prefix that matches the db
results.extend( load_with_external_provider(provider, list(dbxrefs))) for dbxref in dbxrefs:
elif provider['retriever']['type'] == 'internal': key = dbxref['db']
results.extend(load_with_internal_provider(provider, list(dbxrefs))) if config.has_provider(key):
else: provider = config.get_provider(key)
raise Exception('Unknown retriever type', provider['retriever']['type']) for prefix in provider['prefixes']:
else: if key.lower() == prefix.lower():
logger.debug('Retrieval of {0} is not supported'.format(key)) dbxref['db'] = prefix
results.extend( map(lambda x: {'id': toString(x), 'status': 'retrieval not supported'}, dbxrefs)) logger.debug("'{}' -> '{}'".format(key, dbxref['db']))
else:
logger.debug('Retrieval of {0} is not supported'.format(key))
results.extend( map(lambda x: {'id': toString(x), 'status': 'retrieval not supported'}, dbxrefs))
return (results)
def load_with_external_provider(provider, dbxrefs): def load_with_external_provider(provider, dbxrefs):
logger.debug('Loading {0} via external provider'.format(dbxrefs)) logger.debug('Loading {0} via external provider'.format(dbxrefs))
...@@ -44,3 +62,47 @@ def load_with_internal_provider(provider, dbxrefs): ...@@ -44,3 +62,47 @@ def load_with_internal_provider(provider, dbxrefs):
def toString(dbxref): def toString(dbxref):
return '{}:{}'.format(dbxref['db'], dbxref['id']) return '{}:{}'.format(dbxref['db'], dbxref['id'])
def init_cache():
cachedir = user_cache_dir('dbxref')
cache = Cache(cachedir)
return cache
def cache_entries(cache, entries):
expiration_time = 86400 # one day
for e in entries:
logger.debug('Caching {}'.format(e['id']))
cache.set(e['id'], e, expire=expiration_time)
def find_cached_entries(cache, dbxrefs):
cached = []
uncached = []
for d in dbxrefs:
key = toString(d)
if key in cache:
logger.debug("Found {} in cache".format(key))
cached.append(cache[key])
else:
uncached.append(d)
return (cached, uncached)
def load_uncached_entries(dbxrefs):
results = []
for key, dbxrefs in groupby(dbxrefs, lambda x: x['db']):
if config.has_provider(key):
provider = config.get_provider(key)
logger.debug('{0} is supported'.format(key))
if 'retriever' in provider:
if provider['retriever']['type'] == 'external':
results.extend( load_with_external_provider(provider, list(dbxrefs)))
elif provider['retriever']['type'] == 'internal':
results.extend(load_with_internal_provider(provider, list(dbxrefs)))
else:
raise Exception('Unknown retriever type', provider['retriever']['type'])
else:
logger.debug('{0} is not supported'.format(key))
results.extend( map(lambda x: {'id': toString(x), 'status': 'not supported'}, dbxrefs))
else:
logger.debug('{0} is not supported'.format(key))
results.extend( map(lambda x: {'id': toString(x), 'status': 'not supported'}, dbxrefs))
return (results)
Developer Guide
===============
Document structures
-------------------
Return JSON of dbxref commands
##############################
dbxref returns json lists. Each entry in the list is a json object with at
least the 'id' property. Additional properties should be the same for similar
databases. For example most of the databases will contain a free text
description of the entry which should be available via the 'description'
property. Each database may have it's unique fields. These can be named as
required.
Resolve command
###############
Returns a list of json objects. Each object contains the id and locations.
Locations is an object that provides the urls to the different file formats
available for the entry. Valid formats are: 'html', 'xml', 'json', 'text'. If
you need additional formats, feel free to add them. Each format is a property
on the object and contains a list of all available urls. If a format is not
available for an entry, it should not be present in the locations object.
Handling of non existent dbxrefs
###############################
The proposed format for entries that cannot be found in the databases is as following:
`
[{'id': '<db>:<id>', "message": "no results found; probably invalid ID"}]
`
Handling of webservice errors
#############################
Try to be as precise as possible when an error occurs. The default (unprecise) response should be
`
[{'id': '<db>:<id>', "message": "Could not retrieve the requested entry due to problems."}]
`
You can include the original reponse:
`
[{'id': '<db>:<id>', "message": "Could not retrieve the requested entry due to problems.", "response" : "<original reponse>"}]
`
...@@ -4,3 +4,5 @@ pyyaml ...@@ -4,3 +4,5 @@ pyyaml
lockfile lockfile
lxml lxml
pbr pbr
diskcache
appdirs
...@@ -8,3 +8,11 @@ class TestPfam(unittest.TestCase): ...@@ -8,3 +8,11 @@ class TestPfam(unittest.TestCase):
documents = pfam.retrieve([{'db': 'PFAM', 'id': 'PF00083.23'}]) documents = pfam.retrieve([{'db': 'PFAM', 'id': 'PF00083.23'}])
# this test failed due to an error due to missing None handling, # this test failed due to an error due to missing None handling,
# so no assertions here. Once fixed. this should suffice # so no assertions here. Once fixed. this should suffice
def test_renamed_family(self):
'''regression test for missing comment in pfam entry'''
documents = pfam.retrieve([{'db': 'PFAM', 'id': 'Tiny_TM_bacill'}])
# this test failed due to a redirect when a family was renamed
# unfortunately the redirect was not encoded in http headers, but in
# html markup (<meta http-equiv="Refresh" content="5; URL=/family/PF09680" />)
# so no assertions here. Once fixed. this should suffice
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment