Commit 26edf18a authored by Lukas Jelonek's avatar Lukas Jelonek
Browse files

Merge branch 'develop'

parents 011d0be3 7b72d8c6
# This file is a template, and might need editing before it works on your project.
# Official language image. Look for the different tagged releases at:
# https://hub.docker.com/r/library/python/tags/
image: python:3-alpine
# Change pip's cache directory to be inside the project directory since we can
# only cache local items.
variables:
PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip"
# Pip's cache doesn't store the python packages
# https://pip.pypa.io/en/stable/reference/pip_install/#caching
#
# If you want to also cache the installed packages, you have to install
# them in a virtualenv and cache it as well.
cache:
paths:
- .cache/pip
- venv/
before_script:
- python -V # Print out python version for debugging
- pip install virtualenv
- virtualenv venv
- source venv/bin/activate
- pip install -r requirements.txt
- apk add git
stages:
- test
- bundle
- deploy
test:
stage: test
script:
- python setup.py test
run:
stage: bundle
script:
- python setup.py bdist_wheel
# an alternative approach is to install and run:
- pip install dist/*
# run the command here
artifacts:
paths:
- dist/*.whl
pages:
stage: deploy
script:
- pip install sphinx sphinx-rtd-theme
- python setup.py build_sphinx
- mv build/sphinx/html/ public
artifacts:
paths:
- public
only:
- master
......@@ -26,6 +26,7 @@ def main():
retrieve_parser = subparsers.add_parser('retrieve')
retrieve_parser.set_defaults(func=retrieve)
retrieve_parser.add_argument('dbxrefs', nargs=argparse.REMAINDER)
retrieve_parser.add_argument('--ignore_cache', '-C', action='store_true', default=False, help="Ignore entries from cache. Fetched entries are still stored in cache.")
retrieve_parser.add_argument('--verbose', '-v', action='store_true', default=False, help="Show debug output")
args = parser.parse_args()
......@@ -53,7 +54,14 @@ def resolve(args, config):
def retrieve(args, config):
from dbxref import retriever
print(json.dumps(retriever.retrieve(resolver.convert_to_dbxrefs(args.dbxrefs))))
print(
json.dumps(
retriever.retrieve(
resolver.convert_to_dbxrefs(args.dbxrefs),
ignore_cache = args.ignore_cache
)
)
)
if __name__ == "__main__":
main()
......@@ -7,6 +7,18 @@
retriever:
type: 'internal'
location: 'dbxref.retrieve.enzyme'
- name: HAMAP
prefixes: ["HAMAP"]
resources:
html: ["https://hamap.expasy.org/signature/%i"]
text: ["https://hamap.expasy.org/signature/%i.txt"]
check_existence: "https://hamap.expasy.org/signature/%i.txt"
- name: OMA - Orthologous MAtrix
prefixes: ["OMA"]
resources:
html: ["https://omabrowser.org/oma/omagroup/%i/"]
json: ["https://omabrowser.org/api/group/%i/"]
check_existence: "https://omabrowser.org/api/group/%i/"
- name: Gene Identifier
prefixes: ["GI"]
resources:
......@@ -14,6 +26,10 @@
xml: ["http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=protein&dopt=xml&sort=&val=%i&retmode=file"]
# does not work
# check_existence: "http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=protein&dopt=xml&sort=&val=%i&retmode=file"
- name: Conserved domain database
prefixes: ["CDD"]
resources:
html: ["https://www.ncbi.nlm.nih.gov/Structure/cdd/cddsrv.cgi?uid=%i"]
- name: Uniprot
prefixes: ["UniProtKB/TrEMBL", "UniProtKB/Swiss-Prot"]
resources:
......@@ -89,15 +105,29 @@
- name: Gene Ontology
prefixes: ["GO"]
resources:
html: ["http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i"]
xml: ["http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i&format=oboxml"]
html: ["https://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i"]
xml: ["https://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i&format=oboxml"]
json: ["https://www.ebi.ac.uk/QuickGO/services/ontology/go/terms/GO:%i/complete"]
check_existence: "http://purl.obolibrary.org/obo/GO_%i"
retriever:
type: 'internal'
location: 'dbxref.retrieve.gene_ontology'
- name: KEGG
prefixes: ["KO", "KEGG"]
resources:
html: ["https://www.genome.jp/dbget-bin/www_bget?%i"]
text: ["http://rest.kegg.jp/get/%i"]
check_existence: "http://rest.kegg.jsdjkaap/get/%i"
# not implemented yet
# retriever:
# type: 'internal'
# location: 'dbxref.retrieve.kegg'
- name: HTTP
prefixes: ["http", "https"]
resources:
html: ["%d:%i"]
check_existence: "%d:%i"
- name: EggNOG
prefixes: ['eggnog']
resources:
html: ["http://eggnogdb.embl.de/#/app/results?target_nogs=%i"]
......@@ -59,9 +59,9 @@ def compile_url(template, dbxref):
def check_url_exists(url):
try:
r = sess.head(url, allow_redirects=True, timeout=1)
r = sess.head(url, allow_redirects=True, timeout=5)
r.close()
if r.status_code <= 400:
if r.status_code < 400:
return STATUS_EXISTS
else:
logger.debug('The server responded with status code: %s', r.status_code)
......
......@@ -35,16 +35,23 @@ def retrieve(dbxrefs, basic=True, references=True):
txt_url = entry['locations']['text'][0]
logger.debug('URL: %s', txt_url)
r = requests.get(txt_url)
retrieved_entry = {}
if r.status_code < 400:
logger.debug('Content: %s', r.text)
try:
# We expect a plain text document
# check if the document returned is a html document
# if it is something went from and we assume that
# it is a error page.
ls = r.text.replace('\n', ' ')
html = HTML.document_fromstring(ls).head.text_content()
# when everything is fine an exception was thrown for
# the last line
output = {'id': entry['dbxref']}
output['status'] = 'not found'
documents.append(output)
except:
retrieved_entry = parse_flat_file(r.text)
elif r.status_code == 404:
retrieved_entry = {'status' : '404 Not found'}
else:
retrieved_entry = {'status' : r.status_code}
retrieved_entry['dbxref'] = entry['dbxref']
documents.append(retrieved_entry)
retrieved_entry['id'] = entry['dbxref']
documents.append(retrieved_entry)
return documents
......
......@@ -34,21 +34,24 @@ def retrieve(dbxrefs, basic=True, relations=False):
if 'messages' in d:
output['message'] = '; '.join(d['messages'])
else:
if basic:
output.update(read_basic(d))
if relations:
output.update(read_relations(d))
if len(d['results']) > 0:
if basic:
output.update(read_basic(d))
if relations:
output.update(read_relations(d))
else:
output['message'] = "no results found, probably invalid ID"
documents.append(output)
return documents
def read_basic(d):
out = {'definition': d['results'][0]['definition']['text'], 'synonyms': []}
out['name'] = d['results'][0]['name']
if 'aspect' in d['results'][0]:
out['aspect'] = d['results'][0]['aspect']
if 'synonyms' in d['results'][0]:
out['synonyms'] = d['results'][0]['synonyms']
return (out)
out = {'definition': d['results'][0]['definition']['text'], 'synonyms': []}
out['name'] = d['results'][0]['name']
if 'aspect' in d['results'][0]:
out['aspect'] = d['results'][0]['aspect']
if 'synonyms' in d['results'][0]:
out['synonyms'] = d['results'][0]['synonyms']
return (out)
def read_relations(d):
out = {'relations': {'children': [], 'parents': []}}
......
......@@ -2,6 +2,7 @@
import dbxref.resolver
import requests
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import ParseError
import logging
import json
import argparse
......@@ -35,11 +36,12 @@ def retrieve(dbxrefs, basic=True, annotation=True):
logger.debug('URL: %s', xml_url)
r = requests.get(xml_url)
logger.debug('Content: %s', r.text)
root = ET.fromstring(r.text)
output = {'id': entry['dbxref']}
try:
root = ET.fromstring(r.text)
tree = str(ET.tostring(root))
if '<error>' in tree:
output['message'] = tree[tree.find('<error>')+7:tree.rfind('</error>')]
......@@ -52,7 +54,7 @@ def retrieve(dbxrefs, basic=True, annotation=True):
except (KeyError, AttributeError) as e:
logger.warn('Error in retrieving %s', str(entry))
raise
except RuntimeError as e:
except (ParseError, RuntimeError) as e:
output['message'] = 'an error occurred'
try:
html = HTML.document_fromstring(r.text.replace('\n', ' '))
......
......@@ -180,6 +180,15 @@ def read_features(entry):
feature['end'] = end.attrib['position']
else:
feature['end'] = end.attrib['status']
if feature['begin'] is not 'unknown':
feature['begin'] = None
else:
feature['begin'] = int(feature['begin'])
if feature['end'] is not 'unknown':
feature['end'] = None
else:
feature['end'] = int(feature['end'])
features.append (feature)
return features
......
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
from dbxref import config
from itertools import groupby
from diskcache import Cache
from appdirs import user_cache_dir
import json
def retrieve(dbxrefs):
sorted(dbxrefs, key=lambda x: x['db'].lower()) # normalize db to lowercase to allow differently cased notations
def retrieve(dbxrefs, ignore_cache=False):
cache = init_cache()
# normalize db notation
normalize_db_notation(dbxrefs)
dbxrefs = sorted(dbxrefs, key=lambda x: x['db'])
# lookup from cache
uncached = []
cached = []
if ignore_cache:
uncached = dbxrefs
else :
(cached, uncached) = find_cached_entries(cache, dbxrefs)
# load uncached
loaded_uncached = load_uncached_entries(uncached)
cache_entries(cache, loaded_uncached)
# compile results
results = []
for key, dbxrefs in groupby(dbxrefs, lambda x: x['db']):
if config.has_provider(key):
provider = config.get_provider(key)
logger.debug('{0} is supported'.format(key))
if 'retriever' in provider:
if provider['retriever']['type'] == 'external':
results.extend( load_with_external_provider(provider, list(dbxrefs)))
elif provider['retriever']['type'] == 'internal':
results.extend(load_with_internal_provider(provider, list(dbxrefs)))
else:
raise Exception('Unknown retriever type', provider['retriever']['type'])
else:
logger.debug('Retrieval of {0} is not supported'.format(key))
results.extend( map(lambda x: {'id': toString(x), 'status': 'retrieval not supported'}, dbxrefs))
else:
logger.debug('Retrieval of {0} is not supported'.format(key))
results.extend( map(lambda x: {'id': toString(x), 'status': 'retrieval not supported'}, dbxrefs))
return (results)
results.extend(cached)
results.extend(loaded_uncached)
return results
def normalize_db_notation(dbxrefs):
# take first prefix that matches the db
for dbxref in dbxrefs:
key = dbxref['db']
if config.has_provider(key):
provider = config.get_provider(key)
for prefix in provider['prefixes']:
if key.lower() == prefix.lower():
dbxref['db'] = prefix
logger.debug("'{}' -> '{}'".format(key, dbxref['db']))
def load_with_external_provider(provider, dbxrefs):
logger.debug('Loading {0} via external provider'.format(dbxrefs))
......@@ -44,3 +62,47 @@ def load_with_internal_provider(provider, dbxrefs):
def toString(dbxref):
return '{}:{}'.format(dbxref['db'], dbxref['id'])
def init_cache():
cachedir = user_cache_dir('dbxref')
cache = Cache(cachedir)
return cache
def cache_entries(cache, entries):
expiration_time = 86400 # one day
for e in entries:
logger.debug('Caching {}'.format(e['id']))
cache.set(e['id'], e, expire=expiration_time)
def find_cached_entries(cache, dbxrefs):
cached = []
uncached = []
for d in dbxrefs:
key = toString(d)
if key in cache:
logger.debug("Found {} in cache".format(key))
cached.append(cache[key])
else:
uncached.append(d)
return (cached, uncached)
def load_uncached_entries(dbxrefs):
results = []
for key, dbxrefs in groupby(dbxrefs, lambda x: x['db']):
if config.has_provider(key):
provider = config.get_provider(key)
logger.debug('{0} is supported'.format(key))
if 'retriever' in provider:
if provider['retriever']['type'] == 'external':
results.extend( load_with_external_provider(provider, list(dbxrefs)))
elif provider['retriever']['type'] == 'internal':
results.extend(load_with_internal_provider(provider, list(dbxrefs)))
else:
raise Exception('Unknown retriever type', provider['retriever']['type'])
else:
logger.debug('{0} is not supported'.format(key))
results.extend( map(lambda x: {'id': toString(x), 'status': 'not supported'}, dbxrefs))
else:
logger.debug('{0} is not supported'.format(key))
results.extend( map(lambda x: {'id': toString(x), 'status': 'not supported'}, dbxrefs))
return (results)
Developer Guide
===============
Document structures
-------------------
Return JSON of dbxref commands
##############################
dbxref returns json lists. Each entry in the list is a json object with at
least the 'id' property. Additional properties should be the same for similar
databases. For example most of the databases will contain a free text
description of the entry which should be available via the 'description'
property. Each database may have it's unique fields. These can be named as
required.
Resolve command
###############
Returns a list of json objects. Each object contains the id and locations.
Locations is an object that provides the urls to the different file formats
available for the entry. Valid formats are: 'html', 'xml', 'json', 'text'. If
you need additional formats, feel free to add them. Each format is a property
on the object and contains a list of all available urls. If a format is not
available for an entry, it should not be present in the locations object.
Handling of non existent dbxrefs
###############################
The proposed format for entries that cannot be found in the databases is as following:
`
[{'id': '<db>:<id>', "message": "no results found; probably invalid ID"}]
`
Handling of webservice errors
#############################
Try to be as precise as possible when an error occurs. The default (unprecise) response should be
`
[{'id': '<db>:<id>', "message": "Could not retrieve the requested entry due to problems."}]
`
You can include the original reponse:
`
[{'id': '<db>:<id>', "message": "Could not retrieve the requested entry due to problems.", "response" : "<original reponse>"}]
`
......@@ -8,3 +8,11 @@ class TestPfam(unittest.TestCase):
documents = pfam.retrieve([{'db': 'PFAM', 'id': 'PF00083.23'}])
# this test failed due to an error due to missing None handling,
# so no assertions here. Once fixed. this should suffice
def test_renamed_family(self):
'''regression test for missing comment in pfam entry'''
documents = pfam.retrieve([{'db': 'PFAM', 'id': 'Tiny_TM_bacill'}])
# this test failed due to a redirect when a family was renamed
# unfortunately the redirect was not encoded in http headers, but in
# html markup (<meta http-equiv="Refresh" content="5; URL=/family/PF09680" />)
# so no assertions here. Once fixed. this should suffice
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment