Commit 87c46362 authored by Lukas Jelonek's avatar Lukas Jelonek
Browse files

Introduce internal providers that run in the python process, instead in a...

Introduce internal providers that run in the python process, instead in a forked process. Converted the taxonomy provider as a proof of principle
parent 9b702154
......@@ -32,8 +32,8 @@
xml_ncbi: ["https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=taxonomy&id=%i"]
check_existence: "http://www.uniprot.org/taxonomy/%i"
retriever:
type: 'external'
location: 'scripts/retrieve_taxonomy.py'
type: 'internal'
location: 'dbxref.retrieve.taxonomy'
- name: SequenceOntology
prefixes: ["SO", "so"]
resources:
......
#!/usr/bin/env python3
import dbxref.resolver
import requests
import logging
import json
import argparse
logger = logging.getLogger(__name__)
def main():
"""For script usage of this retriever"""
parser = argparse.ArgumentParser(description='Retrieve taxonomy xml documents for dbxrefs and convert them into json')
parser.add_argument('--basic', '-b', action='store_true', help='Include dbxref, scientificName, commonName, lineage and rank')
parser.add_argument('--geneticcodes', '-g', action='store_true', help='Include geneticCode and mitochondrialGeneticCode')
parser.add_argument('dbxrefs', nargs=argparse.REMAINDER)
args = parser.parse_args()
# set default values
if not args.basic and not args.geneticcodes:
args.basic = True
args.geneticcodes = True
documents = retrieve(args.dbxrefs, basic=args.basic, geneticcodes=args.geneticcodes)
print (json.dumps(documents))
def retrieve(dbxrefs, basic=True, geneticcodes=True):
"""Retrieve the data for the dbxrefs and return a list"""
resolved = dbxref.resolver.resolve(dbxrefs, check_existence=False)
documents = []
for entry in resolved:
json_url = entry['locations']['json'][0]
logger.debug('URL: %s', json_url)
r = requests.get(json_url)
logger.debug('Content: %s', r.text)
output = {'id': entry['dbxref']}
d = {}
try:
d = json.loads(r.text)
except:
pass
if len(d) > 0:
if basic:
output.update(read_basic(d))
if geneticcodes:
output.update(read_geneticCodes(d))
else:
output['message'] = "An error occurred! probably invalid ID"
documents.append(output)
return documents
def read_basic(d):
out = {}
if 'scientificName' in d:
out['scientificName'] = d['scientificName']
if 'commonName' in d:
out['commonName'] = d['commonName']
if 'lineage' in d:
out['lineage'] = d['lineage']
if 'rank' in d:
out['rank'] = d['rank']
return (out)
def read_geneticCodes(d):
out = {'geneticCodes': {}}
if 'geneticCode' in d:
out['geneticCodes']['geneticCode'] = d['geneticCode']
if 'mitochondrialGeneticCode' in d:
out['geneticCodes']['mitochondrialGeneticCode'] = d['mitochondrialGeneticCode']
return (out)
if __name__ == '__main__':
main()
......@@ -17,6 +17,11 @@ def retrieve(dbxrefs, location=''):
if provider['retriever']['type'] == 'external':
retrieved = load_with_external_provider(provider, list(dbxrefs), location)
results.extend(retrieved)
elif provider['retriever']['type'] == 'internal':
import importlib
retrieve_method = getattr(importlib.import_module(provider['retriever']['location']), 'retrieve')
retrieved = retrieve_method(dbxrefs)
results.extend(retrieved)
else:
raise Exception('Unknown retriever type', provider['retriever']['type'])
else:
......@@ -37,6 +42,5 @@ def load_with_external_provider(provider, dbxrefs, location):
result = subprocess.check_output(call, shell=True)
return json.loads(result.decode('utf-8'))
def toString(dbxref):
return '{}:{}'.format(dbxref['db'], dbxref['id'])
#!/usr/bin/env python3
import env
import dbxref.config
import dbxref.resolver
import requests
import logging
import json
import argparse
logger = logging.getLogger(__name__)
def main():
parser = argparse.ArgumentParser(description='Retrieve taxonomy xml documents for dbxrefs and convert them into json')
parser.add_argument('--basic', '-b', action='store_true', help='Include dbxref, scientificName, commonName, lineage and rank')
parser.add_argument('--geneticcodes', '-g', action='store_true', help='Include geneticCode and mitochondrialGeneticCode')
parser.add_argument('dbxrefs', nargs=argparse.REMAINDER)
args = parser.parse_args()
if not args.basic and not args.geneticcodes:
args.basic = True
args.geneticcodes = True
resolved = dbxref.resolver.resolve(args.dbxrefs, check_existence=False)
documents = []
for entry in resolved:
json_url = entry['locations']['json'][0]
logger.debug('URL: %s', json_url)
r = requests.get(json_url)
logger.debug('Content: %s', r.text)
output = {'id': entry['dbxref']}
d = {}
try:
d = json.loads(r.text)
except:
pass
if len(d) > 0:
if args.basic:
output.update(read_basic(d))
if args.geneticcodes:
output.update(read_geneticCodes(d))
else:
output['message'] = "An error occurred! probably invalid ID"
documents.append(output)
print (json.dumps(documents))
def read_basic(d):
out = {}
if 'scientificName' in d:
out['scientificName'] = d['scientificName']
if 'commonName' in d:
out['commonName'] = d['commonName']
if 'lineage' in d:
out['lineage'] = d['lineage']
if 'rank' in d:
out['rank'] = d['rank']
return (out)
def read_geneticCodes(d):
out = {'geneticCodes': {}}
if 'geneticCode' in d:
out['geneticCodes']['geneticCode'] = d['geneticCode']
if 'mitochondrialGeneticCode' in d:
out['geneticCodes']['mitochondrialGeneticCode'] = d['mitochondrialGeneticCode']
return (out)
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment