Commit 018cfd05 authored by Lukas Jelonek's avatar Lukas Jelonek
Browse files

Merge remote-tracking branch 'origin/develop' into features/gi

parents 1582b355 683bf12b
# This file is a template, and might need editing before it works on your project.
# Official language image. Look for the different tagged releases at:
# https://hub.docker.com/r/library/python/tags/
image: python:3-alpine
image: python:3-stretch
# Change pip's cache directory to be inside the project directory since we can
# only cache local items.
......@@ -23,8 +23,8 @@ before_script:
- pip install virtualenv
- virtualenv venv
- source venv/bin/activate
- apt-get install git
- pip install -r requirements.txt
- apk add git
stages:
- test
......
......@@ -65,8 +65,11 @@
resources:
html: ["http://rfam.xfam.org/family/%i"]
xml: ["http://rfam.xfam.org/family/%i?content-type=text%2Fxml"]
# does not work
# check_existence: "http://rfam.xfam.org/family/%i?content-type=text%2Fxml"
json: ["https://rfam.org/family/%i?content-type=application/json"]
check_existence: "http://rfam.xfam.org/family/%i?content-type=text%2Fxml"
retriever:
type: 'internal'
location: 'dbxref.retrieve.rfam'
- name: Pubmed
prefixes: ["Pubmed"]
resources:
......@@ -91,7 +94,11 @@
- name: InterPro
prefixes: ["InterPro"]
resources:
html: ["http://www.ebi.ac.uk/interpro/entry/%i"]
html: ["https://www.ebi.ac.uk/interpro/entry/%i"]
json: ["https://www.ebi.ac.uk/interpro/api/entry/InterPro/%i"]
retriever:
type: 'internal'
location: 'dbxref.retrieve.interpro'
# does not work
# check_existence: "http://www.ebi.ac.uk/interpro/entry/%i"
- name: GeneID
......@@ -117,10 +124,9 @@
html: ["https://www.genome.jp/dbget-bin/www_bget?%i"]
text: ["http://rest.kegg.jp/get/%i"]
check_existence: "http://rest.kegg.jsdjkaap/get/%i"
# not implemented yet
# retriever:
# type: 'internal'
# location: 'dbxref.retrieve.kegg'
retriever:
type: 'internal'
location: 'dbxref.retrieve.kegg'
- name: HTTP
prefixes: ["http", "https"]
resources:
......
#!/usr/bin/env python3
import dbxref.resolver
import requests
import logging
import json
import argparse
logger = logging.getLogger(__name__)
def main():
"""main()method for script usage"""
# AVAILABLE for implementation:
# 'go_terms', 'member_databases', 'integrated', 'entry_annotations', ''
#
# USED:
# basics: 'accession', 'type', 'description', 'counters', 'entry_id', 'source_database', 'name'
# hierarchy
# wikipedia
# literature
# cross_references
# overlaps_with
parser = argparse.ArgumentParser(description="Retrieve InterPro documents and convert them into json")
parser.add_argument("--basics", "-b", action="store_true", help="Include basic information such as accession, "
"type, name, description, counters, entry_id and "
"source_database")
parser.add_argument("--hierarchy", "-hi", action="store_true", help="")
parser.add_argument("--wikipedia", "-w", action="store_true", help="")
parser.add_argument("--literature", "-l", action="store_true", help="")
parser.add_argument("--cross_references", "-cr", action="store_true", help="")
parser.add_argument("--overlaps", "-o", action="store_true", help="")
parser.add_argument("dbxrefs", nargs=argparse.REMAINDER)
args = parser.parse_args()
# if nothing specified, output all available information for the entry
if None not in (args.basics, args.hierarchy, args.wikipedia, args.literature, args.cross_references, args.overlaps):
args.basics = True
args.hierarchy = True
args.wikipedia = True
args.literature = True
args.cross_references = True
args.overlaps = True
dbxrefs = dbxref.resolver.convert_to_dbxrefs(args.dbxrefs)
documents = retrieve(dbxrefs, basics=args.basics, hierarchy=args.hierarchy, wikipedia=args.wikipedia,
literature=args.literature, cross_references=args.cross_references, overlaps=args.overlaps)
print(json.dumps(documents, sort_keys=True, indent=4))
def retrieve(dbxrefs, basics=True, hierarchy=True, wikipedia=True, literature=True, cross_references=True, overlaps=True):
"""Retrieve json document from InterPro REST api, filter information by selected Options and parse into new json"""
resolved = dbxref.resolver.resolve(dbxrefs, check_existence=False)
documents = []
for entry in resolved:
# Construct URL for retrieve
json_url = entry['locations']['json'][0]
logger.debug('URL: %s', json_url)
r = requests.get(json_url)
logger.debug('Content: %s', r.text)
ipro = json.loads(r.text)
# Parse retrieved json file by selected Options
output = {"id": entry["dbxref"]}
if basics:
try:
output.update(accession=ipro["metadata"]["accession"], entry_type=ipro["metadata"]["type"],
description=ipro["metadata"]["description"], counters=ipro["metadata"]["counters"],
entry_id=ipro["metadata"]["entry_id"], name=ipro["metadata"]["name"],
source_database=ipro["metadata"]["source_database"])
except KeyError:
logger.warning("One or more basic information were not available for the given entry. Please check your output.")
if hierarchy:
try:
output.update(hierarchy=ipro["metadata"]["hierarchy"])
except KeyError:
logger.warning("Hierarchy information was not available for the given entry.")
if wikipedia:
try:
output.update(wikipedia=ipro["metadata"]["wikipedia"])
except KeyError:
logger.warning("Wikipedia articel were not available for the given entry.")
if literature:
try:
output.update(literature=ipro["metadata"]["literature"])
except KeyError:
logger.warning("Literature was not available for the given entry.")
if cross_references:
try:
output.update(cross_references=ipro["metadata"]["cross_references"])
except KeyError:
logger.warning("Cross_references were not available for the given entry.")
if overlaps:
try:
output.update(overlaps=ipro["metadata"]["overlaps_with"])
except KeyError:
logger.warning("Overlap information was not available for the given entry.")
documents.append(output)
return documents
if __name__ == "__main__":
main()
This diff is collapsed.
#!/usr/bin/env python3
import dbxref.resolver
import requests
import logging
import json
import argparse
import xml.etree.ElementTree as ET
logger = logging.getLogger(__name__)
def main():
"""main()method for script usage"""
parser = argparse.ArgumentParser(description="Retrieve Rfam json documents and parse them into dbxref json format")
parser.add_argument("--basics", "-b", action="store_true", help="Include basic informations such as dbxref_id, "
"name, description and comment.")
parser.add_argument("--references", "-r", action="store_true", help="Include reference information.")
parser.add_argument("dbxref", nargs=argparse.REMAINDER)
args = parser.parse_args()
if None not in (args.basics, args.references):
args.basics = True
args.references = True
dbxrefs = dbxref.resolver.convert_to_dbxrefs(args.dbxref)
documents = retrieve(dbxrefs, basics=args.basics, references=args.references)
print(json.dumps(documents, sort_keys=True, indent=4))
def retrieve(dbxrefs, basics=True, references=True):
"""Retrieve rfam json documents and parse into dbxref json format"""
resolved = dbxref.resolver.resolve(dbxrefs, check_existence=False)
documents = []
for entry in resolved:
# Construct URL for retrival
json_url = entry["locations"]["json"][0]
logger.debug("URL: %s", json_url)
r = requests.get(json_url)
logger.debug("Content: %s", r.text)
rfam = json.loads(r.text)
output = {"id": entry["dbxref"]}
# Parse basic information
if basics:
try:
output.update({"dbxref": "RFAM:" + rfam["rfam"]["acc"],
"name": rfam["rfam"]["id"],
"description": rfam["rfam"]["description"],
"comment": rfam["rfam"]["comment"]
})
except KeyError:
print("Basic information weren't fully or only partly available. "
"Please check the dbxref and the Rfam-site.")
raise
# Parse reference information
if references:
try:
output.update({"references": {"author": rfam["rfam"]["curation"]["author"],
"DOI": rfam["rfam"]["curation"]["structure_source"],
"type": rfam["rfam"]["curation"]["type"]
}
})
except KeyError:
print("References weren't fully or only partly available. "
"Please check the dbxref and the Rfam-site")
raise
documents.append(output)
return documents
if __name__ == "__main__":
main()
......@@ -16,6 +16,9 @@ Welcome to DBXRef resolve and retrieval tool's documentation!
sequence_ontology
taxonomy
uniprot
kegg
interpro
rfam
......
This diff is collapsed.
This diff is collapsed.
.. highlight:: yaml
Rfam
====
Retrieve rfam json documents for dbxrefs and convert them into dbxref format json.
Options
-------
* ``--basics`` - Include basic informations such as dbxref_id, name, description and comment.
* ``--references`` - Include reference information.
Input
-----
example: ``RFAM:RF03094``
Output
------
output scheme::
[
{
"comment": "Halobacteria (Archaea). Nearby to self-splicing intron genes",
"dbxref": "RF03094",
"description": "LAGLIDADG-2 RNA",
"name": "LAGLIDADG-2",
"references": {
"DOI": "Published; PMID:28977401;",
"author": "Weinberg Z",
"type": "Gene; sRNA;"
}
}
]
import unittest
from dbxref.retrieve import interpro
class TestIPro(unittest.TestCase):
# Test if ipro retriever gives any output
def test_output(self):
documents = interpro.retrieve([{'db': 'InterPro', 'id': 'IPR000003'}], basics=True, hierarchy=True, wikipedia=True,
literature=True, cross_references=True, overlaps=True)
self.assertTrue(documents)
if __name__ == '__main__':
unittest.main()
import unittest
from dbxref.retrieve import kegg
class TestKegg(unittest.TestCase):
def test_output(self):
"""Test if kegg.py gives any output"""
documents = kegg.retrieve([{"db": "KEGG", "id": "K00121"}], basics=True, brite=True, pathway=True,
dbxrefs_links=True, formula=True, reaction=True, genes=True, motif=True,
orthology=True, reference=True)
self.assertTrue(documents)
def test_brite_output_1(self):
# Test parsing and saving of a graph(v,e) in an adjacency list. Tree with one root and one continuous branch
brite_example_1 = [["BRITE Root1"],
[" branch1"],
[" branch2"],
[" Branch3"],
[" BRANCH4"],
[" branch5"]
]
brite_example_output_1 = {"vertices": ["Root1", "branch1", "branch2", "Branch3", "BRANCH4", "branch5"],
"edges": {"0": ["1"],
"1": ["2"],
"2": ["3"],
"3": ["4"],
"4": ["5"],
"5": []
}
}
self.assertEqual(kegg.read_brite(brite_example_1), brite_example_output_1)
# Test parsing and saving of a graph(v,e) in an adjacency list. Tree with one root but two branches.
brite_example_2 = [["BRITE Root1"],
[" branch1"],
[" branch2"],
[" Branch3"],
[" BRANCH4"],
[" branch5"]
]
brite_example_output_2 = {"vertices": ["Root1", "branch1", "branch2", "Branch3", "BRANCH4", "branch5"],
"edges": {"0": ["1", "3"],
"1": ["2"],
"2": [],
"3": ["4"],
"4": ["5"],
"5": []
}
}
self.assertEqual(kegg.read_brite(brite_example_2), brite_example_output_2)
# Test parsing and saving of a graph(v,e) in an adjacency list. Tree with a second root and separate branches
brite_example_3 = [["BRITE Root1"],
[" branch1"],
[" branch2"],
[" Root2"],
[" BRANCH4"],
[" branch5"]
]
brite_example_output_3 = {"vertices": ["Root1", "branch1", "branch2", "Root2", "BRANCH4", "branch5"],
"edges": {"0": ["1"],
"1": ["2"],
"2": [],
"3": ["4"],
"4": ["5"],
"5": []
}
}
self.assertEqual(kegg.read_brite(brite_example_3), brite_example_output_3)
# Test parsing and saving of a graph(v,e) in an adjacency list. Tree with one root and branch, bu multiple leafs
brite_example_4 = [["BRITE Root1"],
[" branch1"],
[" branch2"],
[" Branch3"],
[" BRANCH4"],
[" branch5"]
]
brite_example_output_4 = {"vertices": ["Root1", "branch1", "branch2", "Branch3", "BRANCH4", "branch5"],
"edges": {"0": ["1"],
"1": ["2", "3", "4", "5"],
"2": [],
"3": [],
"4": [],
"5": []
}
}
self.assertEqual(kegg.read_brite(brite_example_4), brite_example_output_4)
# Test parsing and saving of a graph(v,e) in an adjacency list. Tree with a mix of above testing methods
brite_example_5 = [["BRITE Root1"],
[" branch1"],
[" branch2"],
[" Branch3"],
[" BRANCH4"],
[" branch5"],
[" Branch6"],
[" Branch7"],
[" Branch8"],
[" Branch9"]
]
brite_example_output_5 = {"vertices": ["Root1", "branch1", "branch2", "Branch3", "BRANCH4", "branch5",
"Branch6", "Branch7", "Branch8", "Branch9"],
"edges": {"0": ["1"],
"1": ["2", "8"],
"2": ["3"],
"3": ["4", "6", "7"],
"4": ["5"],
"5": [],
"6": [],
"7": [],
"8": ["9"],
"9": []
}
}
self.assertEqual(kegg.read_brite(brite_example_5), brite_example_output_5)
if __name__ == '__main__':
unittest.main()
\ No newline at end of file
import unittest
from dbxref.retrieve import rfam
class TestRfam(unittest.TestCase):
def test_output(self):
documents = rfam.retrieve([{"db": "Rfam", "id": "RF03094"}], basics=True, references=True)
self.assertTrue(documents)
if __name__ == "__main__":
unittest.main()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment