Commit dbf2bc5d authored by Lukas Jelonek's avatar Lukas Jelonek
Browse files

Merge remote-tracking branch 'origin/develop' into features/hamap

parents 0547ab60 1d471f2a
# This file is a template, and might need editing before it works on your project.
# Official language image. Look for the different tagged releases at:
# https://hub.docker.com/r/library/python/tags/
image: python:3-alpine
image: python:3-stretch
# Change pip's cache directory to be inside the project directory since we can
# only cache local items.
......@@ -23,8 +23,8 @@ before_script:
- pip install virtualenv
- virtualenv venv
- source venv/bin/activate
- apt-get install git
- pip install -r requirements.txt
- apk add git
stages:
- test
......
......@@ -27,12 +27,11 @@
resources:
html: ["https://www.ncbi.nlm.nih.gov/protein/GI:%i"]
xml: ["http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=protein&dopt=xml&sort=&val=%i&retmode=file"]
# does not work
# check_existence: "http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=protein&dopt=xml&sort=&val=%i&retmode=file"
- name: Conserved domain database
prefixes: ["CDD"]
resources:
html: ["https://www.ncbi.nlm.nih.gov/Structure/cdd/cddsrv.cgi?uid=%i"]
text: ["https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?noredirect=1&db=protein&val=%i"]
check_existence: "http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=protein&dopt=xml&sort=&val=%i&retmode=file"
retriever:
type: 'internal'
location: 'dbxref.retrieve.gi'
- name: Uniprot
prefixes: ["UniProtKB/TrEMBL", "UniProtKB/Swiss-Prot"]
resources:
......@@ -69,8 +68,11 @@
resources:
html: ["http://rfam.xfam.org/family/%i"]
xml: ["http://rfam.xfam.org/family/%i?content-type=text%2Fxml"]
# does not work
# check_existence: "http://rfam.xfam.org/family/%i?content-type=text%2Fxml"
json: ["https://rfam.org/family/%i?content-type=application/json"]
check_existence: "http://rfam.xfam.org/family/%i?content-type=text%2Fxml"
retriever:
type: 'internal'
location: 'dbxref.retrieve.rfam'
- name: Pubmed
prefixes: ["Pubmed"]
resources:
......@@ -95,7 +97,11 @@
- name: InterPro
prefixes: ["InterPro"]
resources:
html: ["http://www.ebi.ac.uk/interpro/entry/%i"]
html: ["https://www.ebi.ac.uk/interpro/entry/%i"]
json: ["https://www.ebi.ac.uk/interpro/api/entry/InterPro/%i"]
retriever:
type: 'internal'
location: 'dbxref.retrieve.interpro'
# does not work
# check_existence: "http://www.ebi.ac.uk/interpro/entry/%i"
- name: GeneID
......@@ -121,10 +127,9 @@
html: ["https://www.genome.jp/dbget-bin/www_bget?%i"]
text: ["http://rest.kegg.jp/get/%i"]
check_existence: "http://rest.kegg.jsdjkaap/get/%i"
# not implemented yet
# retriever:
# type: 'internal'
# location: 'dbxref.retrieve.kegg'
retriever:
type: 'internal'
location: 'dbxref.retrieve.kegg'
- name: HTTP
prefixes: ["http", "https"]
resources:
......@@ -134,3 +139,7 @@
prefixes: ['eggnog']
resources:
html: ["http://eggnogdb.embl.de/#/app/results?target_nogs=%i"]
- name: Conserved domain database
prefixes: ["CDD"]
resources:
html: ["https://www.ncbi.nlm.nih.gov/Structure/cdd/cddsrv.cgi?uid=%i"]
#!/usr/bin/env python3
import dbxref.resolver
import requests
import logging
import json
import argparse
import xml.etree.ElementTree as ET
logger = logging.getLogger(__name__)
def main():
"""main()method for script usage"""
parser = argparse.ArgumentParser(description="Retrieves Protein Information from NCBIs Gene Identifier. "
"Database: Protein")
parser.add_argument("--basics", "-b", action="store_true", help="Include basic informations such as "
"dbxref/accession-nr., locus, source organism and "
"definition.")
parser.add_argument("--dbsource", "-db", action="store_true", help="Include source database information.")
parser.add_argument("--references", "-r", action="store_true", help="Include reference information.")
parser.add_argument("dbxref", nargs=argparse.REMAINDER)
args = parser.parse_args()
# When not specified, include all information available
if None not in (args.basics, args.dbsource, args.references):
args.basics = True
args.dbsource = True
args.references = True
dbxrefs = dbxref.resolver.convert_to_dbxrefs(args.dbxref)
documents = retrieve(dbxrefs, basics=args.basics, dbsource=args.dbsource, references=args.references)
print(json.dumps(documents, sort_keys=True, indent=4))
def retrieve(dbxrefs, basics=True, dbsource=True, references=True):
"""Retrieve Protein data as xml and parse into json format"""
resolved = dbxref.resolver.resolve(dbxrefs, check_existence=False)
documents = []
for entry in resolved:
xml_url = entry["locations"]["xml"][0]
logger.debug("URL: %s", xml_url)
gi = requests.get(xml_url)
logger.debug("Content: %s", gi.text)
output = {"id": entry["dbxref"]}
try:
root = ET.fromstring(gi.text)
if basics:
try:
output.update(read_basics(root))
except KeyError:
print("One ore more of the basic information were not available for given dbxref. "
"Please check the source data.")
raise
if dbsource:
try:
output.update(read_dbsource(root))
except KeyError:
print("Source database information wasn't or wasn't fully available. Please check the source data")
raise
if references:
try:
output.update(read_references(root))
except KeyError:
print("reference information wasn't or wasn't fully available. Please check the source data")
raise
except (RuntimeError, ET.ParseError):
print("An error occurred")
raise
documents.append(output)
return documents
def read_basics(root):
"""Finds basic information such as locus, dbxref, definition, organism, molecular information and representational
structure, if available, and puts out a dictionary containing the information"""
locus = root.find("Seq-entry_seq/Bioseq/Bioseq_id/Seq-id/Seq-id_swissprot/Textseq-id/Textseq-id_name").text
dbxref_id = "GI:" + root.find("Seq-entry_seq/Bioseq/Bioseq_id/Seq-id/Seq-id_swissprot/Textseq-id/"
"Textseq-id_accession").text
definition = root.find("Seq-entry_seq/Bioseq/Bioseq_descr/Seq-descr/Seqdesc/Seqdesc_title").text
organism = {"name": root.find("Seq-entry_seq/Bioseq/Bioseq_descr/Seq-descr/Seqdesc/Seqdesc_source/BioSource/"
"BioSource_org/Org-ref/Org-ref_orgname/OrgName/OrgName_name/"
"OrgName_name_binomial/BinomialOrgName/BinomialOrgName_genus").text + " " +
root.find("Seq-entry_seq/Bioseq/Bioseq_descr/Seq-descr/Seqdesc/Seqdesc_source/BioSource/"
"BioSource_org/Org-ref/Org-ref_orgname/OrgName/OrgName_name/OrgName_name_binomial/"
"BinomialOrgName/BinomialOrgName_species").text,
"taxonomy": root.find("OrgName_lineage")}
mol_info = root.find("MolInfo_biomol")
structure = root.find("Seqdesc_comment")
return {"locus": locus, "dbxref": dbxref_id, "definition": definition, "organism": organism,
"molecular_info": mol_info, "structure": structure}
def read_dbsource(root):
"""Finds databank sources in the xmland puts out a list with all dbxrefs found."""
dbxref_list = []
for dbtag in root.findall("Seq-entry_seq/Bioseq/Bioseq_descr/Seq-descr/Seqdesc/Seqdesc_sp/SP-block/SP-block_dbref/"
"Dbtag"):
dbxref_list.append(dbtag.find("Dbtag_db").text + ":" + dbtag.find("Dbtag_tag/Object-id/Object-id_str").text)
return {"source databases": dbxref_list}
def read_references(root):
"""Finds reference information in the xml and puts out a list containing information for authors, title, journal
and pubmed DOI"""
references = []
for cit_art in root.findall("Seq-entry_seq/Bioseq/Bioseq_descr/Seq-descr/Seqdesc/Seqdesc_pub/Pubdesc/Pubdesc_pub/"
"Pub-equiv/Pub/Pub_article/Cit-art"):
author_list = []
journal = {}
title = ""
doi = ""
# Find Authors
for author in cit_art.findall("Cit-art_authors/Auth-list/Auth-list_names/Auth-list_names_std/Author"):
author_list.append(author.find("Author_name/Person-id/Person-id_name/Name-std/Name-std_last").text + ", " +
author.find("Author_name/Person-id/Person-id_name/Name-std/Name-std_initials").text)
# Find Title
title = cit_art.find("Cit-art_title/Title/Title_E/Title_E_name").text
# Find Journal
journal = {"name": cit_art.find("Cit-art_from/Cit-art_from_journal/Cit-jour/Cit-jour_title/Title/Title_E/"
"Title_E_iso-jta").text,
"date": cit_art.find("Cit-art_from/Cit-art_from_journal/Cit-jour/Cit-jour_imp/Imprint/Imprint_date/"
"Date/Date_std/Date-std/Date-std_day").text + "." +
cit_art.find("Cit-art_from/Cit-art_from_journal/Cit-jour/Cit-jour_imp/Imprint/Imprint_date/"
"Date/Date_std/Date-std/Date-std_month").text + "." +
cit_art.find("Cit-art_from/Cit-art_from_journal/Cit-jour/Cit-jour_imp/Imprint/Imprint_date/"
"Date/Date_std/Date-std/Date-std_year").text
}
# Find Pubmed DOI
doi = cit_art.find("Cit-art_ids/ArticleIdSet/ArticleId/ArticleId_doi/DOI").text
# Put into dictionary
references.append({"authors": author_list,
"title": title,
"journal": journal,
"doi": doi
})
return {"references": references}
if __name__ == "__main__":
main()
#!/usr/bin/env python3
import dbxref.resolver
import requests
import logging
import json
import argparse
logger = logging.getLogger(__name__)
def main():
"""main()method for script usage"""
# AVAILABLE for implementation:
# 'go_terms', 'member_databases', 'integrated', 'entry_annotations', ''
#
# USED:
# basics: 'accession', 'type', 'description', 'counters', 'entry_id', 'source_database', 'name'
# hierarchy
# wikipedia
# literature
# cross_references
# overlaps_with
parser = argparse.ArgumentParser(description="Retrieve InterPro documents and convert them into json")
parser.add_argument("--basics", "-b", action="store_true", help="Include basic information such as accession, "
"type, name, description, counters, entry_id and "
"source_database")
parser.add_argument("--hierarchy", "-hi", action="store_true", help="")
parser.add_argument("--wikipedia", "-w", action="store_true", help="")
parser.add_argument("--literature", "-l", action="store_true", help="")
parser.add_argument("--cross_references", "-cr", action="store_true", help="")
parser.add_argument("--overlaps", "-o", action="store_true", help="")
parser.add_argument("dbxrefs", nargs=argparse.REMAINDER)
args = parser.parse_args()
# if nothing specified, output all available information for the entry
if None not in (args.basics, args.hierarchy, args.wikipedia, args.literature, args.cross_references, args.overlaps):
args.basics = True
args.hierarchy = True
args.wikipedia = True
args.literature = True
args.cross_references = True
args.overlaps = True
dbxrefs = dbxref.resolver.convert_to_dbxrefs(args.dbxrefs)
documents = retrieve(dbxrefs, basics=args.basics, hierarchy=args.hierarchy, wikipedia=args.wikipedia,
literature=args.literature, cross_references=args.cross_references, overlaps=args.overlaps)
print(json.dumps(documents, sort_keys=True, indent=4))
def retrieve(dbxrefs, basics=True, hierarchy=True, wikipedia=True, literature=True, cross_references=True, overlaps=True):
"""Retrieve json document from InterPro REST api, filter information by selected Options and parse into new json"""
resolved = dbxref.resolver.resolve(dbxrefs, check_existence=False)
documents = []
for entry in resolved:
# Construct URL for retrieve
json_url = entry['locations']['json'][0]
logger.debug('URL: %s', json_url)
r = requests.get(json_url)
logger.debug('Content: %s', r.text)
ipro = json.loads(r.text)
# Parse retrieved json file by selected Options
output = {"id": entry["dbxref"]}
if basics:
try:
output.update(accession=ipro["metadata"]["accession"], entry_type=ipro["metadata"]["type"],
description=ipro["metadata"]["description"], counters=ipro["metadata"]["counters"],
entry_id=ipro["metadata"]["entry_id"], name=ipro["metadata"]["name"],
source_database=ipro["metadata"]["source_database"])
except KeyError:
logger.warning("One or more basic information were not available for the given entry. Please check your output.")
if hierarchy:
try:
output.update(hierarchy=ipro["metadata"]["hierarchy"])
except KeyError:
logger.warning("Hierarchy information was not available for the given entry.")
if wikipedia:
try:
output.update(wikipedia=ipro["metadata"]["wikipedia"])
except KeyError:
logger.warning("Wikipedia articel were not available for the given entry.")
if literature:
try:
output.update(literature=ipro["metadata"]["literature"])
except KeyError:
logger.warning("Literature was not available for the given entry.")
if cross_references:
try:
output.update(cross_references=ipro["metadata"]["cross_references"])
except KeyError:
logger.warning("Cross_references were not available for the given entry.")
if overlaps:
try:
output.update(overlaps=ipro["metadata"]["overlaps_with"])
except KeyError:
logger.warning("Overlap information was not available for the given entry.")
documents.append(output)
return documents
if __name__ == "__main__":
main()
This diff is collapsed.
#!/usr/bin/env python3
import dbxref.resolver
import requests
import logging
import json
import argparse
import xml.etree.ElementTree as ET
logger = logging.getLogger(__name__)
def main():
"""main()method for script usage"""
parser = argparse.ArgumentParser(description="Retrieve Rfam json documents and parse them into dbxref json format")
parser.add_argument("--basics", "-b", action="store_true", help="Include basic informations such as dbxref_id, "
"name, description and comment.")
parser.add_argument("--references", "-r", action="store_true", help="Include reference information.")
parser.add_argument("dbxref", nargs=argparse.REMAINDER)
args = parser.parse_args()
if None not in (args.basics, args.references):
args.basics = True
args.references = True
dbxrefs = dbxref.resolver.convert_to_dbxrefs(args.dbxref)
documents = retrieve(dbxrefs, basics=args.basics, references=args.references)
print(json.dumps(documents, sort_keys=True, indent=4))
def retrieve(dbxrefs, basics=True, references=True):
"""Retrieve rfam json documents and parse into dbxref json format"""
resolved = dbxref.resolver.resolve(dbxrefs, check_existence=False)
documents = []
for entry in resolved:
# Construct URL for retrival
json_url = entry["locations"]["json"][0]
logger.debug("URL: %s", json_url)
r = requests.get(json_url)
logger.debug("Content: %s", r.text)
rfam = json.loads(r.text)
output = {"id": entry["dbxref"]}
# Parse basic information
if basics:
try:
output.update({"dbxref": "RFAM:" + rfam["rfam"]["acc"],
"name": rfam["rfam"]["id"],
"description": rfam["rfam"]["description"],
"comment": rfam["rfam"]["comment"]
})
except KeyError:
print("Basic information weren't fully or only partly available. "
"Please check the dbxref and the Rfam-site.")
raise
# Parse reference information
if references:
try:
output.update({"references": {"author": rfam["rfam"]["curation"]["author"],
"DOI": rfam["rfam"]["curation"]["structure_source"],
"type": rfam["rfam"]["curation"]["type"]
}
})
except KeyError:
print("References weren't fully or only partly available. "
"Please check the dbxref and the Rfam-site")
raise
documents.append(output)
return documents
if __name__ == "__main__":
main()
.. highlight:: yaml
Gene Identifier
====
Retrieve GI xml documents for dbxrefs and convert them into json.
Options
-------
* ``--basics`` - Include basic informations such as dbxref/accession-nr., locus, source organism and definition.
* ``--dbsource`` - Include source database information.
* ``--references`` - Include reference information.
Input
-----
example: ``GI:P0ABT0``
Output
------
output scheme::
[
{
"dbxref": "GI:P0ABT0",
"definition": "RecName: Full=DNA polymerase III subunit theta",
"locus": "HOLE_ECO57",
"molecular_info": null,
"organism": {
"name": "Escherichia coli",
"taxonomy": null
},
"references": [
{
"authors": [
"Perna, N.T.",
"Plunkett, G.",
"Burland, V.",
"Mau, B.",
"Glasner, J.D.",
"Rose, D.J.",
"Mayhew, G.F.",
"Evans, P.S.",
"Gregor, J.",
"Kirkpatrick, H.A.",
"Posfai, G.",
"Hackett, J.",
"Klink, S.",
"Boutin, A.",
"Shao, Y.",
"Miller, L.",
"Grotbeck, E.J.",
"Davis, N.W.",
"Lim, A.",
"Dimalanta, E.T.",
"Potamousis, K.D.",
"Apodaca, J.",
"Anantharaman, T.S.",
"Lin, J.",
"Yen, G.",
"Schwartz, D.C.",
"Welch, R.A.",
"Blattner, F.R."
],
"doi": "10.1038/35054089",
"journal": {
"date": "25.1.2001",
"name": "Nature"
},
"title": "Genome sequence of enterohaemorrhagic Escherichia coli O157:H7."
},
{
"authors": [
"Hayashi, T.",
"Makino, K.",
"Ohnishi, M.",
"Kurokawa, K.",
"Ishii, K.",
"Yokoyama, K.",
"Han, C.G.",
"Ohtsubo, E.",
"Nakayama, K.",
"Murata, T.",
"Tanaka, M.",
"Tobe, T.",
"Iida, T.",
"Takami, H.",
"Honda, T.",
"Sasakawa, C.",
"Ogasawara, N.",
"Yasunaga, T.",
"Kuhara, S.",
"Shiba, T.",
"Hattori, M.",
"Shinagawa, H."
],
"doi": "10.1093/dnares/8.1.11",
"journal": {
"date": "28.2.2001",
"name": "DNA Res."
},
"title": "Complete genome sequence of enterohemorrhagic Escherichia coli O157:H7 and genomic comparison with a laboratory strain K-12."
}
],
"source databases": [
"SMR:P0ABT0",
"STRING:155864.EDL933_2815",
"EnsemblBacteria:AAG56832",
"EnsemblBacteria:AAG56832",
"EnsemblBacteria:Z2891",
"EnsemblBacteria:BAB35975",
"EnsemblBacteria:BAB35975",
"EnsemblBacteria:BAB35975",
"GeneID:913059",
"KEGG:ece:Z2891",
"KEGG:ecs:ECs2552",
"PATRIC:fig|386585.9.peg.2675",
"eggNOG:ENOG4105MPK",
"eggNOG:ENOG4111UZC",
"HOGENOM:HOG000219272",
"KO:K02345",
"BioCyc:ECOO157:HOLE-MONOMER",
"Proteomes:UP000000558",
"Proteomes:UP000002519",
"GO:GO:0003677",
"GO:GO:0003887",
"GO:GO:0006260",
"Gene3D:1.20.58.250",
"InterPro:IPR009052",
"InterPro:IPR036745",
"Pfam:PF06440"
],
"structure": null
}
]
......@@ -16,6 +16,9 @@ Welcome to DBXRef resolve and retrieval tool's documentation!
sequence_ontology
taxonomy
uniprot
kegg
interpro
rfam
......
This diff is collapsed.
This diff is collapsed.
.. highlight:: yaml
Rfam
====
Retrieve rfam json documents for dbxrefs and convert them into dbxref format json.
Options
-------
* ``--basics`` - Include basic informations such as dbxref_id, name, description and comment.
* ``--references`` - Include reference information.
Input
-----
example: ``RFAM:RF03094``
Output
------
output scheme::
[
{
"comment": "Halobacteria (Archaea). Nearby to self-splicing intron genes",
"dbxref": "RF03094",
"description": "LAGLIDADG-2 RNA",
"name": "LAGLIDADG-2",
"references": {
"DOI": "Published; PMID:28977401;",
"author": "Weinberg Z",
"type": "Gene; sRNA;"
}
}
]
import unittest
from dbxref.retrieve import gi
class TestGI(unittest.TestCase):
def test_output(self):
documents = gi.retrieve([{"db": "GI", "id": "P0ABT0"}], basics=True, dbsource=True, references=True)
self.assertTrue(documents)
if __name__ == "__main__":
unittest.main()
import unittest
from dbxref.retrieve import interpro
class TestIPro(unittest.TestCase):
# Test if ipro retriever gives any output
def test_output(self):
documents = interpro.retrieve([{'db': 'InterPro', 'id': 'IPR000003'}], basics=True, hierarchy=True, wikipedia=True,
literature=True, cross_references=True, overlaps=True)
self.assertTrue(documents)