Commit c6381462 authored by Lukas Jelonek's avatar Lukas Jelonek
Browse files

Merge branch 'features/gi' into 'develop'


See merge request SOaAS/dbxref!9
parents 683bf12b 97794e74
......@@ -24,12 +24,11 @@
html: [""]
xml: [""]
# does not work
# check_existence: ""
- name: Conserved domain database
prefixes: ["CDD"]
html: [""]
text: [""]
check_existence: ""
type: 'internal'
location: ''
- name: Uniprot
prefixes: ["UniProtKB/TrEMBL", "UniProtKB/Swiss-Prot"]
#!/usr/bin/env python3
import dbxref.resolver
import requests
import logging
import json
import argparse
import xml.etree.ElementTree as ET
logger = logging.getLogger(__name__)
def main():
"""main()method for script usage"""
parser = argparse.ArgumentParser(description="Retrieves Protein Information from NCBIs Gene Identifier. "
"Database: Protein")
parser.add_argument("--basics", "-b", action="store_true", help="Include basic informations such as "
"dbxref/accession-nr., locus, source organism and "
parser.add_argument("--dbsource", "-db", action="store_true", help="Include source database information.")
parser.add_argument("--references", "-r", action="store_true", help="Include reference information.")
parser.add_argument("dbxref", nargs=argparse.REMAINDER)
args = parser.parse_args()
# When not specified, include all information available
if None not in (args.basics, args.dbsource, args.references):
args.basics = True
args.dbsource = True
args.references = True
dbxrefs = dbxref.resolver.convert_to_dbxrefs(args.dbxref)
documents = retrieve(dbxrefs, basics=args.basics, dbsource=args.dbsource, references=args.references)
print(json.dumps(documents, sort_keys=True, indent=4))
def retrieve(dbxrefs, basics=True, dbsource=True, references=True):
"""Retrieve Protein data as xml and parse into json format"""
resolved = dbxref.resolver.resolve(dbxrefs, check_existence=False)
documents = []
for entry in resolved:
xml_url = entry["locations"]["xml"][0]
logger.debug("URL: %s", xml_url)
gi = requests.get(xml_url)
logger.debug("Content: %s", gi.text)
output = {"id": entry["dbxref"]}
root = ET.fromstring(gi.text)
if basics:
except KeyError:
print("One ore more of the basic information were not available for given dbxref. "
"Please check the source data.")
if dbsource:
except KeyError:
print("Source database information wasn't or wasn't fully available. Please check the source data")
if references:
except KeyError:
print("reference information wasn't or wasn't fully available. Please check the source data")
except (RuntimeError, ET.ParseError):
print("An error occurred")
return documents
def read_basics(root):
"""Finds basic information such as locus, dbxref, definition, organism, molecular information and representational
structure, if available, and puts out a dictionary containing the information"""
locus = root.find("Seq-entry_seq/Bioseq/Bioseq_id/Seq-id/Seq-id_swissprot/Textseq-id/Textseq-id_name").text
dbxref_id = "GI:" + root.find("Seq-entry_seq/Bioseq/Bioseq_id/Seq-id/Seq-id_swissprot/Textseq-id/"
definition = root.find("Seq-entry_seq/Bioseq/Bioseq_descr/Seq-descr/Seqdesc/Seqdesc_title").text
organism = {"name": root.find("Seq-entry_seq/Bioseq/Bioseq_descr/Seq-descr/Seqdesc/Seqdesc_source/BioSource/"
"OrgName_name_binomial/BinomialOrgName/BinomialOrgName_genus").text + " " +
"taxonomy": root.find("OrgName_lineage")}
mol_info = root.find("MolInfo_biomol")
structure = root.find("Seqdesc_comment")
return {"locus": locus, "dbxref": dbxref_id, "definition": definition, "organism": organism,
"molecular_info": mol_info, "structure": structure}
def read_dbsource(root):
"""Finds databank sources in the xmland puts out a list with all dbxrefs found."""
dbxref_list = []
for dbtag in root.findall("Seq-entry_seq/Bioseq/Bioseq_descr/Seq-descr/Seqdesc/Seqdesc_sp/SP-block/SP-block_dbref/"
dbxref_list.append(dbtag.find("Dbtag_db").text + ":" + dbtag.find("Dbtag_tag/Object-id/Object-id_str").text)
return {"source databases": dbxref_list}
def read_references(root):
"""Finds reference information in the xml and puts out a list containing information for authors, title, journal
and pubmed DOI"""
references = []
for cit_art in root.findall("Seq-entry_seq/Bioseq/Bioseq_descr/Seq-descr/Seqdesc/Seqdesc_pub/Pubdesc/Pubdesc_pub/"
author_list = []
journal = {}
title = ""
doi = ""
# Find Authors
for author in cit_art.findall("Cit-art_authors/Auth-list/Auth-list_names/Auth-list_names_std/Author"):
author_list.append(author.find("Author_name/Person-id/Person-id_name/Name-std/Name-std_last").text + ", " +
# Find Title
title = cit_art.find("Cit-art_title/Title/Title_E/Title_E_name").text
# Find Journal
journal = {"name": cit_art.find("Cit-art_from/Cit-art_from_journal/Cit-jour/Cit-jour_title/Title/Title_E/"
"date": cit_art.find("Cit-art_from/Cit-art_from_journal/Cit-jour/Cit-jour_imp/Imprint/Imprint_date/"
"Date/Date_std/Date-std/Date-std_day").text + "." +
"Date/Date_std/Date-std/Date-std_month").text + "." +
# Find Pubmed DOI
doi = cit_art.find("Cit-art_ids/ArticleIdSet/ArticleId/ArticleId_doi/DOI").text
# Put into dictionary
references.append({"authors": author_list,
"title": title,
"journal": journal,
"doi": doi
return {"references": references}
if __name__ == "__main__":
.. highlight:: yaml
Gene Identifier
Retrieve GI xml documents for dbxrefs and convert them into json.
* ``--basics`` - Include basic informations such as dbxref/accession-nr., locus, source organism and definition.
* ``--dbsource`` - Include source database information.
* ``--references`` - Include reference information.
example: ``GI:P0ABT0``
output scheme::
"dbxref": "GI:P0ABT0",
"definition": "RecName: Full=DNA polymerase III subunit theta",
"locus": "HOLE_ECO57",
"molecular_info": null,
"organism": {
"name": "Escherichia coli",
"taxonomy": null
"references": [
"authors": [
"Perna, N.T.",
"Plunkett, G.",
"Burland, V.",
"Mau, B.",
"Glasner, J.D.",
"Rose, D.J.",
"Mayhew, G.F.",
"Evans, P.S.",
"Gregor, J.",
"Kirkpatrick, H.A.",
"Posfai, G.",
"Hackett, J.",
"Klink, S.",
"Boutin, A.",
"Shao, Y.",
"Miller, L.",
"Grotbeck, E.J.",
"Davis, N.W.",
"Lim, A.",
"Dimalanta, E.T.",
"Potamousis, K.D.",
"Apodaca, J.",
"Anantharaman, T.S.",
"Lin, J.",
"Yen, G.",
"Schwartz, D.C.",
"Welch, R.A.",
"Blattner, F.R."
"doi": "10.1038/35054089",
"journal": {
"date": "25.1.2001",
"name": "Nature"
"title": "Genome sequence of enterohaemorrhagic Escherichia coli O157:H7."
"authors": [
"Hayashi, T.",
"Makino, K.",
"Ohnishi, M.",
"Kurokawa, K.",
"Ishii, K.",
"Yokoyama, K.",
"Han, C.G.",
"Ohtsubo, E.",
"Nakayama, K.",
"Murata, T.",
"Tanaka, M.",
"Tobe, T.",
"Iida, T.",
"Takami, H.",
"Honda, T.",
"Sasakawa, C.",
"Ogasawara, N.",
"Yasunaga, T.",
"Kuhara, S.",
"Shiba, T.",
"Hattori, M.",
"Shinagawa, H."
"doi": "10.1093/dnares/8.1.11",
"journal": {
"date": "28.2.2001",
"name": "DNA Res."
"title": "Complete genome sequence of enterohemorrhagic Escherichia coli O157:H7 and genomic comparison with a laboratory strain K-12."
"source databases": [
"structure": null
import unittest
from dbxref.retrieve import gi
class TestGI(unittest.TestCase):
def test_output(self):
documents = gi.retrieve([{"db": "GI", "id": "P0ABT0"}], basics=True, dbsource=True, references=True)
if __name__ == "__main__":
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment