Commit 504f2774 authored by Lukas Jelonek's avatar Lukas Jelonek
Browse files

Merge branch 'features/pubmed' into 'develop'


See merge request SOaAS/dbxref!16
parents 76cf8a03 41ec240d
......@@ -74,10 +74,16 @@
type: 'internal'
location: 'dbxref.retrieve.rfam'
- name: Pubmed
prefixes: ["Pubmed"]
prefixes: ["Pubmed", "PM"]
html: [""]
html: ['']
text: ['']
xml: ['']
json: ['']
check_existence: ""
type: 'internal'
location: 'dbxref.retrieve.pubmed'
- name: Protein Families
prefixes: ["PFAM"]
#!/usr/bin/env python3
import dbxref.resolver
import requests
import logging
import json
import argparse
logger = logging.getLogger(__name__)
def main():
"""main()method for script usage"""
parser = argparse.ArgumentParser(description="Retrieve Pubmed json documents and parse into dbxref json format")
parser.add_argument("--basics", "-b", action="store_true", help="Include basic information such as title, language,"
" dbxref-id, and day of publishment on pubmed.")
parser.add_argument("dbxrefs", nargs=argparse.REMAINDER)
args = parser.parse_args()
if None not in (args.basics):
args.basics = True
dbxrefs = dbxref.resolver.convert_to_dbxrefs(args.dbxrefs)
documents = retrieve(dbxrefs, basics=args.basics)
print(json.dumps(documents, sort_keys=True, indent=4))
def _get(result, field, mandatory=False, default="", transform=lambda x: x):
"""Retrieve a given field if available, return default or exception otherwise. Result may be manipulated by transformation function"""
if field in result:
return transform(result[field])
if mandatory:
raise KeyError("Field '"+field+"' not found in dictionary")
return default
def find_id(list, type):
"""Find id of given type in pubmed islist"""
matches = [x for x in list if x['idtype'] == type]
if matches:
return matches[0]["value"]
raise KeyError("Id of type '" + type + "' not found in idlist.")
def join_authors(list):
"""Joins pubmed entry authors to a single string"""
return ", ".join([x["name"] for x in list])
def retrieve(dbxrefs, basics=True):
"""Retrieve Pubmed json documents and parse into dbxref json format"""
resolved = dbxref.resolver.resolve(dbxrefs, check_existence=False)
documents = []
for entry in resolved:
# Construct URL for retrival
json_url = entry["locations"]["json"][0]
logger.debug("URL: %s", json_url)
r = requests.get(json_url)
logger.debug("Content: %s", r.text)
pm = json.loads(r.text)
output = {"id": entry["dbxref"]}
entry_id = dbxrefs[0]["id"]
# Parse basic information
result = pm["result"][entry_id]
if basics:
output["publication-date"] = _get(result, "epubdate")
output["dbxref"] = "Pubmed:" + _get(result, "uid")
output["title"] = _get(result, "title")
output["language"] = _get(result, "lang", transform=lambda x: ", ".join(x))
output["authors"] = _get(result, "authors", transform=lambda x: join_authors(x))
output["source"] = _get(result, "source")
output["volume"] = _get(result, "volume")
output["issue"] = _get(result, "issue")
output["doi"] = _get(result, "articleids", transform=lambda x: find_id(x, "doi"))
return documents
if __name__ == '__main__':
.. highlight:: yaml
Retrieve Pubmed json documents for dbxrefs and convert them into dbxref json format.
* ``--basics`` - Include basic information such as title, language, dbxref-id, and day of publishment on pubmed.
* ``--references`` - Include reference information such as journal name, DOI, authors and day of publishment.
* ``--article_ids`` - Include article-IDs.
example: ``PM:19393038``
output scheme::
"article_IDs": [
"pubmed: 19393038",
"pii: gb-2009-10-4-r42",
"doi: 10.1186/gb-2009-10-4-r42",
"pmc: PMC2688933",
"rid: 19393038",
"eid: 19393038",
"pmcid: pmc-id: PMC2688933;"
"dbxref_id": "PM:19393038",
"epublic-date": "24.Apr.2009",
"language": [
"references": {
"DOI": "10.1186/gb-2009-10-4-r42",
"authors": [
"Zimin, AV",
"Delcher, AL",
"Florea, L",
"Kelley, DR",
"Schatz, MC",
"Puiu, D",
"Hanrahan, F",
"Pertea, G",
"Van, Tassell CP",
"Sonstegard, TS",
"Mar\u00e7ais, G",
"Roberts, M",
"Subramanian, P",
"Yorke, JA",
"Salzberg, SL"
"journal": "Genome biology",
"pubdate": "2009/01/01 00:00"
"title": "A whole-genome assembly of the domestic cow, Bos taurus."
import unittest
from dbxref.retrieve import pubmed
class TestPubmed(unittest.TestCase):
def test_output(self):
documents = pubmed.retrieve([{"db": "Pubmed", "id": "19393038"}], basics=True)
if __name__ == "__main__":
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment