Commit 2d42a183 authored by Lukas Jelonek's avatar Lukas Jelonek
Add retrieval script for pfam

parent 487cf9b0
...@@ -44,7 +44,7 @@ ...@@ -44,7 +44,7 @@
prefixes: ["PFAM"] prefixes: ["PFAM"]
resources: resources:
html: [""] html: [""]
xml: [""] xml: [""]
# does not work # does not work
# check_existence: "" # check_existence: ""
- name: PDB - name: PDB
#!/usr/bin/env python3
import env
import dbxref.config
import dbxref.resolver
import requests
import xml.etree.ElementTree as ET
import logging
import json
import argparse
logger = logging.getLogger(__name__)
ns = {'pfam': ''}
def main():
parser = argparse.ArgumentParser(description='Retrieve pfam xml documents for dbxrefs and convert them into json')
parser.add_argument('--basic', '-b', action='store_true', help='Include dbxref and description')
parser.add_argument('--annotation', '-a', action='store_true', help='Include annotation')
parser.add_argument('dbxrefs', nargs=argparse.REMAINDER)
args = parser.parse_args()
if not (args.basic or args.annotation):
args.basic = True
args.annotation = True
resolved = dbxref.resolver.resolve(args.dbxrefs, check_existence=False)
documents = []
for entry in resolved:
xml_url = entry['locations']['xml'][0]
logger.debug('URL: %s', xml_url)
r = requests.get(xml_url)
logger.debug('Content: %s', r.text)
root = ET.fromstring(r.text)
output = {'dbxref': entry['dbxref']}
for child in root.findall('pfam:entry', ns):
if args.basic:
if args.annotation:
def read_basic(entry):
description = entry.find('pfam:description', ns).text.strip()
return {'description': description}
def read_annotation(entry):
annotation = {
'id': entry.attrib['id'],
'accession': entry.attrib['accession'],
'terms' : [],
'comment': entry.find('pfam:comment', ns).text.strip()
go_terms = entry.find('pfam:go_terms', ns)
categories = go_terms.findall('pfam:category', ns)
for category in categories:
terms = category.findall('pfam:term', ns)
for term in terms:
'id': term.attrib['go_id'],
'description': term.text
return annotation
