pfam.py 3.32 KB
Newer Older
Lukas Jelonek's avatar
Lukas Jelonek committed
1
2
3
4
#!/usr/bin/env python3
import dbxref.resolver
import requests
import xml.etree.ElementTree as ET
5
from xml.etree.ElementTree import ParseError
Lukas Jelonek's avatar
Lukas Jelonek committed
6
7
8
9
10
11
12
13
import logging
import json
import argparse
#logging.basicConfig(level=logging.DEBUG)
#logging.getLogger().setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
#logger.setLevel(logging.DEBUG)

14
ns = {'pfam': 'https://pfam.xfam.org/'}
Lukas Jelonek's avatar
Lukas Jelonek committed
15
16
17
18
19
20
21
22
23
24

def main():
    parser = argparse.ArgumentParser(description='Retrieve pfam xml documents for dbxrefs and convert them into json')
    parser.add_argument('--basic', '-b', action='store_true', help='Include dbxref and description')
    parser.add_argument('--annotation', '-a', action='store_true', help='Include annotation')
    parser.add_argument('dbxrefs', nargs=argparse.REMAINDER)
    args = parser.parse_args()
    if not (args.basic or args.annotation):
        args.basic = True
        args.annotation = True
25
26
27
28
29
30
31
    dbxrefs = dbxref.resolver.convert_to_dbxrefs(args.dbxrefs)

    documents = retrieve(dbxrefs, basic=args.basic, annotation=args.annotation)
    print(json.dumps(documents))

def retrieve(dbxrefs, basic=True, annotation=True):
    resolved = dbxref.resolver.resolve(dbxrefs, check_existence=False)
Lukas Jelonek's avatar
Lukas Jelonek committed
32
33
    documents = []
    for entry in resolved:
34
      if 'xml' in entry['locations']:
Lukas Jelonek's avatar
Lukas Jelonek committed
35
36
37
38
39
        xml_url = entry['locations']['xml'][0]
        logger.debug('URL: %s', xml_url)
        r = requests.get(xml_url)
        logger.debug('Content: %s', r.text)

40
        output = {'id': entry['dbxref']}
Lukas Jelonek's avatar
Lukas Jelonek committed
41

42
        try:
43
44
          root = ET.fromstring(r.text)

45
46
47
48
49
50
51
52
53
54
55
56
          tree = str(ET.tostring(root))
          if '<error>' in tree:
               output['message'] = tree[tree.find('<error>')+7:tree.rfind('</error>')]
          else:
              for child in root.findall('pfam:entry', ns):
                  if basic:
                      output.update(read_basic(child))
                  if annotation:
                      output.update(read_annotation(child))
        except (KeyError, AttributeError) as e:
            logger.warn('Error in retrieving %s', str(entry))
            raise
57
        except (ParseError, RuntimeError) as e:
58
59
60
61
62
63
64
            output['message'] = 'an error occurred'
            try:
                html = HTML.document_fromstring(r.text.replace('\n', ' '))
                if html.get_element_by_id('noResultsMessage') is not None:
                    output['message'] = 'no results found; probably invalid ID'
            except:
                pass
Lukas Jelonek's avatar
Lukas Jelonek committed
65
        documents.append(output)
66
    return documents
Lukas Jelonek's avatar
Lukas Jelonek committed
67
68
69
70
71
72
73

def read_basic(entry):
    description = entry.find('pfam:description', ns).text.strip()
    return {'description': description}

def read_annotation(entry):
    annotation = {
74
            'domain': entry.attrib['id'],
Lukas Jelonek's avatar
Lukas Jelonek committed
75
            'accession': entry.attrib['accession'],
76
            'terms' : []
Lukas Jelonek's avatar
Lukas Jelonek committed
77
            }
78
79
80
81
82

    comment = entry.find('pfam:comment', ns)
    if comment:
      annotation['comment'] = comment.text.strip()

Lukas Jelonek's avatar
Lukas Jelonek committed
83
    go_terms = entry.find('pfam:go_terms', ns)
84
85
86
87
88
89
90
91
92
    if go_terms:
      categories = go_terms.findall('pfam:category', ns)
      for category in categories:
          terms = category.findall('pfam:term', ns)
          for term in terms:
              annotation['terms'].append({
                  'id': term.attrib['go_id'],
                  'description': term.text
                  })
Lukas Jelonek's avatar
Lukas Jelonek committed
93
94
    return annotation

95
96
if __name__ == "__main__":
  main()