pfam.py 3.26 KB
Newer Older
Lukas Jelonek's avatar
Lukas Jelonek committed
1
2
3
4
5
6
7
8
9
10
11
12
#!/usr/bin/env python3
import dbxref.resolver
import requests
import xml.etree.ElementTree as ET
import logging
import json
import argparse
#logging.basicConfig(level=logging.DEBUG)
#logging.getLogger().setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
#logger.setLevel(logging.DEBUG)

13
ns = {'pfam': 'https://pfam.xfam.org/'}
Lukas Jelonek's avatar
Lukas Jelonek committed
14
15
16
17
18
19
20
21
22
23

def main():
    parser = argparse.ArgumentParser(description='Retrieve pfam xml documents for dbxrefs and convert them into json')
    parser.add_argument('--basic', '-b', action='store_true', help='Include dbxref and description')
    parser.add_argument('--annotation', '-a', action='store_true', help='Include annotation')
    parser.add_argument('dbxrefs', nargs=argparse.REMAINDER)
    args = parser.parse_args()
    if not (args.basic or args.annotation):
        args.basic = True
        args.annotation = True
24
25
26
27
28
29
30
    dbxrefs = dbxref.resolver.convert_to_dbxrefs(args.dbxrefs)

    documents = retrieve(dbxrefs, basic=args.basic, annotation=args.annotation)
    print(json.dumps(documents))

def retrieve(dbxrefs, basic=True, annotation=True):
    resolved = dbxref.resolver.resolve(dbxrefs, check_existence=False)
Lukas Jelonek's avatar
Lukas Jelonek committed
31
32
    documents = []
    for entry in resolved:
33
      if 'xml' in entry['locations']:
Lukas Jelonek's avatar
Lukas Jelonek committed
34
35
36
37
38
39
        xml_url = entry['locations']['xml'][0]
        logger.debug('URL: %s', xml_url)
        r = requests.get(xml_url)
        logger.debug('Content: %s', r.text)
        root = ET.fromstring(r.text)

40
        output = {'id': entry['dbxref']}
Lukas Jelonek's avatar
Lukas Jelonek committed
41

42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
        try:
          tree = str(ET.tostring(root))
          if '<error>' in tree:
               output['message'] = tree[tree.find('<error>')+7:tree.rfind('</error>')]
          else:
              for child in root.findall('pfam:entry', ns):
                  if basic:
                      output.update(read_basic(child))
                  if annotation:
                      output.update(read_annotation(child))
        except (KeyError, AttributeError) as e:
            logger.warn('Error in retrieving %s', str(entry))
            raise
        except RuntimeError as e:
            output['message'] = 'an error occurred'
            try:
                html = HTML.document_fromstring(r.text.replace('\n', ' '))
                if html.get_element_by_id('noResultsMessage') is not None:
                    output['message'] = 'no results found; probably invalid ID'
            except:
                pass
Lukas Jelonek's avatar
Lukas Jelonek committed
63
        documents.append(output)
64
    return documents
Lukas Jelonek's avatar
Lukas Jelonek committed
65
66
67
68
69
70
71

def read_basic(entry):
    description = entry.find('pfam:description', ns).text.strip()
    return {'description': description}

def read_annotation(entry):
    annotation = {
72
            'domain': entry.attrib['id'],
Lukas Jelonek's avatar
Lukas Jelonek committed
73
            'accession': entry.attrib['accession'],
74
            'terms' : []
Lukas Jelonek's avatar
Lukas Jelonek committed
75
            }
76
77
78
79
80

    comment = entry.find('pfam:comment', ns)
    if comment:
      annotation['comment'] = comment.text.strip()

Lukas Jelonek's avatar
Lukas Jelonek committed
81
    go_terms = entry.find('pfam:go_terms', ns)
82
83
84
85
86
87
88
89
90
    if go_terms:
      categories = go_terms.findall('pfam:category', ns)
      for category in categories:
          terms = category.findall('pfam:term', ns)
          for term in terms:
              annotation['terms'].append({
                  'id': term.attrib['go_id'],
                  'description': term.text
                  })
Lukas Jelonek's avatar
Lukas Jelonek committed
91
92
    return annotation

93
94
if __name__ == "__main__":
  main()