convert_blast_swissprot.py 2.19 KB
Newer Older
1
2
3
#!/usr/bin/python3
import sys
import json
4
import argparse
5

6
7
8
9
10
11
parser = argparse.ArgumentParser(description='Convert blast results to json documents')
parser.add_argument('--result', '-r', required=True, help='The blast result file in outfmt 7 format (tsv with headers)')
parser.add_argument('--output', '-o', required=True, help='The converted results json file')
args = parser.parse_args()

filename = args.result
12
13
documents = {}
with open(filename) as f:
14
15
    header = None
    tool = None
16
17
18
19
20
21
22
23
    for line in f:
        line = line.strip()
        if line.startswith('#'):
            if line.startswith('# Fields:'):
                header_entries = line.replace('# Fields: ','').split(', ')
                header = {}
                for idx, key in enumerate(header_entries):
                    header[key] = idx
24
25
26
27
28
29
30
31
            if line.startswith('# BLASTP'):
                split = line.split()
                tool = {'name': 'blastp',
                        'version': split[2]}
            if line.startswith('# Database'):
                split = line.split()
                tool['database'] = split[2]

32
33
34
        else:
            split = line.split("\t")
            if not split[0] in documents:
35
36
                documents[split[0]] = {"id": split[0], "computations": [{'tool':tool, 'results':[]}]}
            results = documents[split[0]]['computations'][0]['results']
37
38
39
40
41
42
43
44
45
46
47
48
49
50

            result = {}
            result["dbxref"] = "UniProtKB/Swiss-Prot:"+split[header['subject id']].split("|")[1]
            if '% identity' in header:
                result["percent_identity"] = float(split[header['% identity']])
            if 'q. start' in header and 'q. end' in header:
                result['qloc'] = split[header['q. start']] + '-' + split[header['q. end']]
            if 's. start' in header and 's. end' in header:
                result['sloc'] = split[header['s. start']] + '-' + split[header['s. end']]
            if 'evalue' in header:
                result['evalue'] = float(split[header['evalue']])
            if 'BTOP' in header:
                result['btop'] = split[header['BTOP']]

51
            results.append(result)
52

53
54
55
output_filename = args.output
with open(output_filename, 'w') as o:
    json.dump(documents, o)