convert_ghostx.py 2.13 KB
Newer Older
1
2
3
4
5
6
7
8
#!/usr/bin/python3
import sys
import json
import argparse

parser = argparse.ArgumentParser(description='Convert ghostx results to json documents')
parser.add_argument('--result', '-r', required=True, help='The ghostx result directory')
parser.add_argument('--output', '-o', required=True, help='The converted results json file')
9
10
11
parser.add_argument('--dbxref', '-d', required=True, help='The dbxref prefix that will be prepended to the accession or id')
parser.add_argument('--acc_split', '-s', required=False, help='The delimiter used to split the subject id (optional)')
parser.add_argument('--acc_position', '-p', type=int, required=False, help='The position of the accession after splitting the subject_id (optional)')
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
args = parser.parse_args()

result_directory = args.result
info_filename = args.result + "/info.json"
result_filename = args.result + "/results.tsv"
documents = {}

tool = None
# read tool info
with open(info_filename) as f:
    tool = json.load(f)

with open(result_filename) as f:
    for line in f:
        line = line.strip()
        split = line.split("\t")
28
29
30
31
32
33
        # ghostx may contain the full fasta header as a query
        # remove everything after the actual id
        id = split[0].split()[0]
        if not id in documents:
            documents[id] = {"id": id, "computations": [{'tool':tool, 'results':[]}]}
        results = documents[id]['computations'][0]['results']
34
35

        result = {}
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
        accession = split[1]
        if args.acc_split and args.acc_position:
            accession = accession.split(args.acc_split)[args.acc_position]
        elif args.acc_split:
            accession = acession.split(args.acc_split)[0]
        result['query'] = {
                'start': split[6],
                'end': split[7]
                }
        result['target'] = {
                'dbxref': args.dbxref + ':' + accession,
                'start': split[8],
                'end': split[9],
                'evalue': float(split[10]),
                'percent_identity': float(split[2])
                }
52
53
54
55
56
57

        results.append(result)

output_filename = args.output
with open(output_filename, 'w') as o:
    json.dump(documents, o)