Fix bug: hmmer and ghostx don't include sequences with no results

7a52f072 · Lukas Jelonek · cda62f3e · 7a52f072 · 7a52f072
Commit 7a52f072 authored 6 years ago by Lukas Jelonek
--- a/scripts/convert_ghostx.py
+++ b/scripts/convert_ghostx.py
@@ -4,6 +4,7 @@ import sys
 import json
 import argparse
 from os import path
+import glob
 import subprocess

 parser = argparse.ArgumentParser(description='Convert ghostx results to json documents')
@@ -55,6 +56,20 @@ with open(result_filename) as f:

        results.append(result)

+# add entries for files with no results
+queries = []
+path = "*.ids"
+for filename in glob.glob(path):
+    with open(filename) as f:
+        for line in f:
+            if line.startswith('>'):
+                queries.append(line.split()[0].strip().lstrip('>'))
+            else:
+                queries.append(line.split()[0].strip())
+for query_id in queries:
+    if not query_id in documents:
+        documents[query_id] = {"id": query_id, "computations": [{'tool': tool, 'results':[]}]}
+
 output_filename = args.output
 with open(output_filename, 'w') as o:
    json.dump(documents, o)

--- a/scripts/convert_hmmer.py
+++ b/scripts/convert_hmmer.py
@@ -12,12 +12,14 @@ args = parser.parse_args()

 # Provide a list of all query sequence names for conversion process
 queries = []
-path = "*_enum_headers.tsv"
+path = "*.ids"
 for filename in glob.glob(path):
    with open(filename) as f:
        for line in f:
            if line.startswith('>'):
                queries.append(line.split()[0].strip().lstrip('>'))
+            else:
+                queries.append(line.split()[0].strip())

 filename = args.result + "/domtblout.tsv"
 documents = {}