Commit 5a3cc5ad authored by Lukas Jelonek's avatar Lukas Jelonek
Browse files

Merge branch 'fasta-header-decomplexify' into 'develop'

Fasta header decomplexify

See merge request SOaAS/psot!6
parents ebb26a4a ad087cc8
......@@ -16,6 +16,7 @@ In order to run PSOT on your machine you need:
* signalp
* ghostx
* tmhmm
* targetp
* the bioinformatic databases you want to use
......
#!/usr/bin/python3
import sys
import json
import argparse
from os import path
import subprocess
parser = argparse.ArgumentParser(description='Convert ghostx results to json documents')
parser.add_argument('--result', '-r', required=True, help='The ghostx result directory')
......@@ -55,3 +58,7 @@ with open(result_filename) as f:
output_filename = args.output
with open(output_filename, 'w') as o:
json.dump(documents, o)
# Replace sequences' enumerated ids with their original ids
restore_seq_ids_tool = path.dirname(__file__) + '/restore_seq_id_from_enumeration.py'
subprocess.run([restore_seq_ids_tool, '-j', output_filename, '-e', args.result + '/enum_headers.tsv'])
#!/usr/bin/python3
import sys
import json
import argparse
from os import path
import subprocess
parser = argparse.ArgumentParser(description = 'Convert targetp results to json documents')
parser.add_argument('--result', '-r', required = True, help = 'The targetp result file')
parser.add_argument('--output', '-o', required = True, help = 'The converted results json file')
parser = argparse.ArgumentParser(description='Convert targetp results to json document')
parser.add_argument('--result', '-r', required=True, help='The targetp results directory')
parser.add_argument('--output', '-o', required=True, help='The converted results json file')
args = parser.parse_args()
loc_dict = {"C": "chloroplast", "M": "mitochondrion", "S": "secretory pathway", "_": "other", "*": "undetermined"}
filename = args.result
filename = args.result + '/results.txt'
documents = {}
with open(filename) as f:
tool = None
field_index = {"Name": None, "cTP": None, "mTP": None, "SP": None, "Loc": None}
field_index = {"cTP": None, "mTP": None, "SP": None, "Loc": None}
is_datasection = False
for line in f:
......@@ -27,12 +31,10 @@ with open(filename) as f:
results = documents[split[0]]['computations'][0]["results"][0]
for field in field_index:
if field_index[field] is not None:
if field in ["cTP", "mTP", "SP"]:
results[field.lower()] = float(split[field_index[field]])
elif field == "Loc":
if field == "Loc":
results[field.lower()] = loc_dict[split[field_index[field]]]
else:
results[field.lower()] = split[field_index[field]]
results[field.lower()] = float(split[field_index[field]])
else:
if line.startswith('Name '):
split = line.split()
......@@ -50,5 +52,10 @@ with open(filename) as f:
tool['organism_group'] = 'non-plant'
output_filename = args.output
with open(output_filename, 'w') as o:
json.dump(documents, o)
# Replace sequences' enumerated ids with their original ids
restore_seq_ids_tool = path.dirname(__file__) + '/restore_seq_id_from_enumeration.py'
subprocess.run([restore_seq_ids_tool, '-j', output_filename, '-e', args.result + '/enum_headers.tsv'])
#!/usr/bin/python3
import argparse
import fileinput
parser = argparse.ArgumentParser(description='Replaces fasta headers with unique numbers and saves a dictionary of both in tsv format. Caution: The original fasta file gets replaced in the process.')
parser.add_argument('--fasta', '-f', required=True, help='The fasta file')
parser.add_argument('--enum-headers', '-e', required=True, help='File to store enumerated headers in tsv format')
args = parser.parse_args()
fasta = args.fasta
headers_dict = {}
num = 1
with fileinput.FileInput(fasta, inplace=True) as f:
for line in f:
if line.startswith(">"):
header = line.strip().lstrip('>')
headers_dict[num] = header
print(">{}".format(num))
num += 1
else:
print(line, end='')
enum_headers_file = args.enum_headers
with open(enum_headers_file, 'w') as o:
for key in headers_dict:
o.write("{}\t{}\n".format(key, headers_dict[key]))
#!/usr/bin/python3
import json
import argparse
parser = argparse.ArgumentParser(description='Replace enumerated id of sequences with original identifier. Caution: The original json file gets replaced in the process.')
parser.add_argument('--json', '-j', required=True, help='The results json file')
parser.add_argument('--enum-headers', '-e', required=True, help='The enumerated original headers in tsv format')
args = parser.parse_args()
seq_id_dict = {}
docs_enumerated = {}
with open(args.json) as j:
docs_enumerated = json.load(j)
with open(args.enum_headers) as h:
for line in h:
num, header = line.strip().split('\t', 1)
seq_id_dict[num] = header.split()[0]
documents_restored = {}
for num in docs_enumerated:
seq_id = seq_id_dict[num]
doc = docs_enumerated[num]
doc["id"] = seq_id
documents_restored[seq_id] = doc
with open(args.json, 'w') as o:
json.dump(documents_restored, o)
#!/usr/bin/env python3
import env
import argparse
import re
from os import system,makedirs
from os import system,makedirs,path
from psot import config
import subprocess
import json
......@@ -12,9 +13,15 @@ ghostx_tool = config.load_config()['tools'].get('ghostx', 'ghostx')
parser = argparse.ArgumentParser(description='Identify homologues in the swissprot database')
parser.add_argument('--fasta', '-f', required=True, help='A fasta file with aminoacid sequences')
parser.add_argument('--database', '-d', required=True, help='Database to search in')
parser.add_argument('--output', '-o', required=True, help='The result directory. Will contain info.json and results.tsv.')
parser.add_argument('--output', '-o', required=True, help='The result directory. Will contain info.json, results.tsv and enum_headers.tsv.')
args = parser.parse_args()
makedirs(args.output, exist_ok=True)
# Swap fasta headers for unique numbers to save ghostx from dealing with complex headers
reduce_headers_tool = path.dirname(__file__) + '/reduce_fasta_headers_to_enumeration.py'
subprocess.run([reduce_headers_tool, "-f", args.fasta, "-e", args.output + '/enum_headers.tsv'])
# Aproach:
# directory for output
# info.json -> Tool info
......@@ -26,11 +33,10 @@ toolconfig = {
}
# find version
output = subprocess.run([ghostx_tool], stderr=subprocess.PIPE)
text =output.stderr.decode('ascii')
text = output.stderr.decode('ascii')
result = re.search('version (.*)', text)
toolconfig['version'] = result.group(1)
makedirs(args.output, exist_ok=True)
with open(args.output + '/info.json', 'w') as f:
json.dump(toolconfig, f)
system(ghostx_tool + " aln -d " + args.database + " -o " + args.output + "/results.tsv -i " + args.fasta)
#!/usr/bin/env python3
import env
import argparse
from psot import config
from os import system
from os import system,makedirs,path
import subprocess
targetp_tool = config.load_config()['tools'].get('targetp', 'targetp')
org_flags = {'plant': '-P', 'non-plant': '-N'}
parser = argparse.ArgumentParser(description = 'Determine subcellular locations of eukaryotic amino acid sequences')
parser.add_argument('--fasta', '-f', required = True, help = 'A fasta file with amino acid sequences')
parser.add_argument('--organism_group', choices = org_flags.keys(), required = True, help = 'Define wether to use plant/non-plant networks')
parser.add_argument('--output', required = True, help = 'The output file')
parser = argparse.ArgumentParser(description='Determine subcellular locations of eukaryotic amino acid sequences')
parser.add_argument('--fasta', '-f', required=True, help='A fasta file with amino acid sequences')
parser.add_argument('--organism_group', choices=org_flags.keys(), required=True, help='Define wether to use plant/non-plant networks')
parser.add_argument('--output', required=True, help='The result directory. Will contain results.txt and enum_headers.tsv.')
args = parser.parse_args()
system(targetp_tool + " " + org_flags[args.organism_group] + " " + args.fasta + " > " + args.output)
makedirs(args.output, exist_ok=True)
# Swap fasta headers for unique numbers to avoid truncation
reduce_headers_tool = path.dirname(__file__) + '/reduce_fasta_headers_to_enumeration.py'
subprocess.run([reduce_headers_tool, "-f", args.fasta, "-e", args.output + '/enum_headers.tsv'])
results_file = args.output + '/results.txt'
system(targetp_tool + " " + org_flags[args.organism_group] + " " + args.fasta + " > " + results_file)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment