Commit c1d124d9 authored by lmueller's avatar lmueller
Browse files

merge develop into tmhmm

parents 458a3c11 5a3cc5ad
[0.2]
* Output format for homology tools has been aligned
* Implemented cluster execution with SGE
* Aligned output format for homology tools
* Changed module manifest format to allow parameters for converters
* Hidden files in modules directory are ignored
* Ignore hidden files in modules directory
[0.1]
* Implemented psot runner script
......
......@@ -18,6 +18,7 @@ Supported bioinformatic tools:
* blastp
* signalp
* ghostx
* targetp
Install nextflow:
......
tools:
signalp: '/vol/biotools/bin/signalp'
ghostx: '/vol/biotools/bin/ghostx'
targetp: '/vol/software/bin/targetp'
tmhmm: '/vol/biotools/bin/tmhmm'
......@@ -16,6 +16,7 @@ In order to run PSOT on your machine you need:
* signalp
* ghostx
* tmhmm
* targetp
* the bioinformatic databases you want to use
......
......@@ -11,8 +11,11 @@ info: 'hmmscan analysis against PFAM-A'
analysis:
script: 'run_hmmer.py'
parameters:
database: '/vol/biodb/pfam/Pfam-A.hmm'
database: '/vol/biodb/pfam-31/Pfam-A.hmm'
evalue: 1e-10
execution:
cluster:
chunksize: 200
# The name of the result to json converter script. Must take one parameter, the
# result file from the analysis_script
......
# Module manifest for the targetp analysis
# The name of the module. Is needed for the list-analyses option, for custom
# configurations and custom profiles
name: 'targetp'
# Short description of the analysis
info: 'predict subcellular peptide location'
# The configuration of the script for the analysis step.
analysis:
# script must take a --fasta parameter
script: 'run_targetp.py'
# specify additional default configuration here
parameters:
# The configuration of the script for the json conversion step.
converter:
# script must take a --result parameter, which is the result from the analysis step
script: 'convert_targetp.py'
# specify additional default configuration here
parameters:
......@@ -5,3 +5,5 @@ modules:
blastp_swissprot:
alignment:
hmmer_pfam_a:
targetp:
organism_group: 'non-plant'
......@@ -6,3 +6,5 @@ modules:
blastp_swissprot:
ghostx_swissprot:
hmmer_pfam_a:
targetp:
organism_group: 'non-plant'
......@@ -25,6 +25,7 @@ def main():
analyze_parser.add_argument('--config', '-c', help='The config to use')
analyze_parser.add_argument('--debug', '-d', action='store_true', help='Debug mode, computation directory will not be removed after computation')
analyze_parser.add_argument('--execution_dir', '-e', help='Use the specified execution directory and do not delete it after the computation')
analyze_parser.add_argument('--use_cluster', '-C', action='store_true', help='Use compute cluster for execution')
analyze_parser.set_defaults(func=analyze)
args = parser.parse_args()
......@@ -53,6 +54,7 @@ def cleanup(execution):
def generate_execution(config, args):
execution = {}
execution['debug'] = args.debug
execution['use_cluster'] = args.use_cluster
execution['mode'] = 'live' if args.live else 'complete'
execution['bin_path'] = config['app']['bin_path']
execution['script_path'] = os.path.abspath(os.path.join(os.path.dirname(config['app']['bin_path']), 'scripts'))
......@@ -62,7 +64,10 @@ def generate_execution(config, args):
if args.execution_dir:
execution['directory'] = os.path.abspath(args.execution_dir)
else:
execution['directory'] = tempfile.mkdtemp()
if args.use_cluster:
execution['directory'] = tempfile.mkdtemp(dir='/vol/sge-tmp')
else:
execution['directory'] = tempfile.mkdtemp()
execution['modules'] = generate_execution_modules_for_profile(config, args.profile)
return execution
......
......@@ -17,8 +17,10 @@ def flatten(d, parent_key='', sep='_'):
analysis_template = Template ('''
process ${id} {
executor '${executor}'
${clusterOptions}
input:
file fasta from for_${id}
file fasta from for_${id}${chunks}
output:
file "$${fasta}.${id}.results" into ${id}_results
......@@ -89,7 +91,7 @@ process fetch_dbxrefs {
"""
}
''')
input_template = Template(''' file ${id}_result from ${id}_json''')
input_template = Template(''' file ${id}_result from ${id}_json.collect()''')
join_jsons_template = Template('''
process join_documents {
......@@ -155,7 +157,17 @@ Channel.fromPath(params.fasta).set{fasta}''')
fragments.append('fasta.into{'+';'.join(target_channels)+';}')
for m in modules:
fragments.append(analysis_template.substitute(flatten(m)))
config = flatten(m)
if execution['use_cluster']:
config['executor'] = 'sge'
config['chunks'] = ".splitFasta(by:300, file:'input')"
config['clusterOptions'] = "clusterOptions='-S /bin/bash'"
else:
config['executor'] = 'local'
config['chunks'] = ''
config['clusterOptions'] = ''
fragments.append(analysis_template.substitute(config))
if execution['mode'] == 'live':
fragments.append(convert_live_template.substitute(flatten(m)))
copy = deepcopy(m)
......
#!/usr/bin/python3
import sys
import json
import argparse
from os import path
import subprocess
parser = argparse.ArgumentParser(description='Convert ghostx results to json documents')
parser.add_argument('--result', '-r', required=True, help='The ghostx result directory')
......@@ -55,3 +58,7 @@ with open(result_filename) as f:
output_filename = args.output
with open(output_filename, 'w') as o:
json.dump(documents, o)
# Replace sequences' enumerated ids with their original ids
restore_seq_ids_tool = path.dirname(__file__) + '/restore_seq_id_from_enumeration.py'
subprocess.run([restore_seq_ids_tool, '-j', output_filename, '-e', args.result + '/enum_headers.tsv'])
......@@ -9,6 +9,11 @@ parser.add_argument('--output', '-o', required=True, help='The converted results
parser.add_argument('--dbxref', '-d', required=True, help='The dbxref prefix that will be prepended to the accession or id')
args = parser.parse_args()
query_file = args.result + "/queries.json"
queries = []
with open(query_file) as f:
queries = json.load(f)
filename = args.result + "/domtblout.tsv"
documents = {}
tool = {'name': None, 'version': None, 'database': None}
......@@ -30,7 +35,11 @@ with open(filename) as f:
query_id = split[3]
if not query_id in documents:
documents[query_id] = {"id": query_id, "computations": [{'tool': tool, 'results':[]}]}
results = documents[query_id]['computations'][0]["results"]
try:
queries.remove(query_id)
except ValueError:
print('HMMER converter: Query ID "' + query_id + '" not found among initial queries!')
results = documents[query_id]['computations'][0]["results"]
results.append({
'target': {
......@@ -48,6 +57,10 @@ with open(filename) as f:
}
})
for query_id in queries:
if not query_id in documents:
documents[query_id] = {"id": query_id, "computations": [{'tool': tool, 'results':[]}]}
output_filename = args.output
with open(output_filename, 'w') as o:
json.dump(documents, o)
#!/usr/bin/python3
import sys
import json
import argparse
from os import path
import subprocess
parser = argparse.ArgumentParser(description='Convert targetp results to json document')
parser.add_argument('--result', '-r', required=True, help='The targetp results directory')
parser.add_argument('--output', '-o', required=True, help='The converted results json file')
args = parser.parse_args()
loc_dict = {"C": "chloroplast", "M": "mitochondrion", "S": "secretory pathway", "_": "other", "*": "undetermined"}
filename = args.result + '/results.txt'
documents = {}
with open(filename) as f:
tool = None
field_index = {"cTP": None, "mTP": None, "SP": None, "Loc": None}
is_datasection = False
for line in f:
if line.startswith("---------"):
is_datasection = not is_datasection
elif is_datasection:
split = line.split()
if not split[0] in documents:
documents[split[0]] = {"id": split[0], "computations": [{'tool': tool, 'results':[{}]}]}
results = documents[split[0]]['computations'][0]["results"][0]
for field in field_index:
if field_index[field] is not None:
if field == "Loc":
results[field.lower()] = loc_dict[split[field_index[field]]]
else:
results[field.lower()] = float(split[field_index[field]])
else:
if line.startswith('Name '):
split = line.split()
for field in field_index:
try:
field_index[field] = split.index(field)
except ValueError:
print('TargetP Converter: No column "' + field + '" found in table!')
elif line.startswith('### targetp '):
split = line.split()
tool = {'name': 'TargetP',
'version': split[2].replace("v", ""),
'organism_group': 'plant'}
elif ' NON-PLANT ' in line:
tool['organism_group'] = 'non-plant'
output_filename = args.output
with open(output_filename, 'w') as o:
json.dump(documents, o)
# Replace sequences' enumerated ids with their original ids
restore_seq_ids_tool = path.dirname(__file__) + '/restore_seq_id_from_enumeration.py'
subprocess.run([restore_seq_ids_tool, '-j', output_filename, '-e', args.result + '/enum_headers.tsv'])
......@@ -10,62 +10,62 @@ parser.add_argument('--output', '-o', required=True, help='The converted results
args = parser.parse_args()
def parse_topology(s, l):
elements = []
result = []
flag = False
num = ""
for c in s:
if c == 'i' or c == 'o' or c == '-':
if num != "":
elements.append(int(num))
num = ""
if c != '-':
elements.append(c)
else:
num += c
if num != "":
elements.append(int(num))
for i in range(len(elements)):
if flag:
flag = False
else:
if elements[i] == 'i' or elements[i] == 'o':
location = ""
if elements[i] == 'i':
location = "inside"
else:
location = "outside"
if i == 0:
if len(elements) == 1:
result.append({'start': 1, 'end': l , 'location': location})
else:
result.append({'start': 1, 'end': elements[i+1]-1 , 'location': location})
elif i+1 == len(elements):
result.append({'start': elements[i-1]+1, 'end':l , 'location': location})
else:
result.append({'start': elements[i-1]+1, 'end': elements[i+1]-1 , 'location': location})
else:
result.append({'start': elements[i], 'end': elements[i+1] , 'location': 'membrane'})
flag = True
return (result)
elements = []
result = []
flag = False
num = ""
for c in s:
if c == 'i' or c == 'o' or c == '-':
if num != "":
elements.append(int(num))
num = ""
if c != '-':
elements.append(c)
else:
num += c
if num != "":
elements.append(int(num))
for i in range(len(elements)):
if flag:
flag = False
else:
if elements[i] == 'i' or elements[i] == 'o':
location = ""
if elements[i] == 'i':
location = "inside"
else:
location = "outside"
if i == 0:
if len(elements) == 1:
result.append({'start': 1, 'end': l , 'location': location})
else:
result.append({'start': 1, 'end': elements[i+1]-1 , 'location': location})
elif i+1 == len(elements):
result.append({'start': elements[i-1]+1, 'end':l , 'location': location})
else:
result.append({'start': elements[i-1]+1, 'end': elements[i+1]-1 , 'location': location})
else:
result.append({'start': elements[i], 'end': elements[i+1] , 'location': 'membrane'})
flag = True
return (result)
filename = args.result
documents = {}
with open(filename) as f:
for line in f:
line_dic = {}
elements = line.rstrip().split("\t")
for i in range(1, len(elements)):
key = elements[i][:elements[i].rfind('=')]
value = elements[i][elements[i].rfind('=')+1:]
if key == 'Topology':
line_dic['topology'] = parse_topology(value, int(elements[i-4][elements[i-4].rfind('=')+1:]))
elif key == 'PredHel':
line_dic[key] = int(value)
elif not key == 'len':
line_dic[key] = float(value)
documents[elements[0]] = {'computations': [{'tool': {'name': 'TMHMM', 'version': '2.0'}, 'results': line_dic}]}
for line in f:
line_dic = {}
elements = line.rstrip().split("\t")
for i in range(1, len(elements)):
key = elements[i][:elements[i].rfind('=')]
value = elements[i][elements[i].rfind('=')+1:]
if key == 'Topology':
line_dic['topology'] = parse_topology(value, int(elements[i-4][elements[i-4].rfind('=')+1:]))
elif key == 'PredHel':
line_dic[key] = int(value)
elif not key == 'len':
line_dic[key] = float(value)
documents[elements[0]] = {'computations': [{'tool': {'name': 'TMHMM', 'version': '2.0'}, 'results': line_dic}]}
output_filename = args.output
with open(output_filename, 'w') as o:
json.dump(documents, o)
json.dump(documents, o)
#!/usr/bin/python3
import argparse
import fileinput
parser = argparse.ArgumentParser(description='Replaces fasta headers with unique numbers and saves a dictionary of both in tsv format. Caution: The original fasta file gets replaced in the process.')
parser.add_argument('--fasta', '-f', required=True, help='The fasta file')
parser.add_argument('--enum-headers', '-e', required=True, help='File to store enumerated headers in tsv format')
args = parser.parse_args()
fasta = args.fasta
headers_dict = {}
num = 1
with fileinput.FileInput(fasta, inplace=True) as f:
for line in f:
if line.startswith(">"):
header = line.strip().lstrip('>')
headers_dict[num] = header
print(">{}".format(num))
num += 1
else:
print(line, end='')
enum_headers_file = args.enum_headers
with open(enum_headers_file, 'w') as o:
for key in headers_dict:
o.write("{}\t{}\n".format(key, headers_dict[key]))
#!/usr/bin/python3
import json
import argparse
parser = argparse.ArgumentParser(description='Replace enumerated id of sequences with original identifier. Caution: The original json file gets replaced in the process.')
parser.add_argument('--json', '-j', required=True, help='The results json file')
parser.add_argument('--enum-headers', '-e', required=True, help='The enumerated original headers in tsv format')
args = parser.parse_args()
seq_id_dict = {}
docs_enumerated = {}
with open(args.json) as j:
docs_enumerated = json.load(j)
with open(args.enum_headers) as h:
for line in h:
num, header = line.strip().split('\t', 1)
seq_id_dict[num] = header.split()[0]
documents_restored = {}
for num in docs_enumerated:
seq_id = seq_id_dict[num]
doc = docs_enumerated[num]
doc["id"] = seq_id
documents_restored[seq_id] = doc
with open(args.json, 'w') as o:
json.dump(documents_restored, o)
#!/usr/bin/env python3
import env
import argparse
import re
from os import system,makedirs
from os import system,makedirs,path
from psot import config
import subprocess
import json
......@@ -12,9 +13,15 @@ ghostx_tool = config.load_config()['tools'].get('ghostx', 'ghostx')
parser = argparse.ArgumentParser(description='Identify homologues in the swissprot database')
parser.add_argument('--fasta', '-f', required=True, help='A fasta file with aminoacid sequences')
parser.add_argument('--database', '-d', required=True, help='Database to search in')
parser.add_argument('--output', '-o', required=True, help='The result directory. Will contain info.json and results.tsv.')
parser.add_argument('--output', '-o', required=True, help='The result directory. Will contain info.json, results.tsv and enum_headers.tsv.')
args = parser.parse_args()
makedirs(args.output, exist_ok=True)
# Swap fasta headers for unique numbers to save ghostx from dealing with complex headers
reduce_headers_tool = path.dirname(__file__) + '/reduce_fasta_headers_to_enumeration.py'
subprocess.run([reduce_headers_tool, "-f", args.fasta, "-e", args.output + '/enum_headers.tsv'])
# Aproach:
# directory for output
# info.json -> Tool info
......@@ -26,11 +33,10 @@ toolconfig = {
}
# find version
output = subprocess.run([ghostx_tool], stderr=subprocess.PIPE)
text =output.stderr.decode('ascii')
text = output.stderr.decode('ascii')
result = re.search('version (.*)', text)
toolconfig['version'] = result.group(1)
makedirs(args.output, exist_ok=True)
with open(args.output + '/info.json', 'w') as f:
json.dump(toolconfig, f)
system(ghostx_tool + " aln -d " + args.database + " -o " + args.output + "/results.tsv -i " + args.fasta)
#!/usr/bin/env python3
import env
import argparse
import json
from psot import config
from os import system, makedirs
......@@ -17,8 +18,20 @@ makedirs(args.output, exist_ok=True)
system(hmmscan_tool +
" -E " + args.evalue +
" -o " +args.output+ "/hmmscan.out " +
" -o " + args.output + "/hmmscan.out " +
" --tblout " + args.output + "/tblout.tsv " +
" --domtblout " + args.output + "/domtblout.tsv " +
" --pfamtblout " + args.output + "/pfamtblout.tsv " +
args.database + " " + args.fasta)
# Provide a list of all query sequence names for conversion process
queries = []
with open(args.fasta) as f:
for line in f:
if line.startswith('>'):
queries.append(line.split()[0].strip().lstrip('>'))
query_file = args.output + '/queries.json'
with open(query_file, 'w') as o:
json.dump(queries, o)
#!/usr/bin/env python3
import env
import argparse
from psot import config
from os import system,makedirs,path
import subprocess
targetp_tool = config.load_config()['tools'].get('targetp', 'targetp')
org_flags = {'plant': '-P', 'non-plant': '-N'}
parser = argparse.ArgumentParser(description='Determine subcellular locations of eukaryotic amino acid sequences')
parser.add_argument('--fasta', '-f', required=True, help='A fasta file with amino acid sequences')
parser.add_argument('--organism_group', choices=org_flags.keys(), required=True, help='Define wether to use plant/non-plant networks')
parser.add_argument('--output', required=True, help='The result directory. Will contain results.txt and enum_headers.tsv.')
args = parser.parse_args()
makedirs(args.output, exist_ok=True)
# Swap fasta headers for unique numbers to avoid truncation
reduce_headers_tool = path.dirname(__file__) + '/reduce_fasta_headers_to_enumeration.py'
subprocess.run([reduce_headers_tool, "-f", args.fasta, "-e", args.output + '/enum_headers.tsv'])
results_file = args.output + '/results.txt'
system(targetp_tool + " " + org_flags[args.organism_group] + " " + args.fasta + " > " + results_file)
......@@ -9,18 +9,6 @@ tmhmm_tool = config.load_config()['tools'].get('tmhmm', 'tmhmm')
parser = argparse.ArgumentParser(description='Find transmembrane helices in amino acid sequences')
parser.add_argument('--fasta', '-f', required=True, help='A fasta file with aminoacid sequences')
#parser.add_argument('--workdir', '-workdir', help='Working directory')
#parser.add_argument('--wwwdir', '-wwwdir', help='The place where the www server looks for files')
#parser.add_argument('--serverhome', '-serverhome', help='') #
#parser.add_argument('--basedir', '-basedir', help='basis directory for TMHMM package')
#parser.add_argument('--bindir', '-bindir', help='Bin directory (defaults basedir/bin)')
#parser.add_argument('--scrdir', '-scrdir', help='Script directory (defaults basedir/bin)')
#parser.add_argument('--libdir', '-libdir', help='Library directory (defaults basedir/lib)')
#parser.add_argument('--html', '-html', help='Produce HTML output')
#parser.add_argument('--short', '-s', help='Short output format')
#parser.add_argument('--plot', '-p', help='Produce graphics')
#parser.add_argument('--version1', '-v1', help='Use old model (version 1)')
#parser.add_argument('--debugging', '-d', help='') #
parser.add_argument('--output', required=True, help='The output file')
args = parser.parse_args()
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment