Commit 0025df27 authored by Marc's avatar Marc
Browse files

reduce_headers* and restore_headers* helper scripts now copy the input file...

reduce_headers* and restore_headers* helper scripts now copy the input file and don't overwrite it; named scripts got an extra nextflow process and are not included in runscripts anymore; nextflow splitFasta is now at the beginning; NOTE: psot does currently not run in '-i' or '-l' mode
parent 00b599a0
databases:
localpath: '/usr/lib/dbs'
containerpath: '/databases'
tools:
# Replace tool paths if they are located elsewhere
signalp: 'signalp'
......
......@@ -11,10 +11,11 @@ info: 'blastp analysis against swissprot'
analysis:
script: 'run_blastp.py'
parameters:
database: '/vol/biodb/uniprot/uniprot_sprot.fasta'
database: '/home/ubuntu/db/uniprot_sprot.fasta'
evalue: 1e-10
use_accession:
container:
docker: 'blastp'
# The name of the result to json converter script. Must take one parameter, the
# result file from the analysis_script
......
......@@ -11,7 +11,7 @@ info: 'ghostx analysis against swissprot'
analysis:
script: 'run_ghostx.py'
parameters:
database: '/vol/biodb/ghostx/uniprot_sprot'
database: '/home/ubuntu/db/uniprot_sprot.fasta'
container:
# The name of the result to json converter script. Must take one parameter, the
......
......@@ -11,7 +11,7 @@ info: 'hmmscan analysis against PFAM-A'
analysis:
script: 'run_hmmer.py'
parameters:
database: '/vol/biodb/pfam-31/Pfam-A.hmm'
database: '/home/ubuntu/db/Pfam-A.hmm'
evalue: 1e-10
execution:
cluster:
......
......@@ -3,3 +3,5 @@ info: 'Profile that contains tools for testing and development'
modules:
signalp:
organism: 'euk'
targetp:
organism_group: 'non-plant'
\ No newline at end of file
......@@ -4,19 +4,10 @@ import argparse
from os import system,makedirs,path,environ
import subprocess
def find_tool_path():
from psot import config
cfg = config.load_config()
return cfg['tools'].get('targetp', 'targetp'), cfg['helpers_path'] + '/reduce_fasta_headers_to_enumeration.py'
targetp_tool = ''
reduce_headers_tool =''
if 'INSIDE_CONTAINER' in environ:
targetp_tool = 'targetp'
reduce_headers_tool = 'reduce_fasta_headers_to_enumeration.py'
else:
targetp_tool, reduce_headers_tool = find_tool_path()
targetp_tool = 'targetp'
org_flags = {'plant': '-P', 'non-plant': '-N'}
......@@ -28,8 +19,5 @@ args = parser.parse_args()
makedirs(args.output, exist_ok=True)
# Swap fasta headers for unique numbers to avoid truncation
subprocess.run([reduce_headers_tool, "-f", args.fasta, "-e", args.output + '/enum_headers.tsv'])
results_file = args.output + '/results.txt'
system(targetp_tool + " " + org_flags[args.organism_group] + " " + args.fasta + " > " + results_file)
......@@ -2,17 +2,20 @@
import argparse
import fileinput
from os.path import splitext
from shutil import copyfile
parser = argparse.ArgumentParser(description='Replaces fasta headers with unique numbers and saves a dictionary of both in tsv format. Caution: The original fasta file gets replaced in the process.')
parser = argparse.ArgumentParser(description='Replaces fasta headers with unique numbers and saves a dictionary of both in tsv format.')
parser.add_argument('--fasta', '-f', required=True, help='The fasta file')
parser.add_argument('--enum-headers', '-e', required=True, help='File to store enumerated headers in tsv format')
args = parser.parse_args()
fasta = args.fasta
normalized_fasta = splitext(args.fasta)[0] + '_normalized.fasta'
copyfile(args.fasta, normalized_fasta)
headers_dict = {}
num = 1
with fileinput.FileInput(fasta, inplace=True) as f:
with fileinput.FileInput(normalized_fasta, inplace=True) as f:
for line in f:
if line.startswith(">"):
header = line.strip().lstrip('>')
......
......@@ -2,8 +2,9 @@
import json
import argparse
from os.path import splitext
parser = argparse.ArgumentParser(description='Replace enumerated id of sequences with original identifier. Caution: The original json file gets replaced in the process.')
parser = argparse.ArgumentParser(description='Replace enumerated id of sequences with original identifier.')
parser.add_argument('--json', '-j', required=True, help='The results json file')
parser.add_argument('--enum-headers', '-e', required=True, help='The enumerated original headers in tsv format')
args = parser.parse_args()
......@@ -27,5 +28,5 @@ for num in docs_enumerated:
doc["id"] = seq_id
documents_restored[seq_id] = doc
with open(args.json, 'w') as o:
with open(splitext(args.json)[0] + '_restored_headers.json', 'w') as o:
json.dump(documents_restored, o)
......@@ -71,6 +71,7 @@ def generate_execution(config, args):
execution['fasta'] = os.path.abspath(args.fasta)
execution['output'] = os.path.abspath(args.output)
execution['install_path'] = config['install_path']
execution['helpers_path'] = config['helpers_path']
execution['docker'] = args.docker
execution['singularity'] = args.singularity
if 'venv' in config:
......
......@@ -15,14 +15,30 @@ def flatten(d, parent_key='', sep='_'):
items.append((new_key, v))
return dict(items)
normalizing_fasta_template = Template('''
process normalizing_fasta {
input:
file fasta from for_normalization
output:
set file("$${fasta.baseName}_normalized.fasta"), file("$${fasta.baseName}_enum_headers.tsv") into for_analysis
script:
"""
${helpers_path}/reduce_fasta_headers_to_enumeration.py -f $$fasta -e $${fasta.baseName}_enum_headers.tsv
"""
}
''')
analysis_template = Template ('''
process ${id} {
input:
file fasta from for_${id}${chunks}
set file(fasta), file(headers) from for_${id}
output:
file "$${fasta}.${id}.results" into ${id}_results
set file("$${fasta}.${id}.results"), file(headers) into ${id}_results
script:
"""
......@@ -81,10 +97,10 @@ convert_template = Template ('''
process convert_${id}_to_json {
input:
file result from ${id}_results
set file(result), file(headers) from ${id}_results
output:
file "$${result}.json" into ${id}_json
set file("$${result}.json"), file(headers) into ${id}_restore_headers
script:
"""
......@@ -92,6 +108,23 @@ process convert_${id}_to_json {
"""
}
''')
restore_headers_json_template = Template('''
process ${id}_restore_headers_json {
input:
set file(result), file(headers) from ${id}_restore_headers
output:
file "$${result.baseName}_restored_headers.json" into ${id}_json
script:
"""
${helpers_path}/restore_seq_id_from_enumeration.py -j $$result -e $$headers
"""
}
''')
retrieve_informations_template = Template('''
process retrieve_informations_for_${id} {
......@@ -164,13 +197,21 @@ analysis_config_template = Template('''
}
'''
)
beforeScript_config_template = Template('''
beforeScript_modul_config_template = Template('''
withName:${process_names}{
${beforeScript}
}
'''
)
beforeScript_norm_config_template = Template('''
withName:normalizing_fasta{
${beforeScript}
}
'''
)
def setup_execution_directory(execution):
directory = execution['directory']
if not os.path.exists(directory):
......@@ -201,19 +242,21 @@ def generate_nextflow_script(execution):
modules = execution['modules']
fragments = []
fragments.append('''params.fasta = "'''+execution['fasta']+'''"
Channel.fromPath(params.fasta).set{fasta}''')
fragments.append('''params.fasta = "'''+execution['fasta']+'''"''')
if execution['use_cluster']:
fragments.append('''for_normalization = Channel.fromPath(params.fasta).splitFasta(by:300, file:'input')''')
else:
fragments.append('''for_normalization = Channel.fromPath(params.fasta)''')
fragments.append(normalizing_fasta_template.substitute(execution))
target_channels = ["for_"+m['id'] for m in modules]
fragments.append('fasta.into{'+';'.join(target_channels)+';}')
fragments.append('for_analysis.into{'+';'.join(target_channels)+';}')
for m in modules:
config = flatten(m)
config['output'] = execution['output']
if execution['use_cluster']:
config['chunks'] = ".splitFasta(by:300, file:'input')"
else:
config['chunks'] = ''
fragments.append(analysis_template.substitute(config))
if execution['mode'] == 'live' and not execution['fetch_informations']:
......@@ -228,7 +271,10 @@ Channel.fromPath(params.fasta).set{fasta}''')
fragments.append(retrieve_informations_template.substitute(config))
else:
fragments.append(convert_template.substitute(config))
config['helpers_path'] = execution['helpers_path']
fragments.append(restore_headers_json_template.substitute(config))
json_inputs = []
for m in modules:
json_inputs.append(input_template.substitute(m))
......@@ -274,15 +320,18 @@ def generate_nextflow_config(execution):
config['beforeScript'] = "beforeScript = 'export PS1=; source " + execution['venv'] + "/bin/activate'"
if execution['fetch_informations']:
config['process_names'] = "'" + Template('convert_${id}_to_json|retrieve_informations_for_${id}').substitute(config) + "'"
config['process_names'] = "'" + Template('convert_${id}_to_json|${id}_restore_headers_json|retrieve_informations_for_${id}').substitute(config) + "'"
else:
config['process_names'] = Template('convert_${id}_to_json').substitute(config)
config['process_names'] = "'" + Template('convert_${id}_to_json|${id}_restore_headers_json').substitute(config) + "'"
fragments.append(analysis_config_template.substitute(config))
fragments.append(beforeScript_config_template.substitute(config))
fragments.append(beforeScript_modul_config_template.substitute(config))
else:
config['beforeScript'] = ''
fragments.append(analysis_config_template.substitute(config))
if config['beforeScript']:
fragments.append(beforeScript_norm_config_template.substitute(config))
fragments.append('''}''')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment