Commit 63022ed7 authored by Lukas Jelonek's avatar Lukas Jelonek
Browse files

Merge branch 'release/0.1'

parents 4dcde2a9 28457915
[0.1]
* Implemented psot runner script
* Implemented psot configuration
* Implemented nextflow execution
* Added blastp vs swissprot module
* Added ghostx vs swissprot module
* Added signalp module
* Added exemplary profiles
The MIT License (MIT)
Copyright (c) 2017 SOaAS
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
import os
import yaml
import copy
def get_install_location():
"""Finds the location directory of the tool"""
script_path = os.path.realpath(__file__)
script_dir = os.path.dirname(script_path)
install_dir = os.path.dirname(script_dir)
return install_dir
def get_script_location():
return os.path.join(get_install_location(),'bin')
def get_template_location():
return os.path.join(get_install_location(),'templates')
def get_modules_location():
return os.path.join(get_install_location(),'modules')
def get_profiles_location():
return os.path.join(get_install_location(),'profiles')
def get_config_file():
return os.path.join(get_install_location(), 'config.yaml')
def load_config_file():
config = {}
with (open(get_config_file())) as f:
config = yaml.load(f)
return config
def get_module_manifests():
mypath = get_modules_location()
return [os.path.join(mypath, f) for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))]
def create_app_config():
config = {}
config['app_path'] = get_install_location()
config['bin_path'] = get_script_location()
return config
def load_modules():
modules = {}
for manifest in get_module_manifests():
with (open(manifest)) as f:
module = yaml.load(f)
modules[module['name']] = module
if 'parameters' not in module:
module['parameters'] = {}
return modules
def normalize_profiles(profiles):
"""take care that all keys of the profile map exist"""
for profile in profiles:
if 'modules' not in profile:
profile['modules'] = []
for module in profile['modules']:
if profile['modules'][module] is None:
profile['modules'][module] = {}
def merge_modules_in_profiles(config):
"""Merge the module configuration and the profile configuration. Profile has precedence over module."""
profiles = config['profiles']
modules = config['modules']
for profile in profiles:
resolved_modules = []
for module_name in profile['modules']:
profile_module_config = profile['modules'][module_name]
resolved_module = modules[module_name]
copied_module = copy.deepcopy(resolved_module)
resolved_modules.append(copied_module)
for c in profile_module_config:
copied_module[c] = profile_module_config[c]
profile['modules'] = resolved_modules
def load_config():
config = load_config_file()
config['modules'] = load_modules()
config['app'] = create_app_config()
normalize_profiles(config['profiles'])
merge_modules_in_profiles(config)
return config
if __name__ == "__main__":
"""prints the configuration"""
import config
from pprint import pformat
print('Install location: ' + config.get_install_location())
print('Script lcoation: ' + config.get_script_location())
print('Config location: ' + config.get_config_file())
print('Available module manifests:\n\t' + '\n\t'.join(config.get_module_manifests()) + '\n')
print('Config content:\n' + pformat(config.load_config_file()) + '\n')
print('Manifest content:\n' + pformat(config.load_modules()) + '\n')
print('Aggregated config:\n' + pformat(config.load_config()) + '\n')
#!/usr/bin/python3
import sys
import json
import argparse
parser = argparse.ArgumentParser(description='Convert blast results to json documents')
parser.add_argument('--result', '-r', required=True, help='The blast result file in outfmt 7 format (tsv with headers)')
parser.add_argument('--output', '-o', required=True, help='The converted results json file')
args = parser.parse_args()
filename = args.result
documents = {}
with open(filename) as f:
header = None
tool = None
for line in f:
line = line.strip()
if line.startswith('#'):
if line.startswith('# Fields:'):
header_entries = line.replace('# Fields: ','').split(', ')
header = {}
for idx, key in enumerate(header_entries):
header[key] = idx
if line.startswith('# BLASTP'):
split = line.split()
tool = {'name': 'blastp',
'version': split[2]}
if line.startswith('# Database'):
split = line.split()
tool['database'] = split[2]
else:
split = line.split("\t")
if not split[0] in documents:
documents[split[0]] = {"id": split[0], "computations": [{'tool':tool, 'results':[]}]}
results = documents[split[0]]['computations'][0]['results']
result = {}
result["dbxref"] = "UniProtKB/Swiss-Prot:"+split[header['subject id']].split("|")[1]
if '% identity' in header:
result["percent_identity"] = float(split[header['% identity']])
if 'q. start' in header and 'q. end' in header:
result['qloc'] = split[header['q. start']] + '-' + split[header['q. end']]
if 's. start' in header and 's. end' in header:
result['sloc'] = split[header['s. start']] + '-' + split[header['s. end']]
if 'evalue' in header:
result['evalue'] = float(split[header['evalue']])
if 'BTOP' in header:
result['btop'] = split[header['BTOP']]
results.append(result)
output_filename = args.output
with open(output_filename, 'w') as o:
json.dump(documents, o)
#!/usr/bin/python3
import sys
import json
import argparse
parser = argparse.ArgumentParser(description='Convert ghostx results to json documents')
parser.add_argument('--result', '-r', required=True, help='The ghostx result directory')
parser.add_argument('--output', '-o', required=True, help='The converted results json file')
args = parser.parse_args()
result_directory = args.result
info_filename = args.result + "/info.json"
result_filename = args.result + "/results.tsv"
documents = {}
tool = None
# read tool info
with open(info_filename) as f:
tool = json.load(f)
with open(result_filename) as f:
for line in f:
line = line.strip()
split = line.split("\t")
if not split[0] in documents:
documents[split[0]] = {"id": split[0], "computations": [{'tool':tool, 'results':[]}]}
results = documents[split[0]]['computations'][0]['results']
result = {}
result["dbxref"] = "UniProtKB/Swiss-Prot:"+split[1].split("|")[1]
result["percent_identity"] = float(split[2])
result['qloc'] = split[6] + '-' + split[7]
result['sloc'] = split[8] + '-' + split[9]
result['evalue'] = float(split[10])
results.append(result)
output_filename = args.output
with open(output_filename, 'w') as o:
json.dump(documents, o)
#!/usr/bin/python3
import sys
import json
import argparse
parser = argparse.ArgumentParser(description='Convert signalp results to json documents')
parser.add_argument('--result', '-r', required=True, help='The signalp result file')
parser.add_argument('--output', '-o', required=True, help='The converted results json file')
args = parser.parse_args()
filename = args.result
documents = {}
with open(filename) as f:
tool = None
for line in f:
if not line.startswith("#"):
split = line.split()
if not split[0] in documents:
documents[split[0]] = {"id": split[0], "computations": [{'tool': tool, 'results':[]}]}
results = documents[split[0]]['computations'][0]["results"]
if split[9] == "Y":
results.append({
'signalpeptide': True,
'score': float(split[8]),
'start':1,
'end': int(split[2])-1
})
else:
results.append({'signalpeptide': False})
else:
if line.startswith('# SignalP'):
split = line.split()
tool = {'name': 'SignalP',
'version': split[1].split('-')[1],
'mode': split[2]}
output_filename = args.output
with open(output_filename, 'w') as o:
json.dump(documents, o)
#!/usr/bin/env python3
import json
import argparse
parser = argparse.ArgumentParser(description='Join json documents')
parser.add_argument('jsons', metavar='N', nargs='+', help='json documents')
parser.add_argument('--output', '-o', required=True, help='The name of the output document')
args = parser.parse_args()
joined = {}
for file in args.jsons:
with open(file) as f:
doc = json.load(f)
for k in doc:
if not k in joined:
joined[k] = {'id': k, 'computations':[]}
joined[k]['computations'].extend(doc[k]['computations'])
output_filename = args.output
with open(output_filename, 'w') as o:
json.dump(joined, o)
from string import Template
import os.path
import os
analysis_template = Template ('''
process ${id} {
input:
file fasta from for_${id}
output:
file "$${fasta}.${id}.results" into ${id}_results
script:
"""
${analysis_script} --fasta $$fasta --output $${fasta}.${id}.results ${params}
"""
}
''')
convert_template = Template ('''
process convert_${id}_to_json {
input:
file result from ${id}_results
output:
file "$${result}.json" into ${id}_json
script:
"""
${converter_script} --result $$result --output $${result}.json
"""
}
''')
input_template = Template(''' file ${id}_result from ${id}_json''')
join_jsons_template = Template('''
process join_documents {
input:
${inputs}
output:
file "joined.json" into joined_json
script:
"""
join_json_files.py --output joined.json *.json
"""
}
''')
split_jsons_template = Template('''
process split_documents {
publishDir "${output}", mode: 'copy'
input:
file "input/json.json" from joined_json
output:
file "*.json" into result_documents
script:
"""
split_json_into_separate_files.py --json 'input/json.json' --output .
"""
}
''')
def setup_execution_directory(execution):
directory = execution['directory']
if not os.path.exists(directory):
os.mkdir(directory)
if not os.path.isdir(directory):
exit()
nextflow_script = generate_nextflow_script(execution)
with open(directory + '/main.nf', 'w') as script_file:
script_file.write(nextflow_script)
if not os.path.exists(directory+'/bin'):
os.symlink(execution['bin_path'], directory + '/bin')
def execute_analysis(execution):
old_cwd = os.getcwd()
os.chdir(execution['directory'])
os.system('nextflow run ' + execution['directory'] + '/main.nf --fasta ' + execution['fasta'] + ' --output ' + execution['output'])
os.chdir(old_cwd)
def generate_nextflow_script(execution):
modules = execution['modules']
fragments = []
fragments.append('''params.fasta = "example/proteins.fas"
Channel.fromPath(params.fasta).set{fasta}''')
target_channels = ["for_"+m['id'] for m in modules]
fragments.append('fasta.into{'+';'.join(target_channels)+';}')
for m in modules:
fragments.append(analysis_template.substitute(m))
fragments.append(convert_template.substitute(m))
json_inputs = []
for m in modules:
json_inputs.append(input_template.substitute(m))
fragments.append(join_jsons_template.substitute({'inputs': '\n'.join(json_inputs)}))
fragments.append(split_jsons_template.substitute(execution))
nextflow_script = '\n'.join(fragments)
return nextflow_script
#!/usr/bin/env python3
import argparse
import os
from config import load_config
import copy
import shutil
from nextflow import setup_execution_directory, execute_analysis
import tempfile
def main():
parser = argparse.ArgumentParser(description='Make bioinformatic observations on aminoacid sequences')
subparsers = parser.add_subparsers()
info_parser = subparsers.add_parser('info')
info_parser.add_argument('--listanalyses', '-l', action='store_true', help='Show available analysis steps')
info_parser.set_defaults(func=info)
analyze_parser = subparsers.add_parser('analyze')
analyze_parser.add_argument('--fasta', '-f', required=True, help='A fasta file with aminoacid sequences')
analyze_parser.add_argument('--output', '-o', required=True, help='The output directory for the json documents')
analyze_parser.add_argument('--profile', '-p', default='fast', help='The profile to use')
analyze_parser.add_argument('--live', '-l', action='store_true', help='Report results as they are computed, not only at the end of the computation')
analyze_parser.add_argument('--config', '-c', help='The config to use')
analyze_parser.add_argument('--debug', '-d', action='store_true', help='Debug mode, computation directory will not be removed after computation')
analyze_parser.add_argument('--execution_dir', '-e', help='Use the specified execution directory and do not delete it after the computation')
analyze_parser.set_defaults(func=analyze)
args = parser.parse_args()
config = load_config()
args.func(args, config)
def info(args, config):
show_analyses(config)
def analyze(args, config):
execution = generate_execution(config, args)
if args.debug:
print(execution)
setup_execution_directory(execution)
execute_analysis(execution)
cleanup(execution)
def cleanup(execution):
if not execution['debug']:
shutil.rmtree(execution['directory'])
def generate_execution(config, args):
execution = {}
execution['debug'] = args.debug
execution['mode'] = 'live' if args.live else 'complete'
execution['bin_path'] = config['app']['bin_path']
execution['fasta'] = os.path.abspath(args.fasta)
execution['output'] = os.path.abspath(args.output)
if args.execution_dir:
execution['directory'] = args.execution_dir
else:
execution['directory'] = tempfile.mkdtemp()
execution['modules'] = generate_execution_modules_for_profile(config, args.profile)
return execution
def generate_execution_modules_for_profile(config, profile):
# find profile by name
p = [x for x in config['profiles'] if x['name'] == profile][0]
modules = copy.deepcopy(p['modules'])
# generate unique ids for each module
for module in modules:
module['id'] = module['name']
module['params'] = generate_params_string(module['parameters'])
return modules
def generate_params_string(options):
params = ''
if options:
l = []
for k in options:
if options[k]:
l.append('--' + k + " '" + options[k] + "'")
else:
l.append('--' + k)
params = ' '.join(l)
return params
def show_analyses(config):
print('Profiles:')
for profile in config['profiles']:
print(' {0:<20} - {1}'.format(profile['name'], profile['info']))
if 'modules' in profile:
for module in profile['modules']:
print(' {0:<20}'.format(module))
print()
print('Available modules for custom profile:')
for module in config['modules']:
print(' {0:<20} - {1}'.format(module['name'], module['info']))
if __name__ == "__main__":
main()
#!/usr/bin/env python3
import argparse
import config
from os import system
blastp_tool = config.load_config()['tools'].get('blastp', 'blastp')
parser = argparse.ArgumentParser(description='Identify homologues in the swissprot database')
parser.add_argument('--fasta', '-f', required=True, help='A fasta file with aminoacid sequences')
parser.add_argument('--database', '-d', required=True, help='Database to search in')
parser.add_argument('--output', '-o', required=True, help='The result file')
parser.add_argument('--evalue', '-e', default='0.0001', help='Evalue cutoff')
parser.add_argument('--alignment', '-a', action='store_true', help='Include alignments in btop format')
args = parser.parse_args()
format = '7 qseqid sseqid pident qstart qend sstart send evalue bitscore'
if args.alignment:
format += ' btop'
system(blastp_tool + " -db " + args.database + " -num_threads 7 -outfmt '" + format + "' -out " + args.output + " -query " + args.fasta)
#!/usr/bin/env python3
import argparse
import config
import re
from os import system,makedirs
import subprocess
import json
ghostx_tool = config.load_config()['tools'].get('ghostx', 'ghostx')
parser = argparse.ArgumentParser(description='Identify homologues in the swissprot database')
parser.add_argument('--fasta', '-f', required=True, help='A fasta file with aminoacid sequences')
parser.add_argument('--database', '-d', required=True, help='Database to search in')
parser.add_argument('--output', '-o', required=True, help='The result directory. Will contain info.json and results.tsv.')
args = parser.parse_args()
# Aproach:
# directory for output
# info.json -> Tool info
# result.json -> Tool result
toolconfig = {
'name': 'ghostx',
'database': args.database
}
# find version
output = subprocess.run([ghostx_tool], stderr=subprocess.PIPE)
text =output.stderr.decode('ascii')
result = re.search('version (.*)', text)
toolconfig['version'] = result.group(1)
makedirs(args.output, exist_ok=True)
with open(args.output + '/info.json', 'w') as f:
json.dump(toolconfig, f)
system(ghostx_tool + " aln -d " + args.database + " -o " + args.output + "/results.tsv -i " + args.fasta)
#!/usr/bin/env python3
import argparse
import config
from os import system
signalp_tool = config.load_config()['tools'].get('signalp', 'signalp')
parser = argparse.ArgumentParser(description='Find signal peptides in amino acid sequences')
parser.add_argument('--fasta', '-f', required=True, help='A fasta file with aminoacid sequences')
parser.add_argument('--organism', '-o', choices=['euk', 'gram+', 'gram-'], default='euk', help='The organism type')
parser.add_argument('--output', required=True, help='The output file')
args = parser.parse_args()
system(signalp_tool + " -t " + args.organism + " " + args.fasta + " > " + args.output)
#!/usr/bin/env python3
import json
import argparse
import os
import os.path
parser = argparse.ArgumentParser(description='Split a json file with multiple documents into single files per document')
parser.add_argument('--json', '-j', required=True, help='json documents')
parser.add_argument('--output', '-o', required=True, help='The name of the output directory')
args = parser.parse_args()
# check if output directory exists and create it if not
output_dir = os.path.abspath(args.output)
if os.path.exists(output_dir) and not os.path.isdir(output_dir):
print("Output " + output_dir + " exists and is not a directory")
exit(1)
if not os.path.exists(output_dir):
os.mkdir(output_dir)
file = args.json
with open(file) as f:
doc = json.load(f)
for k in doc:
output_filename = k + '.json'
with open(output_dir + '/' + output_filename, 'w') as o:
json.dump(doc[k], o)