Commit c4d8ffab authored by Lukas Jelonek's avatar Lukas Jelonek
Browse files

Implement transformation step support\nImplement create blastdbs

parent 40b823f2
......@@ -6,6 +6,8 @@ import re
import json
import datetime
import logging
import sys
import os
def download(url):
logging.info("Downloading " + url)
......@@ -39,7 +41,36 @@ def create_metadata(name,
'version': version
}
metadata.update(other)
save_metadata(metadata)
with open("metadata.json", "w") as metadata_file:
def save_metadata(metadata, path="./metadata.json"):
with open(path, "w") as metadata_file:
logging.info("Writing metadata " + json.dumps(metadata))
json.dump(metadata, metadata_file)
def load_metadata(path="./metadata.json"):
with open(path) as f:
return json.load(f)
def run_external_tool(path, params=[]):
"""Runs the external tool and captures stdout, stderr and exitcode in files"""
import subprocess
with open(".stdout", "w") as out, open(".stderr", "w") as err, open(".exitcode", "w") as exit:
if params:
path = [path]
path.extend(params)
logging.debug("Executing '%s'", path)
cp = subprocess.run(path, stdout=out, stderr=err)
print(cp.returncode, file=exit)
return cp.returncode
def to_absolute_paths(metadata):
if 'location' in metadata and 'parts' in metadata:
location = metadata['location']
for p in metadata['parts']:
if 'files' in p:
abs_files = map(lambda f: os.path.join(location, f), p['files'])
p['files'] = list(abs_files)
return metadata
......@@ -8,8 +8,10 @@ import pathlib
import prettytable
import shutil
import pkg_resources
import dbman.helper as h
def main():
logging.basicConfig(level=logging.DEBUG)
parser = argparse.ArgumentParser(description='Prepare, compile and distribute biological databases', prog='dbman')
subparsers = parser.add_subparsers(title='Subcommands', description='Available subcommands')
......@@ -19,6 +21,7 @@ def main():
prepare_parser.add_argument('-d', '--directory', dest='dbdir', help='Override the databases root directory', type=str)
prepare_parser.add_argument('-v', '--version', help='Override the version of the database', type=str)
prepare_parser.add_argument('--force_download', help='Force download of the database. Will replace existing downloads', action='store_true')
prepare_parser.add_argument('--force_rebuild', help='Force rebuild of the database. Will replace existing builds', action='store_true')
prepare_parser.add_argument('--keep_temp', help='Keep temporary data on failure', action='store_true')
prepare_parser.set_defaults(func=prepare)
......@@ -79,20 +82,39 @@ def list_local_databases(args):
print(table)
def prepare(args):
logging.info("Preparing db '%s' for tool '%s'", args.database, args.tool)
if not _exists(_local_databases(args), args.database, "download") or args.force_download:
logging.debug("Local databases:\n %s", _localdatabases_string(args))
download_recipe = _get_recipe(_recipes(args), args.database, "download")
if not download_recipe:
logging.warn("Recipe '%s - %s' does not exits", args.tool, "download")
return
transform_recipe = _get_recipe(_recipes(args), args.database, args.tool)
if not transform_recipe:
logging.warn("Recipe '%s - %s' does not exits", args.tool, args.tool)
return
db_info = _getdb(_local_databases(args), args.database, args.tool)
if not db_info or args.force_download:
logging.info("Database '%s' not found. Downloading it.", args.database)
recipe = _get_recipe(_recipes(args), args.database, "download")
logging.debug("Found recipe: %s", recipe)
run_in_tempdir(func = lambda: _run_external_tool(recipe['script'], recipe['params'] if 'params' in recipe else None),
db_info = run_in_tempdir(func = lambda: h.run_external_tool(download_recipe['script'], download_recipe['params'] if 'params' in download_recipe else None),
success=_rename_directory_after_metadata,
fail=_delete_directory if not args.keep_temp else lambda x: print())
else:
logging.info("Database '%s - %s' found. Download not required.", args.database, db_info['version'])
# if not download compile the database
if args.tool == 'download':
return
if args.tool != 'download':
logging.info("Preparing '%s' for '%s'", args.database, args.tool)
if not _exists(_local_databases(args), args.database, args.tool, db_info['version']) or args.force_rebuild:
logging.info("Transforming '%s' for '%s'", args.database, args.tool)
run_in_tempdir(func = lambda: run_transform(db_info, transform_recipe),
success=_rename_directory_after_metadata,
fail=_delete_directory if not args.keep_temp else lambda x: print())
else:
logging.warn("Database exists found for '%s - %s - %s'. Skipping transformation.", args.database, args.tool, db_info['version'])
return
def run_transform(source_metadata, recipe):
h.save_metadata(h.to_absolute_paths(source_metadata), "./.source_metadata.json")
return h.run_external_tool(recipe['script'], recipe['params'] if 'params' in recipe else None)
def _get_recipe(recipes, database, tool):
for recipe in recipes:
......@@ -113,22 +135,12 @@ def _rename_directory_after_metadata(path):
newname = metadata['name'] + "_" + metadata['version'] + '_' + metadata['tool']
oldpath = pathlib.Path(path)
newpath = oldpath.parent.parent.joinpath(newname)
if newpath.exists():
import shutil
shutil.rmtree(str(newpath))
logging.debug("Renaming '%s' to '%s'", oldpath, newpath)
oldpath.rename(newpath)
def _run_external_tool(path, params=[]):
"""Runs the external tool and captures stdout, stderr and exitcode in files"""
import subprocess
with open(".stdout", "w") as out:
with open(".stderr", "w") as err:
with open(".exitcode", "w") as exit:
if params:
path = [path]
path.extend(params)
logging.debug("Executing '%s'", path)
cp = subprocess.run(path, stderr=err, stdout=out)
print(cp.returncode, file=exit)
return cp.returncode
return metadata
def run_in_tempdir(func=None, success=None, fail=None):
dbtmpdir = os.path.join(_databases_dir(), "tmp")
......@@ -142,20 +154,26 @@ def run_in_tempdir(func=None, success=None, fail=None):
os.chdir(olddir)
if ret == 0:
logging.debug("External process succeeded")
success(tmpdir)
return success(tmpdir)
else:
logging.debug("External process failed")
fail(tmpdir)
return tmpdir
return None
def _recipes(args):
recipes = {
'card': {'download': {'script': os.path.abspath(pkg_resources.resource_filename(__name__, 'recipes/download_card.py'))}},
'swissprot': {'download': {'script': os.path.abspath(pkg_resources.resource_filename(__name__, 'recipes/download_uniprotkb.py')), 'params': ['--database', 'swissprot', '--type', 'fasta']}}
'card': {'download': {'script': _pkgres('recipes/download_card.py')},
'blast': {'script': _pkgres('recipes/create_blast_dbs.py')}
},
'swissprot': {'download': {'script': _pkgres('recipes/download_uniprotkb.py'), 'params': ['--database', 'swissprot', '--type', 'fasta']},
'blast': {'script': _pkgres('recipes/create_blast_dbs.py')},
}
}
return recipes
def _pkgres(name):
return os.path.abspath(pkg_resources.resource_filename(__name__, name))
def _getdb(databases, database, tool=None, version=None):
dbs = _getdbs(databases, database, tool, version)
if len(dbs) > 0:
......@@ -202,7 +220,6 @@ def _local_databases(args):
if os.path.isdir(dbdir):
# find and read all metadata.json files in direct subdirectories of dbdir
subdirectories = [f.path for f in os.scandir(dbdir) if f.is_dir() and os.path.isfile(os.path.join(f.path, "metadata.json"))]
logging.debug("Database subdirectories: %s", str(subdirectories))
for dir in subdirectories:
metadata_file = dir + "/metadata.json"
if os.path.isfile(metadata_file):
......@@ -210,9 +227,12 @@ def _local_databases(args):
m = json.load(f)
m['location'] = os.path.abspath(dir)
local_databases.append(m)
logging.debug("local databases: %s", str(local_databases))
return local_databases
def _localdatabases_string(args=None):
dbs_string = "\n ".join(map(lambda x: ", ".join([x['name'], x['tool'], x['version']]), _local_databases(args)))
return dbs_string
def _databases_dir(args=None):
'''get the configured local databases directory'''
if args and 'dbdir' in args and args.dbdir:
......@@ -223,5 +243,4 @@ def _databases_dir(args=None):
return "dbs"
if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG)
main()
#!/usr/bin/env python
import dbman.helper as h
import subprocess
import sys
import os
import json
from pathlib import Path
metadata = h.load_metadata('./.source_metadata.json')
fasta_parts = [p for p in metadata['parts'] if 'fasta' in p['tags']]
new_parts = []
for part in fasta_parts:
type = None
type_tag = None
if 'protein' in part['tags']:
type = 'prot'
if 'nucleotide' in part['tags']:
type = 'nucl'
for file in part['files']:
name = Path(file).stem
command = None
if 'gzip' in part['tags']:
command = 'zcat {} | makeblastdb -dbtype {} -title "{}" -out "{}" -in -'.format(file, type, name, name)
else:
command = 'cat {} | makeblastdb -dbtype {} -title "{}" -out "{}" -in -'.format(file, type, name, name)
cp = subprocess.run(command, shell=True)
if (cp.returncode != 0):
sys.exit(cp)
files = [f.name for f in os.scandir('.') if f.name.startswith(name + '.') and not f.name.endswith('.fasta')]
tags = ['blastdb']
tags.extend(part['tags'])
tags.remove('fasta')
new_parts.append({'files': files, 'tags': tags})
h.create_metadata(metadata['name'], 'blast', metadata['description'], metadata['version'], other={'parts': new_parts})
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment