Skip to content
Snippets Groups Projects
Commit 2b71cd2d authored by Lukas Jelonek's avatar Lukas Jelonek
Browse files

Add uniprotkb download\nadd parameters to external processes\nChange default directory name to dbs

parent 9eb4fd32
No related branches found
No related tags found
No related merge requests found
import wget import wget
import tarfile import tarfile
import gzip
import shutil
import re
import json import json
import datetime import datetime
import logging import logging
...@@ -12,14 +15,21 @@ def download(url): ...@@ -12,14 +15,21 @@ def download(url):
def extract(filename): def extract(filename):
logging.info("Extracting " + filename) logging.info("Extracting " + filename)
with tarfile.open(filename, 'r') as tar: if '.tar.' in filename:
tar.extractall() with tarfile.open(filename, 'r') as tar:
tar.extractall()
elif filename.endswith(".gz"):
with gzip.open(filename, 'rb') as gin, open(re.sub("\.gz$", "", filename), 'wb') as bout:
shutil.copyfileobj(gin, bout)
else:
raise Exception("Compression not supported")
def create_metadata(name, def create_metadata(name,
tool, tool,
description=None, description=None,
creation_date=datetime.datetime.now().isoformat(),
version=datetime.date.today().isoformat(), version=datetime.date.today().isoformat(),
other={},
creation_date=datetime.datetime.now().isoformat(),
): ):
metadata = { metadata = {
'name': name, 'name': name,
...@@ -28,6 +38,7 @@ def create_metadata(name, ...@@ -28,6 +38,7 @@ def create_metadata(name,
'creation_date': creation_date, 'creation_date': creation_date,
'version': version 'version': version
} }
metadata.update(other)
with open("metadata.json", "w") as metadata_file: with open("metadata.json", "w") as metadata_file:
logging.info("Writing metadata " + json.dumps(metadata)) logging.info("Writing metadata " + json.dumps(metadata))
......
...@@ -19,6 +19,8 @@ def main(): ...@@ -19,6 +19,8 @@ def main():
prepare_parser.add_argument('-d', '--directory', dest='dbdir', help='Override the databases root directory', type=str) prepare_parser.add_argument('-d', '--directory', dest='dbdir', help='Override the databases root directory', type=str)
prepare_parser.add_argument('-v', '--version', help='Override the version of the database', type=str) prepare_parser.add_argument('-v', '--version', help='Override the version of the database', type=str)
prepare_parser.add_argument('--force_download', help='Force download of the database. Will replace existing downloads', action='store_true') prepare_parser.add_argument('--force_download', help='Force download of the database. Will replace existing downloads', action='store_true')
prepare_parser.add_argument('--keep_temp', help='Keep temporary data on failure', action='store_true')
prepare_parser.set_defaults(func=prepare) prepare_parser.set_defaults(func=prepare)
list_local_databases_parser = subparsers.add_parser('list_local_databases', help='List the locally available databases') list_local_databases_parser = subparsers.add_parser('list_local_databases', help='List the locally available databases')
...@@ -82,9 +84,9 @@ def prepare(args): ...@@ -82,9 +84,9 @@ def prepare(args):
logging.info("Database '%s' not found. Downloading it.", args.database) logging.info("Database '%s' not found. Downloading it.", args.database)
recipe = _get_recipe(_recipes(args), args.database, "download") recipe = _get_recipe(_recipes(args), args.database, "download")
logging.debug("Found recipe: %s", recipe) logging.debug("Found recipe: %s", recipe)
run_in_tempdir(func = lambda: _run_external_tool(recipe['script']), run_in_tempdir(func = lambda: _run_external_tool(recipe['script'], recipe['params'] if 'params' in recipe else None),
success=_rename_directory_after_metadata, success=_rename_directory_after_metadata,
fail=_delete_directory) fail=_delete_directory if not args.keep_temp else lambda x: print())
# if not download compile the database # if not download compile the database
...@@ -114,13 +116,17 @@ def _rename_directory_after_metadata(path): ...@@ -114,13 +116,17 @@ def _rename_directory_after_metadata(path):
logging.debug("Renaming '%s' to '%s'", oldpath, newpath) logging.debug("Renaming '%s' to '%s'", oldpath, newpath)
oldpath.rename(newpath) oldpath.rename(newpath)
def _run_external_tool(path): def _run_external_tool(path, params=[]):
"""Runs the external tool and captures stdout, stderr and exitcode in files""" """Runs the external tool and captures stdout, stderr and exitcode in files"""
import subprocess import subprocess
with open(".stdout", "w") as out: with open(".stdout", "w") as out:
with open(".stderr", "w") as err: with open(".stderr", "w") as err:
with open(".exitcode", "w") as exit: with open(".exitcode", "w") as exit:
cp = subprocess.run(path, stderr=err, stdout=out, shell=True) if params:
path = [path]
path.extend(params)
logging.debug("Executing '%s'", path)
cp = subprocess.run(path, stderr=err, stdout=out)
print(cp.returncode, file=exit) print(cp.returncode, file=exit)
return cp.returncode return cp.returncode
...@@ -145,7 +151,8 @@ def run_in_tempdir(func=None, success=None, fail=None): ...@@ -145,7 +151,8 @@ def run_in_tempdir(func=None, success=None, fail=None):
def _recipes(args): def _recipes(args):
recipes = { recipes = {
'card': {'download': {'script': os.path.abspath(pkg_resources.resource_filename(__name__, 'recipes/download_card.py'))}} 'card': {'download': {'script': os.path.abspath(pkg_resources.resource_filename(__name__, 'recipes/download_card.py'))}},
'swissprot': {'download': {'script': os.path.abspath(pkg_resources.resource_filename(__name__, 'recipes/download_uniprotkb.py')), 'params': ['--database', 'swissprot', '--type', 'fasta']}}
} }
return recipes return recipes
...@@ -213,7 +220,7 @@ def _databases_dir(args=None): ...@@ -213,7 +220,7 @@ def _databases_dir(args=None):
elif "DBMAN_DBDIR" in os.environ: elif "DBMAN_DBDIR" in os.environ:
return os.environ["DBMAN_DBDIR"] return os.environ["DBMAN_DBDIR"]
else: else:
return "local_databases" return "dbs"
if __name__ == '__main__': if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
......
#!/usr/bin/env python3
import dbman.helper as h
import re
import argparse
import logging
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser(description='Download UniProtKB-database files')
parser.add_argument('-d', '--database', required=True, help='Database, one of [swissprot, trembl]')
parser.add_argument('-t', '--type', required=True, help='Database type, one of [xml, fasta, flatfile]')
args = parser.parse_args()
url_prefix="ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/"
data = {
'swissprot': {
'description': 'UniProt/Swiss-Prot',
'fileprefix': 'uniprot_sprot',
'types': {
'xml': '.xml.gz',
'fasta': '.fasta.gz',
'flatfile': '.dat.gz'
}
},
'trembl': {
'description': 'UniProt/TrEMBL',
'fileprefix': 'uniprot_trembl',
'types': {
'xml': '.xml.gz',
'fasta': '.fasta.gz',
'flatfile': '.dat.gz'
}
}
}
entry = data[args.database]
dbname = args.database + '_' + args.type
dbdescription = "UniProtKB/Swiss-Prot"
url = url_prefix + entry['fileprefix'] + entry['types'][args.type]
version_url = url_prefix + "reldate.txt"
# the card version is either available on the homepage or inside the card.json file
# here we use the card.json file
def extract_version_info(file):
data = {}
with open(file) as f:
for l in f:
search = re.search(entry['description'] + " Release (\d+_\d+) of (.+)", l)
if search:
data['version'] = search.group(1)
data['release_date'] = search.group(2)
return data
# download archive
fn = h.download(url)
ver_fn = h.download(version_url)
# write metadata file
other = extract_version_info(ver_fn)
other['parts'] = [
{
'files': [fn],
'tags': [args.type, 'protein', 'compressed', 'gzip']
},
{
'files': [ver_fn],
'tags': ['text', 'info', 'version']
}
]
h.create_metadata(dbname, "download", dbdescription, other=other)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment