Commit 2b71cd2d authored by Lukas Jelonek's avatar Lukas Jelonek
Browse files

Add uniprotkb download\nadd parameters to external processes\nChange default directory name to dbs

parent 9eb4fd32
import wget
import tarfile
import gzip
import shutil
import re
import json
import datetime
import logging
......@@ -12,14 +15,21 @@ def download(url):
def extract(filename):
logging.info("Extracting " + filename)
with tarfile.open(filename, 'r') as tar:
tar.extractall()
if '.tar.' in filename:
with tarfile.open(filename, 'r') as tar:
tar.extractall()
elif filename.endswith(".gz"):
with gzip.open(filename, 'rb') as gin, open(re.sub("\.gz$", "", filename), 'wb') as bout:
shutil.copyfileobj(gin, bout)
else:
raise Exception("Compression not supported")
def create_metadata(name,
tool,
description=None,
creation_date=datetime.datetime.now().isoformat(),
version=datetime.date.today().isoformat(),
other={},
creation_date=datetime.datetime.now().isoformat(),
):
metadata = {
'name': name,
......@@ -28,6 +38,7 @@ def create_metadata(name,
'creation_date': creation_date,
'version': version
}
metadata.update(other)
with open("metadata.json", "w") as metadata_file:
logging.info("Writing metadata " + json.dumps(metadata))
......
......@@ -19,6 +19,8 @@ def main():
prepare_parser.add_argument('-d', '--directory', dest='dbdir', help='Override the databases root directory', type=str)
prepare_parser.add_argument('-v', '--version', help='Override the version of the database', type=str)
prepare_parser.add_argument('--force_download', help='Force download of the database. Will replace existing downloads', action='store_true')
prepare_parser.add_argument('--keep_temp', help='Keep temporary data on failure', action='store_true')
prepare_parser.set_defaults(func=prepare)
list_local_databases_parser = subparsers.add_parser('list_local_databases', help='List the locally available databases')
......@@ -82,9 +84,9 @@ def prepare(args):
logging.info("Database '%s' not found. Downloading it.", args.database)
recipe = _get_recipe(_recipes(args), args.database, "download")
logging.debug("Found recipe: %s", recipe)
run_in_tempdir(func = lambda: _run_external_tool(recipe['script']),
run_in_tempdir(func = lambda: _run_external_tool(recipe['script'], recipe['params'] if 'params' in recipe else None),
success=_rename_directory_after_metadata,
fail=_delete_directory)
fail=_delete_directory if not args.keep_temp else lambda x: print())
# if not download compile the database
......@@ -114,13 +116,17 @@ def _rename_directory_after_metadata(path):
logging.debug("Renaming '%s' to '%s'", oldpath, newpath)
oldpath.rename(newpath)
def _run_external_tool(path):
def _run_external_tool(path, params=[]):
"""Runs the external tool and captures stdout, stderr and exitcode in files"""
import subprocess
with open(".stdout", "w") as out:
with open(".stderr", "w") as err:
with open(".exitcode", "w") as exit:
cp = subprocess.run(path, stderr=err, stdout=out, shell=True)
if params:
path = [path]
path.extend(params)
logging.debug("Executing '%s'", path)
cp = subprocess.run(path, stderr=err, stdout=out)
print(cp.returncode, file=exit)
return cp.returncode
......@@ -145,7 +151,8 @@ def run_in_tempdir(func=None, success=None, fail=None):
def _recipes(args):
recipes = {
'card': {'download': {'script': os.path.abspath(pkg_resources.resource_filename(__name__, 'recipes/download_card.py'))}}
'card': {'download': {'script': os.path.abspath(pkg_resources.resource_filename(__name__, 'recipes/download_card.py'))}},
'swissprot': {'download': {'script': os.path.abspath(pkg_resources.resource_filename(__name__, 'recipes/download_uniprotkb.py')), 'params': ['--database', 'swissprot', '--type', 'fasta']}}
}
return recipes
......@@ -213,7 +220,7 @@ def _databases_dir(args=None):
elif "DBMAN_DBDIR" in os.environ:
return os.environ["DBMAN_DBDIR"]
else:
return "local_databases"
return "dbs"
if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG)
......
#!/usr/bin/env python3
import dbman.helper as h
import re
import argparse
import logging
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser(description='Download UniProtKB-database files')
parser.add_argument('-d', '--database', required=True, help='Database, one of [swissprot, trembl]')
parser.add_argument('-t', '--type', required=True, help='Database type, one of [xml, fasta, flatfile]')
args = parser.parse_args()
url_prefix="ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/"
data = {
'swissprot': {
'description': 'UniProt/Swiss-Prot',
'fileprefix': 'uniprot_sprot',
'types': {
'xml': '.xml.gz',
'fasta': '.fasta.gz',
'flatfile': '.dat.gz'
}
},
'trembl': {
'description': 'UniProt/TrEMBL',
'fileprefix': 'uniprot_trembl',
'types': {
'xml': '.xml.gz',
'fasta': '.fasta.gz',
'flatfile': '.dat.gz'
}
}
}
entry = data[args.database]
dbname = args.database + '_' + args.type
dbdescription = "UniProtKB/Swiss-Prot"
url = url_prefix + entry['fileprefix'] + entry['types'][args.type]
version_url = url_prefix + "reldate.txt"
# the card version is either available on the homepage or inside the card.json file
# here we use the card.json file
def extract_version_info(file):
data = {}
with open(file) as f:
for l in f:
search = re.search(entry['description'] + " Release (\d+_\d+) of (.+)", l)
if search:
data['version'] = search.group(1)
data['release_date'] = search.group(2)
return data
# download archive
fn = h.download(url)
ver_fn = h.download(version_url)
# write metadata file
other = extract_version_info(ver_fn)
other['parts'] = [
{
'files': [fn],
'tags': [args.type, 'protein', 'compressed', 'gzip']
},
{
'files': [ver_fn],
'tags': ['text', 'info', 'version']
}
]
h.create_metadata(dbname, "download", dbdescription, other=other)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment