Commit 5e30edc8 authored by Lukas Jelonek's avatar Lukas Jelonek
Browse files

Add bachelor thesis code by Christian Fankep and start over\n Reason: large...

Add bachelor thesis code by Christian Fankep and start over\n Reason: large databases were added to his repository and it is easier to start over at this point
parents
Database Manager
================
This tool provide the preparation of Databases for several bioinformaticians tools.
The preparated databases can be used on the working computer or can be saved on the cloud Server(Amazon Web Service S3)
so that other computer can be able to download the data from there. It's also possible to delete some undesirable
databases on the working computer and/or in the Cloud.
Supported Databases with associated tools:
* Uniprot-Swissprot [Blast, Ghostx]
* CARD [Blast, Ghostx]
* Pfam [hmmer]
Installation
------------
Prerequisites:
* Python (Version >= 3.7)
* Git
Install for user::
pip install git+https://git.computational.bio.uni-giessen.de/cfankep/psotdb.git
Install for developer::
#checkout repository
git clone git+https://git.computational.bio.uni-giessen.de/cfankep/psotdb.git
#install
pip3 install -e .
Using Database Manager
----------------------
For the general help use::
dbman --help
Checkout which databases are available::
# in the S3 directory
dbman list_remote_databases
# in the local directory
dbman list_local_databases
Checkout which databases with associated transformations are available::
dbman list_recipes
Prepare databases::
# check the available optional parameters
dbman prepare -h
# run the standard preparation
dbman prepare example/database exemple/tool
Transfer databases from the working computer to s3 Cloud::
# check the available optional parameters
dbman upload -h
# run the standard transfer
dbman upload example/database exemple/tool*
Transfer databases from s3 Cloud to working computer::
# check the available optional parameters
dbman download -h
# run the standard download
dbman download example/database exemple/tool*
Delete undesirable databases::
# from the local directory
dbman delete example/database example/tool* local
# from the s3 cloud directory
dbman delete example/database example/tool* s3
remplace the standard directory to save the data::
# change local directory with the environment variable
export DBMAN_DBDIR = example/path
# change remote directory with the environment variable
export DBMAN_S3DIR = example/path
The standard directories(local and remote) can also be change with optional parameters.
(*) For upload, download and delete of the raw databank instead of 'example/tool' enter 'raw'.
#!/home/theia/conda/bin/python
import argparse
import subprocess
import os
import shutil
import re
import wget
from datetime import date
import tarfile
import json
import pkg_resources
def myparser():
parser = argparse.ArgumentParser(description='Download, convert and upload databases to cloud server', prog='Database Manager')
parser.set_defaults(func=help)
subparsers = parser.add_subparsers(title='Subcommands', description='Authorized Subcommands', help='Additional help')
prepare_parser = subparsers.add_parser('prepare', help='Databank download from the Website, extraction and transformation for another Programm')
prepare_parser.add_argument('database', help='Database which have to be prepared', type=str)
prepare_parser.add_argument('tool', help='Programm/Tool for the post analysis', type=str,)
prepare_parser.add_argument('-d', '--directory', dest='dir', help='change the local directory to save/to get the data.', type=str)
prepare_parser.add_argument('-v', '--version', help='version of the needed database. Standard will be the current release')
prepare_parser.set_defaults(func=prepare)
upload_parser = subparsers.add_parser('upload', help='Databank copy from the local directory to the web storage')
upload_parser.add_argument('database', help='database to be transfered from the local directory', type=str)
upload_parser.add_argument('tool', type=str, help='database type which have to be transfered."raw" instead of tool, if raw files are needed to be processed')
upload_parser.add_argument('-s', '--s3store', dest='store', help='change the "S3" storage of the data ')
upload_parser.add_argument('-d', '--directory', dest='dir', help='change the local directory to save/to get the data.', type=str)
upload_parser.add_argument('-v', '--version', help='version of the needed database. Standard will be the current release')
upload_parser.set_defaults(func=upload)
download_parser = subparsers.add_parser('download', help='Datenbank copy from the web storage to the working computer')
download_parser.add_argument('database', help='database to be transfered from "S3"', type=str)
download_parser.add_argument('tool', type=str, help='database type which have to be transfered. "raw" instead of tool, if raw files are needed to be processed')
download_parser.add_argument('-s', '--s3store', dest='store', help='change the "S3" storage of the data ')
download_parser.add_argument('-d', '--directory', dest='dir', help='change the local directory to save/to get the data.', type=str)
download_parser.add_argument('-v', '--version', help='version of the needed database. Standard will be the current release')
download_parser.set_defaults(func=download)
delete_parser = subparsers.add_parser('delete', help='delete existing files from local directory or from "S3"')
delete_parser.add_argument('database', help='database which have to be delete')
delete_parser.add_argument('tool', type=str, help='database type which have to be deleted."raw" instead of tool, if raw files are needed to be processed')
delete_parser.add_argument('place', choices=['local', 's3'], help='defined the place where the database have to be delete')
delete_parser.add_argument('-d', '--directory', dest='dir', help='change the local directory to save/to get the data.', type=str)
delete_parser.add_argument('-s', '--s3store', dest='store', help='change the "S3" storage of the data ')
delete_parser.add_argument('-v', '--version', help='version of the needed database. Standard will be the current release')
delete_parser.set_defaults(func=delete)
list_local_databases_parser = subparsers.add_parser('list_local_databases', help='print the list of local databases with some features')
list_local_databases_parser.add_argument('-d', '--directory', dest='dir', help='change the local directory to save/to get the data.', type=str)
list_local_databases_parser.set_defaults(func=list_local_databases)
list_remote_databases_parser = subparsers.add_parser('list_remote_databases', help='print the list of remote databases with some features')
list_remote_databases_parser.add_argument('-d', '--directory', dest='dir', help='change the local directory to save/to get the data.', type=str)
list_remote_databases_parser.add_argument('-s', '--s3store', dest='store', help='change the "S3" storage of the data ')
list_remote_databases_parser.set_defaults(func=list_remote_databases)
list_recipes_parser = subparsers.add_parser('list_recipes', help='print databases with the possible Tool/s')
list_recipes_parser.set_defaults(func=list_recipes)
args = parser.parse_args()
args.parser = parser
return args
def help(args):
args.parser.print_help()
def get_local_databases_directory(args):
database_directory = os.path.abspath("local_databases")
if args.dir:
database_directory = os.path.abspath(args.dir)
elif "DBMAN_DBDIR" in os.environ:
database_directory = os.environ["DBMAN_DBDIR"]
return database_directory
def get_remote_databases_directory(args):
web_dir = 's3://db_storage'
if args.store:
web_dir = args.store
elif 'DBMAN_S3DIR' in os.environ:
web_dir = os.environ['DBMAN_S3DIR']
return web_dir
def get_raw_directory_name(args, version):
return args.database+'_'+version+'_raw'
def get_tool_directory_name(args, version):
return args.database+'_'+version+'_'+args.tool
def get_path_tool_directory(args, version):
return path_maker(get_local_databases_directory(args), get_tool_directory_name(args, version))
def path_maker(directory, file):
filedir = ''
if directory[-1] == '/':
filedir = directory+file
else:
filedir = directory+'/'+file
return filedir
def get_version(args):
version = str(date.today())
if args.version:
version = args.version
elif data[args.database]['version']:
version = data[args.database]['version']()
return version
def get_swissprot_version():
file = wget.download("ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/reldate.txt")
datei = open(file)
satz = ''
while 'Swiss-Prot' not in satz:
satz = datei.readline()
satz_search = re.search('[0-9]{4}_[0-9]{2}', satz)
version = satz_search.group(0)
datei.close()
os.remove(file)
return version
def get_pfam_version():
file = wget.download("ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/relnotes.txt")
datei = open(file)
satz = ''
while 'RELEASE' not in satz:
satz = datei.readline()
satz_search = re.search(r'[0-9]{2}\.[0-9]', satz)
version = satz_search.group(0)
datei.close()
os.remove(file)
return version
def get_card_version():
os.mkdir('version')
os.chdir('version')
wget.download("https://card.mcmaster.ca/latest/data")
tar = tarfile.open('card-data.tar.bz2')
tar.extractall()
tar.close()
with open("card.json") as f:
data = json.load(f)
version = data["_version"]
os.chdir('..')
shutil.rmtree('version')
return version
def get_local_json_version(args):
if args.version:
version = args.version
else:
with open(get_path_local_json(args)) as f:
metadata = json.load(f)
matched_dictionnary = []
for dictionnary in metadata:
if dictionnary['name'] == args.database and dictionnary['tool'] == args.tool:
matched_dictionnary.append(dictionnary)
if len(matched_dictionnary) == 1:
version = matched_dictionnary[0]['version']
elif len(matched_dictionnary) > 1:
version = sorted(matched_dictionnary, key=lambda i: i['created'], reverse=True)[0]['version']
else:
version = 'error'
return version
def get_remote_json_version(args):
if args.version:
version = args.version
else:
remote_metadata = get_remote_metadata(args)
if remote_metadata != []:
matched_dictionnary = []
for dictionnary in remote_metadata:
if dictionnary['name'] == args.database and dictionnary['tool'] == args.tool:
matched_dictionnary.append(dictionnary)
if len(matched_dictionnary) == 1:
version = matched_dictionnary[0]['version']
elif len(matched_dictionnary) > 1:
version = sorted(matched_dictionnary, key=lambda i: i['created'], reverse=True)[0]['version']
else:
version = 'error'
else:
version = 'error'
return version
def get_path_local_json(args):
return path_maker(get_local_databases_directory(args), 'dbman_metadata.json')
def get_path_remote_json(args):
return path_maker(get_remote_databases_directory(args), 'dbman_remote_metadata.json')
def get_local_metadata(args):
if not os.path.isfile(get_path_local_json(args)):
with open(get_path_local_json(args), 'w') as f:
json.dump([], f)
metadata = []
else:
with open(get_path_local_json(args)) as f:
metadata = json.load(f)
return metadata
def save_local_metadata(args, metaliste):
with open(get_path_local_json(args), 'w') as f:
json.dump(metaliste, f)
def get_remote_metadata(args):
if get_path_remote_json(args) in get_remote_files():
subprocess.run([pkg_resources.resource_filename(__name__, "scripts/download_json.sh"), get_path_remote_json(args), get_local_databases_directory(args)])
with open(path_maker(get_local_databases_directory(args), 'dbman_remote_metadata.json')) as f:
metadata = json.load(f)
os.remove(path_maker(get_local_databases_directory(args), 'dbman_remote_metadata.json'))
else:
metadata = []
return metadata
def save_remote_metadata(args, metaliste):
with open(path_maker(get_local_databases_directory(args), 'dbman_remote_metadata.json'), 'w') as f:
json.dump(metaliste, f)
subprocess.run([pkg_resources.resource_filename(__name__, "scripts/upload_json.sh"), get_local_databases_directory(args), 'dbman_remote_metadata.json', get_remote_databases_directory(args)])
os.remove(path_maker(get_local_databases_directory(args), 'dbman_remote_metadata.json'))
def get_remote_filename(args, version):
return path_maker(get_remote_databases_directory(args), get_tar_filename(args, version))
def get_remote_files():
return subprocess.run(['s3cmd', 'la'], capture_output=True, text=True).stdout.split()
def get_tar_filename(args, version):
return get_tool_directory_name(args, version) + '.tar.gz'
def create_tar_file_and_upload(args, version):
subprocess.run([pkg_resources.resource_filename(__name__, "scripts/upload_db.sh"), get_local_databases_directory(args), get_tar_filename(args, version), get_tool_directory_name(args, version), get_remote_databases_directory(args)])
def prepare(args):
if args.database in data.keys():
version = get_version(args)
print('')
raw_directory_path = path_maker(get_local_databases_directory(args), get_raw_directory_name(args, version))
if not os.path.isdir(raw_directory_path):
os.mkdir(raw_directory_path)
subprocess.run([data[args.database]['prepare'], raw_directory_path])
metadata = get_local_metadata(args)
details = {'name': args.database, 'tool': 'raw', 'version': version, 'created': str(date.today())}
metadata.append(details)
save_local_metadata(args, metadata)
print("The {} file is in: ".format(args.database) + os.path.abspath(raw_directory_path))
else:
print('The {} file already exists in: '.format(args.database) + os.path.abspath(raw_directory_path))
if args.tool in data[args.database]['tool'].keys():
tool_dir = get_path_tool_directory(args, version)
tool_file = path_maker(raw_directory_path, data[args.database]['filename'])
tool_dir_file = path_maker(tool_dir, data[args.database]['filename'])
if not os.path.isdir(tool_dir):
os.mkdir(tool_dir)
os.symlink(os.path.relpath(os.path.abspath(tool_file), tool_dir), tool_dir_file)
subprocess.run([data[args.database]['tool'][args.tool], tool_dir, data[args.database]['filename']])
print('The {} files are in: '.format(args.tool) + os.path.abspath(tool_dir))
os.unlink(tool_dir_file)
metadata = get_local_metadata(args)
details = {'name': args.database, 'tool': args.tool, 'version': version, 'created': str(date.today())}
metadata.append(details)
save_local_metadata(args, metadata)
else:
print('The {} files are already exists in: '.format(args.tool) + os.path.abspath(tool_dir))
else:
print('Tool error. There are following possibility: {}'.format([tool for tool in data[args.database]['tool'].keys()]))
else:
print('Database error. There are following possibility: {}'.format([database for database in data.keys()]))
def upload(args):
if args.database in data.keys():
if args.tool in data[args.database]['tool'].keys() or args.tool == "raw":
version = get_local_json_version(args)
# 'error' when the data to upload was not found local
if version != 'error':
if get_remote_filename(args, version) not in get_remote_files():
create_tar_file_and_upload(args, version)
remote_metadata = get_remote_metadata(args)
for dictionnary in get_local_metadata(args):
if dictionnary['name'] == args.database and dictionnary['tool'] == args.tool and dictionnary['version'] == version:
remote_metadata.append(dictionnary)
save_remote_metadata(args, remote_metadata)
else:
print('The {} files are already in {}'.format(get_tool_directory_name(args, version), get_remote_databases_directory(args)))
else:
print('There is no {} data to upload in {}. Prepare the database first'.format(args.database+' '+args.tool, os.path.abspath(get_local_databases_directory(args))))
else:
print('Tool error. There are following possibility: {}'.format([tool for tool in data[args.database]['tool'].keys()]))
else:
print('Database error. There are following possibility: {}'.format([database for database in data.keys()]))
def download(args):
if args.database in data.keys():
if args.tool in data[args.database]['tool'].keys() or args.tool == "raw":
version = get_remote_json_version(args)
if version != 'error':
if not os.path.isdir(get_path_tool_directory(args, version)):
download_file = path_maker(get_remote_databases_directory(args), get_tar_filename(args, version))
subprocess.run([pkg_resources.resource_filename(__name__, "scripts/download_db.sh"), download_file, get_local_databases_directory(args), get_tar_filename(args, version)])
local_metadata = get_local_metadata(args)
for dictionnary in get_remote_metadata(args):
if dictionnary['name'] == args.database and dictionnary['tool'] == args.tool and dictionnary['version'] == version:
local_metadata.append(dictionnary)
save_local_metadata(args, local_metadata)
else:
print('{} is already in the local directory {}'.format(get_tool_directory_name(args, version), os.path.abspath(get_local_databases_directory(args))))
else:
print('There is no {} files to download in {}:'.format(args.database+' '+args.tool, get_remote_databases_directory(args)))
else:
print('Tool error. There are following possibility: {}'.format([tool for tool in data[args.database]['tool'].keys()]))
else:
print('Database error. There are following possibility: {}'.format([database for database in data.keys()]))
def delete(args):
if args.database in data.keys():
if args.tool in data[args.database]['tool'].keys() or args.tool == "raw":
frage = input('Do you want to continue (Y/N)? ')
if frage.lower() == 'y':
if args.place == 'local':
version = get_local_json_version(args)
dbman_dir = get_local_databases_directory(args)
directory_to_delete = path_maker(dbman_dir, get_tool_directory_name(args, version))
if os.path.isdir(directory_to_delete):
shutil.rmtree(directory_to_delete)
metadata = get_local_metadata(args)
for position in range(0, len(metadata)):
if metadata[position]['name'] == args.database and metadata[position]['tool'] == args.tool and metadata[position]['version'] == version:
del metadata[position]
break
save_local_metadata(args, metadata)
print("The {} files were successfully delete from: ".format(get_tool_directory_name(args, version)) + os.path.abspath(dbman_dir))
else:
print("The {} files aren't existing in: ".format(get_tool_directory_name(args, version)) + os.path.abspath(dbman_dir))
elif args.place == 's3':
version = get_remote_json_version(args)
web_store = get_remote_databases_directory(args)
web_file = path_maker(web_store, get_tar_filename(args, version))
if web_file in get_remote_files():
subprocess.run([pkg_resources.resource_filename(__name__, "scripts/delete_remote_file.sh"), web_file])
metadata = get_remote_metadata(args)
for position in range(0, len(metadata)):
if metadata[position]['name'] == args.database and metadata[position]['tool'] == args.tool and metadata[position]['version'] == version:
del metadata[position]
break
save_remote_metadata(args, metadata)
print("The {} files were successfully delete from: ".format(get_tool_directory_name(args, version)) + web_store)
else:
print("The {} files aren't existing in: ".format(get_tool_directory_name(args, version)) + web_store)
else:
print('delete canceled')
else:
print('Tool error. There are following possibility: {}'.format([tool for tool in data[args.database]['tool'].keys()]))
else:
print('Database error. There are following possibility: {}'.format([database for database in data.keys()]))
def list_recipes(args):
for database in data.keys():
print('{}:{}'.format(database, [tool for tool in data[database]['tool'].keys()]))
def list_local_databases(args):
metadata = get_local_metadata(args)
for dic_fichier in metadata:
print('\n{}[{}] Version: {} erstellt am: {}'.format(dic_fichier['name'], dic_fichier['tool'], dic_fichier['version'], dic_fichier['created']))
def list_remote_databases(args):
metadata = get_remote_metadata(args)
for dic_fichier in metadata:
print('\n{}[{}] Version: {} erstellt am: {}'.format(dic_fichier['name'], dic_fichier['tool'], dic_fichier['version'], dic_fichier['created']))
data = {'swissprot': {'prepare': pkg_resources.resource_filename(__name__, "scripts/prepare_swissprot.sh"),
'tool': {'blast': pkg_resources.resource_filename(__name__, "scripts/blast_db.sh"), 'ghostx': pkg_resources.resource_filename(__name__, "scripts/ghostx_db.sh")},
'filename': 'uniprot_sprot.fasta',
'version': get_swissprot_version
},
'pfam': {'prepare': pkg_resources.resource_filename(__name__, "scripts/prepare_pfam.sh"),
'tool': {'hmmer': pkg_resources.resource_filename(__name__, "scripts/hmmer_pfam.sh")},
'filename': 'Pfam-A.hmm',
'version': get_pfam_version
},
'card': {'prepare': pkg_resources.resource_filename(__name__, "scripts/prepare_card.sh"),
'tool': {'blast': pkg_resources.resource_filename(__name__, "scripts/blast_db.sh"), 'ghostx': pkg_resources.resource_filename(__name__, "scripts/ghostx_db.sh")},
'filename': 'protein_fasta_protein_homolog_model.fasta',
'version': get_card_version
}
}
def main():
if not os.path.isdir('local_databases'):
os.mkdir('local_databases')
args = myparser()
args.func(args)
if __name__ == '__main__':
main()
#!/bin/bash
TOOL_DIRECTORY=$1
TOOL_FILE=$2
cd $TOOL_DIRECTORY
makeblastdb -dbtype prot -in $TOOL_FILE
cd -
\ No newline at end of file
#!/bin/bash
REMOTE_FILE=$1
s3cmd del $REMOTE_FILE
\ No newline at end of file
#!/bin/bash
REMOTE_TARFILE=$1
LOCAL_DATABASE_DIRECTORY=$2
TARFILE=$3
cd $LOCAL_DATABASE_DIRECTORY
s3cmd get $REMOTE_TARFILE
tar -xzvf $TARFILE
rm $TARFILE
cd -
\ No newline at end of file
#!/bin/bash
REMOTE_FILE=$1
LOCAL_DATABASE_DIRECTORY=$2
cd $LOCAL_DATABASE_DIRECTORY
s3cmd get $REMOTE_FILE
cd -
\ No newline at end of file
#!/bin/bash
TOOL_DIRECTORY=$1
RAW_FILE=$2
cd $TOOL_DIRECTORY
ghostx db -i $RAW_FILE -o ghostx_db
cd -
\ No newline at end of file
#!/bin/bash
TOOL_DIRECTORY=$1
RAW_FILE=$2
cd $TOOL_DIRECTORY
hmmpress $RAW_FILE
cd -
\ No newline at end of file
#!/bin/bash
LOCAL_DATABASE_DIRECTORY=$1
cd $LOCAL_DATABASE_DIRECTORY
wget --content-disposition https://card.mcmaster.ca/latest/data
tar xfa card-data.tar.bz2
rm card-data.tar.bz2
cd -
\ No newline at end of file
#!/bin/bash
LOCAL_DATABASE_DIRECTORY=$1