be357217fcd21f7a305d036bdf871f9fa28bd646 to 0de88e0fe49c1766c6e46f5d9a67e2317a989b32 · SOaAS / psot

.gitlab-ci.yml

0 → 100644

+96 −0

Original line number	Diff line number	Diff line
		stages:
		- artifacts

		variables:
		CI_REGISTRY_IMAGE: harbor.computational.bio.uni-giessen.de/psos/psot
		VERSIONLABELMETHOD: "OnlyIfThisCommitHasVersion" # options: "OnlyIfThisCommitHasVersion","LastVersionTagInGit"
		ADDITIONALTAGLIST: latest
		IMAGE_LABELS: >
		--label org.opencontainers.image.vendor=$CI_SERVER_URL/$GITLAB_USER_LOGIN
		--label org.opencontainers.image.authors=$CI_SERVER_URL/$GITLAB_USER_LOGIN
		--label org.opencontainers.image.revision=$CI_COMMIT_SHA
		--label org.opencontainers.image.source=$CI_PROJECT_URL
		--label org.opencontainers.image.documentation=$CI_PROJECT_URL
		--label org.opencontainers.image.licenses=$CI_PROJECT_URL
		--label org.opencontainers.image.url=$CI_PROJECT_URL
		--label vcs-url=$CI_PROJECT_URL
		--label com.gitlab.ci.user=$CI_SERVER_URL/$GITLAB_USER_LOGIN
		--label com.gitlab.ci.email=$GITLAB_USER_EMAIL
		--label com.gitlab.ci.tagorbranch=$CI_COMMIT_REF_NAME
		--label com.gitlab.ci.pipelineurl=$CI_PIPELINE_URL
		--label com.gitlab.ci.commiturl=$CI_PROJECT_URL/commit/$CI_COMMIT_SHA
		--label com.gitlab.ci.cijoburl=$CI_JOB_URL
		--label com.gitlab.ci.mrurl=$CI_PROJECT_URL/-/merge_requests/$CI_MERGE_REQUEST_ID

		build-and-deploy-docker-image:
		stage: artifacts
		extends: .build-docker-image

		get-latest-git-version:
		stage: .pre
		image:
		name: alpine/git
		entrypoint: [""]
		rules:
		- if: '$VERSIONLABELMETHOD == "LastVersionTagInGit"'
		script:
		- \|
		echo "the google kaniko container does not have git and does not have a packge manager to install it"
		git clone https://github.com/GoogleContainerTools/kaniko.git
		cd kaniko
		echo "$(git describe --abbrev=0 --tags)" > ../VERSIONTAG.txt
		echo "VERSIONTAG.txt contains $(cat ../VERSIONTAG.txt)"
		artifacts:
		paths:
		- VERSIONTAG.txt

		# taken from https://gitlab.com/guided-explorations/containers/kaniko-docker-build/-/blob/master/.gitlab-ci.yml
		.build-docker-image:
		image:
		name: gcr.io/kaniko-project/executor:debug
		entrypoint: [""]
		script:
		- \|
		echo "Building and shipping image to $CI_REGISTRY_IMAGE"
		#Build date for opencontainers
		BUILDDATE="'$(date '+%FT%T%z' \| sed -E -n 's/(\+[0-9]{2})([0-9]{2})$/\1:\2/p')'" #rfc 3339 date
		IMAGE_LABELS="$IMAGE_LABELS --label org.opencontainers.image.created=$BUILDDATE --label build-date=$BUILDDATE"
		#Description for opencontainers
		BUILDTITLE=$(echo $CI_PROJECT_TITLE \| tr " " "_")
		IMAGE_LABELS="$IMAGE_LABELS --label org.opencontainers.image.title=$BUILDTITLE --label org.opencontainers.image.description=$BUILDTITLE"
		#Add ref.name for opencontainers
		IMAGE_LABELS="$IMAGE_LABELS --label org.opencontainers.image.ref.name=$CI_REGISTRY_IMAGE:${CI_COMMIT_REF_NAME//\//_}"

		#Build Version Label and Tag from git tag, LastVersionTagInGit was placed by a previous job artifact
		if [[ "$VERSIONLABELMETHOD" == "LastVersionTagInGit" ]]; then VERSIONLABEL=$(cat VERSIONTAG.txt); fi
		if [[ "$VERSIONLABELMETHOD" == "OnlyIfThisCommitHasVersion" ]]; then VERSIONLABEL=$CI_COMMIT_TAG; fi
		if [[ ! -z "$VERSIONLABEL" ]]; then
		IMAGE_LABELS="$IMAGE_LABELS --label org.opencontainers.image.version=$VERSIONLABEL"
		ADDITIONALTAGLIST="$ADDITIONALTAGLIST $VERSIONLABEL"
		fi

		ADDITIONALTAGLIST="$ADDITIONALTAGLIST $CI_COMMIT_REF_NAME $CI_COMMIT_SHORT_SHA"
		if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then ADDITIONALTAGLIST="$ADDITIONALTAGLIST latest"; fi
		if [[ -n "$ADDITIONALTAGLIST" ]]; then
		for TAG in $ADDITIONALTAGLIST; do
		TAG=${TAG//\//_} # replace slashes with underscore in tag
		FORMATTEDTAGLIST="${FORMATTEDTAGLIST} --tag $CI_REGISTRY_IMAGE:$TAG ";
		done;
		fi
		if [[ -n "$ADDITIONALIMAGENAMES" ]]; then
		for TAG in $ADDITIONALIMAGENAMES; do
		FORMATTEDTAGLIST="${FORMATTEDTAGLIST} --tag $TAG ";
		done;
		fi

		#Reformat Docker tags to kaniko's --destination argument:
		FORMATTEDTAGLIST=$(echo "${FORMATTEDTAGLIST}" \| sed s/\-\-tag/\-\-destination/g)

		mkdir -p /kaniko/.docker
		PROJECTDIR=$CI_PROJECT_DIR
		if [[ ! -z "$SUB_DIRECTORY" ]]; then
		PROJECTDIR=$CI_PROJECT_DIR/$SUB_DIRECTORY
		fi
		echo "{\"auths\":{\"$CI_REGISTRY\":{\"auth\":\"$(echo -n $CI_REGISTRY_USER:$CI_REGISTRY_PASSWORD \| base64 \| tr -d '\n')\"}}}" > /kaniko/.docker/config.json
		/kaniko/executor --context $PROJECTDIR --dockerfile $PROJECTDIR/Dockerfile $FORMATTEDTAGLIST $IMAGE_LABELS

AUTHORS

+2 −0

Original line number	Diff line number	Diff line
		Lukas Jelonek <lukas.jelonek@computational.bio.uni-giessen.de>
		Marc Weingärtner <marc.weingaertner@bioinfsys.uni-giessen.de>
		heiko-mueller <Heiko.Mueller@bioinfsys.uni-giessen.de>
		lmueller <lion.mueller@bio.uni-giessen.de>
		mweingae <marc.weingaertner@bioinfsys.uni-giessen.de>

Dockerfile

0 → 100644

+21 −0

Original line number	Diff line number	Diff line
		FROM ubuntu:20.04

		ENV DEBIAN_FRONTEND=noninteractive \
		NXF_VERSION=21.10.6
		RUN apt-get update && \
		apt-get install -y python3-pip python3-setuptools python3-sphinx docker.io openjdk-11-jdk-headless wget

		RUN wget https://github.com/nextflow-io/nextflow/releases/download/v${NXF_VERSION}/nextflow-${NXF_VERSION}-all && \
		chmod +x nextflow-${NXF_VERSION}-all && \
		mv nextflow-${NXF_VERSION}-all /usr/local/bin/ && \
		ln -s /usr/local/bin/nextflow-${NXF_VERSION}-all /usr/local/bin/nextflow

		RUN pip3 install --upgrade git+https://git.computational.bio.uni-giessen.de/SOaAS/dbxref.git

		COPY ./ /opt/psot/
		RUN pip3 install --upgrade /opt/psot/
		ENV PSOT_REPOSITORIES=/opt/psot/default_repo/

		# Use the image as follows:
		# docker run -it -v /tmp:/tmp -v $PWD:$PWD -v /var/run/docker.sock:/var/run/docker.sock psot-image psot analyze -f $PWD/example/single_sequence.faa -o $PWD/result/ -p fast --docker
		CMD psot info

default_repo/modules/hmmer_vs_ecfgroups.yaml

0 → 100644

+28 −0

Original line number	Diff line number	Diff line
		# Module manifest for the blastp against swissprot analysis

		# The name of the module. Is needed for the list-analyses option, for custom
		# configurations and custom profiles
		name: 'hmmer_ecfgroups'

		# Short description of the analysis
		info: 'hmmscan analysis against ECFexpress groups'

		# The name of the script for the analysis step. Must take a --fasta parameter
		analysis:
		script: 'run_hmmer.py'
		parameters:
		database: 'ecfexpress/ECFgroups.hmm'
		evalue: 1e-10
		execution:
		cluster:
		chunksize: 200
		container:
		docker: 'biocontainers/hmmer:v3.1b2dfsg-5-deb_cv1'
		singularity: 'biocontainers/hmmer:v3.1b2dfsg-5-deb_cv1'

		# The name of the result to json converter script. Must take one parameter, the
		# result file from the analysis_script
		converter:
		script: 'convert_hmmer.py'
		parameters:
		dbxref: 'ECF'

default_repo/modules/hmmer_vs_ecfsubgroups.yaml

0 → 100644

+28 −0

Original line number	Diff line number	Diff line
		# Module manifest for the blastp against swissprot analysis

		# The name of the module. Is needed for the list-analyses option, for custom
		# configurations and custom profiles
		name: 'hmmer_ecfsubgroups'

		# Short description of the analysis
		info: 'hmmscan analysis against ECFexpress subgroups'

		# The name of the script for the analysis step. Must take a --fasta parameter
		analysis:
		script: 'run_hmmer.py'
		parameters:
		database: 'ecfexpress/ECFsubgroups.hmm'
		evalue: 1e-10
		execution:
		cluster:
		chunksize: 200
		container:
		docker: 'biocontainers/hmmer:v3.1b2dfsg-5-deb_cv1'
		singularity: 'biocontainers/hmmer:v3.1b2dfsg-5-deb_cv1'

		# The name of the result to json converter script. Must take one parameter, the
		# result file from the analysis_script
		converter:
		script: 'convert_hmmer.py'
		parameters:
		dbxref: 'ECF'

default_repo/modules/hmmsearch_vs_sorfdb.yaml

0 → 100644

+28 −0

Original line number	Diff line number	Diff line
		# Module manifest for the hmmscan against sORFdb small protein family analysis

		# The name of the module. Is needed for the list-analyses option, for custom
		# configurations and custom profiles
		name: 'hmmer_sorfdb'

		# Short description of the analysis
		info: 'hmmscan analysis against sORFdb small protein families'

		# The name of the script for the analysis step. Must take a --fasta parameter
		analysis:
		script: 'run_hmmer.py'
		parameters:
		database: 'sorfdb/sorfdb.1.0.hmm'
		ga: 'True'
		execution:
		cluster:
		chunksize: 200
		container:
		docker: 'proteogenomicsworkflow/hmmer:3.4'
		singularity: 'proteogenomicsworkflow/hmmer:3.4'

		# The name of the result to json converter script. Must take one parameter, the
		# result file from the analysis_script
		converter:
		script: 'convert_hmmer.py'
		parameters:
		dbxref: 'sORFdb'

default_repo/modules/pepstats.yaml

0 → 100644

+26 −0

Original line number	Diff line number	Diff line
		# Module manifest for the Pepstats analysis

		# The name of the module. Is needed for the list-analyses option, for custom
		# configurations and custom profiles.
		name: 'pepstats'

		# Short description of the analysis.
		info: 'Calculates statistics of protein properties'

		# The configuration of the script for the analysis step.
		analysis:
		# script must take a --fasta parameter
		script: 'run_pepstats.py'
		# specify additional default configuration here
		parameters:
		# run script in a container
		container:
		docker: 'biocontainers/emboss:v6.6.0dfsg-7b1-deb_cv1'
		singularity: 'biocontainers/emboss:v6.6.0dfsg-7b1-deb_cv1'

		# The configuration of the script for the json conversion step.
		converter:
		# script must take a --result parameter, which is the result from the analysis step
		script: 'convert_pepstats.py'
		# specify additional default configuration here
		parameters:

default_repo/profiles/bacteria-ecf.yaml

0 → 100644

+11 −0

Original line number	Diff line number	Diff line
		name: 'bacteria-ecf'
		info: 'Profile for ecfexpress calculation'
		modules:
		include_sequence:
		tmhmm:
		ghostx_swissprot:
		hmmer_pfam_a:
		evalue: 1e-5
		hmmer_ecfgroups:
		hmmer_ecfsubgroups:
		pepstats:

default_repo/profiles/bacteria-gram+.yaml

0 → 100644

+9 −0

Original line number	Diff line number	Diff line
		name: 'bacteria-gram+'
		info: 'Profile for gram-positive bacteria'
		modules:
		include_sequence:
		signalp:
		organism: gram+
		tmhmm:
		ghostx_swissprot:
		hmmer_pfam_a:

default_repo/profiles/bacteria-gram-.yaml

0 → 100644

+10 −0

Original line number	Diff line number	Diff line
		name: 'bacteria-gram-'
		info: 'Profile for gram-negative bacteria'
		modules:
		include_sequence:
		signalp:
		organism: gram-
		tmhmm:
		ghostx_swissprot:
		hmmer_pfam_a:
		pepstats:

default_repo/profiles/bacteria-sorfdb.yaml

0 → 100644

+5 −0

Original line number	Diff line number	Diff line
		name: 'bacteria-sorfdb'
		info: 'Profile for sORFdb HMM family search'
		modules:
		include_sequence:
		hmmer_sorfdb:

default_repo/profiles/common.yaml

+2 −0

Original line number	Diff line number	Diff line
		@@ -5,3 +5,5 @@ modules:
		signalp:
		blastp_swissprot:
		hmmer_pfam_a:
		tmhmm:
		pepstats:

default_repo/profiles/complete.yaml

+3 −0

Original line number	Diff line number	Diff line
		name: 'complete'
		info: 'Profile that uses all available tools'
		modules:
		include_sequence:
		signalp:
		ghostx_swissprot:
		hmmer_pfam_a:
		targetp:
		organism_group: 'non-plant'
		tmhmm:
		pepstats:

default_repo/profiles/debug.yaml

+1 −0

Original line number	Diff line number	Diff line
		@@ -8,3 +8,4 @@ modules:
		hmmer_pfam_a:
		targetp:
		organism_group: 'non-plant'
		tmhmm:

default_repo/profiles/eukaryote-plant.yaml

0 → 100644

+12 −0

Original line number	Diff line number	Diff line
		name: 'eukaryote-plant'
		info: 'Profile for plants'
		modules:
		include_sequence:
		signalp:
		organism: euk
		tmhmm:
		targetp:
		organism_group: plant
		ghostx_swissprot:
		hmmer_pfam_a:
		pepstats:

default_repo/profiles/eukaryote.yaml

0 → 100644

+12 −0

Original line number	Diff line number	Diff line
		name: 'eukaryote'
		info: 'Profile for eukaryotes'
		modules:
		include_sequence:
		signalp:
		organism: euk
		tmhmm:
		targetp:
		organism_group: non-plant
		ghostx_swissprot:
		hmmer_pfam_a:
		pepstats:

default_repo/profiles/fast.yaml

+2 −0

Original line number	Diff line number	Diff line
		name: 'fast'
		info: 'Profile that contains tools that give a fast result'
		modules:
		include_sequence:
		ghostx_swissprot:
		signalp:
		organism: 'euk'
		tmhmm:
		pepstats:

default_repo/scripts/convert_blastp.py

+4 −4

Original line number	Diff line number	Diff line
		@@ -52,11 +52,11 @@ with open(filename) as f:
		if '% identity' in header:
		result['target']["percent_identity"] = float(split[header['% identity']])
		if 'q. start' in header and 'q. end' in header:
		result['query']['start'] = split[header['q. start']]
		result['query']['end'] = split[header['q. end']]
		result['query']['start'] = int(split[header['q. start']])
		result['query']['end'] = int(split[header['q. end']])
		if 's. start' in header and 's. end' in header:
		result['target']['start'] = split[header['s. start']]
		result['target']['end'] = split[header['s. end']]
		result['target']['start'] = int(split[header['s. start']])
		result['target']['end'] = int(split[header['s. end']])
		if 'evalue' in header:
		result['target']['evalue'] = float(split[header['evalue']])
		if 'BTOP' in header:

default_repo/scripts/convert_ghostx.py

+6 −5

Original line number	Diff line number	Diff line
		@@ -43,15 +43,16 @@ with open(result_filename) as f:
		elif args.acc_split:
		accession = acession.split(args.acc_split)[0]
		result['query'] = {
		'start': split[6],
		'end': split[7]
		'start': int(split[6]),
		'end': int(split[7])
		}
		result['target'] = {
		'dbxref': args.dbxref + ':' + accession,
		'start': split[8],
		'end': split[9],
		'start': int(split[8]),
		'end': int(split[9]),
		'evalue': float(split[10]),
		'percent_identity': float(split[2])
		'percent_identity': float(split[2]),
		'score': float(split[11])
		}

		results.append(result)

default_repo/scripts/convert_hmmer.py

+19 −11

Original line number	Diff line number	Diff line
		@@ -48,21 +48,29 @@ with open(filename) as f:
		print('HMMER converter: Query ID "' + query_id + '" not found among initial queries!')
		results = documents[query_id]['computations'][0]["results"]

		dbxref = None
		if split[1] == "-":
		dbxref = args.dbxref + ":" + split[0]
		else:
		dbxref = args.dbxref + ":" + split[0]
		results.append({
		'target': {
		'name': split[0],
		'dbxref': args.dbxref + ':' + split[1],
		'length': split[2],
		'score': float(split[13]),
		'start': int(split[15]),
		'end': int(split[16]),
		'description': split[22].rstrip()
		'name': split[0], # target name
		'dbxref': dbxref,
		'length': split[2], # tlen
		'score': float(split[13]), # this domain score
		'bias': float(split[14]), # this domain bias
		'evalue': float(split[12]), # this domain i-Evalue
		'start': int(split[15]), # hmm coord from
		'end': int(split[16]), # hmm coord to
		'acc': float(split[21]), # acc
		'description': split[22].rstrip() # description of target
		},
		'query': {
		'start': int(split[17]),
		'end': int(split[18]),
		'envelop_start': int(split[19]),
		'envelop_end': int(split[20])
		'start': int(split[17]), # ali coord from
		'end': int(split[18]), # ali coord to
		'envelop_start': int(split[19]), # env coord from
		'envelop_end': int(split[20]) # env coord to
		}
		})

default_repo/scripts/convert_pepstats.py

0 → 100755

+111 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/env python3

		import sys
		import json
		import argparse
		import glob

		parser = argparse.ArgumentParser(description='Convert pepstats results to json documents')
		parser.add_argument('--result', '-r', required=True, help='The pepstats result file')
		parser.add_argument('--output', '-o', required=True, help='The converted results json file')
		args = parser.parse_args()




		filename = args.result
		documents={}
		residue=False
		propertyv=False
		#id
		with open(filename) as r:
		#Über Ergebnisse iterieren
		for line in r:
		if line.startswith('PEPSTATS'):
		document={}
		line=line.strip().split()
		seq_id=line[2]
		if not seq_id in documents:
		documents[seq_id] = {
		"id": seq_id,
		"computations": [
		]
		}
		computation = {
		'tool':{'name':'Pepstats','version':'EMBOSS:6.6.0.0'},
		'results' : []
		}
		result = {}
		#Ergebnisse zur gabzen Sequenz abspeichern
		elif line.startswith('Molecular'):
		line=line.strip().split()
		molecularw=line[3]
		residues=line[6]
		result['Molecular weight']=float(molecularw)
		result['Residues']=float(residues)
		elif line.startswith('Average'):
		line=line.strip().split()
		averagerw=line[4]
		charge=line[7]
		result['Average Residue Weight']= float(averagerw)
		result['Charge']= float(charge)
		elif line.startswith('Isoelectric'):
		line=line.strip().split()
		iso=line[3]
		result['Isolectric point']=float(iso)
		elif line.startswith('Improbability'):
		line=line.strip().split()
		probabilityib=1-float(line[7])
		result['Probability of expression in inclusion bodies']=probabilityib
		#Ergebnisse zu einzelnen Aminosäuren abspeichern
		elif residue == False and line.startswith('Residue'):
		residue=True
		elif residue==True:
		line=line.strip().split()
		#print(result)
		if line==[]:
		residue = False
		else:
		residueS=line[2]
		number=line[3]
		mole=line[4]
		dayhoff=line[5]
		if 'Amino acids' not in result:
		result['Amino acids']=[]
		amino={}
		amino['Residue']=residueS
		amino['Number']=int(number)
		amino['Mole%']=float(mole)
		amino['DayhoffStat']=float(dayhoff)
		result['Amino acids'].append(amino)
		amino={}
		#Ergebnisse zu Aminosäure-Gruppen abspeichern
		elif propertyv == False and line.startswith('Property'):
		propertyv=True
		elif propertyv==True:
		line=line.strip().split()
		print(line)
		if line ==[]:
		propertyv=False
		computation['results'].append(result)
		documents[seq_id]['computations'].append(computation)
		else:
		propertyd=line[0]
		residuesd=line[1]
		numberd=line[2]
		moled=line[3]
		if 'Physico-chemical class' not in result:
		result['Physico-chemical class']=[]
		pcc={}
		pcc['Property']=propertyd
		pcc['Residues']=residuesd
		pcc['Number']=int(numberd)
		pcc['Mole%']=float(moled)
		result['Physico-chemical class'].append(pcc)
		pcc={}




		with open(args.output, 'w') as o:
		json.dump(documents,o)
		No newline at end of file

default_repo/scripts/convert_targetp.py

+2 −2

Original line number	Diff line number	Diff line
		@@ -32,9 +32,9 @@ with open(filename) as f:
		for field in field_index:
		if field_index[field] is not None:
		if field == "Loc":
		results[field.lower()] = loc_dict[split[field_index[field]]]
		results['Localization'] = loc_dict[split[field_index[field]]]
		else:
		results[field.lower()] = float(split[field_index[field]])
		results[field] = float(split[field_index[field]])
		else:
		if line.startswith('Name '):
		split = line.split()

default_repo/scripts/run_ghostx.py

+2 −1

Original line number	Diff line number	Diff line
		@@ -8,6 +8,7 @@ ghostx_tool = 'ghostx'
		parser = argparse.ArgumentParser(description='Identify homologues in the swissprot database')
		parser.add_argument('--fasta', '-f', required=True, help='A fasta file with aminoacid sequences')
		parser.add_argument('--database', '-d', required=True, help='Database to search in')
		parser.add_argument('--max_hits', '-m', type=int, default=20, help='Maximal number of reported hits')
		parser.add_argument('--output', '-o', required=True, help='The result directory. Will contain info.json, results.tsv and enum_headers.tsv.')
		args = parser.parse_args()

		@@ -22,4 +23,4 @@ print(" VERSION=\$(ghostx 2>&1 \| grep -Eo '[[:digit:]]\\\\.[[:digit:]]\\\\.[[
		toolconfig = '{\\\\"name\\\\": \\\\"ghostx\\\\", \\\\"database\\\\":' \
		+ '\\\\"' + args.database + '\\\\"' + ', \\\\"version\\\\":\\\\"\$VERSION\\\\" }'
		print(' echo ' + toolconfig + ' > ' + args.output + '/info.json')
		print(' ' + ghostx_tool + " aln -d " + args.database + " -o " + args.output + "/results.tsv -i " + args.fasta)
		print(' ' + ghostx_tool + " aln -v 5 -b " + str(args.max_hits) + " -d " + args.database + " -o " + args.output + "/results.tsv -i " + args.fasta)

default_repo/scripts/run_hmmer.py

+4 −1

Original line number	Diff line number	Diff line
		@@ -11,12 +11,15 @@ parser.add_argument('--fasta', '-f', required=True, help='A fasta file with amin
		parser.add_argument('--database', '-d', required=True, help='Database to search in')
		parser.add_argument('--output', '-o', required=True, help='The result directory')
		parser.add_argument('--evalue', '-e', default='0.0001', help='Evalue cutoff')
		parser.add_argument('--ga', '-g', default=False, type=bool,
		help="Use profile's GA gathering cutoffs to set all thresholding")
		args = parser.parse_args()

		print('mkdir -p ' + args.output)

		cutoff = " --cut_ga " if args.ga else f" -E {args.evalue} "
		print(hmmscan_tool +
		" -E " + args.evalue +
		cutoff +
		" -o " + args.output + "/hmmscan.out " +
		" --tblout " + args.output + "/tblout.tsv " +
		" --domtblout " + args.output + "/domtblout.tsv " +

default_repo/scripts/run_pepstats.py

0 → 100755

+13 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/env python3
		import argparse
		from os import system,environ

		pepstats_tool='pepstats'

		parser =argparse.ArgumentParser(description='Calculates statistics of protein properties')
		parser.add_argument('--fasta','-f', required=True, help='A fasta file')
		parser.add_argument('--output','-o',required=True, help='A output file')
		args=parser.parse_args()


		print(pepstats_tool , args.fasta , args.output)

default_repo/tests/data/fasta/proteins.fas

0 → 100644

+707 −0

File added.

Preview size limit exceeded, changes collapsed.

default_repo/tests/execute_targetp_tools.sh

0 → 100755

+7 −0

Original line number	Diff line number	Diff line
		#!/bin/bash
		set -uexo pipefail

		SCRIPT=$(../scripts/run_targetp.py --fasta data/fasta/proteins.fas --output data/targetp/ --organism_group non-plant)
		eval "$SCRIPT"

		../scripts/convert_targetp.py --result data/targetp/ --output data/targetp/converted.json

example/ecf.fas

0 → 100644

+4 −0

Original line number	Diff line number	Diff line
		>tr\|Q189L9\|Q189L9_PEPD6 Extracytoplasmic function (ECF) sigma factor csfT OS=Peptoclostridium difficile (strain 630) OX=272563 GN=csfT PE=3 SV=1
		MDKTTFTNNILESEQTLYRVSKSILGNDQDCEDAVNNAILKAYEKLDSLKEEQYFKTWLI
		RIVINECNSLRRKRLKSLSFEDVFKNKKIDEKDDYSDLYTAIQSLPKKIKIPIVLYYIEG
		YSVDEVKEILDIPQGTVKSRLSRGRRLLKTKLENTEVII

psot/dbxref.py

+10 −10

Original line number	Diff line number	Diff line
		@@ -13,25 +13,25 @@ def dbxrefs_from_document(document):
		return set(refs)
		return set([])

		def combine_with_documents(documents, informations):
		def combine_with_documents(documents, data):
		for k in documents:
		combine_with_document2(documents[k], informations)
		combine_with_document2(documents[k], data)

		def combine_with_document(document, informations):
		def combine_with_document(document, data):
		'''adds an information tag for each result that references a dbxref'''
		if 'computations' in document:
		for c in document['computations']:
		for r in c['results']:
		if 'target' in r and 'dbxref' in r['target']:
		r['informations'] = next(entry for entry in informations if entry['id'] == r['target']['dbxref'])
		r['dbxrefs'] = next(entry for entry in data if entry['id'] == r['target']['dbxref'])

		def combine_with_document2(document, informations):
		def combine_with_document2(document, data):
		'''adds an information field to the document that contains all referenced
		dbxrefs present in informations within the document'''
		if not 'informations' in document:
		document['informations'] = {}
		dbxrefs present in data within the document'''
		if not 'dbxrefs' in document:
		document['dbxrefs'] = []
		dbxrefs = dbxrefs_from_document(document)
		for info in informations:
		for info in data:
		if info['id'] in dbxrefs:
		document['informations'][info['id']] = info
		document['dbxrefs'].append(info)

psot/helpers/restore_ids.py

+8 −1

Original line number	Diff line number	Diff line
		@@ -10,6 +10,7 @@ parser.add_argument('--tsv', '-t', required=True, help='The mapping file in tsv
		parser.add_argument('--output', '-o', required=True, help='The result json document file')
		args = parser.parse_args()

		id_to_description = {}
		seq_id_dict = {}
		docs_enumerated = {}

		@@ -20,8 +21,12 @@ with open(args.json) as j:
		# read mapping
		with open(args.tsv) as h:
		for line in h:
		num, id = line.strip().split('\t')
		split = line.strip().split('\t')
		num, id = (split[0], split[1])
		seq_id_dict[num] = id
		if len(split) is 3:
		description = split[2]
		id_to_description[id] = description

		documents_restored = {}

		@@ -29,6 +34,8 @@ for num in docs_enumerated:
		seq_id = seq_id_dict[num]
		doc = docs_enumerated[num]
		doc["id"] = seq_id
		if seq_id in id_to_description:
		doc["description"] = id_to_description[seq_id]
		documents_restored[seq_id] = doc

		with open(args.output, 'w') as o:

psot/main.py

+21 −9

Original line number	Diff line number	Diff line
		@@ -14,26 +14,30 @@ from psot.nextflow import setup_execution_directory, execute_analysis
		__version__ = VersionInfo('psot').semantic_version().release_string()

		def main():
		parser = argparse.ArgumentParser(description='PSOT - Proteins sequence observation tool\nVersion ' + __version__ +'\nGather informations about proteins.', formatter_class=argparse.RawTextHelpFormatter)
		parser = argparse.ArgumentParser(description='PSOT - Proteins sequence observation tool\nVersion ' + __version__ +'\nGather data about proteins.', formatter_class=argparse.RawTextHelpFormatter)
		parser.set_defaults(func=help)

		subparsers = parser.add_subparsers()
		info_parser = subparsers.add_parser('info')
		info_parser.add_argument('--listanalyses', '-l', action='store_true', help='Show available analysis steps')
		info_parser.add_argument('--list_analyses', '-l', action='store_true', help='Show available analysis steps')
		info_parser.set_defaults(func=info)

		analyze_parser = subparsers.add_parser('analyze')
		executions = analyze_parser.add_mutually_exclusive_group()

		analyze_parser.add_argument('--fasta', '-f', required=True, help='A fasta file with aminoacid sequences')
		analyze_parser.add_argument('--output', '-o', required=True, help='The output directory for the json documents')
		analyze_parser.add_argument('--profile', '-p', default='fast', help='The profile to use')
		analyze_parser.add_argument('--live', '-l', action='store_true', help='Report results as they are computed, not only at the end of the computation. The live results will be available in the $output/live.')
		analyze_parser.add_argument('--config', '-c', help='The config to use')
		analyze_parser.add_argument('--fetch_informations', '-i', action='store_true', help='Fetch informations')
		analyze_parser.add_argument('--fetch_dbxrefs', '-F', action='store_true', help='Fetch associated dbxrefs')
		analyze_parser.add_argument('--debug', '-d', action='store_true', help='Debug mode, computation directory will not be removed after computation')
		analyze_parser.add_argument('--execution_dir', '-e', help='Use the specified execution directory and do not delete it after the computation')
		analyze_parser.add_argument('--use_cluster', '-C', action='store_true', help='Use compute cluster for execution')
		executions.add_argument('--k8s', '-k', action='store_true', help='Use kubernetes for execution')
		executions.add_argument('--use_cluster', '-C', action='store_true', help='Use compute cluster for execution')
		analyze_parser.add_argument('--k8s_config', help='The k8s nextflow config to use')
		analyze_parser.add_argument('--download_databases', '-b', action='store_true', help='Download databases if they are not present yet')
		analyze_parser.add_argument('--generate_only', '-g', action='store_true', help='Only generate workflow directory without executing it')

		images = analyze_parser.add_mutually_exclusive_group()
		images.add_argument('--docker', '-D', action='store_true', help='Use docker image for computation')
		@@ -66,9 +70,12 @@ def analyze(args, config):
		download_databases(execution)

		setup_execution_directory(execution)
		if not args.generate_only:
		error_code = execute_analysis(execution)
		cleanup(execution)
		exit(error_code)
		else:
		exit(0)

		def cleanup(execution):
		if not execution['debug']:
		@@ -77,8 +84,11 @@ def cleanup(execution):
		def generate_execution(config, args):
		execution = {}
		execution['debug'] = args.debug
		execution['use_k8s'] = args.k8s
		if args.k8s and 'k8s' in config and 'defaultimage' in config['k8s']:
		execution['k8s_default_image'] = config['k8s']['defaultimage']
		execution['use_cluster'] = args.use_cluster
		execution['fetch_informations'] = args.fetch_informations
		execution['fetch_dbxrefs'] = args.fetch_dbxrefs
		execution['mode'] = 'live' if args.live else 'complete'
		execution['fasta'] = os.path.abspath(args.fasta)
		execution['output'] = os.path.abspath(args.output)
		@@ -98,6 +108,8 @@ def generate_execution(config, args):
		else:
		execution['directory'] = tempfile.mkdtemp()
		execution['modules'] = generate_execution_modules_for_profile(config, args.profile)
		if args.k8s_config:
		execution['k8s_config'] = args.k8s_config
		return execution

		def generate_execution_modules_for_profile(config, profile):

psot/nextflow.py

+79 −15

Original line number	Diff line number	Diff line
		@@ -6,13 +6,14 @@ from copy import deepcopy
		import shutil
		import collections
		import sys
		from collections.abc import import MutableMapping

		# taken from https://stackoverflow.com/questions/6027558/flatten-nested-python-dictionaries-compressing-keys
		def flatten(d, parent_key='', sep='_'):
		items = []
		for k, v in d.items():
		new_key = parent_key + sep + k if parent_key else k
		if isinstance(v, collections.MutableMapping):
		if isinstance(v, MutableMapping):
		items.extend(flatten(v, new_key, sep=sep).items())
		else:
		items.append((new_key, v))
		@@ -162,8 +163,8 @@ process ${id}_restore_ids {
		}
		''')

		retrieve_informations_template = Template('''
		process retrieve_informations_for_${id} {
		retrieve_dbxrefs_template = Template('''
		process retrieve_dbxrefs_for_${id} {

		input:
		file result from ${id}_json_info
		@@ -177,8 +178,8 @@ process retrieve_informations_for_${id} {
		"""
		}
		''')
		retrieve_informations_live_template = Template('''
		process retrieve_informations_for_${id} {
		retrieve_dbxrefs_live_template = Template('''
		process retrieve_dbxrefs_for_${id} {

		input:
		file result from ${id}_json_info
		@@ -234,6 +235,12 @@ analysis_config_template = Template('''
		}
		'''
		)
		k8s_analysis_config_template = Template('''
		withName:${id}{
		${container}
		}
		'''
		)
		beforeScript_modul_config_template = Template('''
		withName:${process_name}{
		${beforeScript}
		@@ -242,7 +249,7 @@ beforeScript_modul_config_template = Template('''
		)

		beforeScript_norm_config_template = Template('''
		withName:normalizing_fasta{
		withName:normalize_fasta{
		${beforeScript}
		}
		'''
		@@ -321,19 +328,19 @@ def generate_nextflow_script(execution):
		config['cmdline'] = cmdline.stdout.decode('utf-8')

		fragments.append(analysis_template.substitute(config))
		if execution['mode'] == 'live' and not execution['fetch_informations']:
		if execution['mode'] == 'live' and not execution['fetch_dbxrefs']:
		fragments.append(convert_live_template.substitute(config))
		fragments.append(restore_headers_json_live_template.substitute(config))
		fragments.append(live_results_template.substitute(config))
		elif execution['mode'] == 'live' and execution['fetch_informations']:
		elif execution['mode'] == 'live' and execution['fetch_dbxrefs']:
		fragments.append(convert_info_template.substitute(config))
		fragments.append(restore_headers_json_info_template.substitute(config))
		fragments.append(retrieve_informations_live_template.substitute(config))
		fragments.append(retrieve_dbxrefs_live_template.substitute(config))
		fragments.append(live_results_template.substitute(config))
		elif execution['mode'] == 'complete' and execution['fetch_informations']:
		elif execution['mode'] == 'complete' and execution['fetch_dbxrefs']:
		fragments.append(convert_info_template.substitute(config))
		fragments.append(restore_headers_json_info_template.substitute(config))
		fragments.append(retrieve_informations_template.substitute(config))
		fragments.append(retrieve_dbxrefs_template.substitute(config))
		else:
		fragments.append(convert_template.substitute(config))
		fragments.append(restore_headers_json_template.substitute(config))
		@@ -350,6 +357,9 @@ def generate_nextflow_script(execution):
		return nextflow_script

		def generate_nextflow_config(execution):
		if execution['use_k8s']:
		return generate_k8s_config(execution)

		modules = execution['modules']
		database_path = execution['database_path']
		mount_point = execution['mount_point_for_containers']
		@@ -369,14 +379,21 @@ def generate_nextflow_config(execution):
		runOptions = '--bind {dpath}:{mpoint}'
		}}
		'''.format(dpath=database_path, mpoint=mount_point))
		elif execution['use_k8s']:
		fragments.append('''process.executor = 'k8s'
		''')


		fragments.append('''process { ''')
		if execution['use_k8s']:
		fragments.append(''' executor = 'k8s'
		''')

		for m in modules:
		config = {}
		config['id'] = m['id']

		if execution['docker'] and m['analysis']['container']['docker']:
		if m['analysis']['container']['docker']:
		config['container'] = "container = " + "'" + m['analysis']['container']['docker'] + "'"
		# mount the referenced database at the same location as on the host system
		if 'parameters' in m['analysis'] and 'database' in m['analysis']['parameters'] and m['analysis']['parameters']['database']:
		@@ -392,6 +409,9 @@ def generate_nextflow_config(execution):
		if execution['use_cluster']:
		config['executor'] = 'sge'
		config['clusterOptions'] = "clusterOptions = '-S /bin/bash'"
		if execution['use_k8s']:
		config['executor'] = 'k8s'
		config['clusterOptions'] = ""
		else:
		config['executor'] = 'local'
		config['clusterOptions'] = ''
		@@ -399,10 +419,10 @@ def generate_nextflow_config(execution):
		if 'venv' in execution:
		config['beforeScript'] = "beforeScript = 'export PS1=; source " + execution['venv'] + "/bin/activate'"

		if execution['fetch_informations']:
		process_names_list = Template('convert_${id}_to_json\|${id}_restore_headers_json\|retrieve_informations_for_${id}').substitute(config).split('\|')
		if execution['fetch_dbxrefs']:
		process_names_list = Template('convert_${id}_to_json\|${id}_restore_ids\|retrieve_dbxrefs_for_${id}').substitute(config).split('\|')
		else:
		process_names_list = Template('convert_${id}_to_json\|${id}_restore_headers_json').substitute(config).split('\|')
		process_names_list = Template('convert_${id}_to_json\|${id}_restore_ids').substitute(config).split('\|')

		fragments.append(analysis_config_template.substitute(config))
		for process in process_names_list:
		@@ -420,4 +440,48 @@ def generate_nextflow_config(execution):
		nextflow_config = '\n'.join(fragments)
		return nextflow_config

		def generate_k8s_config(execution) :
		image = ''
		if "k8s_default_image" in execution:
		image = execution["k8s_default_image"]
		modules = [
		{ 'id' : 'normalize_fasta' },
		{ 'id' : 'join_documents' },
		{ 'id' : 'split_documents' },
		]

		for m in execution['modules']:
		config = {}
		id = m['id']
		config['id'] = id
		if m['analysis']['container']['docker']:
		config['container'] = "container = " + "'" + m['analysis']['container']['docker'] + "'"
		modules.append(config)
		modules.append({'id': 'convert_' + id + '_to_json'})
		modules.append({'id': id + '_restore_ids'})
		modules.append({'id': 'retrieve_dbxrefs_for_' + id})
		if execution['mode'] == 'live':
		modules.append({'id': 'generate_'+ id + '_live_results'})

		for m in modules:
		if not 'container' in m:
		m['container'] = "container = " + '"' + image + '"'

		fragments = []
		if 'k8s_config' in execution:
		with open(execution['k8s_config']) as f:
		for l in f:
		fragments.append(l.rstrip())
		fragments.append('''process { ''')
		if execution['use_k8s']:
		fragments.append(''' executor = 'k8s'
		''')

		for m in modules:
		fragments.append(k8s_analysis_config_template.substitute(m))

		fragments.append('''}''')

		nextflow_config = '\n'.join(fragments)
		return nextflow_config

requirements.txt

+1 −0

Original line number	Diff line number	Diff line
		sphinx
		sphinx_rtd_theme
		requests
		jsonmerge

setup.py

+0 −3

Original line number	Diff line number	Diff line
		from setuptools import setup
		# this is only necessary when not using setuptools/distribute
		from sphinx.setup_command import BuildDoc
		cmdclass = {'build_sphinx': BuildDoc}

		setup(
		setup_requires=['pbr'],

Compare revisions

Source

Target

Commits on Source 34

Files

.gitlab-ci.yml

AUTHORS

Dockerfile

default_repo/modules/hmmer_vs_ecfgroups.yaml

default_repo/modules/hmmer_vs_ecfsubgroups.yaml

default_repo/modules/hmmsearch_vs_sorfdb.yaml

default_repo/modules/pepstats.yaml

default_repo/profiles/bacteria-ecf.yaml

default_repo/profiles/bacteria-gram+.yaml

default_repo/profiles/bacteria-gram-.yaml

default_repo/profiles/bacteria-sorfdb.yaml

default_repo/profiles/common.yaml

default_repo/profiles/complete.yaml

default_repo/profiles/debug.yaml

default_repo/profiles/eukaryote-plant.yaml

default_repo/profiles/eukaryote.yaml

default_repo/profiles/fast.yaml

default_repo/scripts/convert_blastp.py

default_repo/scripts/convert_ghostx.py

default_repo/scripts/convert_hmmer.py

default_repo/scripts/convert_pepstats.py

default_repo/scripts/convert_targetp.py

default_repo/scripts/run_ghostx.py

default_repo/scripts/run_hmmer.py

default_repo/scripts/run_pepstats.py

default_repo/tests/data/fasta/proteins.fas

default_repo/tests/execute_targetp_tools.sh

example/ecf.fas

psot/dbxref.py

psot/helpers/restore_ids.py

psot/main.py

psot/nextflow.py

requirements.txt

setup.py