Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • containerization
  • develop
  • effector-pipeline
  • feature/dbxrefs-list
  • fix/containerization
  • master
  • psot-fix-requirement
  • 0.1
  • 0.2
  • 0.3.0
  • 0.3.1
  • 0.3.2
  • 0.4.0
  • 0.4.1
14 results

Target

Select target project
No results found
Select Git revision
  • containerization
  • develop
  • effector-pipeline
  • feature/dbxrefs-list
  • fix/containerization
  • master
  • psot-fix-requirement
  • 0.1
  • 0.2
  • 0.3.0
  • 0.3.1
  • 0.3.2
  • 0.4.0
  • 0.4.1
14 results
Show changes

Commits on Source 34

34 files
+ 1294
62
Compare changes
  • Side-by-side
  • Inline

Files

.gitlab-ci.yml

0 → 100644
+96 −0
Original line number Diff line number Diff line
stages:
    - artifacts

variables:
  CI_REGISTRY_IMAGE: harbor.computational.bio.uni-giessen.de/psos/psot
  VERSIONLABELMETHOD: "OnlyIfThisCommitHasVersion" # options: "OnlyIfThisCommitHasVersion","LastVersionTagInGit"
  ADDITIONALTAGLIST: latest
  IMAGE_LABELS: >
    --label org.opencontainers.image.vendor=$CI_SERVER_URL/$GITLAB_USER_LOGIN
    --label org.opencontainers.image.authors=$CI_SERVER_URL/$GITLAB_USER_LOGIN
    --label org.opencontainers.image.revision=$CI_COMMIT_SHA
    --label org.opencontainers.image.source=$CI_PROJECT_URL
    --label org.opencontainers.image.documentation=$CI_PROJECT_URL
    --label org.opencontainers.image.licenses=$CI_PROJECT_URL
    --label org.opencontainers.image.url=$CI_PROJECT_URL
    --label vcs-url=$CI_PROJECT_URL
    --label com.gitlab.ci.user=$CI_SERVER_URL/$GITLAB_USER_LOGIN
    --label com.gitlab.ci.email=$GITLAB_USER_EMAIL
    --label com.gitlab.ci.tagorbranch=$CI_COMMIT_REF_NAME
    --label com.gitlab.ci.pipelineurl=$CI_PIPELINE_URL
    --label com.gitlab.ci.commiturl=$CI_PROJECT_URL/commit/$CI_COMMIT_SHA
    --label com.gitlab.ci.cijoburl=$CI_JOB_URL
    --label com.gitlab.ci.mrurl=$CI_PROJECT_URL/-/merge_requests/$CI_MERGE_REQUEST_ID

build-and-deploy-docker-image:
  stage: artifacts
  extends: .build-docker-image

get-latest-git-version:
  stage: .pre
  image:
    name: alpine/git
    entrypoint: [""]
  rules:
    - if: '$VERSIONLABELMETHOD == "LastVersionTagInGit"'
  script:
    - |
      echo "the google kaniko container does not have git and does not have a packge manager to install it"
      git clone https://github.com/GoogleContainerTools/kaniko.git
      cd kaniko
      echo "$(git describe --abbrev=0 --tags)" > ../VERSIONTAG.txt
      echo "VERSIONTAG.txt contains $(cat ../VERSIONTAG.txt)"
  artifacts:
    paths:
      - VERSIONTAG.txt

# taken from https://gitlab.com/guided-explorations/containers/kaniko-docker-build/-/blob/master/.gitlab-ci.yml
.build-docker-image:
  image:
    name: gcr.io/kaniko-project/executor:debug
    entrypoint: [""]
  script:
    - |
      echo "Building and shipping image to $CI_REGISTRY_IMAGE"
      #Build date for opencontainers
      BUILDDATE="'$(date '+%FT%T%z' | sed -E -n 's/(\+[0-9]{2})([0-9]{2})$/\1:\2/p')'" #rfc 3339 date
      IMAGE_LABELS="$IMAGE_LABELS --label org.opencontainers.image.created=$BUILDDATE --label build-date=$BUILDDATE"
      #Description for opencontainers
      BUILDTITLE=$(echo $CI_PROJECT_TITLE | tr " " "_")
      IMAGE_LABELS="$IMAGE_LABELS --label org.opencontainers.image.title=$BUILDTITLE --label org.opencontainers.image.description=$BUILDTITLE"
      #Add ref.name for opencontainers
      IMAGE_LABELS="$IMAGE_LABELS --label org.opencontainers.image.ref.name=$CI_REGISTRY_IMAGE:${CI_COMMIT_REF_NAME//\//_}"

      #Build Version Label and Tag from git tag, LastVersionTagInGit was placed by a previous job artifact
      if [[ "$VERSIONLABELMETHOD" == "LastVersionTagInGit" ]]; then VERSIONLABEL=$(cat VERSIONTAG.txt); fi
      if [[ "$VERSIONLABELMETHOD" == "OnlyIfThisCommitHasVersion" ]]; then VERSIONLABEL=$CI_COMMIT_TAG; fi
      if [[ ! -z "$VERSIONLABEL" ]]; then
        IMAGE_LABELS="$IMAGE_LABELS --label org.opencontainers.image.version=$VERSIONLABEL"
        ADDITIONALTAGLIST="$ADDITIONALTAGLIST $VERSIONLABEL"
      fi

      ADDITIONALTAGLIST="$ADDITIONALTAGLIST $CI_COMMIT_REF_NAME $CI_COMMIT_SHORT_SHA"
      if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then ADDITIONALTAGLIST="$ADDITIONALTAGLIST latest"; fi
      if [[ -n "$ADDITIONALTAGLIST" ]]; then
        for TAG in $ADDITIONALTAGLIST; do
          TAG=${TAG//\//_} # replace slashes with underscore in tag
          FORMATTEDTAGLIST="${FORMATTEDTAGLIST} --tag $CI_REGISTRY_IMAGE:$TAG ";
        done;
      fi
      if [[ -n "$ADDITIONALIMAGENAMES" ]]; then
        for TAG in $ADDITIONALIMAGENAMES; do
          FORMATTEDTAGLIST="${FORMATTEDTAGLIST} --tag $TAG ";
        done;
      fi

      #Reformat Docker tags to kaniko's --destination argument:
      FORMATTEDTAGLIST=$(echo "${FORMATTEDTAGLIST}" | sed s/\-\-tag/\-\-destination/g)

      mkdir -p /kaniko/.docker
      PROJECTDIR=$CI_PROJECT_DIR
      if [[ ! -z "$SUB_DIRECTORY" ]]; then
        PROJECTDIR=$CI_PROJECT_DIR/$SUB_DIRECTORY
      fi
      echo "{\"auths\":{\"$CI_REGISTRY\":{\"auth\":\"$(echo -n $CI_REGISTRY_USER:$CI_REGISTRY_PASSWORD | base64 | tr -d '\n')\"}}}" > /kaniko/.docker/config.json
      /kaniko/executor --context $PROJECTDIR --dockerfile $PROJECTDIR/Dockerfile $FORMATTEDTAGLIST $IMAGE_LABELS
+2 −0
Original line number Diff line number Diff line
Lukas Jelonek <lukas.jelonek@computational.bio.uni-giessen.de>
Marc Weingärtner <marc.weingaertner@bioinfsys.uni-giessen.de>
heiko-mueller <Heiko.Mueller@bioinfsys.uni-giessen.de>
lmueller <lion.mueller@bio.uni-giessen.de>
mweingae <marc.weingaertner@bioinfsys.uni-giessen.de>

Dockerfile

0 → 100644
+21 −0
Original line number Diff line number Diff line
FROM ubuntu:20.04

ENV DEBIAN_FRONTEND=noninteractive \
    NXF_VERSION=21.10.6
RUN apt-get update && \
    apt-get install -y python3-pip python3-setuptools python3-sphinx docker.io openjdk-11-jdk-headless wget

RUN wget https://github.com/nextflow-io/nextflow/releases/download/v${NXF_VERSION}/nextflow-${NXF_VERSION}-all && \
    chmod +x nextflow-${NXF_VERSION}-all && \
    mv nextflow-${NXF_VERSION}-all /usr/local/bin/ && \
    ln -s /usr/local/bin/nextflow-${NXF_VERSION}-all /usr/local/bin/nextflow

RUN pip3 install --upgrade git+https://git.computational.bio.uni-giessen.de/SOaAS/dbxref.git 

COPY ./ /opt/psot/
RUN pip3 install --upgrade /opt/psot/
ENV PSOT_REPOSITORIES=/opt/psot/default_repo/

# Use the image as follows:
# docker run -it -v /tmp:/tmp -v $PWD:$PWD -v /var/run/docker.sock:/var/run/docker.sock psot-image psot analyze -f $PWD/example/single_sequence.faa -o $PWD/result/ -p fast --docker
CMD psot info
Original line number Diff line number Diff line
# Module manifest for the blastp against swissprot analysis

# The name of the module. Is needed for the list-analyses option, for custom
# configurations and custom profiles
name: 'hmmer_ecfgroups'

# Short description of the analysis
info: 'hmmscan analysis against ECFexpress groups'

# The name of the script for the analysis step. Must take a --fasta parameter
analysis:
    script: 'run_hmmer.py'
    parameters:
        database: 'ecfexpress/ECFgroups.hmm'
        evalue: 1e-10
    execution:
        cluster:
            chunksize: 200
    container:
        docker: 'biocontainers/hmmer:v3.1b2dfsg-5-deb_cv1'
        singularity: 'biocontainers/hmmer:v3.1b2dfsg-5-deb_cv1'

# The name of the result to json converter script. Must take one parameter, the
# result file from the analysis_script
converter:
    script: 'convert_hmmer.py'
    parameters:
        dbxref: 'ECF'
Original line number Diff line number Diff line
# Module manifest for the blastp against swissprot analysis

# The name of the module. Is needed for the list-analyses option, for custom
# configurations and custom profiles
name: 'hmmer_ecfsubgroups'

# Short description of the analysis
info: 'hmmscan analysis against ECFexpress subgroups'

# The name of the script for the analysis step. Must take a --fasta parameter
analysis:
    script: 'run_hmmer.py'
    parameters:
        database: 'ecfexpress/ECFsubgroups.hmm'
        evalue: 1e-10
    execution:
        cluster:
            chunksize: 200
    container:
        docker: 'biocontainers/hmmer:v3.1b2dfsg-5-deb_cv1'
        singularity: 'biocontainers/hmmer:v3.1b2dfsg-5-deb_cv1'

# The name of the result to json converter script. Must take one parameter, the
# result file from the analysis_script
converter:
    script: 'convert_hmmer.py'
    parameters:
        dbxref: 'ECF'
Original line number Diff line number Diff line
# Module manifest for the hmmscan against sORFdb small protein family analysis

# The name of the module. Is needed for the list-analyses option, for custom
# configurations and custom profiles
name: 'hmmer_sorfdb'

# Short description of the analysis
info: 'hmmscan analysis against sORFdb small protein families'

# The name of the script for the analysis step. Must take a --fasta parameter
analysis:
    script: 'run_hmmer.py'
    parameters:
        database: 'sorfdb/sorfdb.1.0.hmm'
        ga: 'True'
    execution:
        cluster:
            chunksize: 200
    container:
        docker: 'proteogenomicsworkflow/hmmer:3.4'
        singularity: 'proteogenomicsworkflow/hmmer:3.4'

# The name of the result to json converter script. Must take one parameter, the
# result file from the analysis_script
converter:
    script: 'convert_hmmer.py'
    parameters:
        dbxref: 'sORFdb'
+26 −0
Original line number Diff line number Diff line
# Module manifest for the Pepstats analysis

# The name of the module. Is needed for the list-analyses option, for custom
# configurations and custom profiles.
name: 'pepstats'

# Short description of the analysis.
info: 'Calculates statistics of protein properties'

# The configuration of the script for the analysis step.
analysis:
    # script must take a --fasta parameter
    script: 'run_pepstats.py'
    # specify additional default configuration here
    parameters:
    # run script in a container
    container:
      docker: 'biocontainers/emboss:v6.6.0dfsg-7b1-deb_cv1'
      singularity: 'biocontainers/emboss:v6.6.0dfsg-7b1-deb_cv1'

# The configuration of the script for the json conversion step.
converter:
    # script must take a --result parameter, which is the result from the analysis step
    script: 'convert_pepstats.py'
    # specify additional default configuration here
    parameters:
+11 −0
Original line number Diff line number Diff line
name: 'bacteria-ecf'
info: 'Profile for ecfexpress calculation'
modules:
  include_sequence:
  tmhmm:
  ghostx_swissprot:
  hmmer_pfam_a:
          evalue: 1e-5
  hmmer_ecfgroups:
  hmmer_ecfsubgroups:
  pepstats:
Original line number Diff line number Diff line
name: 'bacteria-gram+'
info: 'Profile for gram-positive bacteria'
modules:
  include_sequence:
  signalp:
    organism: gram+
  tmhmm:
  ghostx_swissprot:
  hmmer_pfam_a:
Original line number Diff line number Diff line
name: 'bacteria-gram-'
info: 'Profile for gram-negative bacteria'
modules:
  include_sequence:
  signalp:
    organism: gram-
  tmhmm:
  ghostx_swissprot:
  hmmer_pfam_a:
  pepstats:
Original line number Diff line number Diff line
name: 'bacteria-sorfdb'
info: 'Profile for sORFdb HMM family search'
modules:
  include_sequence:
  hmmer_sorfdb:
Original line number Diff line number Diff line
@@ -5,3 +5,5 @@ modules:
  signalp:
  blastp_swissprot:
  hmmer_pfam_a:
  tmhmm:
  pepstats:
Original line number Diff line number Diff line
name: 'complete'
info: 'Profile that uses all available tools'
modules:
  include_sequence:
  signalp:
  ghostx_swissprot:
  hmmer_pfam_a:
  targetp:
    organism_group: 'non-plant'
  tmhmm:
  pepstats:
Original line number Diff line number Diff line
@@ -8,3 +8,4 @@ modules:
  hmmer_pfam_a:
  targetp:
    organism_group: 'non-plant'
  tmhmm:
Original line number Diff line number Diff line
name: 'eukaryote-plant'
info: 'Profile for plants'
modules:
  include_sequence:
  signalp:
    organism: euk
  tmhmm:
  targetp:
    organism_group: plant
  ghostx_swissprot:
  hmmer_pfam_a:
  pepstats:
+12 −0
Original line number Diff line number Diff line
name: 'eukaryote'
info: 'Profile for eukaryotes'
modules:
  include_sequence:
  signalp:
    organism: euk
  tmhmm:
  targetp:
    organism_group: non-plant
  ghostx_swissprot:
  hmmer_pfam_a:
  pepstats:
Original line number Diff line number Diff line
name: 'fast'
info: 'Profile that contains tools that give a fast result'
modules:
  include_sequence:
  ghostx_swissprot:
  signalp:
      organism: 'euk'
  tmhmm:
  pepstats:
Original line number Diff line number Diff line
@@ -52,11 +52,11 @@ with open(filename) as f:
            if '% identity' in header:
                result['target']["percent_identity"] = float(split[header['% identity']])
            if 'q. start' in header and 'q. end' in header:
                result['query']['start'] = split[header['q. start']]
                result['query']['end'] = split[header['q. end']]
                result['query']['start'] = int(split[header['q. start']])
                result['query']['end'] = int(split[header['q. end']])
            if 's. start' in header and 's. end' in header:
                result['target']['start'] = split[header['s. start']]
                result['target']['end'] = split[header['s. end']]
                result['target']['start'] = int(split[header['s. start']])
                result['target']['end'] = int(split[header['s. end']])
            if 'evalue' in header:
                result['target']['evalue'] = float(split[header['evalue']])
            if 'BTOP' in header:
Original line number Diff line number Diff line
@@ -43,15 +43,16 @@ with open(result_filename) as f:
        elif args.acc_split:
            accession = acession.split(args.acc_split)[0]
        result['query'] = {
                'start': split[6],
                'end': split[7]
                'start': int(split[6]),
                'end': int(split[7])
                }
        result['target'] = {
                'dbxref': args.dbxref + ':' + accession,
                'start': split[8],
                'end': split[9],
                'start': int(split[8]),
                'end': int(split[9]),
                'evalue': float(split[10]),
                'percent_identity': float(split[2])
                'percent_identity': float(split[2]),
                'score': float(split[11])
                }

        results.append(result)
Original line number Diff line number Diff line
@@ -48,21 +48,29 @@ with open(filename) as f:
                    print('HMMER converter: Query ID "' + query_id + '" not found among initial queries!')
            results = documents[query_id]['computations'][0]["results"]

            dbxref = None
            if split[1] == "-":
                dbxref = args.dbxref + ":" + split[0]
            else:
                dbxref = args.dbxref + ":" + split[0]
            results.append({
                'target': {
                    'name': split[0],
                    'dbxref': args.dbxref + ':' + split[1],
                    'length': split[2],
                    'score': float(split[13]),
                    'start': int(split[15]),
                    'end': int(split[16]),
                    'description': split[22].rstrip()
                    'name': split[0], # target name
                    'dbxref': dbxref, 
                    'length': split[2], # tlen
                    'score': float(split[13]), # this domain score
                    'bias': float(split[14]), # this domain bias
                    'evalue': float(split[12]), # this domain i-Evalue
                    'start': int(split[15]), # hmm coord from
                    'end': int(split[16]), # hmm coord to
                    'acc': float(split[21]), # acc
                    'description': split[22].rstrip() # description of target
                    },
                'query': {
                    'start': int(split[17]),
                    'end': int(split[18]),
                    'envelop_start': int(split[19]),
                    'envelop_end': int(split[20])
                    'start': int(split[17]), # ali coord from
                    'end': int(split[18]), # ali coord to
                    'envelop_start': int(split[19]), # env coord from
                    'envelop_end': int(split[20]) # env coord to
                    }
                })

+111 −0
Original line number Diff line number Diff line
#!/usr/bin/env python3

import sys
import json
import argparse
import glob

parser = argparse.ArgumentParser(description='Convert pepstats results to json documents')
parser.add_argument('--result', '-r', required=True, help='The pepstats result file')
parser.add_argument('--output', '-o', required=True, help='The converted results json file')
args = parser.parse_args()




filename = args.result 
documents={}
residue=False
propertyv=False
#id
with open(filename) as r:
    #Über Ergebnisse iterieren
    for line in r:
        if line.startswith('PEPSTATS'):
            document={}
            line=line.strip().split()
            seq_id=line[2]
            if not seq_id in documents:
                documents[seq_id] = {
                    "id": seq_id, 
                    "computations": [
                    ]
                }
                computation = {
                     'tool':{'name':'Pepstats','version':'EMBOSS:6.6.0.0'}, 
                     'results' : []
                }
                result = {}     
#Ergebnisse zur gabzen Sequenz abspeichern
        elif line.startswith('Molecular'):
            line=line.strip().split()
            molecularw=line[3]
            residues=line[6]
            result['Molecular weight']=float(molecularw)
            result['Residues']=float(residues)
        elif line.startswith('Average'):
            line=line.strip().split()
            averagerw=line[4]
            charge=line[7]
            result['Average Residue Weight']= float(averagerw)
            result['Charge']= float(charge)
        elif line.startswith('Isoelectric'):
            line=line.strip().split()
            iso=line[3]  
            result['Isolectric point']=float(iso)
        elif line.startswith('Improbability'):
            line=line.strip().split()
            probabilityib=1-float(line[7])  
            result['Probability of expression in inclusion bodies']=probabilityib
#Ergebnisse zu einzelnen Aminosäuren abspeichern            
        elif residue == False and line.startswith('Residue'):
            residue=True
        elif residue==True:  
            line=line.strip().split()
            #print(result)   
            if line==[]:
                residue = False
            else:
                residueS=line[2]
                number=line[3]
                mole=line[4]
                dayhoff=line[5]
                if 'Amino acids' not in result:
                    result['Amino acids']=[]
                    amino={}
                amino['Residue']=residueS
                amino['Number']=int(number)
                amino['Mole%']=float(mole)
                amino['DayhoffStat']=float(dayhoff)
                result['Amino acids'].append(amino)
                amino={}
#Ergebnisse zu Aminosäure-Gruppen abspeichern
        elif propertyv == False and line.startswith('Property'):
            propertyv=True
        elif propertyv==True:
            line=line.strip().split()
            print(line)
            if line ==[]:
                propertyv=False
                computation['results'].append(result) 
                documents[seq_id]['computations'].append(computation)
            else:
                propertyd=line[0]
                residuesd=line[1]
                numberd=line[2]
                moled=line[3]
                if 'Physico-chemical class' not in result:
                    result['Physico-chemical class']=[]
                    pcc={}
                pcc['Property']=propertyd
                pcc['Residues']=residuesd
                pcc['Number']=int(numberd)
                pcc['Mole%']=float(moled)
                result['Physico-chemical class'].append(pcc)   
                pcc={}




with open(args.output, 'w') as o:
   json.dump(documents,o)
 No newline at end of file
Original line number Diff line number Diff line
@@ -32,9 +32,9 @@ with open(filename) as f:
                for field in field_index:
                    if field_index[field] is not None:
                        if field == "Loc":
                            results[field.lower()] = loc_dict[split[field_index[field]]]
                            results['Localization'] = loc_dict[split[field_index[field]]]
                        else:
                            results[field.lower()] = float(split[field_index[field]])
                            results[field] = float(split[field_index[field]])
        else:
            if line.startswith('Name   '):
                split = line.split()
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@ ghostx_tool = 'ghostx'
parser = argparse.ArgumentParser(description='Identify homologues in the swissprot database')
parser.add_argument('--fasta', '-f', required=True, help='A fasta file with aminoacid sequences')
parser.add_argument('--database', '-d', required=True, help='Database to search in')
parser.add_argument('--max_hits', '-m', type=int, default=20, help='Maximal number of reported hits')
parser.add_argument('--output', '-o', required=True, help='The result directory. Will contain info.json, results.tsv and enum_headers.tsv.')
args = parser.parse_args()

@@ -22,4 +23,4 @@ print(" VERSION=\$(ghostx 2>&1 | grep -Eo '[[:digit:]]\\\\.[[:digit:]]\\\\.[[
toolconfig = '{\\\\"name\\\\": \\\\"ghostx\\\\", \\\\"database\\\\":' \
            + '\\\\"' + args.database + '\\\\"' + ', \\\\"version\\\\":\\\\"\$VERSION\\\\" }'
print('    echo ' + toolconfig + ' > ' + args.output + '/info.json')
print('    ' + ghostx_tool + " aln -d " + args.database + " -o " + args.output + "/results.tsv -i " + args.fasta)
print('    ' + ghostx_tool + " aln -v 5 -b " + str(args.max_hits) + " -d " + args.database + " -o " + args.output + "/results.tsv -i " + args.fasta)
Original line number Diff line number Diff line
@@ -11,12 +11,15 @@ parser.add_argument('--fasta', '-f', required=True, help='A fasta file with amin
parser.add_argument('--database', '-d', required=True, help='Database to search in')
parser.add_argument('--output', '-o', required=True, help='The result directory')
parser.add_argument('--evalue', '-e', default='0.0001', help='Evalue cutoff')
parser.add_argument('--ga', '-g', default=False, type=bool, 
                    help="Use profile's GA gathering cutoffs to set all thresholding")
args = parser.parse_args()

print('mkdir -p ' + args.output)

cutoff = " --cut_ga " if args.ga else f" -E {args.evalue} "
print(hmmscan_tool + 
        " -E " + args.evalue + 
        cutoff +
        " -o " + args.output + "/hmmscan.out " + 
        " --tblout " + args.output + "/tblout.tsv "  + 
        " --domtblout " + args.output + "/domtblout.tsv " + 
+13 −0
Original line number Diff line number Diff line
#!/usr/bin/env python3
import argparse
from os import system,environ

pepstats_tool='pepstats'

parser =argparse.ArgumentParser(description='Calculates statistics of protein properties')
parser.add_argument('--fasta','-f', required=True, help='A fasta file')
parser.add_argument('--output','-o',required=True, help='A output file')
args=parser.parse_args()


print(pepstats_tool , args.fasta , args.output)
Original line number Diff line number Diff line
#!/bin/bash
set -uexo pipefail

SCRIPT=$(../scripts/run_targetp.py --fasta data/fasta/proteins.fas --output data/targetp/ --organism_group non-plant)
eval "$SCRIPT"

../scripts/convert_targetp.py --result data/targetp/ --output data/targetp/converted.json

example/ecf.fas

0 → 100644
+4 −0
Original line number Diff line number Diff line
>tr|Q189L9|Q189L9_PEPD6 Extracytoplasmic function (ECF) sigma factor csfT OS=Peptoclostridium difficile (strain 630) OX=272563 GN=csfT PE=3 SV=1
MDKTTFTNNILESEQTLYRVSKSILGNDQDCEDAVNNAILKAYEKLDSLKEEQYFKTWLI
RIVINECNSLRRKRLKSLSFEDVFKNKKIDEKDDYSDLYTAIQSLPKKIKIPIVLYYIEG
YSVDEVKEILDIPQGTVKSRLSRGRRLLKTKLENTEVII
+10 −10
Original line number Diff line number Diff line
@@ -13,25 +13,25 @@ def dbxrefs_from_document(document):
    return set(refs)
  return set([])

def combine_with_documents(documents, informations):
def combine_with_documents(documents, data):
  for k in documents:
    combine_with_document2(documents[k], informations)
    combine_with_document2(documents[k], data)

def combine_with_document(document, informations):
def combine_with_document(document, data):
  '''adds an information tag for each result that references a dbxref'''
  if 'computations' in document:
    for c in document['computations']:
      for r in c['results']:
        if 'target' in r and 'dbxref' in r['target']:
          r['informations'] = next(entry for entry in informations if entry['id'] == r['target']['dbxref'])
          r['dbxrefs'] = next(entry for entry in data if entry['id'] == r['target']['dbxref'])

def combine_with_document2(document, informations):
def combine_with_document2(document, data):
    '''adds an information field to the document that contains all referenced
    dbxrefs present in informations within the document'''
    if not 'informations' in document:
        document['informations'] = {}
    dbxrefs present in data within the document'''
    if not 'dbxrefs' in document:
        document['dbxrefs'] = []
    dbxrefs = dbxrefs_from_document(document)
    for info in informations:
    for info in data:
        if info['id'] in dbxrefs:
            document['informations'][info['id']] = info
            document['dbxrefs'].append(info)
        
Original line number Diff line number Diff line
@@ -10,6 +10,7 @@ parser.add_argument('--tsv', '-t', required=True, help='The mapping file in tsv
parser.add_argument('--output', '-o', required=True, help='The result json document file')
args = parser.parse_args()

id_to_description = {}
seq_id_dict = {}
docs_enumerated = {}

@@ -20,8 +21,12 @@ with open(args.json) as j:
# read mapping
with open(args.tsv) as h:
    for line in h:
        num, id = line.strip().split('\t')
        split = line.strip().split('\t')
        num, id = (split[0], split[1])
        seq_id_dict[num] = id
        if len(split) is 3:
            description = split[2]
            id_to_description[id] = description

documents_restored = {}

@@ -29,6 +34,8 @@ for num in docs_enumerated:
    seq_id = seq_id_dict[num]
    doc = docs_enumerated[num]
    doc["id"] = seq_id
    if seq_id in id_to_description:
        doc["description"] = id_to_description[seq_id]
    documents_restored[seq_id] = doc

with open(args.output, 'w') as o:
+21 −9
Original line number Diff line number Diff line
@@ -14,26 +14,30 @@ from psot.nextflow import setup_execution_directory, execute_analysis
__version__ = VersionInfo('psot').semantic_version().release_string()

def main():
    parser = argparse.ArgumentParser(description='PSOT - Proteins sequence observation tool\nVersion ' + __version__ +'\nGather informations about proteins.', formatter_class=argparse.RawTextHelpFormatter)
    parser = argparse.ArgumentParser(description='PSOT - Proteins sequence observation tool\nVersion ' + __version__ +'\nGather data about proteins.', formatter_class=argparse.RawTextHelpFormatter)
    parser.set_defaults(func=help)

    subparsers = parser.add_subparsers()
    info_parser = subparsers.add_parser('info')
    info_parser.add_argument('--listanalyses', '-l', action='store_true', help='Show available analysis steps')
    info_parser.add_argument('--list_analyses', '-l', action='store_true', help='Show available analysis steps')
    info_parser.set_defaults(func=info)

    analyze_parser = subparsers.add_parser('analyze')
    executions = analyze_parser.add_mutually_exclusive_group()

    analyze_parser.add_argument('--fasta', '-f', required=True, help='A fasta file with aminoacid sequences')
    analyze_parser.add_argument('--output', '-o', required=True, help='The output directory for the json documents')
    analyze_parser.add_argument('--profile', '-p', default='fast', help='The profile to use')
    analyze_parser.add_argument('--live', '-l', action='store_true', help='Report results as they are computed, not only at the end of the computation. The live results will be available in the $output/live.')
    analyze_parser.add_argument('--config', '-c', help='The config to use')
    analyze_parser.add_argument('--fetch_informations', '-i', action='store_true', help='Fetch informations')
    analyze_parser.add_argument('--fetch_dbxrefs', '-F', action='store_true', help='Fetch associated dbxrefs')
    analyze_parser.add_argument('--debug', '-d', action='store_true', help='Debug mode, computation directory will not be removed after computation')
    analyze_parser.add_argument('--execution_dir', '-e', help='Use the specified execution directory and do not delete it after the computation')
    analyze_parser.add_argument('--use_cluster', '-C', action='store_true', help='Use compute cluster for execution')
    executions.add_argument('--k8s', '-k', action='store_true', help='Use kubernetes for execution')
    executions.add_argument('--use_cluster', '-C', action='store_true', help='Use compute cluster for execution')
    analyze_parser.add_argument('--k8s_config', help='The k8s nextflow config to use')
    analyze_parser.add_argument('--download_databases', '-b', action='store_true', help='Download databases if they are not present yet')
    analyze_parser.add_argument('--generate_only', '-g', action='store_true', help='Only generate workflow directory without executing it')

    images = analyze_parser.add_mutually_exclusive_group()
    images.add_argument('--docker', '-D', action='store_true', help='Use docker image for computation')
@@ -66,9 +70,12 @@ def analyze(args, config):
      download_databases(execution)

    setup_execution_directory(execution)
    if not args.generate_only: 
      error_code = execute_analysis(execution)
      cleanup(execution)
      exit(error_code)
    else:
      exit(0)

def cleanup(execution):
    if not execution['debug']:
@@ -77,8 +84,11 @@ def cleanup(execution):
def generate_execution(config, args):
    execution = {}
    execution['debug'] = args.debug
    execution['use_k8s'] = args.k8s
    if args.k8s and 'k8s' in config and 'defaultimage' in config['k8s']:
      execution['k8s_default_image'] = config['k8s']['defaultimage']
    execution['use_cluster'] = args.use_cluster
    execution['fetch_informations'] = args.fetch_informations 
    execution['fetch_dbxrefs'] = args.fetch_dbxrefs 
    execution['mode'] = 'live' if args.live else 'complete'
    execution['fasta'] = os.path.abspath(args.fasta)
    execution['output'] = os.path.abspath(args.output)
@@ -98,6 +108,8 @@ def generate_execution(config, args):
        else:
            execution['directory'] = tempfile.mkdtemp()
    execution['modules'] = generate_execution_modules_for_profile(config, args.profile)
    if args.k8s_config:
      execution['k8s_config'] = args.k8s_config
    return execution
    
def generate_execution_modules_for_profile(config, profile):
+79 −15
Original line number Diff line number Diff line
@@ -6,13 +6,14 @@ from copy import deepcopy
import shutil
import collections
import sys
from collections.abc import import MutableMapping

# taken from https://stackoverflow.com/questions/6027558/flatten-nested-python-dictionaries-compressing-keys
def flatten(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, collections.MutableMapping):
        if isinstance(v, MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
@@ -162,8 +163,8 @@ process ${id}_restore_ids {
}
''')

retrieve_informations_template = Template('''
process retrieve_informations_for_${id} {
retrieve_dbxrefs_template = Template('''
process retrieve_dbxrefs_for_${id} {

    input:
    file result from ${id}_json_info
@@ -177,8 +178,8 @@ process retrieve_informations_for_${id} {
    """
}
''')
retrieve_informations_live_template = Template('''
process retrieve_informations_for_${id} {
retrieve_dbxrefs_live_template = Template('''
process retrieve_dbxrefs_for_${id} {

    input:
    file result from ${id}_json_info
@@ -234,6 +235,12 @@ analysis_config_template = Template('''
    }
    '''
    )
k8s_analysis_config_template = Template('''
    withName:${id}{
        ${container}
    }
    '''
    )
beforeScript_modul_config_template = Template('''
    withName:${process_name}{
        ${beforeScript}
@@ -242,7 +249,7 @@ beforeScript_modul_config_template = Template('''
    )

beforeScript_norm_config_template = Template('''
    withName:normalizing_fasta{
    withName:normalize_fasta{
        ${beforeScript}
    }
    '''
@@ -321,19 +328,19 @@ def generate_nextflow_script(execution):
        config['cmdline'] = cmdline.stdout.decode('utf-8')

        fragments.append(analysis_template.substitute(config))
        if execution['mode'] == 'live' and not execution['fetch_informations']:
        if execution['mode'] == 'live' and not execution['fetch_dbxrefs']:
            fragments.append(convert_live_template.substitute(config))
            fragments.append(restore_headers_json_live_template.substitute(config))
            fragments.append(live_results_template.substitute(config))
        elif execution['mode'] == 'live' and execution['fetch_informations']:
        elif execution['mode'] == 'live' and execution['fetch_dbxrefs']:
            fragments.append(convert_info_template.substitute(config))
            fragments.append(restore_headers_json_info_template.substitute(config))
            fragments.append(retrieve_informations_live_template.substitute(config))
            fragments.append(retrieve_dbxrefs_live_template.substitute(config))
            fragments.append(live_results_template.substitute(config))
        elif execution['mode'] == 'complete' and execution['fetch_informations']:
        elif execution['mode'] == 'complete' and execution['fetch_dbxrefs']:
          fragments.append(convert_info_template.substitute(config))
          fragments.append(restore_headers_json_info_template.substitute(config))
          fragments.append(retrieve_informations_template.substitute(config))
          fragments.append(retrieve_dbxrefs_template.substitute(config))
        else:
            fragments.append(convert_template.substitute(config))
            fragments.append(restore_headers_json_template.substitute(config))
@@ -350,6 +357,9 @@ def generate_nextflow_script(execution):
    return nextflow_script

def generate_nextflow_config(execution):
    if execution['use_k8s']:
      return generate_k8s_config(execution)

    modules = execution['modules']
    database_path = execution['database_path']
    mount_point = execution['mount_point_for_containers']
@@ -369,14 +379,21 @@ def generate_nextflow_config(execution):
            runOptions = '--bind {dpath}:{mpoint}'
        }}
        '''.format(dpath=database_path, mpoint=mount_point))
    elif execution['use_k8s']:
        fragments.append('''process.executor = 'k8s'
        ''')
        
        
    fragments.append('''process { ''')
    if execution['use_k8s']:
        fragments.append('''  executor = 'k8s'
        ''')
            
    for m in modules:
        config = {}
        config['id'] = m['id']
        
        if execution['docker'] and m['analysis']['container']['docker']:
        if m['analysis']['container']['docker']:
            config['container'] = "container = " + "'" + m['analysis']['container']['docker'] + "'"
            # mount the referenced database at the same location as on the host system
            if 'parameters' in m['analysis'] and 'database' in m['analysis']['parameters'] and m['analysis']['parameters']['database']:
@@ -392,6 +409,9 @@ def generate_nextflow_config(execution):
        if execution['use_cluster']:
            config['executor'] = 'sge'
            config['clusterOptions'] = "clusterOptions = '-S /bin/bash'"
        if execution['use_k8s']:
            config['executor'] = 'k8s'
            config['clusterOptions'] = ""
        else:
            config['executor'] = 'local'
            config['clusterOptions'] = ''
@@ -399,10 +419,10 @@ def generate_nextflow_config(execution):
        if 'venv' in execution:
            config['beforeScript'] = "beforeScript = 'export PS1=; source " + execution['venv'] + "/bin/activate'"
            
            if execution['fetch_informations']:
                process_names_list = Template('convert_${id}_to_json|${id}_restore_headers_json|retrieve_informations_for_${id}').substitute(config).split('|')
            if execution['fetch_dbxrefs']:
                process_names_list = Template('convert_${id}_to_json|${id}_restore_ids|retrieve_dbxrefs_for_${id}').substitute(config).split('|')
            else:
                process_names_list = Template('convert_${id}_to_json|${id}_restore_headers_json').substitute(config).split('|')
                process_names_list = Template('convert_${id}_to_json|${id}_restore_ids').substitute(config).split('|')
                
            fragments.append(analysis_config_template.substitute(config))
            for process in process_names_list:
@@ -420,4 +440,48 @@ def generate_nextflow_config(execution):
    nextflow_config = '\n'.join(fragments)
    return nextflow_config
            
def generate_k8s_config(execution) :
    image = ''
    if "k8s_default_image" in execution:
      image = execution["k8s_default_image"]
    modules = [
      { 'id' : 'normalize_fasta' },
      { 'id' : 'join_documents' },
      { 'id' : 'split_documents' },
    ]

    for m in execution['modules']:
      config = {}
      id = m['id']
      config['id'] = id
      if m['analysis']['container']['docker']:
        config['container'] = "container = " + "'" + m['analysis']['container']['docker'] + "'"
      modules.append(config)
      modules.append({'id': 'convert_' + id + '_to_json'})
      modules.append({'id': id + '_restore_ids'})
      modules.append({'id': 'retrieve_dbxrefs_for_' + id})
      if execution['mode'] == 'live':
        modules.append({'id': 'generate_'+ id + '_live_results'})
    
    for m in modules:
      if not 'container' in m:
        m['container'] = "container = " + '"' + image + '"'

    fragments = []
    if 'k8s_config' in execution:
      with open(execution['k8s_config']) as f:
        for l in f:
          fragments.append(l.rstrip())
    fragments.append('''process { ''')
    if execution['use_k8s']:
        fragments.append('''  executor = 'k8s'
        ''')
            
    for m in modules:
        fragments.append(k8s_analysis_config_template.substitute(m))
    
    fragments.append('''}''')
    
    nextflow_config = '\n'.join(fragments)
    return nextflow_config
+0 −3
Original line number Diff line number Diff line
from setuptools import setup
# this is only necessary when not using setuptools/distribute
from sphinx.setup_command import BuildDoc
cmdclass = {'build_sphinx': BuildDoc}

setup(
  setup_requires=['pbr'],