Compare revisions

Lukas Jelonek · Lukas Jelonek · 5a128a1b · 5a128a1b · 5a128a1b · 5a128a1b
--- a/nextflow/nextflow.config
+++ b/nextflow/nextflow.config
 process {
-  container='ubuntu:24.10'
+  container='ubuntu'
  executor='slurm'
-  clusterOptions="--export=http_proxy,https_proxy,ftp_proxy"
+  clusterOptions="--export=http_proxy,https_proxy,ftp_proxy,USER"
  queue='bcf'
 }
 podman {
  enabled=true
+  engineOptions="--cgroup-manager=cgroupfs"
 }

--- a/scripts/list-slurm-nodes
+++ b/scripts/list-slurm-nodes
+#/bin/bash
+sinfo --Node | awk '$3 ~ /bcf/ && $4 != "down" {print $1}' | sort | uniq 
--- a/scripts/podman-cleanup
+++ b/scripts/podman-cleanup
 #!/bin/bash
-#
-# This script removes unused resources for podman on cluster nodes in the bcf slurm cluster.
-# It assumes that the podman run directory is located at /tmp/podman-run-${UID}. It will delete
-# unused images, stopped containers, unused volumes and the podman run directory.
-#
 set -eo pipefail
-PODMAN_DIR=/tmp/podman-run-${UID}
+USER_ID=$(id -u)
+PODMAN_DIR=/tmp/podman-run-${USER_ID}
+PODMAN="podman --cgroup-manager=cgroupfs"

 if [[ -e $PODMAN_DIR ]]
 then
  echo "Podman found on $(hostname)"
-  RUNNING_COUNT=$(podman ps --noheading | wc -l)
-  podman system prune -a -f
-  if [[ $RUNNING_COUNT -gt 0 ]]
+  set +e
+  RUNNING_COUNT=$($PODMAN ps --noheading | wc -l)
+  RET=$?
+  set -e
+  # if podman ps fails, assume that podman is in a corrupted state and reset it
+  if [[ ! ($RET == 0) ]]
  then
-    echo "You still have ${RUNNING_COUNT} jobs running on $(hostname)"
+    echo "Podman is in a corrupt state on $(hostname). Resetting it."
+    $PODMAN system migrate
+    $PODMAN system reset -f 
+    echo "Podman reset on $(hostname)"
+  fi
+  if [[ $RUNNING_COUNT -gt 0 || ! ($RET == 0) ]]
+  then
+    echo "You still have ${RUNNING_COUNT} jobs running on $(hostname). Podman directory won't be removed."
  else
    rm -rf $PODMAN_DIR
+    echo "Podman run-directory removed on $(hostname)"
  fi
+else
+  echo "Podman not found on $(hostname)"
 fi
--- a/slurm-snippets.md
+++ b/slurm-snippets.md
@@ -21,5 +21,6 @@ sinfo --Node | awk '{print $1}' | tail -n+2
 ## Cleanup podman-data on all hosts in slurm partition

 ```bash
-parallel -i srun -c 1 --mem=128M --export=None --nodelist="{}" /bin/bash <path-to>/cleanupPodman -- $(sinfo --Node | awk '$3 ~ /<partition>/ && $4 != "down" {print $1}' )
+# in scripts directory
+parallel -j1 -i srun -c 1 --mem=64M --export=USER,PATH,PWD --nodelist="{}" ./podman-cleanup -- $(./list-slurm-nodes)
 ```
No results found