Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • ljelonek-public/podman-on-slurm
1 result
Show changes
Commits on Source (2)
  • Lukas Jelonek's avatar
    Update nextflow.config to latest findings · 5e9df706
    Lukas Jelonek authored
    On our slurm cluster the USER-Variable is not set when only passing the proxy
    env variables. So this variable has to be passed as well.
    5e9df706
  • Lukas Jelonek's avatar
    Improve cleanup script · 5a128a1b
    Lukas Jelonek authored
    * Add script to list all running cluster nodes
    * Reset podman if it is in a failed state
    * Add logs so that the user can follow what the script does on each host
    * Simplify the command to run it on the whole cluster
    5a128a1b
process {
container='ubuntu:24.10'
container='ubuntu'
executor='slurm'
clusterOptions="--export=http_proxy,https_proxy,ftp_proxy"
clusterOptions="--export=http_proxy,https_proxy,ftp_proxy,USER"
queue='bcf'
}
podman {
enabled=true
engineOptions="--cgroup-manager=cgroupfs"
}
#/bin/bash
sinfo --Node | awk '$3 ~ /bcf/ && $4 != "down" {print $1}' | sort | uniq
#!/bin/bash
#
# This script removes unused resources for podman on cluster nodes in the bcf slurm cluster.
# It assumes that the podman run directory is located at /tmp/podman-run-${UID}. It will delete
# unused images, stopped containers, unused volumes and the podman run directory.
#
set -eo pipefail
PODMAN_DIR=/tmp/podman-run-${UID}
USER_ID=$(id -u)
PODMAN_DIR=/tmp/podman-run-${USER_ID}
PODMAN="podman --cgroup-manager=cgroupfs"
if [[ -e $PODMAN_DIR ]]
then
echo "Podman found on $(hostname)"
RUNNING_COUNT=$(podman ps --noheading | wc -l)
podman system prune -a -f
if [[ $RUNNING_COUNT -gt 0 ]]
set +e
RUNNING_COUNT=$($PODMAN ps --noheading | wc -l)
RET=$?
set -e
# if podman ps fails, assume that podman is in a corrupted state and reset it
if [[ ! ($RET == 0) ]]
then
echo "You still have ${RUNNING_COUNT} jobs running on $(hostname)"
echo "Podman is in a corrupt state on $(hostname). Resetting it."
$PODMAN system migrate
$PODMAN system reset -f
echo "Podman reset on $(hostname)"
fi
if [[ $RUNNING_COUNT -gt 0 || ! ($RET == 0) ]]
then
echo "You still have ${RUNNING_COUNT} jobs running on $(hostname). Podman directory won't be removed."
else
rm -rf $PODMAN_DIR
echo "Podman run-directory removed on $(hostname)"
fi
else
echo "Podman not found on $(hostname)"
fi
......@@ -21,5 +21,6 @@ sinfo --Node | awk '{print $1}' | tail -n+2
## Cleanup podman-data on all hosts in slurm partition
```bash
parallel -i srun -c 1 --mem=128M --export=None --nodelist="{}" /bin/bash <path-to>/cleanupPodman -- $(sinfo --Node | awk '$3 ~ /<partition>/ && $4 != "down" {print $1}' )
# in scripts directory
parallel -j1 -i srun -c 1 --mem=64M --export=USER,PATH,PWD --nodelist="{}" ./podman-cleanup -- $(./list-slurm-nodes)
```