Differences between revisions 30 and 86 (spanning 56 versions)
Revision 30 as of 2019-05-14 14:12:24
Size: 10123
Editor: stroth
Comment:
Revision 86 as of 2023-07-11 19:50:53
Size: 24644
Editor: stroth
Comment: Beautify script, use integers for space calculation
Deletions are marked like this. Additions are marked like this.
Line 1: Line 1:
#rev 2020-09-08 stroth
Line 10: Line 12:
After familiarizing yourself with `conda`, read [[Programming/Languages/GPUCPU|further information]] about available platforms on which to use your infrastructure and particularities of the software packages involved. After familiarizing yourself with `conda`, read this [[Programming/Languages/GPUCPU|collection of hints and explanations]] about available platforms on which to use your infrastructure and particularities of the software packages involved.
Line 20: Line 22:
# Locations to store environments
# net_scratch is used as default, local scratch needs to be chosen explicitly
LOCAL_SCRATCH="/scratch/${USER}"
NET_SCRATCH="/itet-stor/${USER}/net_scratch"
declare -i SPACE_AVAILABLE SPACE_MINIMUM_REQUIRED
SPACE_MINIMUM_REQUIRED=5 # [G]

if [[ -z "${1}" ]]; then
    # Default install location
    OPTION='netscratch'
else
    OPTION="${1}"
fi

line=$(printf '%*s\n' "${COLUMNS:-$(tput cols)}" '' | tr ' ' '-')

# Display underlined title to improve readability of script output
function title() {
    echo
    echo "$@"
    echo "${line}"
}

case "${OPTION}" in
h | help | '-h' | '--help')
    title 'Possible installation options are:'
    echo 'Install conda to your local scratch disk:'
    echo "${BASH_SOURCE[0]} localscratch"
    echo
    echo 'Install conda to your directory on net_scratch:'
    echo "${BASH_SOURCE[0]}"
    echo 'or'
    echo "${BASH_SOURCE[0]} netscratch"
    echo
    echo 'Provide a custom location for installation'
    echo "${BASH_SOURCE[0]} /path/to/custom/location"
    echo
    echo "The recommended minimum space requirement for installation is ${SPACE_MINIMUM_REQUIRED} G."
    exit 0
    ;;
l | local | localscratch | '-l' | '-local' | '-localscratch')
    # If local scratch is made available through scratch_net, use its path in
    # order to be able to access it on other hosts through scratch_net
    if grep -q scratch_net /etc/auto.master; then
        CONDA_BASE_DIR="/scratch_net/$(hostname -s)/${USER}"
    else
        CONDA_BASE_DIR="/scratch/${USER}"
    fi
    ;;
n | net | netscratch | '-n' | '-net' | '-netscratch')
    CONDA_BASE_DIR="/itet-stor/${USER}/net_scratch"
    ;;
*)
    CONDA_BASE_DIR="${1}"
    ;;
esac

# Check if this script is started on an Euler login node, if it is, suggest a custom install location and exit
if [[ -z ${HOSTNAME} ]]; then
    host_name=$(hostname -s)
else
    host_name=${HOSTNAME}
fi
if [[ -n ${host_name} ]]; then
    if [[ ${host_name%-*} == 'eu-login' ]]; then
        echo "It seems you're using this script on the Euler cluster."
        echo 'Provide a custom location for installation, for example in your Euler home:'
        echo "${BASH_SOURCE[0]} ${HOME}/conda"
        exit 1
    fi
fi

# Create install location if it doesn't exist
if [[ ! -d "${CONDA_BASE_DIR}" ]]; then
    mkdir -p "${CONDA_BASE_DIR}"
fi

# Check available space on selected install location
SPACE_AVAILABLE=$(($(stat -f --format="%a*%S" "${CONDA_BASE_DIR}") / 1024 / 1024 / 1024))
if ((SPACE_AVAILABLE <= SPACE_MINIMUM_REQUIRED)); then
    title 'Warning!'
    echo "Available space on '${CONDA_BASE_DIR}' is ${SPACE_AVAILABLE} G."
    echo "This is less than the minimum recommendation of ${SPACE_MINIMUM_REQUIRED} G."
    read -p "Press 'y' if you want to continue installing anwyway: " -n 1 -r
    echo
    if [[ ! ${REPLY} =~ ^[Yy]$ ]]; then
        exit 1
    fi
fi

# Locations for conda installation, packet cache and virtual environments
CONDA_INSTALL_DIR="${CONDA_BASE_DIR}/conda"
CONDA_PACKET_CACHE_DIR="${CONDA_BASE_DIR}/conda_pkgs"
CONDA_ENV_DIR="${CONDA_BASE_DIR}/conda_envs"

# Abort if pre-existing installation is found
if [[ -d "${CONDA_INSTALL_DIR}" ]]; then
    if [[ -z "$(find "${CONDA_INSTALL_DIR}" -maxdepth 0 -type d -empty 2>/dev/null)" ]]; then
        title 'Checking installation path'
        echo "The installation path '${CONDA_INSTALL_DIR}' is not empty."
        echo 'Aborting installation.'
        exit 1
    fi
fi
Line 29: Line 127:
[[ -z ${PYTHONPATH} ]] || unset PYTHONPATH if [[ -n ${PYTHONPATH} ]]; then
   
unset PYTHONPATH
fi
Line 32: Line 132:
wget -O miniconda.sh "${CONDA_INSTALLER_URL}" \
    && chmod +x miniconda.sh \
    && ./miniconda.sh -b -p "${NET_SCRATCH}/conda" \
    && rm ./miniconda.sh
title 'Downloading and installing conda'
wget -O miniconda.sh "${CONDA_INSTALLER_URL}" &&
    chmod +x miniconda.sh &&
    ./miniconda.sh -b -p "${CONDA_INSTALL_DIR}" &&
    rm ./miniconda.sh
Line 38: Line 139:
eval "$(${NET_SCRATCH}/conda/bin/conda shell.bash hook)"
conda config --add pkgs_dirs "${NET_SCRATCH}/conda_pkgs" --system
conda config --add envs_dirs "${LOCAL_SCRATCH}/conda_envs" --system
conda config --add envs_dirs "${NET_SCRATCH}/conda_envs" --system
title 'Configuring conda'
eval "$(${CONDA_INSTALL_DIR}/bin/conda shell.bash hook)"
conda config --add pkgs_dirs "${CONDA_PACKET_CACHE_DIR}" --system
conda config --add envs_dirs "${CONDA_ENV_DIR}" --system
Line 43: Line 144:
#conda config --set default_threads $(nproc)
conda config --set pip_interop_enabled True
conda config --set channel_priority strict
Line 45: Line 149:
# Prevent conda base environment from using user site-packages
mkdir -p "${CONDA_INSTALL_DIR}/etc/conda/activate.d"
echo '#!/bin/bash
if [[ -n ${PYTHONUSERBASE} ]]; then
    declare -g "PYTHONUSERBASE_${CONDA_DEFAULT_ENV}=${PYTHONUSERBASE}"
    export "PYTHONUSERBASE_${CONDA_DEFAULT_ENV}"
    unset PYTHONUSERBASE
fi' >"${CONDA_INSTALL_DIR}/etc/conda/activate.d/disable-PYTHONUSERBASE.sh"
chmod +x "${CONDA_INSTALL_DIR}/etc/conda/activate.d/disable-PYTHONUSERBASE.sh"

mkdir -p "${CONDA_INSTALL_DIR}/etc/conda/deactivate.d"
echo '#!/bin/bash
COMBOVAR=PYTHONUSERBASE_${CONDA_DEFAULT_ENV}
COMBOVAR_CONTENT=${!COMBOVAR}
if [[ -n ${COMBOVAR_CONTENT} ]]; then
    declare -g "PYTHONUSERBASE=${COMBOVAR_CONTENT}"
    export PYTHONUSERBASE
    unset "PYTHONUSERBASE_${CONDA_DEFAULT_ENV}"
fi' >"${CONDA_INSTALL_DIR}/etc/conda/deactivate.d/reenable-PYTHONUSERBASE.sh"
chmod +x "${CONDA_INSTALL_DIR}/etc/conda/deactivate.d/reenable-PYTHONUSERBASE.sh"
Line 46: Line 171:
title 'Updating conda and conda base environment'
Line 49: Line 175:
# Clean installation
title 'Removing unused packages and caches'
conda clean --all --yes

# Display information about this conda installation
title 'Information about this conda installation'
conda info
Line 50: Line 184:
echo
echo
'Initialize conda immediately:'
echo "eval \"\$(${NET_SCRATCH}/conda/bin/conda shell.bash hook)\""
echo
echo
'Automatically initialize conda for furure shell sessions:'
echo "echo 'eval \"\$(${NET_SCRATCH}/conda/bin/conda shell.bash hook)\"' >> ${HOME}/.bashrc"
title 'Initialize conda immediately'
echo "eval \"\$(${CONDA_INSTALL_DIR}/bin/conda shell.bash hook)\""
title 'Automatically initialize conda for future shell sessions'
echo "echo '[[ -f ${CONDA_INSTALL_DIR}/bin/conda ]] && eval \"\$(${CONDA_INSTALL_DIR}/bin/conda shell.bash hook)\"' >> ${HOME}/.bashrc"
Line 58: Line 190:
echo
echo
'Completely remove conda:'
echo "rm -r ${NET_SCRATCH}/conda ${NET_SCRATCH}/conda_pkgs ${NET_SCRATCH}/conda_envs ${LOCAL_SCRATCH}/conda_envs ${HOME}/.conda"
title 'Completely remove conda'
echo "rm -r ${CONDA_INSTALL_DIR} ${CONDA_INSTALL_DIR}_pkgs ${CONDA_INSTALL_DIR}_envs ${HOME}/.conda"
Line 64: Line 195:
chmod +x install_conda.sh
}}}
and execute the script by issuing
{{{
./install_conda.sh
}}}
When the script ends it prints out commands to initialize `conda` immediately or every time you log in and a command to completely remove your `conda` installation.

Choose your preferred method of initializing `conda` as recommended by the script and note down the deletion command.
chmod +x ./install_conda.sh
}}}
and run the script to show options for choosing [[#conda-storage-locations|storage locations]] by issuing
{{{
./install_conda.sh help
}}}
Then run the script again with the option of your choosing to start the installation.
 *
When the script ends it prints out information about the installation, commands to initialize `conda` immediately or every time you log in and a command to completely remove your `conda` installation.
 * Choose your preferred method of initializing `conda` as recommended by the script and note down the deletion command.
Line 75: Line 206:
The directories listed in the command for complete `conda` removal contain the following data:
||`/itet-stor/$USER/net_scratch/conda`||The miniconda installation||
||`/itet-stor/$USER/net_scratch/conda_pkgs`||Downloaded packages||
||`/itet-stor/$USER/net_scratch/conda_envs`||Virtual environments on NAS where startup time is not important||
||`/scratch/$USER/conda_envs`||[[#Creates_the_environment_.22my_env.22_in_the_specified_location|Virtual environments on local disk]] which need to start fast||
||`/home/$USER/.conda`||Personal conda configuration||

The purpose of this configuration is to store data according to its importance and prevent using up your quota. If you intend to deviate from the default configuration, consult the [[Services/StorageOverview|storage overview]] to choose your storage locations adequately and follow these recommendations:
=== Pre-set install locations ===
The purpose of the install scripts' options is to store data according to its importance and prevent using up your quota. The difference between the two pre-set installation locations is:
 * '''netscratch''': fail-safe because it resides on a RAID but slower startup times as it is a network share
 * '''localscratch''': single point of failure because it is just one disk but faster startup times as it is a local disk
Neither of the pre-set locations has an automatic backup. Use the recommended [[#Backup|backup practice]] instead.

=== Custom install location ===
If you intend to use a custom install location, consult the [[Services/StorageOverview|storage overview]] to choose it adequately and follow these recommendations:
Line 89: Line 219:
=== conda directories ===
The installation creates the following two directories in the install location:
 * '''conda''': Contains the miniconda installation
 * '''conda_pkgs''': Contains the cache for downloaded and decompressed packages
Creating the [[#Create_an_environment_called_.22my_env.22_with_packages_.22package1.22_and_.22package2.22_installed|first environment]] creates an additional directory in the install location:
 * '''conda_envs''': Contains the created environment(s)
Line 90: Line 227:
`conda` allows to seperate installed software packages from each other by creating so-called ''environments''. Using environments is best practice to generate deterministic and reproducible tools. `conda` allows to seperate installed software packages from each other by creating so-called ''[[#Environments|environments]]''. Using environments is best practice to generate deterministic and reproducible tools.
Line 95: Line 232:
It is best practice to seperate packages in different environments if they don't need to interact. It is best practice to seperate packages in different [[#Environments|environments]] if they don't need to interact.
Line 101: Line 238:
=== Installation examples ===
For `conda`, `python` itself is just a software package as any other. After analyzing all packages to be installed it decides which `python` version works for the whole environment. This means different environments may contain differing versions of `python`.

==== Creating an environment with a specific python version ====
 * Time to install: ~1 minute
 * Space required: ~140M
{{{
conda create --name py38 python=3.8.5
}}}

==== Creating pytorch/tensorflow environments ====
The following examples show how to '''create environments on a managed client''', to '''run on''':
 1. '''A managed client''', which typically has a low memory GPU. Typical use case is testing for later computations.
 1. '''A GPU node''', which has several high memory GPUs. The typical use case is running computations.

Further information for all examples:
 * The version of `cudatoolkit` has to [[Programming/Languages/GPUCPU#Matching_toolkit_versions_to_installed_driver|match the NVIDIA driver currently installed on a managed client]]
 * Be sure to read [[Programming/Languages/GPUCPU#Installing_a_specific_toolkit_version_with_conda|Installing a specific toolkit version]] if you intend to use this example for more than just first steps with conda.

===== pytorch and CUDA toolkit 10 for a managed client =====
 * Time to install: ~5 minutes
 * Space required: ~2.5G
{{{
conda create --name pytcu10 pytorch torchvision cudatoolkit=10.1 --channel pytorch
}}}

===== pytorch and CUDA toolkit 11 to run on GPU nodes =====
 * Time to install: ~5 minutes
 * Space required: ~2.5G
{{{
CONDA_OVERRIDE_CUDA=11.7 conda create --name pytcu11 pytorch torchvision pytorch-cuda --channel pytorch --channel nvidia
}}}

===== tensorflow and CUDA toolkit 10 for a managed client =====
 * Time to install: ~5 minutes
 * Space required: ~2G
{{{
conda create --name tencu10 tensorflow-gpu cudatoolkit=10.1 --channel conda-forge
}}}

===== tensorflow and CUDA toolkit 11 to run on GPU nodes =====
 * Time to install: ~5 minutes
 * Space required: ~2G
{{{
CONDA_OVERRIDE_CUDA=11.4 conda create --name tencu11 tensorflow-gpu cudatoolkit=11.4 --channel conda-forge
}}}
Line 102: Line 286:
`conda` automatically installs a default environment called ''base'' with a `python` interpreter, [[https://pypi.org/project/pip/|pip]] and other tools to start coding in python. Whether you want to use and extend this environment or create your own is up to you. At the time of writing this information it is not possible to remove the base environment. `conda` automatically installs a default environment called ''base'' with a `python` interpreter, [[https://pypi.org/project/pip/|pip]] and other tools to start coding in python.
 * ⚠ It is strongly discouraged to use the ''base'' environment for projects. It's purpose is to provide the tools to maintain other environments, nothing else.
 * ⚠ Set up a new environment for each project. This ensures reproducability and facilitates environment related problem solving.
 * ⚠ Only auto activate an environment in your shell initialisation script if you understand exactly what this entails.
 * It's good practice to make sure [[#Check_for_conda_initialization_and_active_environment|conda is initialized and the wanted environment is active]] before trying to use it
Line 127: Line 316:
==== Export the active environment definition to the file "my_env.yml" ====
This command is also the basis for [[#Backup|backing up]] an environment.
{{{
conda env export > my_env.yml
}}}
==== Export the environment "my_env" to the definition file "my_env.yml" for an identical platform ====
The definition file will include all dependencies automatically installed. These can be different on different platforms.
{{{
conda env export --json --name my_env > my_env.yml
}}}
This command is also the basis for [[#Backup|backing up]] an environment.<<BR>>

==== Export the environment "my_env" to the definition file "my_env.yml" for a different platform ====
To make an environment work on a different platform its definition file should only contain packages you explicitely installed. This is achieved by adding the option `--from-history`:
{{{
conda env export --json --name my_env --from-history > my_env.yml
}}}
Line 136: Line 333:
==== Creates the environment "my_env" in the specified location ==== Recreate an exported enviroment under the new name, `new_env_name`:
{{{
conda env create --file my_env.yml --name new_env_name
}}}

==== Create the environment "my_env" in the specified location ====
Line 146: Line 348:
==== Pack, move and unpack environment ====
A use case is to pack a large environment tested to work in the cluster which suffers from slow startup times due to its location on a mounted network share. Such an environment can be packed into an archive so it's ready to be transferred at the start of a cluster job to the cluster node's local scratch, unpacked and started from there.

The tool used to do this is [[https://conda.github.io/conda-pack/|conda-pack]].

 1. Install the tool in your base environment: {{{#!highlight bash numbers=disable
conda install --name base --yes conda-pack
}}}
 Display it's options with `conda-pack --help` to understand the next step.
 1. Pack the environment into an archive on the host your working on: {{{#!highlight bash numbers=disable
mkdir -p /scratch/$USER/ # Create a directory to store the archive in
conda pack --name my_env --format tar.gz --output /scratch/$USER/my_env.tar.gz --dest-prefix /scratch/$USER/my_env
hostname # Display the hostname where you stored the archive for the transfer in your job script
}}}
 1. At the start of a ypur job script, transfer the archive to the cluster node's local scratch: {{{#!highlight bash numbers=disable
mkdir -p /scratch/$USER/my_env # Create the directory with the destination prefix defined in the previous step
rsync -a --inplace <hostname>:/scratch/$USER/my_env.tar.gz /scratch/$USER/ # Replace <hostname> to what was displayed in the previous step and sync the archive to the local scratch
tar -xf /scratch/$USER/my_env.tar.gz # Unpack the archive
}}}
 1. Activate the unpacked environment: {{{#!highlight bash numbers=disable
source /scratch/$USER/my_env/bin/activate
}}}

==== Listing, adding and removing environment variables in an environment ====
Environment variables can be added to an environment. Variables defined like this will be listed in an exported definition file.<<BR>>
List all environment variables defined for an active environment:
{{{
conda env config vars list
}}}
Set the environment variable `my_var` to value `value` in an active environment:
{{{
conda env config vars set my_var=value
}}}
Unset the environment variable `my_var` in an active environment:
{{{
conda env config vars unset my_var
}}}
Line 152: Line 391:
==== Search for packages with "pack" in their name ====
{{{
conda search *pack*
}}}
Line 156: Line 399:
==== Install packages with version requirements ====
{{{
conda install package1=1.2.3 'package2>=2.3.4' 'package3>=1.1,<=2.0' "package4 [version='3.1|3.5']"
}}}
Where the package versions installed should be:
 * `package1`: exactly version `1.2.3`
 * `package2`: at least version `2.3.4`
 * `package3`: anything between `1.1` and `2.0`
 * `package4`: exactly version `3.1` or `3.5`
The correct placement of single and double quotes is important to prevent parsing of bra/ket or pipe symbols.
Line 160: Line 414:
==== Add software channels ====
The list of available software can be extended by adding channels of selected repositories.
The priority of the channels is set in order of configuration. In the following example, [[https://conda-forge.org/|Conda-Forge]] has the highest priority over [[https://bioconda.github.io/|Bioconda]], with the default channel at the lowest priority.
{{{
conda config --add channels defaults
conda config --add channels bioconda
conda config --add channels conda-forge
}}}
==== Show software channels ====
The following command shows the available channels in order of priority (highest first):
{{{
conda config --show channels
}}}
==== Search for public (unofficial) packages ====
Packages maintained by the public and their respective channels can be searched on [[https://anaconda.org/|Anaconda Cloud]].

==== Installing packages with pip ====
Using `pip` to install packages in a `conda` environment is not recommended. The reasons are explained extensively in the article [[https://www.anaconda.com/blog/using-pip-in-a-conda-environment|Using Pip in a Conda Environment]].<<BR>>
In case a package is only available through `pip`, follow the best practices checklist outlined in this article. The following is a short summary of the checklist:
 * Install as many dependencies as possible with `conda` before resorting to `pip`
 * Set the experimental option `conda config --set pip_interop_enabled True`
 . For details see [[https://docs.conda.io/projects/conda/en/latest/user-guide/configuration/pip-interoperability.html|Improving interoperability with pip]]
 * Don't run `pip` with a non-default option `--upgrade-strategy`, keep the default of `--upgrade-strategy only-if-needed`

=== Miscellaneous ===
==== Display information about the current conda installation ====
{{{
conda info
}}}
==== Check for conda initialization and active environment ====
To make sure conda is initialized and an environment is active the following script can be started:
{{{#!highlight bash numbers=disable
#!/bin/bash

if [[ -z ${CONDA_EXE} ]]; then
    echo 'Environment variable CONDA_EXE is not set.'
    echo 'Please make sure conda is properly initialized.'
    exit 1
else
    if [[ ! -x ${CONDA_EXE} ]]; then
        echo "${CONDA_EXE} is not executable."
        echo "Please make sure it exists and is executable."
        exit 2
    else
        if [[ -z ${CONDA_DEFAULT_ENV} ]]; then
            echo 'Environment variable CONDA_DEFAULT_ENV is not set.'
            echo 'Please make sure to activate a conda environment.'
            exit 3
        else
            conda info
        fi
    fi
fi
}}}
Save this script as `conda_env_check.sh`, make it executable with
{{{
chmod +x ./conda_env_check.sh
}}}
A use case is to start this script within a [[Services/SLURM#sbatch_-.3E_Submitting_a_job|cluster job]] before using the first command installed in an environment. You can either run the script from your cluster job or place your job commands into the innermost branch after the command `conda info`.

==== Change TMPDIR ====
If the message ''"Not enough space on partition mounted at /tmp."'' is shown during a package installation, create a directory in a location with enough available space and point the TMPDIR variable to this location:
{{{
TMPDIR="/scratch/$USER/tmp" && mkdir -p "${TMPDIR}" && export TMPDIR
}}}

==== Speed up conda ====
[[https://github.com/mamba-org/mamba|Mamba]] is a faster reimplementation of the conda package manager in C++. Install it in your base environment with:
{{{
conda install mamba -n base -c conda-forge
}}}
Then replace `conda` by `mamba` wherever you previously used the `conda` command.
Line 170: Line 496:
}}}
==== Convenient alias ====
Above commands can be conveniently pulled together and defined as an alias for easier manual updates of the `base` environment and subsequent cleanup:
{{{
alias conda_update='conda update conda --yes && conda update -n base --update-all --yes && conda clean --all --yes'
Line 192: Line 523:

=== Installation examples ===
For `conda`, `python` itself is just a software package as any other. After analyzing all packages to be installed it decides which `python` version works for the whole environment. This means different environments may contain differing versions of `python`.

==== Creating an environment with a specific python version ====
 * Time to install: ~1 minute
 * Space required: ~140M
{{{
conda create --name py37 python=3.7.3
}}}
==== Creating an environment with the GPU version of pytorch and CUDA toolkit 10 ====
 * Time to install: ~5 minutes
 * Space required: ~2.5G
{{{
conda create --name pytcu10 pytorch torchvision cudatoolkit=10.0 --channel pytorch
}}}
==== Creating an environment with the GPU version of tensorflow and CUDA toolkit 10 ====
 * Time to install: ~5 minutes
 * Space required: ~2G
{{{
conda create --name tencu10 tensorflow-gpu cudatoolkit=10.0
}}}

Contents

  1. Setting up a personal python development infrastructure
    1. Installing conda
    2. conda storage locations
      1. Pre-set install locations
      2. Custom install location
      3. conda directories
    3. Using conda
      1. Installation examples
        1. Creating an environment with a specific python version
        2. Creating pytorch/tensorflow environments
          1. pytorch and CUDA toolkit 10 for a managed client
          2. pytorch and CUDA toolkit 11 to run on GPU nodes
          3. tensorflow and CUDA toolkit 10 for a managed client
          4. tensorflow and CUDA toolkit 11 to run on GPU nodes
      2. Environments
        1. Create an environment called "my_env" with packages "package1" and "package2" installed
        2. Activate the environment called "my_env"
        3. Deactivate the current environment
        4. List available environments
        5. Remove the environment called "my_env"
        6. Create a cloned environment named "cloned_env" from "original_env"
        7. Export the environment "my_env" to the definition file "my_env.yml" for an identical platform
        8. Export the environment "my_env" to the definition file "my_env.yml" for a different platform
        9. Recreate a previously exported environment
        10. Create the environment "my_env" in the specified location
        11. Update an active environment
        12. Pack, move and unpack environment
        13. Listing, adding and removing environment variables in an environment
      3. Packages
        1. Search for a package named "package1"
        2. Search for packages with "pack" in their name
        3. Install the package named "package1" in the active environment
        4. Install packages with version requirements
        5. List packages installed in the active environment
        6. Add software channels
        7. Show software channels
        8. Search for public (unofficial) packages
        9. Installing packages with pip
      4. Miscellaneous
        1. Display information about the current conda installation
        2. Check for conda initialization and active environment
        3. Change TMPDIR
        4. Speed up conda
      5. Maintenance
        1. Remove index cache, lock files, unused cache packages, and tarballs
        2. Update conda without any active environment
        3. Convenient alias
      6. Backup

Setting up a personal python development infrastructure

This page shows how to set up a personal python development infrastructure, how to use it, how to maintain it and make backups of your project environments.

Some examples for software installation in the field of data sciences are provided.

The infrastructure is driven by the conda packet manager which accesses the Anaconda repositories to install software.

After familiarizing yourself with conda, read this collection of hints and explanations about available platforms on which to use your infrastructure and particularities of the software packages involved.

Installing conda

  • Time to install: ~1.5 minutes
  • Space required: ~370M

To provide conda, the minimal anaconda distribution miniconda can be installed and configured for the D-ITET infrastructure with the following bash script:

#!/bin/bash

declare -i SPACE_AVAILABLE SPACE_MINIMUM_REQUIRED
SPACE_MINIMUM_REQUIRED=5 # [G]

if [[ -z "${1}" ]]; then
    # Default install location
    OPTION='netscratch'
else
    OPTION="${1}"
fi

line=$(printf '%*s\n' "${COLUMNS:-$(tput cols)}" '' | tr ' ' '-')

# Display underlined title to improve readability of script output
function title() {
    echo
    echo "$@"
    echo "${line}"
}

case "${OPTION}" in
h | help | '-h' | '--help')
    title 'Possible installation options are:'
    echo 'Install conda to your local scratch disk:'
    echo "${BASH_SOURCE[0]} localscratch"
    echo
    echo 'Install conda to your directory on net_scratch:'
    echo "${BASH_SOURCE[0]}"
    echo 'or'
    echo "${BASH_SOURCE[0]} netscratch"
    echo
    echo 'Provide a custom location for installation'
    echo "${BASH_SOURCE[0]} /path/to/custom/location"
    echo
    echo "The recommended minimum space requirement for installation is ${SPACE_MINIMUM_REQUIRED} G."
    exit 0
    ;;
l | local | localscratch | '-l' | '-local' | '-localscratch')
    # If local scratch is made available through scratch_net, use its path in
    # order to be able to access it on other hosts through scratch_net
    if grep -q scratch_net /etc/auto.master; then
        CONDA_BASE_DIR="/scratch_net/$(hostname -s)/${USER}"
    else
        CONDA_BASE_DIR="/scratch/${USER}"
    fi
    ;;
n | net | netscratch | '-n' | '-net' | '-netscratch')
    CONDA_BASE_DIR="/itet-stor/${USER}/net_scratch"
    ;;
*)
    CONDA_BASE_DIR="${1}"
    ;;
esac

# Check if this script is started on an Euler login node, if it is, suggest a custom install location and exit
if [[ -z ${HOSTNAME} ]]; then
    host_name=$(hostname -s)
else
    host_name=${HOSTNAME}
fi
if [[ -n ${host_name} ]]; then
    if [[ ${host_name%-*} == 'eu-login' ]]; then
        echo "It seems you're using this script on the Euler cluster."
        echo 'Provide a custom location for installation, for example in your Euler home:'
        echo "${BASH_SOURCE[0]} ${HOME}/conda"
        exit 1
    fi
fi

# Create install location if it doesn't exist
if [[ ! -d "${CONDA_BASE_DIR}" ]]; then
    mkdir -p "${CONDA_BASE_DIR}"
fi

# Check available space on selected install location
SPACE_AVAILABLE=$(($(stat -f --format="%a*%S" "${CONDA_BASE_DIR}") / 1024 / 1024 / 1024))
if ((SPACE_AVAILABLE <= SPACE_MINIMUM_REQUIRED)); then
    title 'Warning!'
    echo "Available space on '${CONDA_BASE_DIR}' is ${SPACE_AVAILABLE} G."
    echo "This is less than the minimum recommendation of ${SPACE_MINIMUM_REQUIRED} G."
    read -p "Press 'y' if you want to continue installing anwyway: " -n 1 -r
    echo
    if [[ ! ${REPLY} =~ ^[Yy]$ ]]; then
        exit 1
    fi
fi

# Locations for conda installation, packet cache and virtual environments
CONDA_INSTALL_DIR="${CONDA_BASE_DIR}/conda"
CONDA_PACKET_CACHE_DIR="${CONDA_BASE_DIR}/conda_pkgs"
CONDA_ENV_DIR="${CONDA_BASE_DIR}/conda_envs"

# Abort if pre-existing installation is found
if [[ -d "${CONDA_INSTALL_DIR}" ]]; then
    if [[ -z "$(find "${CONDA_INSTALL_DIR}" -maxdepth 0 -type d -empty 2>/dev/null)" ]]; then
        title 'Checking installation path'
        echo "The installation path '${CONDA_INSTALL_DIR}' is not empty."
        echo 'Aborting installation.'
        exit 1
    fi
fi

# Installer of choice for conda
CONDA_INSTALLER_URL='https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh'

# Unset pre-existing python paths
if [[ -n ${PYTHONPATH} ]]; then
    unset PYTHONPATH
fi

# Downlad latest version of miniconda and install it
title 'Downloading and installing conda'
wget -O miniconda.sh "${CONDA_INSTALLER_URL}" &&
    chmod +x miniconda.sh &&
    ./miniconda.sh -b -p "${CONDA_INSTALL_DIR}" &&
    rm ./miniconda.sh

# Configure conda
title 'Configuring conda'
eval "$(${CONDA_INSTALL_DIR}/bin/conda shell.bash hook)"
conda config --add pkgs_dirs "${CONDA_PACKET_CACHE_DIR}" --system
conda config --add envs_dirs "${CONDA_ENV_DIR}" --system
conda config --set auto_activate_base false
#conda config --set default_threads $(nproc)
conda config --set pip_interop_enabled True
conda config --set channel_priority strict
conda deactivate

# Prevent conda base environment from using user site-packages
mkdir -p "${CONDA_INSTALL_DIR}/etc/conda/activate.d"
echo '#!/bin/bash
if [[ -n ${PYTHONUSERBASE} ]]; then
    declare -g "PYTHONUSERBASE_${CONDA_DEFAULT_ENV}=${PYTHONUSERBASE}"
    export "PYTHONUSERBASE_${CONDA_DEFAULT_ENV}"
    unset PYTHONUSERBASE
fi' >"${CONDA_INSTALL_DIR}/etc/conda/activate.d/disable-PYTHONUSERBASE.sh"
chmod +x "${CONDA_INSTALL_DIR}/etc/conda/activate.d/disable-PYTHONUSERBASE.sh"

mkdir -p "${CONDA_INSTALL_DIR}/etc/conda/deactivate.d"
echo '#!/bin/bash
COMBOVAR=PYTHONUSERBASE_${CONDA_DEFAULT_ENV}
COMBOVAR_CONTENT=${!COMBOVAR}
if [[ -n ${COMBOVAR_CONTENT} ]]; then
    declare -g "PYTHONUSERBASE=${COMBOVAR_CONTENT}"
    export PYTHONUSERBASE
    unset "PYTHONUSERBASE_${CONDA_DEFAULT_ENV}"
fi' >"${CONDA_INSTALL_DIR}/etc/conda/deactivate.d/reenable-PYTHONUSERBASE.sh"
chmod +x "${CONDA_INSTALL_DIR}/etc/conda/deactivate.d/reenable-PYTHONUSERBASE.sh"

# Update conda and conda base environment
title 'Updating conda and conda base environment'
conda update conda --yes
conda update -n 'base' --update-all --yes

# Clean installation
title 'Removing unused packages and caches'
conda clean --all --yes

# Display information about this conda installation
title 'Information about this conda installation'
conda info

# Show how to initialize conda
title 'Initialize conda immediately'
echo "eval \"\$(${CONDA_INSTALL_DIR}/bin/conda shell.bash hook)\""
title 'Automatically initialize conda for future shell sessions'
echo "echo '[[ -f ${CONDA_INSTALL_DIR}/bin/conda ]] && eval \"\$(${CONDA_INSTALL_DIR}/bin/conda shell.bash hook)\"' >> ${HOME}/.bashrc"

# Show how to remove conda
title 'Completely remove conda'
echo "rm -r ${CONDA_INSTALL_DIR} ${CONDA_INSTALL_DIR}_pkgs ${CONDA_INSTALL_DIR}_envs ${HOME}/.conda"

Save this script as install_conda.sh, make it executable with

chmod +x ./install_conda.sh

and run the script to show options for choosing storage locations by issuing

./install_conda.sh help

Then run the script again with the option of your choosing to start the installation.

  • When the script ends it prints out information about the installation, commands to initialize conda immediately or every time you log in and a command to completely remove your conda installation.

  • Choose your preferred method of initializing conda as recommended by the script and note down the deletion command.

conda storage locations

Pre-set install locations

The purpose of the install scripts' options is to store data according to its importance and prevent using up your quota. The difference between the two pre-set installation locations is:

  • netscratch: fail-safe because it resides on a RAID but slower startup times as it is a network share

  • localscratch: single point of failure because it is just one disk but faster startup times as it is a local disk

Neither of the pre-set locations has an automatic backup. Use the recommended backup practice instead.

Custom install location

If you intend to use a custom install location, consult the storage overview to choose it adequately and follow these recommendations:

  • Reproducible, space consuming data like environments and package cache belongs into storage class SCRATCH

  • Code written by yourself should be backuped regularly. It consumes a small amount of space therefore it's ideal location is in storage class HOME and additionally checked into your git repository.

  • Data generated over a long time period which would be time consuming to recreate from scratch and is in use regularly should be stored in the storage class PROJECT.

  • Data generated as a final result which is not needed for ongoing work but needs to be available for later generations should be stored in the storage class ARCHIVE.

conda directories

The installation creates the following two directories in the install location:

  • conda: Contains the miniconda installation

  • conda_pkgs: Contains the cache for downloaded and decompressed packages

Creating the first environment creates an additional directory in the install location:

  • conda_envs: Contains the created environment(s)

Using conda

conda allows to seperate installed software packages from each other by creating so-called environments. Using environments is best practice to generate deterministic and reproducible tools.

conda takes care of dependencies common to the packages it is asked to install. If two packages have a common dependency but define a differing range of version requirements of said dependency, conda chooses the highest common version number. This means the dependency installed in an environment with both packages together might have a lower version number than in environments seperating both packages.

It is best practice to seperate packages in different environments if they don't need to interact.

For a complete guide to conda see the official documentation.

The official cheat sheet contains a compact summary of common commands. An abbreviated list to get you started is shown below.

Installation examples

For conda, python itself is just a software package as any other. After analyzing all packages to be installed it decides which python version works for the whole environment. This means different environments may contain differing versions of python.

Creating an environment with a specific python version

  • Time to install: ~1 minute
  • Space required: ~140M

conda create --name py38 python=3.8.5

Creating pytorch/tensorflow environments

The following examples show how to create environments on a managed client, to run on:

  1. A managed client, which typically has a low memory GPU. Typical use case is testing for later computations.

  2. A GPU node, which has several high memory GPUs. The typical use case is running computations.

Further information for all examples:

pytorch and CUDA toolkit 10 for a managed client
  • Time to install: ~5 minutes
  • Space required: ~2.5G

conda create --name pytcu10 pytorch torchvision cudatoolkit=10.1 --channel pytorch

pytorch and CUDA toolkit 11 to run on GPU nodes
  • Time to install: ~5 minutes
  • Space required: ~2.5G

CONDA_OVERRIDE_CUDA=11.7 conda create --name pytcu11 pytorch torchvision pytorch-cuda --channel pytorch --channel nvidia

tensorflow and CUDA toolkit 10 for a managed client
  • Time to install: ~5 minutes
  • Space required: ~2G

conda create --name tencu10 tensorflow-gpu cudatoolkit=10.1 --channel conda-forge

tensorflow and CUDA toolkit 11 to run on GPU nodes
  • Time to install: ~5 minutes
  • Space required: ~2G

CONDA_OVERRIDE_CUDA=11.4 conda create --name tencu11 tensorflow-gpu cudatoolkit=11.4 --channel conda-forge

Environments

conda automatically installs a default environment called base with a python interpreter, pip and other tools to start coding in python.

  • ⚠ It is strongly discouraged to use the base environment for projects. It's purpose is to provide the tools to maintain other environments, nothing else.

  • ⚠ Set up a new environment for each project. This ensures reproducability and facilitates environment related problem solving.
  • ⚠ Only auto activate an environment in your shell initialisation script if you understand exactly what this entails.
  • It's good practice to make sure conda is initialized and the wanted environment is active before trying to use it

Create an environment called "my_env" with packages "package1" and "package2" installed

conda create --name my_env package1 package2

Activate the environment called "my_env"

conda activate my_env

Deactivate the current environment

conda deactivate

List available environments

conda env list

Remove the environment called "my_env"

conda remove --name my_env --all

Create a cloned environment named "cloned_env" from "original_env"

conda create --name cloned_env --clone original_env

Export the environment "my_env" to the definition file "my_env.yml" for an identical platform

The definition file will include all dependencies automatically installed. These can be different on different platforms.

conda env export --json --name my_env > my_env.yml

This command is also the basis for backing up an environment.

Export the environment "my_env" to the definition file "my_env.yml" for a different platform

To make an environment work on a different platform its definition file should only contain packages you explicitely installed. This is achieved by adding the option --from-history:

conda env export --json --name my_env --from-history > my_env.yml

Recreate a previously exported environment

conda env create --file my_env.yml

Recreate an exported enviroment under the new name, new_env_name:

conda env create --file my_env.yml --name new_env_name

Create the environment "my_env" in the specified location

This example is for creating the environment on local scratch for faster disk access

conda create --prefix /scratch/$USER/conda_envs/my_env

Update an active environment

Make sure to create a backup by exporting the active environment before updating.

conda update --update-all

Pack, move and unpack environment

A use case is to pack a large environment tested to work in the cluster which suffers from slow startup times due to its location on a mounted network share. Such an environment can be packed into an archive so it's ready to be transferred at the start of a cluster job to the cluster node's local scratch, unpacked and started from there.

The tool used to do this is conda-pack.

  1. Install the tool in your base environment:

    conda install --name base --yes conda-pack
    

    Display it's options with conda-pack --help to understand the next step.

  2. Pack the environment into an archive on the host your working on:

    mkdir -p /scratch/$USER/ # Create a directory to store the archive in
    conda pack --name my_env --format tar.gz --output /scratch/$USER/my_env.tar.gz --dest-prefix /scratch/$USER/my_env 
    hostname # Display the hostname where you stored the archive for the transfer in your job script
    
    
  3. At the start of a ypur job script, transfer the archive to the cluster node's local scratch:

    mkdir -p /scratch/$USER/my_env # Create the directory with the destination prefix defined in the previous step
    rsync -a --inplace <hostname>:/scratch/$USER/my_env.tar.gz /scratch/$USER/ # Replace <hostname> to what was displayed in the previous step and sync the archive to the local scratch
    tar -xf /scratch/$USER/my_env.tar.gz # Unpack the archive
    
    
  4. Activate the unpacked environment:

    source /scratch/$USER/my_env/bin/activate 
    

Listing, adding and removing environment variables in an environment

Environment variables can be added to an environment. Variables defined like this will be listed in an exported definition file.
List all environment variables defined for an active environment:

conda env config vars list

Set the environment variable my_var to value value in an active environment:

conda env config vars set my_var=value

Unset the environment variable my_var in an active environment:

conda env config vars unset my_var

Packages

Search for a package named "package1"

conda search package1

Search for packages with "pack" in their name

conda search *pack*

Install the package named "package1" in the active environment

conda install package1

Install packages with version requirements

conda install package1=1.2.3 'package2>=2.3.4' 'package3>=1.1,<=2.0' "package4 [version='3.1|3.5']"

Where the package versions installed should be:

  • package1: exactly version 1.2.3

  • package2: at least version 2.3.4

  • package3: anything between 1.1 and 2.0

  • package4: exactly version 3.1 or 3.5

The correct placement of single and double quotes is important to prevent parsing of bra/ket or pipe symbols.

List packages installed in the active environment

conda list

Add software channels

The list of available software can be extended by adding channels of selected repositories. The priority of the channels is set in order of configuration. In the following example, Conda-Forge has the highest priority over Bioconda, with the default channel at the lowest priority.

conda config --add channels defaults
conda config --add channels bioconda
conda config --add channels conda-forge

Show software channels

The following command shows the available channels in order of priority (highest first):

conda config --show channels

Search for public (unofficial) packages

Packages maintained by the public and their respective channels can be searched on Anaconda Cloud.

Installing packages with pip

Using pip to install packages in a conda environment is not recommended. The reasons are explained extensively in the article Using Pip in a Conda Environment.
In case a package is only available through pip, follow the best practices checklist outlined in this article. The following is a short summary of the checklist:

  • Install as many dependencies as possible with conda before resorting to pip

  • Set the experimental option conda config --set pip_interop_enabled True

  • For details see Improving interoperability with pip

  • Don't run pip with a non-default option --upgrade-strategy, keep the default of --upgrade-strategy only-if-needed

Miscellaneous

Display information about the current conda installation

conda info

Check for conda initialization and active environment

To make sure conda is initialized and an environment is active the following script can be started:

#!/bin/bash

if [[ -z ${CONDA_EXE} ]]; then
    echo 'Environment variable CONDA_EXE is not set.'
    echo 'Please make sure conda is properly initialized.'
    exit 1
else
    if [[ ! -x ${CONDA_EXE} ]]; then
        echo "${CONDA_EXE} is not executable."
        echo "Please make sure it exists and is executable."
        exit 2
    else
        if [[ -z ${CONDA_DEFAULT_ENV} ]]; then
            echo 'Environment variable CONDA_DEFAULT_ENV is not set.'
            echo 'Please make sure to activate a conda environment.'
            exit 3
        else
            conda info
        fi
    fi
fi

Save this script as conda_env_check.sh, make it executable with

chmod +x ./conda_env_check.sh

A use case is to start this script within a cluster job before using the first command installed in an environment. You can either run the script from your cluster job or place your job commands into the innermost branch after the command conda info.

Change TMPDIR

If the message "Not enough space on partition mounted at /tmp." is shown during a package installation, create a directory in a location with enough available space and point the TMPDIR variable to this location:

TMPDIR="/scratch/$USER/tmp" && mkdir -p "${TMPDIR}" && export TMPDIR

Speed up conda

Mamba is a faster reimplementation of the conda package manager in C++. Install it in your base environment with:

conda install mamba -n base -c conda-forge

Then replace conda by mamba wherever you previously used the conda command.

Maintenance

The cache of installed packages will consume a lot of space over time. The default location set for the package cache resides on NetScratch, the terms of use for this storage area require you to clean your cache regularly.

Remove index cache, lock files, unused cache packages, and tarballs

conda clean --all

Update conda without any active environment

conda update conda

Convenient alias

Above commands can be conveniently pulled together and defined as an alias for easier manual updates of the base environment and subsequent cleanup:

alias conda_update='conda update conda --yes && conda update -n base --update-all --yes && conda clean --all --yes'

Backup

Regular backups are recommended to be able to reproduce an environment used at a certain point in time. Before installing or updating an environment, a backup should always be created in order to be able to revert the changes.

It is not necessary to backup environments themselves, it is sufficient to backup the files of environment exports to recreate them exactly.

For a simple backup of all environments the following script can be used:

#!/bin/bash

BACKUP_DIR="${HOME}/conda_env_backup"
MY_TIME_FORMAT='%Y-%m-%d_%H-%M-%S'

NOW=$(date "+${MY_TIME_FORMAT}")
[[ ! -d "${BACKUP_DIR}" ]] && mkdir "${BACKUP_DIR}"
ENVS=$(conda env list |grep '^\w' |cut -d' ' -f1)
for env in $ENVS; do
    echo "Exporting ${env} to ${BACKUP_DIR}/${env}_${NOW}.yml"
    conda env export --name "${env}"> "${BACKUP_DIR}/${env}_${NOW}.yml"
done

Programming/Languages/Conda (last edited 2024-04-18 08:22:58 by stroth)