Integration with an Experiment Engine
For now we can deploy a reproducible distributed environment, connect to the nodes, and run commands. What we would like to do now is to automatize the execution of the commands in an experiment script that we can easily rerun.
Fortunately, NixOS-Compose provides an integration with Execo, a experiment engine for Grid'5000. Execo is a Python library that abstract the usual operations on Grid'5000 (submitting jobs, deploying, executing commands, etc.).
Let's see how to use Execo and NixOS-Compose to run reproducible experiments
Starting Point
The snippet below represents a good starting point for an Execo script with NixOS-Compose.
# script.py
from nixos_compose.nxc_execo import get_oar_job_nodes_nxc
from nixos_compose.g5k import key_sleep_script
import os
from execo import Remote
from execo_engine import Engine, logger, ParamSweeper, sweep
from execo_g5k import oardel, oarsub, OarSubmission, wait_oar_job_start
class NXCEngine(Engine):
def __init__(self):
super(NXCEngine, self).__init__()
parser = self.args_parser
parser.add_argument('--nxc_build_file', help='Path to the NXC build file')
parser.add_argument('--flavour', help='Flavour to deploy')
self.nodes = {}
self.oar_job_id = -1
# --- Where and how many nodes ----
self.nb_nodes = 2
self.site = "grenoble"
self.cluster = "dahu"
def init(self):
# --- Reservation ----
duration = 15 * 60 #seconds
if self.args.flavour == "g5k-image":
self.oar_job_id, site = oarsub([(OarSubmission(f"{{cluster='{self.cluster}'}}/nodes={self.nb_nodes}", duration, job_type=["deploy"], project="lab-2025-compas-nxc"), self.site)])[0]
else:
self.oar_job_id, site = oarsub([(OarSubmission(f"{{cluster='{self.cluster}'}}/nodes={self.nb_nodes}", duration, job_type=[], project="lab-2025-compas-nxc", command=f"{key_sleep_script} {duration}"), self.site)])[0]
wait_oar_job_start(self.oar_job_id, site) # wait for the job to start, otherwise we might get a timeout in the `get_oar_job_nodes_nxc`
# --- How many nodes per role ---
roles_quantities = {"server": ["server"], "node": ["node"]}
# --- Deploy and populate the dict `self.nodes` accordingly ---
self.nodes, self.roles = get_oar_job_nodes_nxc(
self.oar_job_id,
site,
flavour_name=self.args.flavour,
compose_info_file=os.environ['HOME'] + "/.local/share/nix/root" + os.readlink(self.args.nxc_build_file),
roles_quantities=roles_quantities)
def run(self):
my_command = "echo \"Hello from $(whoami) at $(hostname) ($(ip -4 addr | grep \"/20\" | awk '{print $2;}'))\" > /tmp/hello"
hello_remote = Remote(my_command, self.roles["server"], connection_params={'user': 'root'})
hello_remote.run()
my_command2 = "cat /tmp/hello"
cat_remote = Remote(my_command2, self.roles["server"], connection_params={'user': 'root'})
cat_remote.run()
for process in cat_remote.processes:
print(process.stdout)
# --- Giving back the resources ---
oardel([(self.oar_job_id, self.site)])
if __name__ == "__main__":
NXCEngine().start()
Make sure you are in an environment with NixOS-Compose available.
You can then run python3 script.py --help
.
The script takes two arguments:
-
nxc_build_file
which is the path to the result ofnxc build
. Most probably it will be underbuild/composition::FLAVOUR.json
-
and the
flavour
. On Grid'5000 it can beg5k-nfs-store
,g5k-ramdisk
, org5k-image
Let's try to run the script for the g5k-nfs-store
flavour:
python3 script.py --nxc_build_file $(pwd)/build/composition::g5k-nfs-store --flavour g5k-nfs-store
You should see the logs from Execo telling you that it is doing the reservation to OAR, and starting deploying.
When the deployment is finished, you can see that the commands that we ran in the run
function of script.py
are being executed.
Run a real experiment
The code about is just to show the basics of Execo.
In this section, we will run a more realistic experiment calling the start_ior
command that we packaged in a previous section.
# script.py
from nixos_compose.nxc_execo import get_oar_job_nodes_nxc
from nixos_compose.g5k import key_sleep_script
import os
import time
from execo import Remote
from execo_engine import Engine, logger, ParamSweeper, sweep
from execo_g5k import oardel, oarsub, OarSubmission, wait_oar_job_start
class NXCEngine(Engine):
def __init__(self):
super(NXCEngine, self).__init__()
parser = self.args_parser
parser.add_argument('--nxc_build_file', help='Path to the NXC build file')
parser.add_argument('--flavour', help='Flavour to deploy')
parser.add_argument('--nb_nodes', help='Number of nodes')
parser.add_argument('--result_file', help='path to store the results')
self.nodes = {}
self.oar_job_id = -1
# --- Where and how many nodes ----
self.site = "grenoble"
self.cluster = "dahu"
def init(self):
self.nb_nodes = int(self.args.nb_nodes)
assert self.nb_nodes >= 2, "I need at least two nodes"
# --- Reservation ----
duration = 15 * 60 #seconds
if self.args.flavour == "g5k-image":
self.oar_job_id, site = oarsub([(OarSubmission(f"{{cluster='{self.cluster}'}}/nodes={self.nb_nodes}", duration, job_type=["deploy"], project="lab-2025-compas-nxc"), self.site)])[0]
else:
self.oar_job_id, site = oarsub([(OarSubmission(f"{{cluster='{self.cluster}'}}/nodes={self.nb_nodes}", duration, job_type=[], project="lab-2025-compas-nxc", command=f"{key_sleep_script} {duration}"), self.site)])[0]
wait_oar_job_start(self.oar_job_id, site) # wait for the job to start, otherwise we might get a timeout in the `get_oar_job_nodes_nxc`
# --- How many nodes per role ---
# We want one server and all the other nodes are `node`
roles_quantities = {"server": ["server"], "node": [f"node{i}" for i in range(1, self.nb_nodes)]}
# --- Deploy and populate the dict `self.nodes` accordingly ---
self.nodes, self.roles = get_oar_job_nodes_nxc(
self.oar_job_id,
site,
flavour_name=self.args.flavour,
compose_info_file=os.environ['HOME'] + "/.local/share/nix/root" + os.readlink(self.args.nxc_build_file),
roles_quantities=roles_quantities)
def run(self):
result_file = self.args.result_file
time.sleep(10)
remount_volume_remote = Remote("remount_glusterfs", self.roles["node"][0], connection_params={'user': 'root'})
remount_volume_remote.run()
run_ior_remote = Remote("start_ior", self.roles["node"][0], connection_params={'user': 'root'})
run_ior_remote.run()
get_file_remote = Remote(f"cp /tmp/results_ior.json {result_file}", self.roles["node"][0], connection_params={'user': 'root'})
get_file_remote.run()
oardel([(self.oar_job_id, self.site)])
if __name__ == "__main__":
NXCEngine().start()
The previous script can be ran with:
python3 script.py --nxc_build_file $(pwd)/build/composition::g5k-nfs-store --flavour g5k-nfs-store --nb_nodes 2 --result_file $(pwd)/ior_results.json