Integration with an Experiment Engine
For now we can deploy a reproducible distributed environment, connect to the nodes, and run commands. What we would like to do now is to automatize the execution of the commands in an experiment script that we can easily rerun.
Fortunately, NixOS-Compose provides an integration with Execo, a experiment engine for Grid'5000. Execo is a Python library that abstract the usual operations on Grid'5000 (submitting jobs, deploying, executing commands, etc.).
Let's see how to use Execo and NixOS-Compose to run reproducible experiments
Starting Point
The snippet below represents a good starting point for an Execo script with NixOS-Compose.
# script.py
from nixos_compose.nxc_execo import get_oar_job_nodes_nxc
from execo import Remote
from execo_g5k import oardel, oarsub, OarSubmission, wait_oar_job_start
class NXCEngine(execo_engine.Engine):
def __init__(self):
super(MyEngine, self).__init__()
parser = self.args_parser
parser.add_argument('--nxc_build_file', help='Path to the NXC build file')
parser.add_argument('--flavour', help='Flavour to deploy')
self.nodes = {}
self.oar_job_id = -1
# --- Where and how many nodes ----
self.nb_nodes = 2
self.site = "grenoble"
self.cluster = "dahu"
def init(self):
# --- Reservation ----
self.oar_job_id, site = oarsub([(OarSubmission(f"{{cluster='{self.cluster}'}}/nodes={self.nb_nodes}", 15*60, job_type=["deploy"] if self.args.flavour == "g5k-image" else ["allow_classic_ssh"]), self.site)])[0]
wait_oar_job_start(self.oar_job_id, site) # wait for the job to start, otherwise we might get a timeout in the `get_oar_job_nodes_nxc`
# --- How many nodes per role ---
roles_quantities = {"server": ["server"], "node": ["node"]}
# --- Deploy and populate the dict `self.nodes` accordingly ---
self.nodes = get_oar_job_nodes_nxc(
self.oar_job_id,
site,
flavour_name=self.flavour,
compose_info_file=nxc_build_file,
roles_quantities=roles_quantities)
def run(self):
my_command = "echo \"Hello from $(whoami) at $(hostname) ($(ip -4 addr | grep \"/20\" | awk '{print $2;}'))\" > /tmp/hello"
hello_remote = Remote(my_command, self.nodes["server"], connection_params={'user': 'root'})
hello_remote.run()
my_command2 = "cat /tmp/hello"
cat_remote = Remote(my_command2, self.nodes["server"], connection_params={'user': 'root'})
cat_remote.run()
for process in cat_remote.processes:
print(process.stdout)
# --- Giving back the resources ---
oardel([(self.oar_job_id, self.site)])
if __name__ == "__main__":
NXCEngine().start()
Make sure you are in an environment with NixOS-Compose available.
You can then run python3 script.py --help
.
The script takes two arguments:
-
nxc_build_file
which is the path to the result ofnxc build
. Most probably it will be underbuild/composition::FLAVOUR.json
-
and the
flavour
. On Grid'5000 it can beg5k-nfs-store
,g5k-ramdisk
, org5k-image
Let's try to run the script for the g5k-image
flavour (make sure to have run nxc build -f g5k-image
before):
python3 script.py --nxc_build_file $(pwd)/build/composition::g5k-image --flavour g5k-image
You should see the logs from Execo telling you that it is doing the reservation to OAR, and starting deploying.
When the deployment is finished, you can see that the commands that we ran in the run
function of script.py
are being executed.
Run a real experiment
The code about is just to show the basics of Execo.
In this section, we will run a more realistic experiment calling the start_ior
command that we packaged in a previous section.
# script.py
from nixos_compose.nxc_execo import get_oar_job_nodes_nxc
from execo import Remote
from execo_g5k import oardel, oarsub, OarSubmission, wait_oar_job_start
class NXCEngine(Engine):
def __init__(self):
super(MyEngine, self).__init__()
parser = self.args_parser
parser.add_argument('--nxc_build_file', help='Path to the NXC build file')
parser.add_argument('--flavour', help='Flavour to deploy')
parser.add_argument('--nb_nodes', help='Number of nodes')
parser.add_argument('--result_dir', help='path to store the results')
self.nodes = {}
self.oar_job_id = -1
# --- Where and how many nodes ----
self.nb_nodes = 2
self.site = "grenoble"
self.cluster = "dahu"
def init(self):
# We might have more than two nodes
self.nb_nodes = int(args.nb_nodes)
assert self.nb_nodes > 2, "I need at least two nodes"
# --- Reservation ----
self.oar_job_id, site = oarsub([(OarSubmission(f"{{cluster='{self.cluster}'}}/nodes={self.nb_nodes}", 15*60, job_type=["deploy"] if self.args.flavour == "g5k-image" else ["allow_classic_ssh"]), self.site)])[0]
wait_oar_job_start(self.oar_job_id, site) # wait for the job to start, otherwise we might get a timeout in the `get_oar_job_nodes_nxc`
# --- How many nodes per role ---
# We want one server and all the other nodes are `node`
roles_quantities = {"server": ["server"], "node": [f"node{i}" for i in range(1, self.nb_nodes)]}
# --- Deploy and populate the dict `self.nodes` accordingly ---
self.nodes = get_oar_job_nodes_nxc(
self.oar_job_id,
site,
flavour_name=self.flavour,
compose_info_file=nxc_build_file,
roles_quantities=roles_quantities)
def run(self):
result_dir = self.args.result_dir
result_file = f"{result_dir}/results_ior_{self.nb_nodes}_nodes_{self.flavour}_flavour_{self.oar_job_id}"
run_ior_remote = Remote(f"start_ior", self.nodes["node"][0], connection_params={'user': 'root'})
run_ior_remote.run()
get_file_remote = Remote(f"cp /srv/shared/results_ior.json {result_file}", self.nodes["server"], connection_params={'user': 'root'})
get_file_remote.run()
oardel([(self.oar_job_id, self.site)])
if __name__ == "__main__":
NXCEngine().start()