Integration with an Experiment Engine

For now we can deploy a reproducible distributed environment, connect to the nodes, and run commands. What we would like to do now is to automatize the execution of the commands in an experiment script that we can easily rerun.

Fortunately, NixOS-Compose provides an integration with Execo, a experiment engine for Grid'5000. Execo is a Python library that abstract the usual operations on Grid'5000 (submitting jobs, deploying, executing commands, etc.).

Let's see how to use Execo and NixOS-Compose to run reproducible experiments

Starting Point

The snippet below represents a good starting point for an Execo script with NixOS-Compose.

# script.py
from nixos_compose.nxc_execo import get_oar_job_nodes_nxc

from execo import Remote
from execo_g5k import oardel, oarsub, OarSubmission, wait_oar_job_start

class NXCEngine(execo_engine.Engine):
    def __init__(self):
        super(MyEngine, self).__init__()
        parser = self.args_parser
        parser.add_argument('--nxc_build_file', help='Path to the NXC build file')
        parser.add_argument('--flavour', help='Flavour to deploy')
        self.nodes = {}
        self.oar_job_id = -1
        # --- Where and how many nodes ----
        self.nb_nodes = 2
        self.site = "grenoble"
        self.cluster = "dahu"

    def init(self):

        # --- Reservation ----
        self.oar_job_id, site = oarsub([(OarSubmission(f"{{cluster='{self.cluster}'}}/nodes={self.nb_nodes}", 15*60, job_type=["deploy"] if self.args.flavour == "g5k-image" else ["allow_classic_ssh"]), self.site)])[0]
        wait_oar_job_start(self.oar_job_id, site) # wait for the job to start, otherwise we might get a timeout in the `get_oar_job_nodes_nxc`

        # --- How many nodes per role ---
        roles_quantities = {"server": ["server"], "node": ["node"]}

        # --- Deploy and populate the dict `self.nodes` accordingly ---
        self.nodes = get_oar_job_nodes_nxc(
            self.oar_job_id,
            site,
            flavour_name=self.flavour,
            compose_info_file=nxc_build_file,
            roles_quantities=roles_quantities)

    def run(self):
        my_command = "echo \"Hello from $(whoami) at $(hostname) ($(ip -4 addr | grep \"/20\" | awk '{print $2;}'))\" > /tmp/hello"
        hello_remote = Remote(my_command, self.nodes["server"], connection_params={'user': 'root'})
        hello_remote.run()

        my_command2 = "cat /tmp/hello"
        cat_remote = Remote(my_command2, self.nodes["server"], connection_params={'user': 'root'})
        cat_remote.run()
        for process in cat_remote.processes:
            print(process.stdout)

        # --- Giving back the resources ---
        oardel([(self.oar_job_id, self.site)])


if __name__ == "__main__":
    NXCEngine().start()

Make sure you are in an environment with NixOS-Compose available.

You can then run python3 script.py --help.

The script takes two arguments:

  • nxc_build_file which is the path to the result of nxc build. Most probably it will be under build/composition::FLAVOUR.json

  • and the flavour. On Grid'5000 it can be g5k-nfs-store, g5k-ramdisk, or g5k-image

Let's try to run the script for the g5k-image flavour (make sure to have run nxc build -f g5k-image before):

python3 script.py --nxc_build_file $(pwd)/build/composition::g5k-image --flavour g5k-image

You should see the logs from Execo telling you that it is doing the reservation to OAR, and starting deploying. When the deployment is finished, you can see that the commands that we ran in the run function of script.py are being executed.

Run a real experiment

The code about is just to show the basics of Execo. In this section, we will run a more realistic experiment calling the start_ior command that we packaged in a previous section.

# script.py
from nixos_compose.nxc_execo import get_oar_job_nodes_nxc

from execo import Remote
from execo_g5k import oardel, oarsub, OarSubmission, wait_oar_job_start

class NXCEngine(Engine):
    def __init__(self):
        super(MyEngine, self).__init__()
        parser = self.args_parser
        parser.add_argument('--nxc_build_file', help='Path to the NXC build file')
        parser.add_argument('--flavour', help='Flavour to deploy')
        parser.add_argument('--nb_nodes', help='Number of nodes')
        parser.add_argument('--result_dir', help='path to store the results')
        self.nodes = {}
        self.oar_job_id = -1
        # --- Where and how many nodes ----
        self.nb_nodes = 2
        self.site = "grenoble"
        self.cluster = "dahu"

    def init(self):
        # We might have more than two nodes
        self.nb_nodes = int(args.nb_nodes)
        assert self.nb_nodes > 2, "I need at least two nodes"

        # --- Reservation ----
        self.oar_job_id, site = oarsub([(OarSubmission(f"{{cluster='{self.cluster}'}}/nodes={self.nb_nodes}", 15*60, job_type=["deploy"] if self.args.flavour == "g5k-image" else ["allow_classic_ssh"]), self.site)])[0]
        wait_oar_job_start(self.oar_job_id, site) # wait for the job to start, otherwise we might get a timeout in the `get_oar_job_nodes_nxc`

        # --- How many nodes per role ---
        # We want one server and all the other nodes are `node`
        roles_quantities = {"server": ["server"], "node": [f"node{i}" for i in range(1, self.nb_nodes)]}

        # --- Deploy and populate the dict `self.nodes` accordingly ---
        self.nodes = get_oar_job_nodes_nxc(
            self.oar_job_id,
            site,
            flavour_name=self.flavour,
            compose_info_file=nxc_build_file,
            roles_quantities=roles_quantities)

    def run(self):
        result_dir = self.args.result_dir
        result_file = f"{result_dir}/results_ior_{self.nb_nodes}_nodes_{self.flavour}_flavour_{self.oar_job_id}"

        run_ior_remote = Remote(f"start_ior", self.nodes["node"][0], connection_params={'user': 'root'})
        run_ior_remote.run()
        get_file_remote = Remote(f"cp /srv/shared/results_ior.json {result_file}", self.nodes["server"], connection_params={'user': 'root'})
        get_file_remote.run()

        oardel([(self.oar_job_id, self.site)])

if __name__ == "__main__":
    NXCEngine().start()