From 3dee3844b6da1d70b883391f5b2d23e331d6a5ad Mon Sep 17 00:00:00 2001 Message-Id: <3dee3844b6da1d70b883391f5b2d23e331d6a5ad.1637534792.git.julien@lepiller.eu> From: Julien Lepiller Date: Sun, 21 Nov 2021 22:35:09 +0100 Subject: [PATCH 1/2] daemon: Count build and download jobs separately. This allows us to run downloads (that take bandwith) and builds (that take CPU time) independently from one another. * nix/nix-daemon/guix-daemon.cc: Add a max-download-jobs option. * nix/nix-daemon/nix-daemon.cc: Add a max-download-jobs option. * nix/libstore/globals.hh (Settings): Add a maxDownloadJobs setting. * nix/libstore/globals.cc (Settings::Settings): Add a default value to it. (Settings::update): Add it. * nix/libstore/build.cc (Slot): New enum type. (Child): Use `Slot` enum to record the job slot of the task, if any. (Goal): Separate build and download slot members and methods. (DerivationGoal::tryToBuild): Check number of available slots in build or download tasks. (DerivationGoal::tryBuildHook, SubstitutionGoal::tryToRun): Check number of download slots instead of build slots. (Worker::Worker, Worker::childStarted, Worker::childTerminated): Manage build and download slots separately. * doc/guix.texi (Invoking guix-daemon): Document `--max-downloads`. --- doc/guix.texi | 6 ++ nix/libstore/build.cc | 105 ++++++++++++++++++++++++++-------- nix/libstore/globals.cc | 2 + nix/libstore/globals.hh | 3 + nix/nix-daemon/guix-daemon.cc | 5 ++ nix/nix-daemon/nix-daemon.cc | 2 +- 6 files changed, 97 insertions(+), 26 deletions(-) diff --git a/doc/guix.texi b/doc/guix.texi index 59ceb4477a..b8de53c53b 100644 --- a/doc/guix.texi +++ b/doc/guix.texi @@ -1619,6 +1619,12 @@ Invoking guix-daemon locally; instead, the daemon will offload builds (@pxref{Daemon Offload Setup}), or simply fail. +@item --max-downloads=@var{n} +@itemx -D @var{n} +Allow at most @var{n} download jobs in parallel. The default value is +@code{1}. Since some downloads always need to happen, setting it to +@code{0} does not make any sense and will be ignored. + @item --max-silent-time=@var{seconds} When the build or substitution process remains silent for more than @var{seconds}, terminate it and report a build failure. diff --git a/nix/libstore/build.cc b/nix/libstore/build.cc index 5697ae5a43..2fbbd81af2 100644 --- a/nix/libstore/build.cc +++ b/nix/libstore/build.cc @@ -196,6 +196,9 @@ bool CompareGoalPtrs::operator() (const GoalPtr & a, const GoalPtr & b) { } +/* A list of possible slots */ +typedef enum { slotNone, slotBuild, slotDownload } Slot; + /* A mapping used to remember for each child process to what goal it belongs, and file descriptors for receiving log data and output path creation commands. */ @@ -204,7 +207,7 @@ struct Child WeakGoalPtr goal; set fds; bool respectTimeouts; - bool inBuildSlot; + Slot inSlot; time_t lastOutput; /* time we last got output on stdout/stderr */ time_t timeStarted; }; @@ -232,10 +235,15 @@ private: /* Child processes currently running. */ Children children; - /* Number of build slots occupied. This includes local builds and - substitutions but not remote builds via the build hook. */ + /* Number of build slots occupied. This includes local builds but not + fixed-output derivations, substitution, built-ins and remote builds + via the build hook. */ unsigned int nrLocalBuilds; + /* Number of download slots occupied. This includes fixed-output + derivations, substitution and built-ins. */ + unsigned int nrDownloads; + /* Maps used to prevent multiple instantiations of a goal for the same derivation / path. */ WeakGoalMap derivationGoals; @@ -277,15 +285,18 @@ public: /* Wake up a goal (i.e., there is something for it to do). */ void wakeUp(GoalPtr goal); - /* Return the number of local build and substitution processes - currently running (but not remote builds via the build - hook). */ + /* Return the number of local build processes currently running (but + not remote builds via the build hook). */ unsigned int getNrLocalBuilds(); - /* Registers a running child process. `inBuildSlot' means that - the process counts towards the jobs limit. */ + /* Return the number of downloads currently running. */ + unsigned int getNrDownloads(); + + /* Registers a running child process. `inSlot' indicates whether + the process counts towards the jobs limit, and which one. */ void childStarted(GoalPtr goal, pid_t pid, - const set & fds, bool inBuildSlot, bool respectTimeouts); + const set & fds, Slot inSlot, + bool respectTimeouts); /* Unregisters a running child process. `wakeSleepers' should be false if there is no sense in waking up goals that are sleeping @@ -297,6 +308,10 @@ public: might be right away). */ void waitForBuildSlot(GoalPtr goal); + /* Put `goal' to sleep until a download slot becomes available (which + might be right away). */ + void waitForDownloadSlot(GoalPtr goal); + /* Wait for any goal to finish. Pretty indiscriminate way to wait for some resource that some other goal is holding. */ void waitForAnyGoal(GoalPtr goal); @@ -1249,16 +1264,28 @@ void DerivationGoal::tryToBuild() } } - /* Make sure that we are allowed to start a build. If this - derivation prefers to be done locally, do it even if - maxBuildJobs is 0. */ + /* Make sure that, if this is a build (not a download), we are allowed to + start a build. If this derivation prefers to be done locally, do it + even if maxBuildJobs is 0. */ unsigned int curBuilds = worker.getNrLocalBuilds(); - if (curBuilds >= settings.maxBuildJobs && !(buildLocally && curBuilds == 0)) { + if (curBuilds >= settings.maxBuildJobs && !(buildLocally && curBuilds == 0) && + !fixedOutput) { worker.waitForBuildSlot(shared_from_this()); outputLocks.unlock(); return; } + /* Make sure that, if this is a download (not a build), we are allowed to + start a download. Even if maxDownloadJobs is 0, we always allow + downloads, so it is set to 1 */ + unsigned int curDownloads = worker.getNrDownloads(); + unsigned int maxDownloads = settings.maxDownloadJobs == 0 ? 1 : settings.maxDownloadJobs; + if (curDownloads >= maxDownloads && fixedOutput) { + worker.waitForDownloadSlot(shared_from_this()); + outputLocks.unlock(); + return; + } + try { /* Okay, we have to build. */ @@ -1546,7 +1573,7 @@ HookReply DerivationGoal::tryBuildHook() set fds; fds.insert(hook->fromAgent.readSide); fds.insert(hook->builderOut.readSide); - worker.childStarted(shared_from_this(), hook->pid, fds, false, true); + worker.childStarted(shared_from_this(), hook->pid, fds, slotNone, true); if (settings.printBuildTrace) printMsg(lvlError, format("@ build-started %1% - %2% %3% %4%") @@ -1940,7 +1967,8 @@ void DerivationGoal::startBuilder() pid.setSeparatePG(true); builderOut.writeSide.close(); worker.childStarted(shared_from_this(), pid, - singleton >(builderOut.readSide), true, true); + singleton >(builderOut.readSide), + fixedOutput? slotDownload: slotBuild, true); /* Check if setting up the build environment failed. */ string msg = readLine(builderOut.readSide); @@ -2979,11 +3007,11 @@ void SubstitutionGoal::tryToRun() trace("trying to run"); /* Make sure that we are allowed to start a build. Note that even - is maxBuildJobs == 0 (no local builds allowed), we still allow - a substituter to run. This is because substitutions cannot be - distributed to another machine via the build hook. */ - if (worker.getNrLocalBuilds() >= (settings.maxBuildJobs == 0 ? 1 : settings.maxBuildJobs)) { - worker.waitForBuildSlot(shared_from_this()); + is maxDownloadJobs == 0 (no downloads allowed), we still allow + a substituter to run. This is because we always need to download, so + not allowing is meaningless. */ + if (worker.getNrDownloads() >= (settings.maxDownloadJobs == 0 ? 1 : settings.maxDownloadJobs)) { + worker.waitForDownloadSlot(shared_from_this()); return; } @@ -3042,7 +3070,8 @@ void SubstitutionGoal::tryToRun() set fds; fds.insert(substituter->fromAgent.readSide); fds.insert(substituter->builderOut.readSide); - worker.childStarted(shared_from_this(), substituter->pid, fds, true, true); + worker.childStarted(shared_from_this(), substituter->pid, fds, + slotDownload, true); state = &SubstitutionGoal::finished; @@ -3206,6 +3235,7 @@ Worker::Worker(LocalStore & store) if (working) abort(); working = true; nrLocalBuilds = 0; + nrDownloads = 0; lastWokenUp = 0; permanentFailure = false; timedOut = false; @@ -3299,18 +3329,25 @@ unsigned Worker::getNrLocalBuilds() } +unsigned Worker::getNrDownloads() +{ + return nrDownloads; +} + + void Worker::childStarted(GoalPtr goal, - pid_t pid, const set & fds, bool inBuildSlot, + pid_t pid, const set & fds, Slot inSlot, bool respectTimeouts) { Child child; child.goal = goal; child.fds = fds; child.timeStarted = child.lastOutput = time(0); - child.inBuildSlot = inBuildSlot; + child.inSlot = inSlot; child.respectTimeouts = respectTimeouts; children[pid] = child; - if (inBuildSlot) nrLocalBuilds++; + if (inSlot == slotBuild) nrLocalBuilds++; + if (inSlot == slotDownload) nrDownloads++; } @@ -3321,11 +3358,16 @@ void Worker::childTerminated(pid_t pid, bool wakeSleepers) Children::iterator i = children.find(pid); assert(i != children.end()); - if (i->second.inBuildSlot) { + if (i->second.inSlot == slotBuild) { assert(nrLocalBuilds > 0); nrLocalBuilds--; } + if (i->second.inSlot == slotDownload) { + assert(nrDownloads > 0); + nrDownloads--; + } + children.erase(pid); if (wakeSleepers) { @@ -3351,6 +3393,19 @@ void Worker::waitForBuildSlot(GoalPtr goal) } +void Worker::waitForDownloadSlot(GoalPtr goal) +{ + debug("wait for download slot"); + /* Disabling all downloads does not make any sense, so we make sure we + can download with at least one job. */ + unsigned int maxDownloads = settings.maxDownloadJobs == 0 ? 1 : settings.maxDownloadJobs; + if (getNrDownloads() < maxDownloads) + wakeUp(goal); /* we can do it right away */ + else + addToWeakGoals(wantingToBuild, goal); +} + + void Worker::waitForAnyGoal(GoalPtr goal) { debug("wait for any goal"); diff --git a/nix/libstore/globals.cc b/nix/libstore/globals.cc index 0cc001fbe4..416033718d 100644 --- a/nix/libstore/globals.cc +++ b/nix/libstore/globals.cc @@ -29,6 +29,7 @@ Settings::Settings() tryFallback = false; buildVerbosity = lvlError; maxBuildJobs = 1; + maxDownloadJobs = 1; buildCores = 1; readOnlyMode = false; thisSystem = SYSTEM; @@ -118,6 +119,7 @@ void Settings::update() { _get(tryFallback, "build-fallback"); _get(maxBuildJobs, "build-max-jobs"); + _get(maxDownloadJobs, "download-max-jobs"); _get(buildCores, "build-cores"); _get(thisSystem, "system"); _get(multiplexedBuildOutput, "multiplexed-build-output"); diff --git a/nix/libstore/globals.hh b/nix/libstore/globals.hh index 27616a2283..c033f8ed56 100644 --- a/nix/libstore/globals.hh +++ b/nix/libstore/globals.hh @@ -90,6 +90,9 @@ struct Settings { /* Maximum number of parallel build jobs. 0 means unlimited. */ unsigned int maxBuildJobs; + /* Maximum number of parallel download jobs. 0 means 1. */ + unsigned int maxDownloadJobs; + /* Number of CPU cores to utilize in parallel within a build, i.e. by passing this number to Make via '-j'. 0 means that the number of actual CPU cores on the local host ought to be diff --git a/nix/nix-daemon/guix-daemon.cc b/nix/nix-daemon/guix-daemon.cc index 36a06a3fae..f1b60b0050 100644 --- a/nix/nix-daemon/guix-daemon.cc +++ b/nix/nix-daemon/guix-daemon.cc @@ -100,6 +100,8 @@ static const struct argp_option options[] = }, { "max-jobs", 'M', n_("N"), 0, n_("allow at most N build jobs") }, + { "max-downloads", 'D', n_("N"), 0, + n_("allow at most N download jobs") }, { "timeout", GUIX_OPT_TIMEOUT, n_("SECONDS"), 0, n_("mark builds as failed after SECONDS of activity") }, { "max-silent-time", GUIX_OPT_MAX_SILENT_TIME, n_("SECONDS"), 0, @@ -285,6 +287,9 @@ parse_opt (int key, char *arg, struct argp_state *state) case 'M': settings.set ("build-max-jobs", arg); break; + case 'D': + settings.set ("download-max-jobs", arg); + break; case GUIX_OPT_TIMEOUT: settings.set ("build-timeout", arg); break; diff --git a/nix/nix-daemon/nix-daemon.cc b/nix/nix-daemon/nix-daemon.cc index 497de11a04..4b2f8bcf22 100644 --- a/nix/nix-daemon/nix-daemon.cc +++ b/nix/nix-daemon/nix-daemon.cc @@ -610,7 +610,7 @@ static void performOp(bool trusted, unsigned int clientVersion, string value = readString(from); if (name == "build-timeout" || name == "build-max-silent-time" || name == "build-max-jobs" || name == "build-cores" - || name == "build-repeat" + || name == "build-repeat" || name == "download-max-jobs" || name == "multiplexed-build-output") settings.set(name, value); else if (name == "user-name" -- 2.33.1