From 9c059d81ba4f4016f8c400b403f8c5edbdb160c2 Mon Sep 17 00:00:00 2001 From: Julien Lepiller Date: Fri, 21 Feb 2020 23:41:33 +0100 Subject: [PATCH] nix: Count build and download jobs separately. This allows to run downloads (that take bandwith) and builds (that take CPU time) independently from one another. * nix/nix-daemon/guix-daemon.cc: Add a max-download-jobs option. * nix/libstore/globals.hh: Add a maxDownloadJobs setting. * nix/libstore/globals.cc: Add a default value to it. * nix/libstore/build.cc: Manage build and download jobs separately. --- nix/libstore/build.cc | 75 +++++++++++++++++++++++++++-------- nix/libstore/globals.cc | 2 + nix/libstore/globals.hh | 3 ++ nix/nix-daemon/guix-daemon.cc | 5 +++ 4 files changed, 69 insertions(+), 16 deletions(-) diff --git a/nix/libstore/build.cc b/nix/libstore/build.cc index 17e92c68a7..d08904c4ee 100644 --- a/nix/libstore/build.cc +++ b/nix/libstore/build.cc @@ -204,6 +204,7 @@ struct Child set fds; bool respectTimeouts; bool inBuildSlot; + bool inDownloadSlot; time_t lastOutput; /* time we last got output on stdout/stderr */ time_t timeStarted; }; @@ -231,10 +232,14 @@ private: /* Child processes currently running. */ Children children; - /* Number of build slots occupied. This includes local builds and - substitutions but not remote builds via the build hook. */ + /* Number of build slots occupied. This includes local builds but not + substitution, built-ins and remote builds via the build hook. */ unsigned int nrLocalBuilds; + /* Number of download slots occupied. This includes substitution and + built-ins. */ + unsigned int nrDownloads; + /* Maps used to prevent multiple instantiations of a goal for the same derivation / path. */ WeakGoalMap derivationGoals; @@ -275,15 +280,18 @@ public: /* Wake up a goal (i.e., there is something for it to do). */ void wakeUp(GoalPtr goal); - /* Return the number of local build and substitution processes - currently running (but not remote builds via the build - hook). */ + /* Return the number of local build processes currently running (but + not remote builds via the build hook). */ unsigned int getNrLocalBuilds(); + /* Return the number of downloads currently running. */ + unsigned int getNrDownloads(); + /* Registers a running child process. `inBuildSlot' means that the process counts towards the jobs limit. */ void childStarted(GoalPtr goal, pid_t pid, - const set & fds, bool inBuildSlot, bool respectTimeouts); + const set & fds, bool inBuildSlot, bool inDownloadSlot, + bool respectTimeouts); /* Unregisters a running child process. `wakeSleepers' should be false if there is no sense in waking up goals that are sleeping @@ -295,6 +303,10 @@ public: might be right away). */ void waitForBuildSlot(GoalPtr goal); + /* Put `goal' to sleep until a download slot becomes available (which + might be right away). */ + void waitForDownloadSlot(GoalPtr goal); + /* Wait for any goal to finish. Pretty indiscriminate way to wait for some resource that some other goal is holding. */ void waitForAnyGoal(GoalPtr goal); @@ -1359,12 +1371,21 @@ void DerivationGoal::tryToBuild() derivation prefers to be done locally, do it even if maxBuildJobs is 0. */ unsigned int curBuilds = worker.getNrLocalBuilds(); - if (curBuilds >= settings.maxBuildJobs && !(buildLocally && curBuilds == 0)) { + if (curBuilds >= settings.maxBuildJobs && !(buildLocally && curBuilds == 0) && + !fixedOutput) { worker.waitForBuildSlot(shared_from_this()); outputLocks.unlock(); return; } + unsigned int curDownloads = worker.getNrDownloads(); + if (curDownloads >= (settings.maxDownloadJobs==0?1:settings.maxDownloadJobs) && + fixedOutput) { + worker.waitForDownloadSlot(shared_from_this()); + outputLocks.unlock(); + return; + } + try { /* Okay, we have to build. */ @@ -1648,7 +1669,7 @@ HookReply DerivationGoal::tryBuildHook() set fds; fds.insert(hook->fromHook.readSide); fds.insert(hook->builderOut.readSide); - worker.childStarted(shared_from_this(), hook->pid, fds, false, true); + worker.childStarted(shared_from_this(), hook->pid, fds, false, false, true); if (settings.printBuildTrace) printMsg(lvlError, format("@ build-started %1% - %2% %3% %4%") @@ -2030,7 +2051,7 @@ void DerivationGoal::startBuilder() pid.setSeparatePG(true); builderOut.writeSide.close(); worker.childStarted(shared_from_this(), pid, - singleton >(builderOut.readSide), true, true); + singleton >(builderOut.readSide), !fixedOutput, fixedOutput, true); /* Check if setting up the build environment failed. */ string msg = readLine(builderOut.readSide); @@ -3034,11 +3055,11 @@ void SubstitutionGoal::tryToRun() trace("trying to run"); /* Make sure that we are allowed to start a build. Note that even - is maxBuildJobs == 0 (no local builds allowed), we still allow - a substituter to run. This is because substitutions cannot be - distributed to another machine via the build hook. */ - if (worker.getNrLocalBuilds() >= (settings.maxBuildJobs == 0 ? 1 : settings.maxBuildJobs)) { - worker.waitForBuildSlot(shared_from_this()); + is maxDownloadJobs == 0 (no downloads allowed), we still allow + a substituter to run. This is because we always need to download, so + not allowing is meaningless. */ + if (worker.getNrDownloads() >= (settings.maxDownloadJobs == 0 ? 1 : settings.maxDownloadJobs)) { + worker.waitForDownloadSlot(shared_from_this()); return; } @@ -3107,7 +3128,7 @@ void SubstitutionGoal::tryToRun() outPipe.writeSide.close(); logPipe.writeSide.close(); worker.childStarted(shared_from_this(), - pid, singleton >(logPipe.readSide), true, true); + pid, singleton >(logPipe.readSide), false, true, true); state = &SubstitutionGoal::finished; @@ -3242,6 +3263,7 @@ Worker::Worker(LocalStore & store) if (working) abort(); working = true; nrLocalBuilds = 0; + nrDownloads = 0; lastWokenUp = 0; permanentFailure = false; timedOut = false; @@ -3334,9 +3356,14 @@ unsigned Worker::getNrLocalBuilds() return nrLocalBuilds; } +unsigned Worker::getNrDownloads() +{ + return nrDownloads; +} + void Worker::childStarted(GoalPtr goal, - pid_t pid, const set & fds, bool inBuildSlot, + pid_t pid, const set & fds, bool inBuildSlot, bool inDownloadSlot, bool respectTimeouts) { Child child; @@ -3344,9 +3371,11 @@ void Worker::childStarted(GoalPtr goal, child.fds = fds; child.timeStarted = child.lastOutput = time(0); child.inBuildSlot = inBuildSlot; + child.inDownloadSlot = inDownloadSlot; child.respectTimeouts = respectTimeouts; children[pid] = child; if (inBuildSlot) nrLocalBuilds++; + if (inDownloadSlot) nrDownloads++; } @@ -3362,6 +3391,11 @@ void Worker::childTerminated(pid_t pid, bool wakeSleepers) nrLocalBuilds--; } + if (i->second.inDownloadSlot) { + assert(nrDownloads > 0); + nrDownloads--; + } + children.erase(pid); if (wakeSleepers) { @@ -3386,6 +3420,15 @@ void Worker::waitForBuildSlot(GoalPtr goal) addToWeakGoals(wantingToBuild, goal); } +void Worker::waitForDownloadSlot(GoalPtr goal) +{ + debug("wait for download slot"); + if (getNrDownloads() < (settings.maxDownloadJobs==0?1:settings.maxDownloadJobs)) + wakeUp(goal); /* we can do it right away */ + else + addToWeakGoals(wantingToBuild, goal); +} + void Worker::waitForAnyGoal(GoalPtr goal) { diff --git a/nix/libstore/globals.cc b/nix/libstore/globals.cc index 0cc001fbe4..416033718d 100644 --- a/nix/libstore/globals.cc +++ b/nix/libstore/globals.cc @@ -29,6 +29,7 @@ Settings::Settings() tryFallback = false; buildVerbosity = lvlError; maxBuildJobs = 1; + maxDownloadJobs = 1; buildCores = 1; readOnlyMode = false; thisSystem = SYSTEM; @@ -118,6 +119,7 @@ void Settings::update() { _get(tryFallback, "build-fallback"); _get(maxBuildJobs, "build-max-jobs"); + _get(maxDownloadJobs, "download-max-jobs"); _get(buildCores, "build-cores"); _get(thisSystem, "system"); _get(multiplexedBuildOutput, "multiplexed-build-output"); diff --git a/nix/libstore/globals.hh b/nix/libstore/globals.hh index 27616a2283..c033f8ed56 100644 --- a/nix/libstore/globals.hh +++ b/nix/libstore/globals.hh @@ -90,6 +90,9 @@ struct Settings { /* Maximum number of parallel build jobs. 0 means unlimited. */ unsigned int maxBuildJobs; + /* Maximum number of parallel download jobs. 0 means 1. */ + unsigned int maxDownloadJobs; + /* Number of CPU cores to utilize in parallel within a build, i.e. by passing this number to Make via '-j'. 0 means that the number of actual CPU cores on the local host ought to be diff --git a/nix/nix-daemon/guix-daemon.cc b/nix/nix-daemon/guix-daemon.cc index cd949aca67..6997c07f5a 100644 --- a/nix/nix-daemon/guix-daemon.cc +++ b/nix/nix-daemon/guix-daemon.cc @@ -99,6 +99,8 @@ static const struct argp_option options[] = }, { "max-jobs", 'M', n_("N"), 0, n_("allow at most N build jobs") }, + { "max-downloads", 'D', n_("N"), 0, + n_("allow at most N download jobs") }, { "timeout", GUIX_OPT_TIMEOUT, n_("SECONDS"), 0, n_("mark builds as failed after SECONDS of activity") }, { "max-silent-time", GUIX_OPT_MAX_SILENT_TIME, n_("SECONDS"), 0, @@ -276,6 +278,9 @@ parse_opt (int key, char *arg, struct argp_state *state) case 'M': settings.set ("build-max-jobs", arg); break; + case 'D': + settings.set ("download-max-jobs", arg); + break; case GUIX_OPT_TIMEOUT: settings.set ("build-timeout", arg); break; -- 2.24.0