From 91e7b79ef8e470aa944855818443dbe8378bc017 Mon Sep 17 00:00:00 2001 From: Joshua Getega Date: Tue, 11 Dec 2018 11:20:47 -0500 Subject: [PATCH 01/13] Add HUIT cost allocation tags Tag manager and workers with 'platform', 'product', and 'environment' tags. These are required by HUIT, as specified in https://confluence.huit.harvard.edu/display/CLA/Cloud+Resource+Tagging#CloudResourceTagging-4.5%22platform%22Tag, for cost allocation. --- jupyterhub_files/spawner.py | 3 +++ launch_cluster/instance_config.json | 4 +++- launch_cluster/launch.py | 5 +++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/jupyterhub_files/spawner.py b/jupyterhub_files/spawner.py index 0b561ca..7897c83 100644 --- a/jupyterhub_files/spawner.py +++ b/jupyterhub_files/spawner.py @@ -36,6 +36,9 @@ def get_local_ip_address(): {"Key": "Owner", "Value": SERVER_PARAMS["WORKER_SERVER_OWNER"]}, {"Key": "Creator", "Value": SERVER_PARAMS["WORKER_SERVER_OWNER"]}, {"Key": "Jupyter Cluster", "Value": SERVER_PARAMS["JUPYTER_CLUSTER"]}, + {"Key": "environment", "Value": SERVER_PARAMS["ENVIRONMENT"]}, + {"Key": "platform", "Value": SERVER_PARAMS["PLATFORM"]}, + {"Key": "product", "Value": SERVER_PARAMS["JUPYTER_CLUSTER"]} ] #User data script to be executed on every worker created by the spawner diff --git a/launch_cluster/instance_config.json b/launch_cluster/instance_config.json index a0116c9..bd88484 100644 --- a/launch_cluster/instance_config.json +++ b/launch_cluster/instance_config.json @@ -9,5 +9,7 @@ "REGION": "us-east-1", "WORKER_USERNAME": "ubuntu", "SERVER_OWNER": "", -"IGNORE_PERMISSIONS": "false" +"IGNORE_PERMISSIONS": "false", +"ENVIRONMENT": "production", +"PLATFORM": "linux" } diff --git a/launch_cluster/launch.py b/launch_cluster/launch.py index 125408b..1d08512 100755 --- a/launch_cluster/launch.py +++ b/launch_cluster/launch.py @@ -71,6 +71,9 @@ def launch_manager(config): {"Key": "Owner", "Value": config.server_owner}, {"Key": "Creator", "Value": config.server_owner}, {"Key": "Jupyter Cluster", "Value": config.cluster_name}, + {"Key": "platform", "Value": config.platform}, + {"Key": "environment", "Value": config.environment}, + {"Key": "product", "Value": config.cluster_name} ] instance.wait_until_exists() instance.wait_until_running() @@ -106,6 +109,8 @@ def launch_manager(config): "JUPYTER_MANAGER_IP": instance.public_ip_address, "USER_HOME_EBS_SIZE": config.user_home_ebs_size, "MANAGER_IP_ADDRESS": str(instance.private_ip_address), + "ENVIRONMENT": config.environment, + "PLATFORM": config.platform } # Setup the common files and settings between manager and worker. From e806933f203724fd805feed0da987dea97e8431a Mon Sep 17 00:00:00 2001 From: Joshua Getega Date: Wed, 24 Apr 2019 10:28:39 -0400 Subject: [PATCH 02/13] Hotfix for bug causing worker launches to fail. Problem Description: Initially, when the manager received a request to start a worker from a stopped state, it would first check to verify that the notebook server is not running on the worker before starting the notebook server. After looking at the logs, it was found that, in some fraction of the time, the verification call - a call to the is_noteboook_running function - would go to the wrong worker, a worker that has the notebook server already running. In this case, the manager would erroneously think that the notebook server was already up and running on the worker to be started. Consequently, the manager would fail to start the notebook server on the worker resuling in an unsuccessful launch. On the browser, the user would fail to see the expected notebook interface. They would just see a red progress bar with a message indicating that the spawn failed. To move forward, the user would then have to click on the Home button then the Start My Server button to retry the launch. Solution: This commit introduces a few changes to reduce the rate of unsuccessful launches, which at the moment is roughly 50% for a cluster with more than about 30 worker instances already running. Namely: - In the start_worker_server function, removing the verification call that checks whether the notebook server is running, if the worker is being started from a stopped state. The verification call is unnecessary given that the notebook server would not be running in the first place. Thus, the change here is to have the manager run the notebook server start command without doing the verification call. I note that this is a workaround to the problem described. Namely, instead of taking the risk to make the verification call to the wrong worker, the manager just doesn't make the call in the first place. - Adding a while loop to the remote_notebook_start function that retries the notebook server start command until either the notebook server starts on the worker or the maximum number of retries is reached. Initially, the notebook server start command would be called only once. By adding retries, the probability that the launch will be successful increases. - Adding more conditions to the logic in the is_notebook_running function to ensure that the function returns True only if the notebook server is running on the right worker instance and the worker instance belongs to the user in question. The logs showed that, in some fraction of the time, the function returns True after verifying that the notebook server is running on the wrong instance. The logic, thus, needed tightening. - Adding more debugging statements for the sake of ease of debugging when grepping through the manager logs. I have tested the above changes in a cluster of about 60 worker instances wherein I launched each worker instance sequentially and had them all running simultaneously. My observation was that the average time to launch reduced by 40% - from approx 1 min 20 sec to about 50 min. In addition, no worker launched unsuccessfully, except for 1 that had a different issue going on. Refactor changes introduced to reduce unsuccessful-launch-rate Removed unnecessary try/except blocks. Reduced number of attempts in each call to the is_notebook_running function from 30 to 10. Increased gen.sleep time in is_notebook_running function from 1 to 3 seconds. The reasoning here is that 3 seconds should be enough for the notebook server to start. --- jupyterhub_files/spawner.py | 55 ++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/jupyterhub_files/spawner.py b/jupyterhub_files/spawner.py index 7897c83..24779db 100644 --- a/jupyterhub_files/spawner.py +++ b/jupyterhub_files/spawner.py @@ -26,6 +26,7 @@ def get_local_ip_address(): SERVER_PARAMS = json.load(f) # load local server parameters LONG_RETRY_COUNT = 120 +REMOTE_NOTEBOOK_START_RETRY_MAX = 5 HUB_MANAGER_IP_ADDRESS = get_local_ip_address() NOTEBOOK_SERVER_PORT = 4444 WORKER_USERNAME = SERVER_PARAMS["WORKER_USERNAME"] @@ -153,7 +154,7 @@ def start(self): # blocking calls should be wrapped in a Future yield retry(instance.wait_until_running) #this call can occasionally fail, so we wrap it in a retry. yield self.start_worker_server(instance, new_server=False) - self.log.debug("%s , %s" % (instance.private_ip_address, NOTEBOOK_SERVER_PORT)) + self.log.debug("instance private ip address is %s, NOTEBOOK_SERVER_PORT is %s, user is %s" % (instance.private_ip_address, NOTEBOOK_SERVER_PORT, self.user.name)) # a longer sleep duration reduces the chance of a 503 or infinite redirect error (which a user can # resolve with a page refresh). 10s seems to be a good inflection point of behavior yield gen.sleep(10) @@ -262,27 +263,30 @@ def is_notebook_running(self, ip_address_string, attempts=1): comes first. """ with settings(**FABRIC_DEFAULTS, host_string=ip_address_string): for i in range(attempts): - self.log.debug("function check_notebook_running for user %s, attempt %s..." % (self.user.name, i+1)) + self.log.debug("function is_notebook_running for user %s, private ip %s, attempt %s..." % (self.user.name, ip_address_string, i+1)) output = yield run("ps -ef | grep jupyterhub-singleuser") + self.log.debug("output of command 'ps -ef | grep jupyterhub-singleuser' for user %s, private ip %s, is %s" % (self.user.name, ip_address_string, output)) for line in output.splitlines(): # + self.log.debug("line looped over in for-loop in function is_notebook_running for user %s, private ip %s, is %s" % (self.user.name, ip_address_string, line)) #if "jupyterhub-singleuser" and NOTEBOOK_SERVER_PORT in line: - if "jupyterhub-singleuser" and str(NOTEBOOK_SERVER_PORT) in line: - self.log.debug("the following notebook is definitely running:") - self.log.debug(line) + if "jupyterhub-singleuser" and str(NOTEBOOK_SERVER_PORT) and str(self.user.name) and ip_address_string in line: + self.log.debug("notebook for user %s, private ip %s, is definitely running:" % (self.user.name, ip_address_string)) + self.log.debug("line confirming that notebook is running for user %s, private ip %s, is %s" % (self.user.name, ip_address_string, line)) return True - self.log.debug("Notebook for user %s not running..." % self.user.name) - yield gen.sleep(1) - self.log.error("Notebook for user %s is not running." % self.user.name) + self.log.debug("Notebook for user %s, private ip %s, not running..." % (self.user.name, ip_address_string)) + yield gen.sleep(3) + self.log.error("Notebook for user %s, private ip %s, is not running." % (self.user.name, ip_address_string)) return False - ### Retun SSH_CONNECTION_FAILED if ssh connection failed @gen.coroutine def wait_until_SSHable(self, ip_address_string, max_retries=1): """ Run a meaningless bash command (a comment) inside a retry statement. """ self.log.debug("function wait_until_SSHable for user %s" % self.user.name) with settings(**FABRIC_DEFAULTS, host_string=ip_address_string): + self.log.debug("about to run SSHability command for user %s. max_retries for the command is set to %s" % (self.user.name, max_retries)) ret = yield run("# waiting for ssh to be connectable for user %s..." % self.user.name, max_retries=max_retries) + self.log.debug("return value for SSHability command in function wait_until_SSHable, for user %s, is %s" % (self.user.name, ret)) if ret == "RETRY_FAILED": ret = "SSH_CONNECTION_FAILED" return (ret) @@ -323,12 +327,14 @@ def start_worker_server(self, instance, new_server=False): # self.user.server.port = NOTEBOOK_SERVER_PORT try: # Wait for server to finish booting... + self.log.debug("\n\n\n\nabout to call function wait_until_SSHable for user %s\n\n\n\n" % self.user.name) wait_result = yield self.wait_until_SSHable(instance.private_ip_address,max_retries=LONG_RETRY_COUNT) + self.log.debug("value for variable wait_result in function start_worker_server, for user %s, is %s" % (self.user.name, wait_result)) + if wait_result == "SSH_CONNECTION_FAILED": + raise Exception("Server start failed. Please retry by clicking on 'Home' then 'Start My Server'.") #start notebook - self.log.error("\n\n\n\nabout to check if notebook is running before launching\n\n\n\n") - notebook_running = yield self.is_notebook_running(instance.private_ip_address) - if not notebook_running: - yield self.remote_notebook_start(instance) + self.log.debug("\n\n\n\nabout to call function remote_notebook_start for user %s private ip %s\n\n\n\n" % (self.user.name, instance.private_ip_address)) + yield self.remote_notebook_start(instance) except RemoteCmdExecutionError as e: # terminate instance and create a new one self.log.exception(e) @@ -359,20 +365,25 @@ def remote_notebook_start(self, instance): for key in env: lenv = lenv + key + "=" + env[key] + " " # End setup environment - self.log.debug("function remote_server_start %s" % self.user.name) + self.log.debug("function remote_notebook_start %s" % self.user.name) worker_ip_address_string = instance.private_ip_address start_notebook_cmd = self.cmd + self.get_args() start_notebook_cmd = " ".join(start_notebook_cmd) - self.log.info("Starting user %s jupyterhub" % self.user.name) + self.log.info("Starting user %s private ip %s jupyterhub" % (self.user.name, worker_ip_address_string)) with settings(user = self.user.name, key_filename = FABRIC_DEFAULTS["key_filename"], host_string=worker_ip_address_string): - yield sudo("%s %s --user=%s --notebook-dir=/home/%s/ --allow-root > /tmp/jupyter.log 2>&1 &" % (lenv, start_notebook_cmd,self.user.name,self.user.name), pty=False) - self.log.debug("just started the notebook for user %s, waiting." % self.user.name) - try: - self.user.settings[self.user.name] = instance.public_ip_address - except: - self.user.settings[self.user.name] = "" + yield sudo("%s %s --user=%s --notebook-dir=/home/%s/ --allow-root > /tmp/jupyter.log 2>&1 &" % (lenv, start_notebook_cmd,self.user.name,self.user.name), pty=False) + self.log.debug("just started the notebook for user %s, private ip %s, waiting." % (self.user.name, worker_ip_address_string)) + notebook_running = yield self.is_notebook_running(worker_ip_address_string, attempts=10) + self.log.debug("value for variable notebook_running in function remote_notebook_start for user %s private ip %s is %s" % (self.user.name, worker_ip_address_string, notebook_running)) + num_remote_notebook_start_retries = 0 + while not notebook_running and num_remote_notebook_start_retries < REMOTE_NOTEBOOK_START_RETRY_MAX: + yield sudo("%s %s --user=%s --notebook-dir=/home/%s/ --allow-root > /tmp/jupyter.log 2>&1 &" % (lenv, start_notebook_cmd,self.user.name,self.user.name), pty=False) + self.log.debug("just retried to start the notebook for user %s, private ip %s. Attempt %s. Waiting..." % (self.user.name, worker_ip_address_string, num_remote_notebook_start_retries + 1)) + yield gen.sleep(3) # Wait for 3 seconds before checking whether the notebook server started + notebook_running = yield self.is_notebook_running(worker_ip_address_string, attempts=10) + self.log.debug("value for variable notebook_running in function remote_notebook_start for user %s private ip %s is %s" % (self.user.name, worker_ip_address_string, notebook_running)) + num_remote_notebook_start_retries += 1 # self.notebook_should_be_running = True - yield self.is_notebook_running(worker_ip_address_string, attempts=30) @gen.coroutine def create_new_instance(self): From 04fc6ecb98f75a42c78fc7e9ca87120ca6428106 Mon Sep 17 00:00:00 2001 From: Joshua Getega Date: Fri, 16 Aug 2019 17:06:55 +0300 Subject: [PATCH 03/13] Specify owner tag --- jupyterhub_files/spawner.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/jupyterhub_files/spawner.py b/jupyterhub_files/spawner.py index 721e7a3..1f54054 100644 --- a/jupyterhub_files/spawner.py +++ b/jupyterhub_files/spawner.py @@ -34,7 +34,6 @@ def get_local_ip_address(): WORKER_TAGS = [ #These tags are set on every server created by the spawner {"Key": "Name", "Value": SERVER_PARAMS["WORKER_SERVER_NAME"]}, - {"Key": "Owner", "Value": SERVER_PARAMS["WORKER_SERVER_OWNER"]}, {"Key": "Creator", "Value": SERVER_PARAMS["WORKER_SERVER_OWNER"]}, {"Key": "Jupyter Cluster", "Value": SERVER_PARAMS["JUPYTER_CLUSTER"]}, {"Key": "environment", "Value": SERVER_PARAMS["ENVIRONMENT"]}, @@ -438,7 +437,7 @@ def create_new_instance(self): yield retry(instance.wait_until_exists) # add server tags; tags cannot be added until server exists yield retry(instance.create_tags, Tags=WORKER_TAGS) - yield retry(instance.create_tags, Tags=[{"Key": "User", "Value": self.user.name}]) + yield retry(instance.create_tags, Tags=[{"Key": "owner", "Value": self.user.name}]) # start server # blocking calls should be wrapped in a Future yield retry(instance.wait_until_running) From bd8d8da6755cb1cf725e5a31602cb5bd5c2f0e85 Mon Sep 17 00:00:00 2001 From: Arthur Barrett Date: Wed, 18 Mar 2020 13:56:28 -0400 Subject: [PATCH 04/13] Updated spawner logging to improve debugging and traceability. --- jupyterhub_files/spawner.py | 100 ++++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 44 deletions(-) diff --git a/jupyterhub_files/spawner.py b/jupyterhub_files/spawner.py index 1f54054..d2115c4 100644 --- a/jupyterhub_files/spawner.py +++ b/jupyterhub_files/spawner.py @@ -121,18 +121,27 @@ class InstanceSpawner(Spawner): flush. """ + def log_user(self, message='', level=logging.INFO): + user = self.user.name if self.user else None + log_message = "[user:%s] %s" % (user, message) + self.log.log(level, log_message) + @gen.coroutine def start(self): """ When user logs in, start their instance. Must return a tuple of the ip and port for the server and Jupyterhub instance. """ - self.log.debug("function start for user %s" % self.user.name) + self.log_user("start()") + last_activity = self.user.last_activity self.user.last_activity = datetime.utcnow() + self.log_user("start: user last activity updated from %s to %s" % (last_activity, self.user.last_activity)) try: instance = yield self.get_instance() #cannot be a thread pool... + self.log_user("start: instance_id: %s state: %s" % (instance.instance_id, instance.state["Name"])) #comprehensive list of states: pending, running, shutting-down, terminated, stopping, stopped. if instance.state["Name"] == "running": ec2_run_status = yield self.check_for_hanged_ec2(instance) if ec2_run_status == "SSH_CONNECTION_FAILED": + self.log_user("start: cannot start because hanged") #yield self.poll() #yield self.kill_instance(instance) #yield retry(instance.start, max_retries=(LONG_RETRY_COUNT*2)) @@ -141,19 +150,19 @@ def start(self): return None #start_worker_server will handle starting notebook yield self.start_worker_server(instance, new_server=False) - self.log.debug("start ip and port: %s , %s" % (instance.private_ip_address, NOTEBOOK_SERVER_PORT)) + self.log_user("start: started %s:%s" % (instance.private_ip_address, NOTEBOOK_SERVER_PORT)) self.ip = self.user.server.ip = instance.private_ip_address self.port = self.user.server.port = NOTEBOOK_SERVER_PORT return instance.private_ip_address, NOTEBOOK_SERVER_PORT elif instance.state["Name"] in ["stopped", "stopping", "pending", "shutting-down"]: #Server needs to be booted, do so. - self.log.info("Starting user %s instance " % self.user.name) + self.log_user("starting EC2 instance") yield retry(instance.start, max_retries=LONG_RETRY_COUNT) #yield retry(instance.start) # blocking calls should be wrapped in a Future yield retry(instance.wait_until_running) #this call can occasionally fail, so we wrap it in a retry. yield self.start_worker_server(instance, new_server=False) - self.log.debug("instance private ip address is %s, NOTEBOOK_SERVER_PORT is %s, user is %s" % (instance.private_ip_address, NOTEBOOK_SERVER_PORT, self.user.name)) + self.log_user("start: started %s:%s" % (instance.private_ip_address, NOTEBOOK_SERVER_PORT)) # a longer sleep duration reduces the chance of a 503 or infinite redirect error (which a user can # resolve with a page refresh). 10s seems to be a good inflection point of behavior yield gen.sleep(10) @@ -163,16 +172,17 @@ def start(self): elif instance.state["Name"] == "terminated": # We do not care about this state. The solution to this problem is to create a new server, # that cannot happen until the extant terminated server is actually deleted. (501 == not implemented) + self.log_user("start: instance is terminated, wait until it disappears") raise web.HTTPError(501,"Instance for user %s has been terminated, wait until it disappears." % self.user.name) else: # if instance is in pending, shutting-down, or rebooting state raise web.HTTPError(503, "Unknown server state for %s. Please try again in a few minutes" % self.user.name) except Server.DoesNotExist: - self.log.info("\nserver DNE for user %s\n" % self.user.name) + self.log_user("server DNE, attempting to create new instance and start worker") instance = yield self.create_new_instance() yield self.start_worker_server(instance, new_server=True) # self.notebook_should_be_running = False - self.log.debug("%s , %s" % (instance.private_ip_address, NOTEBOOK_SERVER_PORT)) + self.log_user("server DNE, started with %s:%s" % (instance.private_ip_address, NOTEBOOK_SERVER_PORT)) # to reduce chance of 503 or infinite redirect yield gen.sleep(10) self.ip = self.user.server.ip = instance.private_ip_address @@ -186,20 +196,20 @@ def clear_state(self): @gen.coroutine def stop(self, now=False): """ When user session stops, stop user instance """ - self.log.debug("function stop") - self.log.info("Stopping user %s instance " % self.user.name) + self.log_user("stop()") try: instance = yield self.get_instance() retry(instance.stop) + self.log_user("stop: stopped") # self.notebook_should_be_running = False except Server.DoesNotExist: - self.log.error("Couldn't stop server for user '%s' as it does not exist" % self.user.name) + self.log_user("stop: DNE - could not stop because server does not exist", level=logging.ERROR) # self.notebook_should_be_running = False self.clear_state() @gen.coroutine def kill_instance(self,instance): - self.log.debug(" Kill hanged user %s instance: %s " % (self.user.name,instance.id)) + self.log_user("kill_instance(): %s" % instance.id) yield self.stop(now=True) @@ -221,34 +231,34 @@ def check_for_hanged_ec2(self, instance): def poll(self): """ Polls for whether process is running. If running, return None. If not running, return exit code """ - self.log.debug("function poll for user %s" % self.user.name) + self.log_user("poll()") try: instance = yield self.get_instance() - self.log.debug(instance.state) + self.log_user("poll: instance state is %s" % instance.state) if instance.state['Name'] == 'running': - self.log.debug("poll: server is running for user %s" % self.user.name) + self.log_user("poll: instance is running, checking...") # We cannot have this be a long timeout because Jupyterhub uses poll to determine whether a user can log in. # If this has a long timeout, logging in without notebook running takes a long time. # attempts = 30 if self.notebook_should_be_running else 1 # check if the machine is hanged ec2_run_status = yield self.check_for_hanged_ec2(instance) if ec2_run_status == "SSH_CONNECTION_FAILED": - #self.log.debug(ec2_run_status) + self.log_user("poll: instance is hanging: %s" % ec2_run_status) yield self.kill_instance(instance) return "Instance Hang" else: notebook_running = yield self.is_notebook_running(instance.private_ip_address, attempts=1) if notebook_running: - self.log.debug("poll: notebook is running for user %s" % self.user.name) + self.log_user("poll: notebook is running") return None #its up! else: - self.log.debug("Poll, notebook is not running for user %s" % self.user.name) + self.log_user("poll: notebook is NOT running") return "server up, no instance running for user %s" % self.user.name else: - self.log.debug("instance waiting for user %s" % self.user.name) + self.log_user("poll: instance is NOT running") return "instance stopping, stopped, or pending for user %s" % self.user.name except Server.DoesNotExist: - self.log.error("Couldn't poll server for user '%s' as it does not exist" % self.user.name) + self.log_user("poll: DNE - could not poll because server does not exist") # self.notebook_should_be_running = False return "Instance not found/tracked" @@ -262,30 +272,29 @@ def is_notebook_running(self, ip_address_string, attempts=1): comes first. """ with settings(**FABRIC_DEFAULTS, host_string=ip_address_string): for i in range(attempts): - self.log.debug("function is_notebook_running for user %s, private ip %s, attempt %s..." % (self.user.name, ip_address_string, i+1)) - output = yield run("ps -ef | grep jupyterhub-singleuser") - self.log.debug("output of command 'ps -ef | grep jupyterhub-singleuser' for user %s, private ip %s, is %s" % (self.user.name, ip_address_string, output)) + log_msg = "is_notebook_running(%s) attempt: %s/%s" % (ip_address_string, i+1, attempts) + self.log_user(log_msg, level=logging.DEBUG) + output = yield run("nice -5 pgrep -a -f jupyterhub-singleuser") # replaces: ps -ef | grep jupyterhub-singleuser + self.log_user("%s output: %s" % (log_msg, output), level=logging.DEBUG) for line in output.splitlines(): # - self.log.debug("line looped over in for-loop in function is_notebook_running for user %s, private ip %s, is %s" % (self.user.name, ip_address_string, line)) #if "jupyterhub-singleuser" and NOTEBOOK_SERVER_PORT in line: if "jupyterhub-singleuser" and str(NOTEBOOK_SERVER_PORT) and str(self.user.name) and ip_address_string in line: - self.log.debug("notebook for user %s, private ip %s, is definitely running:" % (self.user.name, ip_address_string)) - self.log.debug("line confirming that notebook is running for user %s, private ip %s, is %s" % (self.user.name, ip_address_string, line)) + self.log_user("%s check completed, is running" % log_msg, level=logging.DEBUG) return True - self.log.debug("Notebook for user %s, private ip %s, not running..." % (self.user.name, ip_address_string)) + self.log_user("%s check in progress, not running" % log_msg, level=logging.DEBUG) yield gen.sleep(3) - self.log.error("Notebook for user %s, private ip %s, is not running." % (self.user.name, ip_address_string)) + self.log_user("%s check completed, not running" % log_msg, level=logging.DEBUG) return False ### Retun SSH_CONNECTION_FAILED if ssh connection failed @gen.coroutine def wait_until_SSHable(self, ip_address_string, max_retries=1): """ Run a meaningless bash command (a comment) inside a retry statement. """ - self.log.debug("function wait_until_SSHable for user %s" % self.user.name) + self.log_user("wait_until_SSHable()") with settings(**FABRIC_DEFAULTS, host_string=ip_address_string): - self.log.debug("about to run SSHability command for user %s. max_retries for the command is set to %s" % (self.user.name, max_retries)) + self.log_user("wait_until_SSHable max_retries:%s" % max_retries, level=logging.DEBUG) ret = yield run("# waiting for ssh to be connectable for user %s..." % self.user.name, max_retries=max_retries) - self.log.debug("return value for SSHability command in function wait_until_SSHable, for user %s, is %s" % (self.user.name, ret)) + self.log_user("wait_until_SSHable completed return: %s" % ret, level=logging.DEBUG) if ret == "RETRY_FAILED": ret = "SSH_CONNECTION_FAILED" return (ret) @@ -299,23 +308,24 @@ def get_instance(self): it raises Server.DoesNotExist error. If the instance in the database but boto can't find the instance, it raise 500 http error """ - self.log.debug("function get_instance for user %s" % self.user.name) + self.log_user("get_instance()") server = Server.get_server(self.user.name) resource = yield retry(boto3.resource, "ec2", region_name=SERVER_PARAMS["REGION"]) try: ret = yield retry(resource.Instance, server.server_id) - self.log.debug("return for get_instance for user %s: %s" % (self.user.name, ret)) + self.log_user("get_instance: returned: %s" % ret) # boto3.Instance is lazily loaded. Force with .load() yield retry(ret.load) if ret.meta.data is None: + self.log_user("get_instance: could not access instance", level=logging.ERROR) raise web.HTTPError(500, "Couldn't access instance for user '%s'. Please try again in a few minutes" % self.user.name) #Server.remove_server(server.server_id) #raise Server.DoesNotExist() return ret except ClientError as e: - self.log.error("get_instance client error: %s" % e) + self.log_user("get_instance client error: %s" % e) if "InvalidInstanceID.NotFound" not in str(e): - self.log.error("Couldn't find instance for user '%s'" % self.user.name) + self.log_user("get_instance: could not find instance for user", level=logging.ERROR) raise web.HTTPError(500, "Couldn't access instance for user '%s'. Please try again in a few minutes" % self.user.name) #Server.remove_server(server.server_id) #raise Server.DoesNotExist() @@ -325,20 +335,19 @@ def get_instance(self): def start_worker_server(self, instance, new_server=False): """ Runs remote commands on worker server to mount user EBS and connect to Jupyterhub. If new_server=True, also create filesystem on newly created user EBS""" - self.log.debug("function start_worker_server for user %s" % self.user.name) + self.log_user("start_worker_server()") # redundant variable set for get_args() self.ip = self.user.server.ip = instance.private_ip_address self.port = self.user.server.port = NOTEBOOK_SERVER_PORT # self.user.server.port = NOTEBOOK_SERVER_PORT try: # Wait for server to finish booting... - self.log.debug("\n\n\n\nabout to call function wait_until_SSHable for user %s\n\n\n\n" % self.user.name) wait_result = yield self.wait_until_SSHable(instance.private_ip_address,max_retries=LONG_RETRY_COUNT) - self.log.debug("value for variable wait_result in function start_worker_server, for user %s, is %s" % (self.user.name, wait_result)) + self.log_user("start_worker_server wait_result: %s" % wait_result) if wait_result == "SSH_CONNECTION_FAILED": raise Exception("Server start failed. Please retry by clicking on 'Home' then 'Start My Server'.") #start notebook - self.log.debug("\n\n\n\nabout to call function remote_notebook_start for user %s private ip %s\n\n\n\n" % (self.user.name, instance.private_ip_address)) + self.log_user("start_worker_server starting remote notebook: %s" % instance.private_ip_address) yield self.remote_notebook_start(instance) except RemoteCmdExecutionError as e: # terminate instance and create a new one @@ -364,36 +373,39 @@ def get_env(self): @gen.coroutine def remote_notebook_start(self, instance): """ Do notebook start command on the remote server.""" + self.log_user("remote_notebook_start()") + # Setup environments env = self.get_env() lenv='' for key in env: lenv = lenv + key + "=" + env[key] + " " # End setup environment - self.log.debug("function remote_notebook_start %s" % self.user.name) worker_ip_address_string = instance.private_ip_address start_notebook_cmd = self.cmd + self.get_args() start_notebook_cmd = " ".join(start_notebook_cmd) - self.log.info("Starting user %s private ip %s jupyterhub" % (self.user.name, worker_ip_address_string)) + self.log_user("remote_notebook_start private ip: %s" % worker_ip_address_string) with settings(user = self.user.name, key_filename = FABRIC_DEFAULTS["key_filename"], host_string=worker_ip_address_string): yield sudo("%s %s --user=%s --notebook-dir=/home/%s/ --allow-root > /tmp/jupyter.log 2>&1 &" % (lenv, start_notebook_cmd,self.user.name,self.user.name), pty=False) - self.log.debug("just started the notebook for user %s, private ip %s, waiting." % (self.user.name, worker_ip_address_string)) + self.log_user("remote_notebook_start private ip: %s, waiting." % worker_ip_address_string) notebook_running = yield self.is_notebook_running(worker_ip_address_string, attempts=10) - self.log.debug("value for variable notebook_running in function remote_notebook_start for user %s private ip %s is %s" % (self.user.name, worker_ip_address_string, notebook_running)) + self.log_user("remote_notebook_start private ip: %s, running: %s" % (worker_ip_address_string, notebook_running)) num_remote_notebook_start_retries = 0 while not notebook_running and num_remote_notebook_start_retries < REMOTE_NOTEBOOK_START_RETRY_MAX: yield sudo("%s %s --user=%s --notebook-dir=/home/%s/ --allow-root > /tmp/jupyter.log 2>&1 &" % (lenv, start_notebook_cmd,self.user.name,self.user.name), pty=False) - self.log.debug("just retried to start the notebook for user %s, private ip %s. Attempt %s. Waiting..." % (self.user.name, worker_ip_address_string, num_remote_notebook_start_retries + 1)) + self.log_user("remote_notebook_start private ip: %s, retry attempt %s/%s. waiting..." % (worker_ip_address_string, num_remote_notebook_start_retries + 1, REMOTE_NOTEBOOK_START_RETRY_MAX)) yield gen.sleep(3) # Wait for 3 seconds before checking whether the notebook server started notebook_running = yield self.is_notebook_running(worker_ip_address_string, attempts=10) - self.log.debug("value for variable notebook_running in function remote_notebook_start for user %s private ip %s is %s" % (self.user.name, worker_ip_address_string, notebook_running)) + self.log_user("remote_notebook_start private ip: %s, running: %s" % (worker_ip_address_string, notebook_running)) + if notebook_running: + break # break loop num_remote_notebook_start_retries += 1 # self.notebook_should_be_running = True @gen.coroutine def create_new_instance(self): """ Creates and boots a new server to host the worker instance.""" - self.log.debug("function create_new_instance %s" % self.user.name) + self.log_user("create_new_instance()") ec2 = boto3.client("ec2", region_name=SERVER_PARAMS["REGION"]) resource = boto3.resource("ec2", region_name=SERVER_PARAMS["REGION"]) BDM = [] From ba4a9769e1d03b062dfbbe196336c3c6560bd5fd Mon Sep 17 00:00:00 2001 From: Joshua Getega Date: Fri, 20 Mar 2020 01:11:18 +0300 Subject: [PATCH 05/13] Improve poll functionality Changes include increasing the poll interval and increasing the number of attempts the is_notebook_running() function will be called with from within the poll() function. This is meant to reduce the chances of the poll() function wrongly determining that the jupyterhub-singleuser process is not running on a worker instance when it actually is. --- jupyterhub_files/jupyterhub_config.py | 2 +- jupyterhub_files/spawner.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/jupyterhub_files/jupyterhub_config.py b/jupyterhub_files/jupyterhub_config.py index 09e22b9..c176e0d 100644 --- a/jupyterhub_files/jupyterhub_config.py +++ b/jupyterhub_files/jupyterhub_config.py @@ -48,7 +48,7 @@ api_token = f.read().strip() c.JupyterHub.api_tokens = {api_token:"__tokengeneratoradmin"} -c.Spawner.poll_interval = 10 +c.Spawner.poll_interval = 15 c.Spawner.http_timeout = 300 c.Spawner.start_timeout = 300 diff --git a/jupyterhub_files/spawner.py b/jupyterhub_files/spawner.py index d2115c4..a519686 100644 --- a/jupyterhub_files/spawner.py +++ b/jupyterhub_files/spawner.py @@ -247,7 +247,7 @@ def poll(self): yield self.kill_instance(instance) return "Instance Hang" else: - notebook_running = yield self.is_notebook_running(instance.private_ip_address, attempts=1) + notebook_running = yield self.is_notebook_running(instance.private_ip_address, attempts=3) if notebook_running: self.log_user("poll: notebook is running") return None #its up! From 0fa6a5ccbb8e63f6596734c543dc23cdb85fec39 Mon Sep 17 00:00:00 2001 From: Arthur Barrett Date: Thu, 19 Mar 2020 19:26:13 -0400 Subject: [PATCH 06/13] Upgrade from Fabric3 to Fabric. Fabric3 is an unauthorized fork of the main fabric project. It was originally forked to add python3 support, but the main fabric project now supports python3 and also has gone through a significant rewrite. The motivation for upgrading, besides the fact that Fabric3 is deprecated and no longer supported, is the fact that fabric is thread-safe, providing better support for concurrency. --- jupyterhub_files/spawner.py | 60 ++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 34 deletions(-) diff --git a/jupyterhub_files/spawner.py b/jupyterhub_files/spawner.py index d2115c4..8ca5989 100644 --- a/jupyterhub_files/spawner.py +++ b/jupyterhub_files/spawner.py @@ -2,11 +2,10 @@ import logging import socket import boto3 -from fabric.api import env, sudo as _sudo, run as _run -from fabric.operations import put as _put -from fabric.context_managers import settings -from fabric.exceptions import NetworkError -from paramiko.ssh_exception import SSHException, ChannelException + +from fabric2 import Connection +from invoke.exceptions import UnexpectedExit, CommandTimedOut +from paramiko.ssh_exception import SSHException, ChannelException, NoValidConnectionsError from botocore.exceptions import ClientError, WaiterError from datetime import datetime from tornado import gen, web @@ -31,7 +30,6 @@ def get_local_ip_address(): NOTEBOOK_SERVER_PORT = 4444 WORKER_USERNAME = SERVER_PARAMS["WORKER_USERNAME"] - WORKER_TAGS = [ #These tags are set on every server created by the spawner {"Key": "Name", "Value": SERVER_PARAMS["WORKER_SERVER_NAME"]}, {"Key": "Creator", "Value": SERVER_PARAMS["WORKER_SERVER_OWNER"]}, @@ -54,29 +52,20 @@ def get_local_ip_address(): #Global Fabric config +FABRIC_KEY_FILENAME = "/home/%s/.ssh/%s" % (SERVER_PARAMS["SERVER_USERNAME"], SERVER_PARAMS["KEY_NAME"]) +FABRIC_CONNECT_KWARGS = { + "key_filename": FABRIC_KEY_FILENAME, +} class RemoteCmdExecutionError(Exception): pass -env.abort_exception = RemoteCmdExecutionError -env.abort_on_prompts = True -FABRIC_DEFAULTS = {"user":SERVER_PARAMS["WORKER_USERNAME"], - "key_filename":"/home/%s/.ssh/%s" % (SERVER_PARAMS["SERVER_USERNAME"], SERVER_PARAMS["KEY_NAME"])} - -FABRIC_QUIET = True -#FABRIC_QUIET = False -# Make Fabric only print output of commands when logging level is greater than warning. - -@gen.coroutine -def sudo(*args, **kwargs): - ret = yield retry(_sudo, *args, **kwargs, quiet=FABRIC_QUIET) - return ret @gen.coroutine -def run(*args, **kwargs): - ret = yield retry(_run, *args, **kwargs, quiet=FABRIC_QUIET) +def sudo(connection, *args, **kwargs): + ret = yield retry(connection.sudo, *args, **kwargs, hide=True) return ret @gen.coroutine -def put(*args, **kwargs): - ret = yield retry(_put, *args, **kwargs) +def run(connection, *args, **kwargs): + ret = yield retry(connection.run, *args, **kwargs, hide=True) return ret @gen.coroutine @@ -91,11 +80,13 @@ def retry(function, *args, **kwargs): try: ret = yield thread_pool.submit(function, *args, **kwargs) return ret - except (ClientError, WaiterError, NetworkError, RemoteCmdExecutionError, EOFError, SSHException, ChannelException) as e: - #EOFError can occur in fabric + except (ClientError, WaiterError, CommandTimedOut, SSHException, ChannelException, NoValidConnectionsError) as e: logger.error("Failure in %s with args %s and kwargs %s" % (function.__name__, args, kwargs)) logger.info("retrying %s, (~%s seconds elapsed)" % (function.__name__, attempt * 3)) yield gen.sleep(timeout) + except UnexpectedExit as e: + logger.exception(e) + raise RemoteCmdExecutionError(str(e)) else: logger.error("Failure in %s with args %s and kwargs %s" % (function.__name__, args, kwargs)) yield gen.sleep(0.1) #this line exists to allow the logger time to print @@ -270,20 +261,21 @@ def is_notebook_running(self, ip_address_string, attempts=1): """ Checks if jupyterhub/notebook is running on the target machine, returns True if Yes, False if not. If an attempts count N is provided the check will be run N times or until the notebook is running, whichever comes first. """ - with settings(**FABRIC_DEFAULTS, host_string=ip_address_string): + with Connection(user=WORKER_USERNAME, host=ip_address_string, connect_kwargs=FABRIC_CONNECT_KWARGS) as c: for i in range(attempts): log_msg = "is_notebook_running(%s) attempt: %s/%s" % (ip_address_string, i+1, attempts) self.log_user(log_msg, level=logging.DEBUG) - output = yield run("nice -5 pgrep -a -f jupyterhub-singleuser") # replaces: ps -ef | grep jupyterhub-singleuser + result = yield run(c, "nice -5 pgrep -a -f jupyterhub-singleuser", timeout=2) # replaces: ps -ef | grep jupyterhub-singleuser + output = result.stdout self.log_user("%s output: %s" % (log_msg, output), level=logging.DEBUG) - for line in output.splitlines(): # + for line in output.splitlines(): #if "jupyterhub-singleuser" and NOTEBOOK_SERVER_PORT in line: if "jupyterhub-singleuser" and str(NOTEBOOK_SERVER_PORT) and str(self.user.name) and ip_address_string in line: self.log_user("%s check completed, is running" % log_msg, level=logging.DEBUG) return True self.log_user("%s check in progress, not running" % log_msg, level=logging.DEBUG) yield gen.sleep(3) - self.log_user("%s check completed, not running" % log_msg, level=logging.DEBUG) + self.log_user("%s check completed, not running" % log_msg, level=logging.INFO) return False ### Retun SSH_CONNECTION_FAILED if ssh connection failed @@ -291,9 +283,9 @@ def is_notebook_running(self, ip_address_string, attempts=1): def wait_until_SSHable(self, ip_address_string, max_retries=1): """ Run a meaningless bash command (a comment) inside a retry statement. """ self.log_user("wait_until_SSHable()") - with settings(**FABRIC_DEFAULTS, host_string=ip_address_string): + with Connection(user=WORKER_USERNAME, host=ip_address_string, connect_kwargs=FABRIC_CONNECT_KWARGS) as c: self.log_user("wait_until_SSHable max_retries:%s" % max_retries, level=logging.DEBUG) - ret = yield run("# waiting for ssh to be connectable for user %s..." % self.user.name, max_retries=max_retries) + ret = yield run(c, "# waiting for ssh to be connectable for user %s..." % self.user.name, max_retries=max_retries) self.log_user("wait_until_SSHable completed return: %s" % ret, level=logging.DEBUG) if ret == "RETRY_FAILED": ret = "SSH_CONNECTION_FAILED" @@ -385,14 +377,14 @@ def remote_notebook_start(self, instance): start_notebook_cmd = self.cmd + self.get_args() start_notebook_cmd = " ".join(start_notebook_cmd) self.log_user("remote_notebook_start private ip: %s" % worker_ip_address_string) - with settings(user = self.user.name, key_filename = FABRIC_DEFAULTS["key_filename"], host_string=worker_ip_address_string): - yield sudo("%s %s --user=%s --notebook-dir=/home/%s/ --allow-root > /tmp/jupyter.log 2>&1 &" % (lenv, start_notebook_cmd,self.user.name,self.user.name), pty=False) + with Connection(user=self.user.name, host=worker_ip_address_string, connect_kwargs=FABRIC_CONNECT_KWARGS) as c: + yield sudo(c, "%s %s --user=%s --notebook-dir=/home/%s/ --allow-root > /tmp/jupyter.log 2>&1 &" % (lenv, start_notebook_cmd,self.user.name,self.user.name), pty=False) self.log_user("remote_notebook_start private ip: %s, waiting." % worker_ip_address_string) notebook_running = yield self.is_notebook_running(worker_ip_address_string, attempts=10) self.log_user("remote_notebook_start private ip: %s, running: %s" % (worker_ip_address_string, notebook_running)) num_remote_notebook_start_retries = 0 while not notebook_running and num_remote_notebook_start_retries < REMOTE_NOTEBOOK_START_RETRY_MAX: - yield sudo("%s %s --user=%s --notebook-dir=/home/%s/ --allow-root > /tmp/jupyter.log 2>&1 &" % (lenv, start_notebook_cmd,self.user.name,self.user.name), pty=False) + yield sudo(c, "%s %s --user=%s --notebook-dir=/home/%s/ --allow-root > /tmp/jupyter.log 2>&1 &" % (lenv, start_notebook_cmd,self.user.name,self.user.name), pty=False) self.log_user("remote_notebook_start private ip: %s, retry attempt %s/%s. waiting..." % (worker_ip_address_string, num_remote_notebook_start_retries + 1, REMOTE_NOTEBOOK_START_RETRY_MAX)) yield gen.sleep(3) # Wait for 3 seconds before checking whether the notebook server started notebook_running = yield self.is_notebook_running(worker_ip_address_string, attempts=10) From 95b59bc28001bfcfe79bd110cd28defbfbe2a687 Mon Sep 17 00:00:00 2001 From: Arthur Barrett Date: Fri, 20 Mar 2020 15:18:27 -0400 Subject: [PATCH 07/13] Updated requirements files to include fabric2. --- jupyterhub_files/requirements_jupyterhub.txt | 1 + launch_cluster/requirements.txt | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/jupyterhub_files/requirements_jupyterhub.txt b/jupyterhub_files/requirements_jupyterhub.txt index 03983d9..e41284e 100644 --- a/jupyterhub_files/requirements_jupyterhub.txt +++ b/jupyterhub_files/requirements_jupyterhub.txt @@ -5,6 +5,7 @@ python-dateutil escapism cronutils fabric3 +fabric2==2.5.0 pytz # optional; as needed for authentication diff --git a/launch_cluster/requirements.txt b/launch_cluster/requirements.txt index b331c1f..d8661fb 100644 --- a/launch_cluster/requirements.txt +++ b/launch_cluster/requirements.txt @@ -1,4 +1,4 @@ boto3 -fabric +fabric2==2.5.0 fabric3 -paramiko==2.4.0 +paramiko \ No newline at end of file From 7a7583b09cc9f109a3b9809ffbea92063096e498 Mon Sep 17 00:00:00 2001 From: Joshua Getega Date: Thu, 16 Apr 2020 17:43:15 +0300 Subject: [PATCH 08/13] Refactor launch script with fabric upgrade changes --- launch_cluster/launch.py | 123 ++++++++++++++++++--------------------- 1 file changed, 56 insertions(+), 67 deletions(-) diff --git a/launch_cluster/launch.py b/launch_cluster/launch.py index 9d08a80..83ab74c 100755 --- a/launch_cluster/launch.py +++ b/launch_cluster/launch.py @@ -18,8 +18,9 @@ import sys from time import sleep from botocore.exceptions import ClientError, WaiterError -from fabric.api import env, run, put, sudo -from fabric.exceptions import NetworkError +from paramiko.ssh_exception import NoValidConnectionsError +from fabric2 import Connection +from patchwork.transfers import rsync from secure import (AWS_ACCESS_KEY_ID, AWS_SECRET_KEY, KEY_NAME, KEY_PATH, MANAGER_IAM_ROLE, VPC_ID) @@ -36,10 +37,6 @@ class RemoteCmdExecutionError(Exception): pass -#global fabric config -env.abort_exception = RemoteCmdExecutionError -env.abort_on_prompts = True - def launch_manager(config): """ Creates security groups, Jupyterhub manager, and worker AMI. Refer to README.md for details on what the @@ -78,15 +75,11 @@ def launch_manager(config): instance.wait_until_exists() instance.wait_until_running() instance.create_tags(Tags=tags) - - # Configure fabric - env.host_string = instance.public_ip_address - env.key_filename = KEY_PATH - env.user = config.server_username # Wait for server to finish booting (literally keep trying until you can # successfully run a command on the server via ssh) - retry(run, "# waiting for ssh to be connectable...", max_retries=100) + with Connection(host=instance.public_ip_address, user=config.server_username, connect_kwargs={"key_filename": KEY_PATH}) as connection: + retry(connection.run, "# waiting for ssh to be connectable...", max_retries=100) # These parameters will be used by the manager to launch a worker worker_server_name = "JUPYTER_HUB_%s_%s_WORKER" % (availability_zone.split("-")[-1], config.cluster_name) @@ -114,7 +107,8 @@ def launch_manager(config): } # Setup the common files and settings between manager and worker. - setup_manager(server_params, config, instance.private_ip_address) + with Connection(host=instance.public_ip_address, user=config.server_username, connect_kwargs={"key_filename": KEY_PATH}) as connection: + setup_manager(connection, server_params, config, instance.private_ip_address) # For security, close port 22 on manager security group to prevent SSH access to manager host # logger.info("Closing port 22 on manager") @@ -124,48 +118,46 @@ def launch_manager(config): print("Launch script done.") -def setup_manager(server_params,config, manager_ip_address): +def setup_manager(connection, server_params,config, manager_ip_address): """ Sets up the files that are common to both workers and the manager, runs before worke and jupyterhub setup. """ - put("common_files", remote_path="/var/tmp/") + rsync(connection, "common_files", "/var/tmp") # upload key to manager for usage of SSHing into worker servers - put(KEY_PATH, remote_path="/home/%s/.ssh/%s" % (server_params["SERVER_USERNAME"], KEY_NAME)) - sudo("chmod 600 /home/%s/.ssh/%s" % (server_params["SERVER_USERNAME"], KEY_NAME)) + connection.put(KEY_PATH, remote="/home/%s/.ssh/%s" % (server_params["SERVER_USERNAME"], KEY_NAME)) + connection.sudo("chmod 600 /home/%s/.ssh/%s" % (server_params["SERVER_USERNAME"], KEY_NAME)) # bash environment configuration files (for devs and admins)worker_security_group - run("cp /var/tmp/common_files/.inputrc ~/") - run("cp /var/tmp/common_files/.bash_profile ~/") + connection.run("cp /var/tmp/common_files/.inputrc ~/") + connection.run("cp /var/tmp/common_files/.bash_profile ~/") # Common installs: python 3 - sudo("apt-get -qq -y update") - - sudo("apt-get -qq -y install -q python3-pip sqlite", quiet=True) - sudo("pip3 install --upgrade pip") - sudo("apt-get -qq -y remove -q python3-pip") - sudo("hash -r") - #sudo("hash -d pip") + connection.sudo("sh -c \"apt-get -y update && sleep 15 && apt-get install -y python3-pip sqlite\"") + connection.sudo("pip3 install --upgrade pip") + connection.sudo("apt-get -qq remove -q python3-pip") + connection.sudo("sh -c \"hash -r\"") + #connection.run("hash -d pip") - sudo("pip3 -q install ipython nbgrader", quiet=True) + connection.sudo("pip3 -q install ipython nbgrader", hide=True) # Sets up jupyterhub components - put("jupyterhub_files", remote_path="/var/tmp/") - sudo("cp -r /var/tmp/jupyterhub_files /etc/jupyterhub") - sudo("pip3 install --quiet -r /var/tmp/jupyterhub_files/requirements_jupyterhub.txt") + rsync(connection, "jupyterhub_files", "/var/tmp") + connection.sudo("cp -r /var/tmp/jupyterhub_files /etc/jupyterhub") + connection.sudo("pip3 install --quiet -r /var/tmp/jupyterhub_files/requirements_jupyterhub.txt") # apt-get installs for jupyterhub - sudo("apt-get -qq -y install -q nodejs npm") + connection.sudo("apt-get -qq install -q nodejs npm") # npm installs for the jupyterhub proxy - sudo("npm install -q -g configurable-http-proxy") + connection.sudo("npm install -q -g configurable-http-proxy") # move init script into place so we can have jupyterhub run as a "service". - sudo("cp /var/tmp/jupyterhub_files/jupyterhub_service.sh /etc/init.d/jupyterhub") - sudo("chmod +x /etc/init.d/jupyterhub") - sudo("systemctl daemon-reload") - sudo("systemctl enable jupyterhub") + connection.sudo("cp /var/tmp/jupyterhub_files/jupyterhub_service.sh /etc/init.d/jupyterhub") + connection.sudo("chmod +x /etc/init.d/jupyterhub") + connection.sudo("systemctl daemon-reload") + connection.sudo("systemctl enable jupyterhub") # Put the server_params dict into the environment - sudo("echo '%s' > /etc/jupyterhub/server_config.json" % json.dumps(server_params)) + connection.run("sudo echo '%s' | sudo tee /etc/jupyterhub/server_config.json" % json.dumps(server_params)) # Generate a token value for use in making authenticated calls to the jupyterhub api # Note: this value cannot be put into the server_params because the file is imported in our spawner - sudo("/usr/local/bin/jupyterhub token -f /etc/jupyterhub/jupyterhub_config.py __tokengeneratoradmin > /etc/jupyterhub/api_token.txt") + connection.sudo("sh -c \"/usr/local/bin/jupyterhub token -f /etc/jupyterhub/jupyterhub_config.py __tokengeneratoradmin > /etc/jupyterhub/api_token.txt\"") # start jupyterhub - sudo("service jupyterhub start", pty=False) + connection.sudo("service jupyterhub start", pty=False) # move our cron script into place - sudo("cp /etc/jupyterhub/jupyterhub_cron.txt /etc/cron.d/jupyterhub_cron") + connection.sudo("cp /etc/jupyterhub/jupyterhub_cron.txt /etc/cron.d/jupyterhub_cron") if not config.custom_worker_ami: logger.info("Manager server successfully launched. Please wait 15 minutes for the worker server AMI image to become available. No worker servers (and thus, no user sessions) can be launched until the AMI is available.") # TODO: generate ssl files and enable jupyterhub ssl @@ -177,37 +169,33 @@ def make_worker_ami(config, ec2, security_group_list): instance.wait_until_exists() instance.wait_until_running() - # Configure fabric - env.host_string = instance.public_ip_address - env.key_filename = KEY_PATH - env.user = config.server_username + with Connection(host=instance.public_ip_address, user=config.server_username, connect_kwargs={"key_filename": KEY_PATH}) as connection: + # Wait for server to finish booting (keep trying until you can successfully run a command on the server via ssh) + retry(connection.run, "# waiting for ssh to be connectable...", max_retries=100) - # Wait for server to finish booting (keep trying until you can successfully run a command on the server via ssh) - retry(run, "# waiting for ssh to be connectable...", max_retries=100) + connection.sudo("apt-get -qq -y update") - sudo("apt-get -qq -y update") + connection.sudo("apt-get -qq -y install -q python python-dev python-pip") + connection.sudo("pip install --upgrade pip") + connection.sudo("apt-get -qq -y remove -q python-pip") + connection.sudo("hash -r") - sudo("apt-get -qq -y install -q python python-dev python-pip") - sudo("pip install --upgrade pip") - sudo("apt-get -qq -y remove -q python-pip") - sudo("hash -r") + connection.sudo("apt-get -qq -y install -q python3-pip sqlite") + connection.sudo("pip3 install --upgrade pip") + connection.sudo("apt-get -qq -y remove -q python3-pip") + connection.sudo("hash -r") - sudo("apt-get -qq -y install -q python3-pip sqlite") - sudo("pip3 install --upgrade pip") - sudo("apt-get -qq -y remove -q python3-pip") - sudo("hash -r") + connection.put("jupyterhub_files/requirements_jupyterhub.txt", remote_path="/var/tmp/") + connection.sudo("pip3 install --quiet -r /var/tmp/requirements_jupyterhub.txt") - put("jupyterhub_files/requirements_jupyterhub.txt", remote_path="/var/tmp/") - sudo("pip3 install --quiet -r /var/tmp/requirements_jupyterhub.txt") + connection.sudo("pip3 -q install ipython jupyter ipykernel nbgrader") + connection.sudo("pip2 -q install ipykernel --upgrade") - sudo("pip3 -q install ipython jupyter ipykernel nbgrader") - sudo("pip2 -q install ipykernel --upgrade") - - # register Python 3 and 2 kernel - sudo("python3 -m ipykernel install") - sudo("python2 -m ipykernel install") - sudo("chmod 755 /mnt") - sudo("chown ubuntu /mnt") + # register Python 3 and 2 kernel + connection.sudo("python3 -m ipykernel install") + connection.sudo("python2 -m ipykernel install") + connection.sudo("chmod 755 /mnt") + connection.sudo("chown ubuntu /mnt") # Create AMI for workers logger.info("Creating worker AMI") @@ -342,7 +330,7 @@ def validate_config(): if config.ignore_permissions == "false": permissions = oct(os.stat(KEY_PATH).st_mode % 2 ** 9) #if permissions[2:] != "600": <--- And update this - if permissions[1:] != "600": + if permissions[1:] != "600": print("Your key file permissions are %s, they need to be (0)600 " "or else the configuration script will not be able to connect " "to the server.\n" @@ -384,7 +372,7 @@ def retry(function, *args, **kwargs): print (".", sys.stdout.flush()) try: return function(*args, **kwargs) - except (ClientError, NetworkError, WaiterError) as e: + except (ClientError, NoValidConnectionsError, WaiterError) as e: logger.debug("retrying %s, (~%s seconds elapsed)" % (function, i * 3)) sleep(timeout) logger.error("hit max retries on %s" % function) @@ -406,3 +394,4 @@ def retry(function, *args, **kwargs): config = parser.parse_args() validate_config() launch_manager(config) + From 787fd636c74c12d3f2b192a6b06d2120a0d0b900 Mon Sep 17 00:00:00 2001 From: Joshua Getega Date: Thu, 30 Apr 2020 21:51:59 +0300 Subject: [PATCH 09/13] Fix indentation --- launch_cluster/launch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launch_cluster/launch.py b/launch_cluster/launch.py index 83ab74c..9f7a111 100755 --- a/launch_cluster/launch.py +++ b/launch_cluster/launch.py @@ -330,7 +330,7 @@ def validate_config(): if config.ignore_permissions == "false": permissions = oct(os.stat(KEY_PATH).st_mode % 2 ** 9) #if permissions[2:] != "600": <--- And update this - if permissions[1:] != "600": + if permissions[1:] != "600": print("Your key file permissions are %s, they need to be (0)600 " "or else the configuration script will not be able to connect " "to the server.\n" From 04cbfcd2fbcc2df12457439497ab381ace690dae Mon Sep 17 00:00:00 2001 From: Joshua Getega Date: Wed, 17 Feb 2021 22:16:55 +0300 Subject: [PATCH 10/13] Update 'whitelist' naming to 'allowed_users' as in JHub 1.2 change --- jupyterhub_files/jupyterhub_config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jupyterhub_files/jupyterhub_config.py b/jupyterhub_files/jupyterhub_config.py index c176e0d..2d7df70 100644 --- a/jupyterhub_files/jupyterhub_config.py +++ b/jupyterhub_files/jupyterhub_config.py @@ -83,9 +83,9 @@ c.LocalAuthenticator.add_user_cmd = ['adduser', '-q', '--gecos', '""', '--disabled-password', '--force-badname'] c.LocalAuthenticator.create_system_users = True -# Add users to the admin list, the whitelist, and also record their user ids +# Add users to the admin list, the allowed_users list, and also record their user ids c.Authenticator.admin_users = admin = set() -c.Authenticator.whitelist = whitelist = set() +c.Authenticator.allowed_users = allowed_users = set() if os.path.isfile('/etc/jupyterhub/userlist'): with open('/etc/jupyterhub/userlist') as f: for line in f: @@ -93,7 +93,7 @@ continue parts = line.split() name = parts[0] - whitelist.add(name) + allowed_users.add(name) if len(parts) > 1 and parts[1] == 'admin': admin.add(name) From b0cf31375f188b5466efe9ddfee5761b66120f0b Mon Sep 17 00:00:00 2001 From: dodget Date: Fri, 26 Feb 2021 15:17:01 -0500 Subject: [PATCH 11/13] Increase max_retries to 30 in create_new_instance --- jupyterhub_files/spawner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/jupyterhub_files/spawner.py b/jupyterhub_files/spawner.py index 6c7b660..e9156a0 100644 --- a/jupyterhub_files/spawner.py +++ b/jupyterhub_files/spawner.py @@ -434,6 +434,7 @@ def create_new_instance(self): SecurityGroupIds=SERVER_PARAMS["WORKER_SECURITY_GROUPS"], BlockDeviceMappings=BDM, UserData=user_data_script, + max_retries=30, ) instance_id = reservation["Instances"][0]["InstanceId"] instance = yield retry(resource.Instance, instance_id) From 3400e8c0ee14920c8473e622fb790302c163870a Mon Sep 17 00:00:00 2001 From: dodget Date: Mon, 8 Mar 2021 11:07:07 -0500 Subject: [PATCH 12/13] Add logging for reservation --- jupyterhub_files/spawner.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/jupyterhub_files/spawner.py b/jupyterhub_files/spawner.py index e9156a0..2226126 100644 --- a/jupyterhub_files/spawner.py +++ b/jupyterhub_files/spawner.py @@ -436,6 +436,8 @@ def create_new_instance(self): UserData=user_data_script, max_retries=30, ) + self.log_user("result of retry(ec2.run_instances): %s" % reservation) + instance_id = reservation["Instances"][0]["InstanceId"] instance = yield retry(resource.Instance, instance_id) Server.new_server(instance_id, self.user.name) From c590d84aa99f2f5f5463b4d0701326a257dcbec4 Mon Sep 17 00:00:00 2001 From: Arthur Barrett Date: Thu, 25 Mar 2021 11:12:27 -0400 Subject: [PATCH 13/13] Ensure the launcher installs NodeJS v14 (current LTS release) and updates npm. --- launch_cluster/launch.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/launch_cluster/launch.py b/launch_cluster/launch.py index 9f7a111..06ab849 100755 --- a/launch_cluster/launch.py +++ b/launch_cluster/launch.py @@ -141,8 +141,10 @@ def setup_manager(connection, server_params,config, manager_ip_address): connection.sudo("cp -r /var/tmp/jupyterhub_files /etc/jupyterhub") connection.sudo("pip3 install --quiet -r /var/tmp/jupyterhub_files/requirements_jupyterhub.txt") # apt-get installs for jupyterhub - connection.sudo("apt-get -qq install -q nodejs npm") + connection.sudo("curl -fsSL https://deb.nodesource.com/setup_14.x | sudo -E bash -") + connection.sudo("apt-get -qq install nodejs") # npm installs for the jupyterhub proxy + connection.sudo("npm install -g npm@latest") connection.sudo("npm install -q -g configurable-http-proxy") # move init script into place so we can have jupyterhub run as a "service". connection.sudo("cp /var/tmp/jupyterhub_files/jupyterhub_service.sh /etc/init.d/jupyterhub")