diff --git a/etc/taskcluster/decisionlib.py b/etc/taskcluster/decisionlib.py index 53778221b07..30d01d65c4f 100644 --- a/etc/taskcluster/decisionlib.py +++ b/etc/taskcluster/decisionlib.py @@ -35,7 +35,7 @@ class DecisionTask: "0a7d012ce444d62ffb9e7f06f0c52fedc24b68c2060711b313263367f7272d9d" def __init__(self, *, index_prefix="garbage.servo-decisionlib", task_name_template="%s", - worker_type="github-worker", docker_image_cache_expiry="1 year", + worker_type="github-worker", docker_image_cache_expiry="1 month", routes_for_all_subtasks=None, scopes_for_all_subtasks=None): self.task_name_template = task_name_template self.index_prefix = index_prefix diff --git a/etc/taskcluster/windows/.gitignore b/etc/taskcluster/windows/.gitignore new file mode 100644 index 00000000000..5570b425ee1 --- /dev/null +++ b/etc/taskcluster/windows/.gitignore @@ -0,0 +1 @@ +*.id_rsa diff --git a/etc/taskcluster/windows/README.md b/etc/taskcluster/windows/README.md new file mode 100644 index 00000000000..89271d9e043 --- /dev/null +++ b/etc/taskcluster/windows/README.md @@ -0,0 +1,88 @@ +# Windows AMIs for Servo on Taskcluster + +Unlike Linux tasks on `docker-worker` where each tasks is executed in a container +based on a Docker image provided with the task, +Windows tasks on Taskcluster are typically run by `generic-worker` +where tasks are executed directly in the worker’s environment. +So we may want to install some tools globally on the system, to make them available to tasks. + +With the [AWS provisioner], this means building a custom AMI. +We need to boot an instance on a base Windows AMI, +install what we need (including `generic-worker` itself), +then take an image of that instance. +The [`worker_types`] directory in `generic-worker`’s repository +has some scripts that automate this, +in order to make it more reproducible than clicking around. +The trick is that a PowerShell script to run on boot can be provided +when starting a Windows instance on EC2, and of course AWS has an API. + +[AWS provisioner]: https://docs.taskcluster.net/docs/reference/integrations/aws-provisioner/references/api +[`worker_types`]: https://github.com/taskcluster/generic-worker/blob/master/worker_types/ + + +## Building and deploying a new image + +* Install and configure the [AWS command-line tool]. +* Make your changes to `first-boot.ps1` and/or `base-ami.txt`. +* Run `python3 build-ami.py`. Note that it can take many minutes to complete. +* Save the administrator password together with the image ID + in Servo’s shared 1Password account, in the *Taskcluster Windows AMIs* note. +* In the [worker type definition], edit `ImageId` and `DeploymentId`. + +Note that the new worker type definition will only apply to newly-provisionned workers. + +`DeploymentId` can be any string. It can for example include the image ID. +Workers check it between tasks (if `checkForNewDeploymentEverySecs` since the last check). +If it has changed, they shut down in order to leave room for new workers with the new definition. + +The [EC2 Resources] page has red *Terminate All Instances* button, +but that will make any running task fail. + +[AWS command-line tool]: https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-welcome.html +[worker type definition]: https://tools.taskcluster.net/aws-provisioner/servo-win2016/edit +[EC2 Resources]: https://tools.taskcluster.net/aws-provisioner/servo-win2016/resources + + +## FIXME: possible improvement + +* Have a separate staging worker type to try new AMIs without affecting the production CI +* Automate cleaning up old, unused AMIs +* Use multiple AWS regions +* Use the Taskcluster API to automate updating worker type definitions? + + +## Picking a base AMI + +Amazon provides an ovewhelming number of different Windows images, +so it’s hard to find what’s relevant. +Their console might show a paginated view like this: + +> ⇤ ← 1 to 50 of 13,914 AMIs → ⇥ + +Let’s grep through this with the API: + +```sh +aws ec2 describe-images --owners amazon --filters 'Name=platform,Values=windows' \ + --query 'Images[*].[ImageId,Name,Description]' --output table > /tmp/images +< /tmp/images less -S +``` + +It turns out that these images are all based on Windows Server, +but their number is explained by the presence of many (all?) combinations of: + +* Multiple OS Version +* Many available locales +* *Full* (a.k.a. *with Desktop Experience*), or *Core* +* *Base* with only the OS, or multiple flavors with tools like SQL Server pre-installed + +If we make some choices and filter the list: + +```sh +< /tmp/images grep 2016-English-Full-Base | less -S +``` + +… we get a much more manageable handlful of images with names like +`Windows_Server-2016-English-Full-Base-2018.09.15` or other dates. + +Let’s set `base-ami.txt` to `Windows_Server-2016-English-Full-Base-*`, +and have `build-ami.py` pick the most recently-created AMI whose name matches that pattern. \ No newline at end of file diff --git a/etc/taskcluster/windows/base-ami.txt b/etc/taskcluster/windows/base-ami.txt new file mode 100644 index 00000000000..0ed695728fa --- /dev/null +++ b/etc/taskcluster/windows/base-ami.txt @@ -0,0 +1 @@ +Windows_Server-2016-English-Full-Base-* \ No newline at end of file diff --git a/etc/taskcluster/windows/build-ami.py b/etc/taskcluster/windows/build-ami.py new file mode 100755 index 00000000000..2c7da96793f --- /dev/null +++ b/etc/taskcluster/windows/build-ami.py @@ -0,0 +1,116 @@ +#!/usr/bin/python3 + +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import os +import json +import datetime +import subprocess + + +REGION = "us-west-2" +WORKER_TYPE = "servo-win2016" +AWS_PROVISIONER_USER_ID = "692406183521" + + +def main(): + base_ami_pattern = read_file("base-ami.txt").strip() + base_ami = most_recent_ami(base_ami_pattern) + print("Starting an instance with base image:", base_ami["ImageId"], base_ami["Name"]) + + key_name = "%s_%s" % (WORKER_TYPE, REGION) + key_filename = key_name + ".id_rsa" + ec2("delete-key-pair", "--key-name", key_name) + result = ec2("create-key-pair", "--key-name", key_name) + write_file(key_filename, result["KeyMaterial"].encode("utf-8")) + + user_data = b"\n%s\n" % read_file("first-boot.ps1") + result = ec2( + "run-instances", "--image-id", base_ami["ImageId"], + "--key-name", key_name, + "--user-data", user_data, + "--instance-type", "c4.xlarge", + "--block-device-mappings", + "DeviceName=/dev/sda1,Ebs={VolumeSize=75,DeleteOnTermination=true,VolumeType=gp2}", + "--instance-initiated-shutdown-behavior", "stop" + ) + assert len(result["Instances"]) == 1 + instance_id = result["Instances"][0]["InstanceId"] + + ec2("create-tags", "--resources", instance_id, "--tags", + "Key=Name,Value=TC %s base instance" % WORKER_TYPE) + + print("Waiting for password data to be available…") + ec2_wait("password-data-available", "--instance-id", instance_id) + result = ec2("get-password-data", "--instance-id", instance_id, + "--priv-launch-key", here(key_filename)) + print("Administrator password:", result["PasswordData"]) + + print("Waiting for the instance to finish executing first-boot.ps1 and shut down…") + ec2_wait("instance-stopped", "--instance-id", instance_id) + + now = datetime.datetime.utcnow().strftime("%Y-%m-%d_%H.%M.%S") + image_id = ec2("create-image", "--instance-id", instance_id, + "--name", "TC %s %s" % (WORKER_TYPE, now))["ImageId"] + print("Started creating image with ID %s …" % image_id) + + ec2_wait("image-available", "--image-ids", image_id) + ec2("modify-image-attribute", "--image-id", image_id, + "--launch-permission", "Add=[{UserId=%s}]" % AWS_PROVISIONER_USER_ID) + + print("Image available. Terminating the temporary instance…") + ec2("terminate-instances", "--instance-ids", instance_id) + + +def most_recent_ami(name_pattern): + result = ec2( + "describe-images", "--owners", "amazon", + "--filters", "Name=platform,Values=windows", b"Name=name,Values=" + name_pattern, + ) + return max(result["Images"], key=lambda x: x["CreationDate"]) + + +def ec2_wait(*args): + # https://docs.aws.amazon.com/cli/latest/reference/ec2/wait/password-data-available.html + # “It will poll every 15 seconds until a successful state has been reached. + # This will exit with a return code of 255 after 40 failed checks.” + while True: + try: + return ec2("wait", *args) + except subprocess.CalledProcessError as err: + if err.returncode != 255: + raise + + +def try_ec2(*args): + try: + return ec2(*args) + except subprocess.CalledProcessError: + return None + + +def ec2(*args): + args = ["aws", "ec2", "--region", REGION, "--output", "json"] + list(args) + output = subprocess.check_output(args) + if output: + return json.loads(output) + + +def read_file(filename): + with open(here(filename), "rb") as f: + return f.read() + + +def write_file(filename, contents): + with open(here(filename), "wb") as f: + f.write(contents) + + +def here(filename, base=os.path.dirname(__file__)): + return os.path.join(base, filename) + + +if __name__ == "__main__": + main() diff --git a/etc/taskcluster/windows/first-boot.ps1 b/etc/taskcluster/windows/first-boot.ps1 new file mode 100644 index 00000000000..b0ba9d43b55 --- /dev/null +++ b/etc/taskcluster/windows/first-boot.ps1 @@ -0,0 +1,45 @@ +Start-Transcript -Path "C:\first_boot.txt" + +Get-ChildItem Env: | Out-File "C:\install_env.txt" + +# use TLS 1.2 (see bug 1443595) +[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 + +# For making http requests +$client = New-Object system.net.WebClient +$shell = new-object -com shell.application + +# Download a zip file and extract it +function Expand-ZIPFile($file, $destination, $url) +{ + $client.DownloadFile($url, $file) + $zip = $shell.NameSpace($file) + foreach($item in $zip.items()) + { + $shell.Namespace($destination).copyhere($item) + } +} + +# Open up firewall for livelog (both PUT and GET interfaces) +New-NetFirewallRule -DisplayName "Allow livelog PUT requests" ` + -Direction Inbound -LocalPort 60022 -Protocol TCP -Action Allow +New-NetFirewallRule -DisplayName "Allow livelog GET requests" ` + -Direction Inbound -LocalPort 60023 -Protocol TCP -Action Allow + +# Install generic-worker and dependencies +md C:\generic-worker +$client.DownloadFile("https://github.com/taskcluster/generic-worker/releases/download" + + "/v10.11.3/generic-worker-windows-amd64.exe", "C:\generic-worker\generic-worker.exe") +$client.DownloadFile("https://github.com/taskcluster/livelog/releases/download" + + "/v1.1.0/livelog-windows-amd64.exe", "C:\generic-worker\livelog.exe") +Expand-ZIPFile -File "C:\nssm-2.24.zip" -Destination "C:\" ` + -Url "http://www.nssm.cc/release/nssm-2.24.zip" +Start-Process C:\generic-worker\generic-worker.exe -ArgumentList ( + "install service --nssm C:\nssm-2.24\win64\nssm.exe " + + "--config C:\generic-worker\generic-worker.config" + ) -Wait -NoNewWindow -PassThru ` + -RedirectStandardOutput C:\generic-worker\install.log ` + -RedirectStandardError C:\generic-worker\install.err + +# Now shutdown, in preparation for creating an image +shutdown -s \ No newline at end of file