Move part of the Taskcluster configuration

… to https://github.com/servo/taskcluster-config
This commit is contained in:
Simon Sapin 2019-11-08 16:25:14 +01:00
parent b3faeb5483
commit a5ea470d57
17 changed files with 0 additions and 627 deletions

View file

@ -1 +0,0 @@
.salt

View file

@ -1,69 +0,0 @@
# macOS
This is the configuration for the `proj-servo/macos` worker type.
These macOS workers are configured with SaltStack in [agentless] mode.
[agentless]: https://docs.saltstack.com/en/getstarted/ssh/index.html
Either run `./salt-ssh`
to automatically install `salt-ssh` in `mach`s existing Python virtualenv,
or install `salt-ssh` through some other mean and run in from this directory.
```sh
cd etc/taskcluster/macos
./salt-ssh '*' test.ping
./salt-ssh '*' state.apply test=True
```
## Troubleshooting
SSH into `servo-tc-mac1.servo.org`.
`generic-worker` logs are in `less /Users/worker/stderr.log`.
If the worker seems stuck but nothing seems wrong in the log,
try running `launchctl stop net.generic.worker`.
(It is configured to restart automatically.)
This issue is tracked at
[generic-worker#133](https://github.com/taskcluster/generic-worker/issues/133).
## (Re)deploying a server
* Place an order or file a ticket with MacStadium to get a new hardware or reinstall an OS.
* Change the administrator password to one generated with
`</dev/urandom tr -d -c 'a-zA-Z' | head -c 8; echo`
(this short because of VNC),
and save it in the shared 1Password account.
* Give the public IPv4 address a DNS name through Cloudflare.
* Add a correponding entry in the `config/roster` file.
* Log in through VNC, and run `xcode-select --install`
* Still in VNC, install the jdk8 package from http://adoptopenjdk.net
* Install an ssh key into /Users/administrator/.ssh/authorized_keys and
/var/root/.ssh/authorized_keys.
## Taskcluster secrets
This SaltStack configuration has a custom module that uses Taskclusters
[secrets service](https://tools.taskcluster.net/secrets/).
These secrets include an authentication token.
Youll need to authenticate with a Taskcluster client ID
that has scope `secrets:get:project/servo/*`.
This should be the case if youre a Servo project administrator (the `project-admin:servo` role).
To authenticate, install [taskcluster-cli](https://github.com/taskcluster/taskcluster-cli)
and run `eval \`taskcluster signin\``. This will set up the TASKCLUSTER_CLIENT_ID and
TASKCLUSTER_ACCESS_TOKEN variables to allow retrieving secrets appropriately in the current
terminal session.
## Workers client ID
Workers are configured to authenticate with client ID
[`project/servo/worker/macos/1`](
https://tools.taskcluster.net/auth/clients/project%2Fservo%2Fworker%macos%2F1).
This client has the scopes required to run tasks for this worker type.

View file

@ -1,3 +0,0 @@
salt-ssh:
config_dir: ./config
state_verbose: False

View file

@ -1,14 +0,0 @@
root_dir: .salt
file_roots:
base:
- states
extension_modules: ../modules
ext_pillar:
- taskcluster_secrets:
- ssh_keys:
roster_defaults:
# https://github.com/saltstack/salt/issues/50477
minion_opts:
providers:
user: mac_user
group: mac_group

View file

@ -1,16 +0,0 @@
mac1:
host: servo-tc-mac1.servo.org
mac2:
host: servo-tc-mac2.servo.org
mac3:
host: servo-tc-mac3.servo.org
mac4:
host: servo-tc-mac4.servo.org
mac5:
host: servo-tc-mac5.servo.org
mac6:
host: servo-tc-mac6.servo.org
mac7:
host: servo-tc-mac8.servo.org
mac8:
host: servo-tc-mac8.servo.org

View file

@ -1,14 +0,0 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
import urllib
def ext_pillar(_minion_id, _pillar, *_args):
url = "https://raw.githubusercontent.com/servo/saltfs/master/admin/files/ssh/%s.pub"
return {"ssh_keys": [urllib.urlopen(url % name).read() for name in [
"jdm",
"manishearth",
"simonsapin",
]]}

View file

@ -1,13 +0,0 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
import os
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..", "..", "packet.net"))
import tc
def ext_pillar(_minion_id, _pillar, *_args):
tc.check()
return tc.secret("project/servo/tc-client/worker/macos/1")

View file

@ -1,14 +0,0 @@
#!/bin/sh
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
set -o errexit
set -o nounset
set -o pipefail
cd "$(dirname $0)"
VENV_BIN="../../../python/_virtualenv/bin"
[ -x "${VENV_BIN}/salt-ssh" ] || "${VENV_BIN}/pip" install salt-ssh
"${VENV_BIN}/salt-ssh" "${@}"

View file

@ -1,18 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Label</key> <string>net.generic.worker</string>
<key>KeepAlive</key> <true/>
<key>StandardOutPath</key> <string>stdout.log</string>
<key>StandardErrorPath</key> <string>stderr.log</string>
<key>WorkingDirectory</key> <string>{{ home }}</string>
<key>UserName</key> <string>{{ username }}</string>
<key>ProgramArguments</key> <array>
<string>{{ bin }}/generic-worker</string>
<string>run</string>
<string>--config</string>
<string>{{ etc }}/config.json</string>
</array>
</dict>
</plist>

View file

@ -1,108 +0,0 @@
{% set bin = "/usr/local/bin" %}
{% set etc = "/etc/generic-worker" %}
{% set user = "worker" %}
{% set home = "/Users/" + user %}
GMT:
timezone.system
sshkeys:
ssh_auth.present:
- user: root
- names:
{% for key in pillar["ssh_keys"] %}
- {{ key | yaml_encode }}
{% endfor %}
{{ bin }}/generic-worker:
file.managed:
- name:
- source: https://github.com/taskcluster/generic-worker/releases/download/v14.1.1/generic-worker-nativeEngine-darwin-amd64
- source_hash: sha256=817e72972a7c077f1a829d5824e5c0e831eb6f9b254672e7427246a8dd476a59
- mode: 755
- makedirs: True
- watch_in:
- service: net.generic.worker
{{ bin }}/livelog:
file.managed:
- source: https://github.com/taskcluster/livelog/releases/download/v1.1.0/livelog-darwin-amd64
- source_hash: sha256=be5d4b998b208afd802ac6ce6c4d4bbf0fb3816bb039a300626abbc999dfe163
- mode: 755
- makedirs: True
- watch_in:
- service: net.generic.worker
{{ bin }}/taskcluster-proxy:
file.managed:
- source: https://github.com/taskcluster/taskcluster-proxy/releases/download/v5.1.0/taskcluster-proxy-darwin-amd64
- source_hash: sha256=3faf524b9c6b9611339510797bf1013d4274e9f03e7c4bd47e9ab5ec8813d3ae
- mode: 755
- makedirs: True
- watch_in:
- service: net.generic.worker
{{ user }} group:
group.present:
- name: {{ user }}
{{ user }}:
user.present:
- home: {{ home }}
- gid_from_name: True
# `user.present`s `createhome` is apparently not supported on macOS
{{ home }}:
file.directory:
- user: {{ user }}
{{ etc }}/config.json:
file.serialize:
- makedirs: True
- group: {{ user }}
- mode: 640
- show_changes: False
- formatter: json
- dataset:
provisionerId: proj-servo
workerType: macos
workerGroup: servo-macos
workerId: {{ grains["id"] }}
tasksDir: {{ home }}/tasks
publicIP: {{ salt.network.ip_addrs()[0] }}
ed25519SigningKeyLocation: {{ home }}/keypair
clientId: {{ pillar["client_id"] }}
accessToken: {{ pillar["access_token"] }}
taskclusterProxyExecutable: {{ bin }}/taskcluster-proxy
taskclusterProxyPort: 8080
livelogExecutable: {{ bin }}/livelog
wstAudience: taskcluster-net
wstServerURL: https://websocktunnel.tasks.build
rootURL: https://taskcluster.net
- watch_in:
- service: net.generic.worker
{{ bin }}/generic-worker new-ed25519-keypair --file {{ home }}/keypair:
cmd.run:
- creates: {{ home }}/keypair
- runas: {{ user }}
/Library/LaunchAgents/net.generic.worker.plist:
file.absent: []
net.generic.worker:
file.managed:
- name: /Library/LaunchDaemons/net.generic.worker.plist
- mode: 600
- user: root
- template: jinja
- source: salt://generic-worker.plist.jinja
- context:
bin: {{ bin }}
etc: {{ etc }}
home: {{ home }}
username: {{ user }}
service.running:
- enable: True
- watch:
- file: /Library/LaunchDaemons/net.generic.worker.plist

View file

@ -1,3 +0,0 @@
base:
'mac*':
- generic-worker

View file

@ -1 +0,0 @@
*.id_rsa

View file

@ -1,92 +0,0 @@
# Windows workers for Servo on Taskcluster
The `servo-win2016` worker type runs short-lived Windows 2016 workers on EC2.
## AMIs
Unlike Linux tasks on `docker-worker` where each tasks is executed in a container
based on a Docker image provided with the task,
Windows tasks on Taskcluster are typically run by `generic-worker`
where tasks are executed directly in the workers environment.
So we may want to install some tools globally on the system, to make them available to tasks.
With the [AWS provisioner], this means building a custom AMI.
We need to boot an instance on a base Windows AMI,
install what we need (including `generic-worker` itself),
then take an image of that instance.
The [`worker_types`] directory in `generic-worker`s repository
has some scripts that automate this,
in order to make it more reproducible than clicking around.
The trick is that a PowerShell script to run on boot can be provided
when starting a Windows instance on EC2, and of course AWS has an API.
[AWS provisioner]: https://docs.taskcluster.net/docs/reference/integrations/aws-provisioner/references/api
[`worker_types`]: https://github.com/taskcluster/generic-worker/blob/master/worker_types/
## Building and deploying a new image
* Install and configure the [AWS command-line tool].
* Make your changes to `first-boot.ps1` and/or `base-ami.txt`.
* Run `python3 build-ami.py`. Note that it can take many minutes to complete.
* Save the administrator password together with the image ID
in Servos shared 1Password account, in the *Taskcluster Windows AMIs* note.
* In the [worker type definition], edit `ImageId` and `DeploymentId`.
Note that the new worker type definition will only apply to newly-provisionned workers.
`DeploymentId` can be any string. It can for example include the image ID.
Workers check it between tasks (if `checkForNewDeploymentEverySecs` since the last check).
If it has changed, they shut down in order to leave room for new workers with the new definition.
The [EC2 Resources] page has red *Terminate All Instances* button,
but that will make any running task fail.
[AWS command-line tool]: https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-welcome.html
[worker type definition]: https://tools.taskcluster.net/aws-provisioner/servo-win2016/edit
[EC2 Resources]: https://tools.taskcluster.net/aws-provisioner/servo-win2016/resources
## FIXME: possible improvement
* Have a separate staging worker type to try new AMIs without affecting the production CI
* Automate cleaning up old, unused AMIs and their backing EBS snapshots
* Use multiple AWS regions
* Use the Taskcluster API to automate updating worker type definitions?
## Picking a base AMI
Amazon provides an ovewhelming number of different Windows images,
so its hard to find whats relevant.
Their console might show a paginated view like this:
> ⇤ ← 1 to 50 of 13,914 AMIs → ⇥
Lets grep through this with the API:
```sh
aws ec2 describe-images --owners amazon --filters 'Name=platform,Values=windows' \
--query 'Images[*].[ImageId,Name,Description]' --output table > /tmp/images
< /tmp/images less -S
```
It turns out that these images are all based on Windows Server,
but their number is explained by the presence of many (all?) combinations of:
* Multiple OS Version
* Many available locales
* *Full* (a.k.a. *with Desktop Experience*), or *Core*
* *Base* with only the OS, or multiple flavors with tools like SQL Server pre-installed
If we make some choices and filter the list:
```sh
< /tmp/images grep 2016-English-Full-Base | less -S
```
… we get a much more manageable handlful of images with names like
`Windows_Server-2016-English-Full-Base-2018.09.15` or other dates.
Lets set `base-ami.txt` to `Windows_Server-2016-English-Full-Base-*`,
and have `build-ami.py` pick the most recently-created AMI whose name matches that pattern.

View file

@ -1 +0,0 @@
Windows_Server-2016-English-Full-Base-*

View file

@ -1,54 +0,0 @@
# Use this script is to get a build environment
# when booting a Windows EC2 instance outside of Taskcluster.
[Environment]::SetEnvironmentVariable("Path", $env:Path +
";C:\git\cmd;C:\python2;C:\python2\Scripts;C:\Users\Administrator\.cargo\bin",
[EnvironmentVariableTarget]::Machine)
[Environment]::SetEnvironmentVariable("Lib", $env:Lib +
";C:\gstreamer\1.0\x86_64\lib",
[EnvironmentVariableTarget]::Machine)
# use TLS 1.2 (see bug 1443595)
[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12
# For making http requests
$client = New-Object system.net.WebClient
$shell = new-object -com shell.application
# Download a zip file and extract it
function Expand-ZIPFile($file, $destination, $url)
{
$client.DownloadFile($url, $file)
$zip = $shell.NameSpace($file)
foreach($item in $zip.items())
{
$shell.Namespace($destination).copyhere($item)
}
}
# Optional
$client.DownloadFile(
"https://download.tuxfamily.org/dvorak/windows/bepo.exe",
"C:\bepo.exe"
)
md C:\git
Expand-ZIPFile -File "C:\git.zip" -Destination "C:\git" -Url `
"https://github.com/git-for-windows/git/releases/download/v2.19.0.windows.1/MinGit-2.19.0-64-bit.zip"
$client.DownloadFile(
"https://static.rust-lang.org/rustup/archive/1.13.0/i686-pc-windows-gnu/rustup-init.exe",
"C:\rustup-init.exe"
)
Start-Process C:\rustup-init.exe -Wait -NoNewWindow -ArgumentList `
"--default-toolchain none -y"
md C:\python2
Expand-ZIPFile -File "C:\python2.zip" -Destination "C:\python2" -Url `
"https://queue.taskcluster.net/v1/task/RIuts6jOQtCSjMbuaOU6yw/runs/0/artifacts/public/repacked.zip"
Expand-ZIPFile -File "C:\gst.zip" -Destination "C:\" -Url `
"https://queue.taskcluster.net/v1/task/KAzPF1ZYSFmg2BQKLt0LwA/runs/0/artifacts/public/repacked.zip"

View file

@ -1,116 +0,0 @@
#!/usr/bin/python3
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
import os
import json
import datetime
import subprocess
REGION = "us-west-2"
WORKER_TYPE = "servo-win2016"
AWS_PROVISIONER_USER_ID = "692406183521"
def main():
base_ami_pattern = read_file("base-ami.txt").strip()
base_ami = most_recent_ami(base_ami_pattern)
print("Starting an instance with base image:", base_ami["ImageId"], base_ami["Name"])
key_name = "%s_%s" % (WORKER_TYPE, REGION)
key_filename = key_name + ".id_rsa"
ec2("delete-key-pair", "--key-name", key_name)
result = ec2("create-key-pair", "--key-name", key_name)
write_file(key_filename, result["KeyMaterial"].encode("utf-8"))
user_data = b"<powershell>\n%s\n</powershell>" % read_file("first-boot.ps1")
result = ec2(
"run-instances", "--image-id", base_ami["ImageId"],
"--key-name", key_name,
"--user-data", user_data,
"--instance-type", "c4.xlarge",
"--block-device-mappings",
"DeviceName=/dev/sda1,Ebs={VolumeSize=75,DeleteOnTermination=true,VolumeType=gp2}",
"--instance-initiated-shutdown-behavior", "stop"
)
assert len(result["Instances"]) == 1
instance_id = result["Instances"][0]["InstanceId"]
ec2("create-tags", "--resources", instance_id, "--tags",
"Key=Name,Value=TC %s base instance" % WORKER_TYPE)
print("Waiting for password data to be available…")
ec2_wait("password-data-available", "--instance-id", instance_id)
result = ec2("get-password-data", "--instance-id", instance_id,
"--priv-launch-key", here(key_filename))
print("Administrator password:", result["PasswordData"])
print("Waiting for the instance to finish executing first-boot.ps1 and shut down…")
ec2_wait("instance-stopped", "--instance-id", instance_id)
now = datetime.datetime.utcnow().strftime("%Y-%m-%d_%H.%M.%S")
image_id = ec2("create-image", "--instance-id", instance_id,
"--name", "TC %s %s" % (WORKER_TYPE, now))["ImageId"]
print("Started creating image with ID %s" % image_id)
ec2_wait("image-available", "--image-ids", image_id)
ec2("modify-image-attribute", "--image-id", image_id,
"--launch-permission", "Add=[{UserId=%s}]" % AWS_PROVISIONER_USER_ID)
print("Image available. Terminating the temporary instance…")
ec2("terminate-instances", "--instance-ids", instance_id)
def most_recent_ami(name_pattern):
result = ec2(
"describe-images", "--owners", "amazon",
"--filters", "Name=platform,Values=windows", b"Name=name,Values=" + name_pattern,
)
return max(result["Images"], key=lambda x: x["CreationDate"])
def ec2_wait(*args):
# https://docs.aws.amazon.com/cli/latest/reference/ec2/wait/password-data-available.html
# “It will poll every 15 seconds until a successful state has been reached.
# This will exit with a return code of 255 after 40 failed checks.”
while True:
try:
return ec2("wait", *args)
except subprocess.CalledProcessError as err:
if err.returncode != 255:
raise
def try_ec2(*args):
try:
return ec2(*args)
except subprocess.CalledProcessError:
return None
def ec2(*args):
args = ["aws", "ec2", "--region", REGION, "--output", "json"] + list(args)
output = subprocess.check_output(args)
if output:
return json.loads(output)
def read_file(filename):
with open(here(filename), "rb") as f:
return f.read()
def write_file(filename, contents):
with open(here(filename), "wb") as f:
f.write(contents)
def here(filename, base=os.path.dirname(__file__)):
return os.path.join(base, filename)
if __name__ == "__main__":
main()

View file

@ -1,90 +0,0 @@
Start-Transcript -Path "C:\first_boot.txt"
Get-ChildItem Env: | Out-File "C:\install_env.txt"
# DisableIndexing: Disable indexing on all disk volumes (for performance)
Get-WmiObject Win32_Volume -Filter "IndexingEnabled=$true" | Set-WmiInstance -Arguments @{IndexingEnabled=$false}
# Disable Windows Defender
# https://docs.microsoft.com/en-us/windows/security/threat-protection/windows-defender-antivirus/windows-defender-antivirus-on-windows-server-2016#install-or-uninstall-windows-defender-av-on-windows-server-2016
Uninstall-WindowsFeature -Name Windows-Defender
# use TLS 1.2 (see bug 1443595)
[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12
# For making http requests
$client = New-Object system.net.WebClient
$shell = new-object -com shell.application
# Download a zip file and extract it
function Expand-ZIPFile($file, $destination, $url)
{
$client.DownloadFile($url, $file)
$zip = $shell.NameSpace($file)
foreach($item in $zip.items())
{
$shell.Namespace($destination).copyhere($item)
}
}
# Open up firewall for livelog (both PUT and GET interfaces)
New-NetFirewallRule -DisplayName "Allow livelog PUT requests" `
-Direction Inbound -LocalPort 60022 -Protocol TCP -Action Allow
New-NetFirewallRule -DisplayName "Allow livelog GET requests" `
-Direction Inbound -LocalPort 60023 -Protocol TCP -Action Allow
# Install generic-worker and dependencies
md C:\generic-worker
$client.DownloadFile("https://github.com/taskcluster/generic-worker/releases/download" +
"/v14.1.0/generic-worker-nativeEngine-windows-amd64.exe", "C:\generic-worker\generic-worker.exe")
$client.DownloadFile("https://github.com/taskcluster/livelog/releases/download" +
"/v1.1.0/livelog-windows-amd64.exe", "C:\generic-worker\livelog.exe")
$client.DownloadFile("https://github.com/taskcluster/taskcluster-proxy/releases/download" +
"/v5.1.0/taskcluster-proxy-windows-amd64.exe", "C:\generic-worker\taskcluster-proxy.exe")
Expand-ZIPFile -File "C:\nssm-2.24.zip" -Destination "C:\" `
-Url "https://www.nssm.cc/release/nssm-2.24.zip"
Start-Process C:\generic-worker\generic-worker.exe -ArgumentList `
"new-ed25519-keypair --file C:\generic-worker\generic-worker-ed25519-signing-key.key" `
-Wait -NoNewWindow -PassThru `
-RedirectStandardOutput C:\generic-worker\generate-ed25519-signing-key.log `
-RedirectStandardError C:\generic-worker\generate-ed25519-signing-key.err
Start-Process C:\generic-worker\generic-worker.exe -ArgumentList (
"install service --nssm C:\nssm-2.24\win64\nssm.exe " +
"--configure-for-aws " +
"--config C:\generic-worker\generic-worker.config"
) -Wait -NoNewWindow -PassThru `
-RedirectStandardOutput C:\generic-worker\install.log `
-RedirectStandardError C:\generic-worker\install.err
# # For debugging, let us know the workers IP address through:
# # ssh servo-master.servo.org tail -f /var/log/nginx/access.log | grep ping
# Start-Process C:\nssm-2.24\win64\nssm.exe -ArgumentList `
# "install", "servo-ping", "powershell", "-Command", @"
# (New-Object system.net.WebClient).DownloadData(
# 'http://servo-master.servo.org/ping/generic-worker')
# "@
# # This "service" isnt a long-running service: it runs once on boot and then terminates.
# Start-Process C:\nssm-2.24\win64\nssm.exe -ArgumentList `
# "set", "servo-ping", "AppExit", "Default", "Exit"
Expand-ZIPFile -File "C:\depends22_x86.zip" -Destination "C:\" `
-Url "http://www.dependencywalker.com/depends22_x86.zip"
# Visual C++ Build Tools
# https://blogs.msdn.microsoft.com/vcblog/2016/11/16/introducing-the-visual-studio-build-tools/
$client.DownloadFile("https://aka.ms/vs/15/release/vs_buildtools.exe", "C:\vs_buildtools.exe")
Start-Process C:\vs_buildtools.exe -ArgumentList (`
"--passive --norestart --includeRecommended " +
"--add Microsoft.VisualStudio.Workload.VCTools " +
"--add Microsoft.VisualStudio.Workload.UniversalBuildTools " +
"--add Microsoft.VisualStudio.Component.VC.Tools.ARM64 " +
"--add Microsoft.VisualStudio.Component.VC.ATL " +
"--add Microsoft.VisualStudio.Component.VC.ATL.ARM64 " +
"--add Microsoft.VisualStudio.Component.VC.ATLMFC " +
"--add Microsoft.VisualStudio.Component.VC.MFC.ARM64"
) -Wait
# Now shutdown, in preparation for creating an image
shutdown -s