AI 說使用Terraform 不是適用於這個場景的,使用Lambda 更合適,好吧,您也可以使用一台EC2 或者本地部署他,這樣Lambda 的錢也不用付。
畢竟我們的目的是省錢。
添加一個Lambda function ,名字叫RotateSpotInstance,修改Timeout 為 15min,因為這個過程可能需要比較長的時間,特別是在製作AMI 的部分。
為自動生成的IAM role
RotateSpotInstance-role-mhe3v2sg 添加下面的權限,為什麼是這些?您可以看一下下面需要完成的幾步工作 –
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"ec2:DescribeSpotFleetInstances",
"ec2:CreateImage",
"ec2:DescribeImages",
"ec2:CreateLaunchTemplateVersion",
"ec2:ModifyLaunchTemplate",
"ec2:ModifySpotFleetRequest",
"ec2:TerminateInstances",
"ec2:CreateTags",
"ec2:DescribeInstanceStatus"
],
"Resource": "*"
}
]
}
Lambda 的Configuration 配置按照實際情況,例如:
LAUNCH_TEMPLATE_ID = lt-0716c882cb57a921d
SPOT_FLEET_ID = sfr-c7ccc145-d71d-4268-8b67-089161e02af6
這個function 通過這樣幾步來完成這項工作 –
# 1. Get current instance – 從當前的spot fleet 中獲取運行中的instance id。
# 2. Create AMI – 從當前運行中的instance 創建一個AMI。
# 3. Wait for AMI – 檢測並且等待這個AMI 創建完成。
# 4. Update Launch Template – 將創建完成的AMI id 更新到模板。
# 5. Increase capacity – 修改spot fleet request 的Total target capacity 為2,這樣會自動起一台新的先。
# 6. Wait for new instance to be healthy – 從instance status checks 判斷,等待新啟動的instance 通過健康檢查。
# 7. Terminate old instance – 當新啟動的instance 通過健康檢查後,終止舊的instance。
# 8. Restore capacity – 立刻修改spot fleet request 的Total target capacity 為1,這樣就不會再起更多的instance。
import boto3
import os
import time
import urllib.request
import urllib.error
from datetime import datetime
def lambda_handler(event, context):
SPOT_FLEET_ID = os.environ['SPOT_FLEET_ID']
LAUNCH_TEMPLATE_ID = os.environ['LAUNCH_TEMPLATE_ID']
ec2 = boto3.client('ec2') # 自動使用 Lambda 所在的 region
try:
print(f"Starting rotation for Spot Fleet: {SPOT_FLEET_ID}")
# 1. 取得目前的 instance
response = ec2.describe_spot_fleet_instances(
SpotFleetRequestId=SPOT_FLEET_ID
)
if not response['ActiveInstances']:
return {'statusCode': 400, 'error': 'No active instances'}
instance_id = response['ActiveInstances'][0]['InstanceId']
print(f"Current instance: {instance_id}")
# 2. 建立 AMI
ami_name = f"wordpress-{datetime.now().strftime('%Y%m%d-%H%M')}"
ami_response = ec2.create_image(
InstanceId=instance_id,
Name=ami_name,
NoReboot=True,
TagSpecifications=[{
'ResourceType': 'image',
'Tags': [{'Key': 'auto-delete', 'Value': 'no'}]
}]
)
ami_id = ami_response['ImageId']
print(f"Creating AMI: {ami_id}")
# 3. 等 AMI 可用
print("Waiting for AMI to be available...")
waiter = ec2.get_waiter('image_available')
waiter.wait(
ImageIds=[ami_id],
WaiterConfig={'Delay': 30, 'MaxAttempts': 40}
)
print(f"AMI {ami_id} is available")
# 4. 更新 Launch Template —— 重點修正區段
# 4a. 建立新版本,拿到實際版本號
new_version_response = ec2.create_launch_template_version(
LaunchTemplateId=LAUNCH_TEMPLATE_ID,
SourceVersion='$Latest',
LaunchTemplateData={'ImageId': ami_id}
)
new_version_number = str(
new_version_response['LaunchTemplateVersion']['VersionNumber']
)
print(f"Created launch template version {new_version_number} with AMI {ami_id}")
# 4b. 用實際版本號設定 default(不是 '$Latest')
ec2.modify_launch_template(
LaunchTemplateId=LAUNCH_TEMPLATE_ID,
DefaultVersion=new_version_number
)
# 4c. 驗證 default version 真的更新了
lt_desc = ec2.describe_launch_templates(
LaunchTemplateIds=[LAUNCH_TEMPLATE_ID]
)
actual_default = str(lt_desc['LaunchTemplates'][0]['DefaultVersionNumber'])
if actual_default != new_version_number:
raise Exception(
f"Default version mismatch: expected {new_version_number}, "
f"got {actual_default}"
)
print(f"Verified launch template default version = {actual_default}")
# 4d. 稍等 Spot Fleet 看到新的 default
print("Waiting 60s for Spot Fleet to pick up new launch template version...")
time.sleep(60)
# 5. 增加 capacity —— 會用新的 AMI 啟動
ec2.modify_spot_fleet_request(
SpotFleetRequestId=SPOT_FLEET_ID,
TargetCapacity=2
)
print("Increased capacity to 2")
# 6. 等新 instance 變健康,並驗證它是用新 AMI 啟動
new_instance_id = wait_for_new_instance(
ec2, SPOT_FLEET_ID, instance_id, expected_ami_id=ami_id
)
print(f"New instance {new_instance_id} is healthy and using new AMI")
# 7. 終止舊 instance
ec2.terminate_instances(InstanceIds=[instance_id])
print(f"Terminated old instance: {instance_id}")
# 8. 恢復 capacity
ec2.modify_spot_fleet_request(
SpotFleetRequestId=SPOT_FLEET_ID,
TargetCapacity=1
)
print("Restored capacity to 1")
return {
'statusCode': 200,
'ami_id': ami_id,
'launch_template_version': new_version_number,
'old_instance': instance_id,
'new_instance': new_instance_id
}
except Exception as e:
print(f"Error: {str(e)}")
try:
ec2.modify_spot_fleet_request(
SpotFleetRequestId=SPOT_FLEET_ID,
TargetCapacity=1
)
print("Rolled back capacity to 1")
except Exception as rollback_error:
print(f"Rollback failed: {rollback_error}")
return {'statusCode': 500, 'error': str(e)}
def check_http_port(ec2, instance_id, port=80):
"""透過 HTTP HEAD 檢查 port 是否可連線"""
try:
response = ec2.describe_instances(InstanceIds=[instance_id])
instance = response['Reservations'][0]['Instances'][0]
public_ip = instance.get('PublicIpAddress') or instance.get('PrivateIpAddress')
if not public_ip:
return False
try:
req = urllib.request.Request(f'http://{public_ip}:{port}', method='HEAD')
urllib.request.urlopen(req, timeout=5)
print(f"HTTP port {port} is responding on {public_ip}")
return True
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError):
return False
except Exception as e:
print(f"Error checking HTTP port: {e}")
return False
def get_instance_ami(ec2, instance_id):
"""取得 instance 實際使用的 AMI ID"""
try:
resp = ec2.describe_instances(InstanceIds=[instance_id])
return resp['Reservations'][0]['Instances'][0].get('ImageId')
except Exception as e:
print(f"Error getting AMI for {instance_id}: {e}")
return None
def wait_for_new_instance(ec2, spot_fleet_id, old_instance_id,
expected_ami_id=None, max_wait=600):
"""
等新 instance 出現、變 running、HTTP 通。
若提供 expected_ami_id,會驗證新 instance 確實使用該 AMI,
避免 Spot Fleet 用到舊 template 卻被誤判為成功。
"""
print("Waiting for new instance to be healthy...")
for i in range(max_wait // 10):
response = ec2.describe_spot_fleet_instances(
SpotFleetRequestId=spot_fleet_id
)
instances = response['ActiveInstances']
if len(instances) < 2:
print(f"Waiting for new instance... ({i*10}s)")
time.sleep(10)
continue
new_instances = [inst for inst in instances
if inst['InstanceId'] != old_instance_id]
if not new_instances:
print(f"No new instance found yet... ({i*10}s)")
time.sleep(10)
continue
new_instance_id = new_instances[0]['InstanceId']
# 驗證新 instance 使用的是預期的 AMI
if expected_ami_id:
actual_ami = get_instance_ami(ec2, new_instance_id)
if actual_ami != expected_ami_id:
raise Exception(
f"New instance {new_instance_id} is using AMI {actual_ami}, "
f"expected {expected_ami_id}. "
f"Launch template default version may not have been picked up."
)
print(f"Confirmed new instance {new_instance_id} uses AMI {actual_ami}")
try:
status_response = ec2.describe_instance_status(
InstanceIds=[new_instance_id],
IncludeAllInstances=True
)
if not status_response['InstanceStatuses']:
print(f"Instance {new_instance_id} status not available yet...")
time.sleep(10)
continue
status = status_response['InstanceStatuses'][0]
instance_state = status['InstanceState']['Name']
print(f"Instance: {instance_state}")
# running 後檢查 HTTP port 80
if instance_state == 'running':
if check_http_port(ec2, new_instance_id, port=80):
print(f"Instance {new_instance_id} is fully healthy!")
return new_instance_id
else:
print("Instance running but HTTP port 80 not ready yet...")
except Exception as e:
print(f"Error checking status: {e}")
time.sleep(10)
raise Exception(f"New instance didn't become healthy within {max_wait}s")
Deploy 到Lambda,Publish!
trigger it –
aws lambda invoke \
--function-name RotateSpotInstance \
--region ap-northeast-3 \
--invocation-type Event \
response.json
View logs in real-time –
aws logs tail /aws/lambda/RotateSpotInstance \
--follow \
--region ap-northeast-3
2026-03-05T06:33:57.808000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 INIT_START Runtime Version: python:3.14.v35 Runtime Version ARN: arn:aws:lambda:ap-northeast-3::runtime:35b4fe1ff6a2b42e1513619f35af63e09acce626823e1d0e547d6393c854bc71
2026-03-05T06:33:58.102000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 START RequestId: 7bf945e2-dcbf-4345-a7fb-8e811842cf69 Version: $LATEST
2026-03-05T06:34:00.771000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 Starting rotation for Spot Fleet: sfr-c7ccc145-d71d-4268-8b67-089161e02af6
2026-03-05T06:34:01.142000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 Current instance: i-0e187b0c5195efaa6
2026-03-05T06:34:01.538000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 Creating AMI: ami-07590290f72731092
2026-03-05T06:34:01.538000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 Waiting for AMI to be available...
2026-03-05T06:34:59.256000+00:00 2026/03/05/[$LATEST]4907e5924ad374a64d41ffd64da3c19b INIT_START Runtime Version: python:3.14.v35 Runtime Version ARN: arn:aws:lambda:ap-northeast-3::runtime:35b4fe1ff6a2b42e1513619f35af63e09acce626823e1d0e547d6393c854bc71
2026-03-05T06:34:59.567000+00:00 2026/03/05/[$LATEST]4907e5924ad374a64d41ffd64da3c19b START RequestId: 3e94eeea-257e-4203-b1af-9cd12adab8d4 Version: $LATEST
2026-03-05T06:35:02.268000+00:00 2026/03/05/[$LATEST]4907e5924ad374a64d41ffd64da3c19b Starting rotation for Spot Fleet: sfr-c7ccc145-d71d-4268-8b67-089161e02af6
2026-03-05T06:35:02.664000+00:00 2026/03/05/[$LATEST]4907e5924ad374a64d41ffd64da3c19b Current instance: i-0e187b0c5195efaa6
2026-03-05T06:35:03.024000+00:00 2026/03/05/[$LATEST]4907e5924ad374a64d41ffd64da3c19b Creating AMI: ami-0c75c75b4f29b190e
2026-03-05T06:35:03.024000+00:00 2026/03/05/[$LATEST]4907e5924ad374a64d41ffd64da3c19b Waiting for AMI to be available...
2026-03-05T06:36:01.324000+00:00 2026/03/05/[$LATEST]5fd07f95f7fb4ff087fdefaab8e80e9f INIT_START Runtime Version: python:3.14.v35 Runtime Version ARN: arn:aws:lambda:ap-northeast-3::runtime:35b4fe1ff6a2b42e1513619f35af63e09acce626823e1d0e547d6393c854bc71
2026-03-05T06:36:01.627000+00:00 2026/03/05/[$LATEST]5fd07f95f7fb4ff087fdefaab8e80e9f START RequestId: d9621da3-bd40-4aed-a028-1ac35f2ed22c Version: $LATEST
2026-03-05T06:36:01.980000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 AMI ami-07590290f72731092 is available
2026-03-05T06:36:02.354000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 Updated Launch Template to use ami-07590290f72731092
2026-03-05T06:36:02.542000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 Increased capacity to 2
2026-03-05T06:36:02.542000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 Waiting for new instance to be healthy...
2026-03-05T06:36:02.652000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 Waiting for new instance... (0s)
2026-03-05T06:36:04.407000+00:00 2026/03/05/[$LATEST]5fd07f95f7fb4ff087fdefaab8e80e9f Starting rotation for Spot Fleet: sfr-c7ccc145-d71d-4268-8b67-089161e02af6
2026-03-05T06:36:04.783000+00:00 2026/03/05/[$LATEST]5fd07f95f7fb4ff087fdefaab8e80e9f Current instance: i-0e187b0c5195efaa6
2026-03-05T06:36:05.128000+00:00 2026/03/05/[$LATEST]5fd07f95f7fb4ff087fdefaab8e80e9f Creating AMI: ami-035bf50417603be87
2026-03-05T06:36:05.128000+00:00 2026/03/05/[$LATEST]5fd07f95f7fb4ff087fdefaab8e80e9f Waiting for AMI to be available...
2026-03-05T06:36:12.748000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 Waiting for new instance... (10s)
2026-03-05T06:36:22.920000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 Instance: running, System: initializing, Check: initializing
2026-03-05T06:36:33.082000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 Instance: running, System: initializing, Check: initializing
2026-03-05T06:36:43.263000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 Instance: running, System: initializing, Check: initializing
2026-03-05T06:36:53.430000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 Instance: running, System: initializing, Check: initializing
2026-03-05T06:37:03.489000+00:00 2026/03/05/[$LATEST]4907e5924ad374a64d41ffd64da3c19b AMI ami-0c75c75b4f29b190e is available
2026-03-05T06:37:03.598000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 Instance: running, System: initializing, Check: initializing
2026-03-05T06:37:03.889000+00:00 2026/03/05/[$LATEST]4907e5924ad374a64d41ffd64da3c19b Updated Launch Template to use ami-0c75c75b4f29b190e
2026-03-05T06:37:04.054000+00:00 2026/03/05/[$LATEST]4907e5924ad374a64d41ffd64da3c19b Increased capacity to 2
2026-03-05T06:37:04.054000+00:00 2026/03/05/[$LATEST]4907e5924ad374a64d41ffd64da3c19b Waiting for new instance to be healthy...
2026-03-05T06:37:04.212000+00:00 2026/03/05/[$LATEST]4907e5924ad374a64d41ffd64da3c19b Instance: running, System: initializing, Check: initializing
2026-03-05T06:37:13.770000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 Instance: running, System: initializing, Check: initializing
2026-03-05T06:37:14.379000+00:00 2026/03/05/[$LATEST]4907e5924ad374a64d41ffd64da3c19b Instance: running, System: initializing, Check: initializing
2026-03-05T06:37:23.922000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 Instance: running, System: initializing, Check: initializing
2026-03-05T06:37:24.542000+00:00 2026/03/05/[$LATEST]4907e5924ad374a64d41ffd64da3c19b Instance: running, System: initializing, Check: initializing
2026-03-05T06:37:34.083000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 Instance: running, System: initializing, Check: initializing
2026-03-05T06:37:34.704000+00:00 2026/03/05/[$LATEST]4907e5924ad374a64d41ffd64da3c19b Instance: running, System: initializing, Check: initializing
2026-03-05T06:37:44.269000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 Instance: running, System: initializing, Check: initializing
2026-03-05T06:37:44.869000+00:00 2026/03/05/[$LATEST]4907e5924ad374a64d41ffd64da3c19b Instance: running, System: initializing, Check: initializing
2026-03-05T06:37:54.661000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 Instance: running, System: initializing, Check: initializing
2026-03-05T06:37:55.047000+00:00 2026/03/05/[$LATEST]4907e5924ad374a64d41ffd64da3c19b Instance: running, System: initializing, Check: initializing
2026-03-05T06:38:04.816000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 Instance: running, System: ok, Check: ok
2026-03-05T06:38:04.816000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 Instance i-0638e625f7b42d66a is fully healthy!
2026-03-05T06:38:04.816000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 New instance i-0638e625f7b42d66a is healthy
2026-03-05T06:38:05.113000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 Terminated old instance: i-0e187b0c5195efaa6
2026-03-05T06:38:05.229000+00:00 2026/03/05/[$LATEST]4907e5924ad374a64d41ffd64da3c19b Instance: running, System: ok, Check: ok
2026-03-05T06:38:05.229000+00:00 2026/03/05/[$LATEST]4907e5924ad374a64d41ffd64da3c19b Instance i-0638e625f7b42d66a is fully healthy!
2026-03-05T06:38:05.229000+00:00 2026/03/05/[$LATEST]4907e5924ad374a64d41ffd64da3c19b New instance i-0638e625f7b42d66a is healthy
2026-03-05T06:38:05.309000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 Restored capacity to 1
2026-03-05T06:38:05.333000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 END RequestId: 7bf945e2-dcbf-4345-a7fb-8e811842cf69
2026-03-05T06:38:05.333000+00:00 2026/03/05/[$LATEST]4e889110fba78fe4e51e9fb26315e4d6 REPORT RequestId: 7bf945e2-dcbf-4345-a7fb-8e811842cf69 Duration: 247229.90 ms Billed Duration: 247521 ms Memory Size: 128 MB Max Memory Used: 98 MB Init Duration: 290.80 ms
2026-03-05T06:38:05.496000+00:00 2026/03/05/[$LATEST]4907e5924ad374a64d41ffd64da3c19b Terminated old instance: i-0e187b0c5195efaa6
2026-03-05T06:38:05.580000+00:00 2026/03/05/[$LATEST]5fd07f95f7fb4ff087fdefaab8e80e9f AMI ami-035bf50417603be87 is available
2026-03-05T06:38:05.630000+00:00 2026/03/05/[$LATEST]4907e5924ad374a64d41ffd64da3c19b Restored capacity to 1
2026-03-05T06:38:05.647000+00:00 2026/03/05/[$LATEST]4907e5924ad374a64d41ffd64da3c19b END RequestId: 3e94eeea-257e-4203-b1af-9cd12adab8d4
2026-03-05T06:38:05.647000+00:00 2026/03/05/[$LATEST]4907e5924ad374a64d41ffd64da3c19b REPORT RequestId: 3e94eeea-257e-4203-b1af-9cd12adab8d4 Duration: 186080.32 ms Billed Duration: 186387 ms Memory Size: 128 MB Max Memory Used: 97 MB Init Duration: 306.42 ms
2026-03-05T06:38:05.944000+00:00 2026/03/05/[$LATEST]5fd07f95f7fb4ff087fdefaab8e80e9f Updated Launch Template to use ami-035bf50417603be87
2026-03-05T06:38:06.115000+00:00 2026/03/05/[$LATEST]5fd07f95f7fb4ff087fdefaab8e80e9f Error: An error occurred (FleetNotInModifiableState) when calling the ModifySpotFleetRequest operation: Fleet Request: sfr-c7ccc145-d71d-4268-8b67-089161e02af6 is not in a modifiable state.
2026-03-05T06:38:06.555000+00:00 2026/03/05/[$LATEST]5fd07f95f7fb4ff087fdefaab8e80e9f Rolled back capacity to 1
2026-03-05T06:38:06.588000+00:00 2026/03/05/[$LATEST]5fd07f95f7fb4ff087fdefaab8e80e9f END RequestId: d9621da3-bd40-4aed-a028-1ac35f2ed22c
2026-03-05T06:38:06.588000+00:00 2026/03/05/[$LATEST]5fd07f95f7fb4ff087fdefaab8e80e9f REPORT RequestId: d9621da3-bd40-4aed-a028-1ac35f2ed22c Duration: 124960.33 ms Billed Duration: 125260 ms Memory Size: 128 MB Max Memory Used: 97 MB Init Duration: 299.41 ms
有併發數量的問題,限制Lambda 只能1個進程跑 –
aws lambda put-function-concurrency \
–function-name RotateSpotInstance \
–reserved-concurrent-executions 1 \
–region ap-northeast-3
{
“ReservedConcurrentExecutions”: 1
}
確實清爽多了,也不會有服務中斷的問題。
當然了,為了服務的持續性,您需要使用Dynamic DNS 來將新的instance ip report 到DNS server 並且設置script 在 boot 的時候自動執行。
例如ddclient 更新到Cloudflare –
INFO: [cloudflare][private.bbken.org]> getting Cloudflare Zone ID
INFO: [cloudflare][private.bbken.org]> Zone ID is 0933028cb8e70c5cb4f0c736be6fee37
INFO: [cloudflare][private.bbken.org]> setting IPv4 address to 10.4.41.150
SUCCESS: [cloudflare][private.bbken.org]> IPv4 address set to 10.4.41.150
INFO: [cloudflare][kix.bbken.org]> getting Cloudflare Zone ID
INFO: [cloudflare][kix.bbken.org]> Zone ID is 0933028cb8e70c5cb4f0c736be6fee37
INFO: [cloudflare][kix.bbken.org]> setting IPv4 address to 172.15.168.113
SUCCESS: [cloudflare][kix.bbken.org]> IPv4 address set to 172.15.168.113
INFO: [cloudflare][kix.bbken.org]> setting IPv6 address to 2406:da16:a8d:2bc6:5a03:80fb:3a46:796
SUCCESS: [cloudflare][kix.bbken.org]> IPv6 address set to 2406:da16:a8d:2bc6:5a03:80fb:3a46:796