Lambda からメトリクス送って Alarm 状態になったらメール送って Slack にも通知する
https://aws.amazon.com/jp/builders-flash/202207/lambda-powertools-python-4/
https://zenn.dev/metalmental/articles/20241116_aws-power-tools
import json
import os
from aws_lambda_powertools import Logger, Metrics
from aws_lambda_powertools.metrics import MetricUnit
from aws_lambda_powertools.utilities.typing import LambdaContext
# Initialize Powertools Logger and Metrics
logger = Logger()
metrics = Metrics()
@metrics.log_metrics(default_dimensions=json.loads(os.environ["METRICS_DIMENSION"]), capture_cold_start_metric=True)
@logger.inject_lambda_context
def handler(event: dict, context: LambdaContext) -> int:
logger.info(f"Event: {event}")
logger.info(f"Context: {context}")
# Example metric
metrics.add_metric(name="SuccessfulInvocation", unit=MetricUnit.Count, value=1)
metrics.add_metric(name="FailedInvocation", unit=MetricUnit.Count, value=1)
return 42
default_dimensions にディメンションと値の組を渡せる
locals {
region = "us-east-1"
python_version = "python3.12"
function_name = "sample_function"
env = "sample"
target_content = "target01"
metrics = {
namespace = "sample_application"
}
}
resource "aws_lambda_function" "test_lambda" {
...
runtime = local.python_version
layers = [
"arn:aws:lambda:${local.region}:017000801446:layer:AWSLambdaPowertoolsPythonV3-${replace(local.python_version, ".", "")}-x86_64:7"
]
environment {
variables = {
POWERTOOLS_SERVICE_NAME = "${local.function_name}-${each.key}" # メトリクスのディメンションに service として追加される
POWERTOOLS_METRICS_NAMESPACE = local.metrics.namespace
POWERTOOLS_LOG_LEVEL = "DEBUG"
POWERTOOLS_TRACER_CAPTURE_ERROR = "True"
METRICS_DIMENSION = jsonencode({ env = local.env, target = local.target_content })
}
}
1分間に1回しきい値を超えたら SNS でメールを送る
resource "aws_sns_topic" "sample_input_topic" {
name = "alarm-notification-topic"
}
resource "aws_sns_topic_subscription" "user_updates_sqs_target" {
topic_arn = aws_sns_topic.sample_input_topic.arn
protocol = "email"
endpoint = "<通知先のメールアドレス>"
}
resource "aws_cloudwatch_metric_alarm" "FailedInvocation" {
alarm_name = "failed-invocation-sample"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
metric_name = "FailedInvocation"
namespace = local.metrics.namespace
period = 60
statistic = "Sum"
threshold = 1
alarm_description = "Monitor any errors on function"
dimensions = {
service = "${local.function_name}"
env = local.env
target = local.target_content
}
treat_missing_data = "notBreaching" #
insufficient_data_actions = []
alarm_actions = [aws_sns_topic.sample_input_topic.arn]
ok_actions = [aws_sns_topic.sample_input_topic.arn]
}
https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm
ログから level が ERROR になってる件数を拾ってくる例
locals {
function_name = "sample_function"
env = "sample"
target_content = "target01"
metrics = {
namespace = "sample_application"
names = {
error_log_count = "ErrorLogCount"
}
dimensions = { env = local.env, target = local.target_content }
}
functions = toset(["a", "b"])
}
resource "aws_lambda_function" "test_lambda" {
...
logging_config {
log_group = aws_cloudwatch_log_group.test_lambda_log_group[each.key].name
log_format = "JSON"
system_log_level = "DEBUG"
}
...
}
resource "aws_cloudwatch_log_metric_filter" "ErrorFilter" {
for_each = local.functions
name = "error-filter-sample-${each.key}"
log_group_name = aws_cloudwatch_log_group.test_lambda_log_group[each.key].name
pattern = "{ $.level = \"ERROR\" }"
metric_transformation {
name = local.metrics.names.error_log_count
namespace = local.metrics.namespace
value = 1
unit = "Count"
dimensions = merge({ service = "$.service" }, { for k, v in local.metrics.dimensions : k => "$.${k}" })
}
}
resource "aws_cloudwatch_metric_alarm" "ErrorLogs" {
for_each = local.functions
alarm_name = "error-logs-${each.key}"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
metric_name = local.metrics.names.error_log_count
namespace = local.metrics.namespace
period = 60
statistic = "Sum"
threshold = 1
alarm_description = "Monitor any error logs on function"
dimensions = merge({
service = aws_lambda_function.test_lambda[each.key].function_name
}, local.metrics.dimensions)
treat_missing_data = "notBreaching"
insufficient_data_actions = []
alarm_actions = [aws_sns_topic.sample_input_topic.arn]
ok_actions = [aws_sns_topic.sample_input_topic.arn]
}
手動でAppを追加する必要がある