Lambda からメトリクス送って Alarm 状態になったらメール送って Slack にも通知する

メトリクスの送信

https://aws.amazon.com/jp/builders-flash/202207/lambda-powertools-python-4/

https://zenn.dev/metalmental/articles/20241116_aws-power-tools

import json
import os

from aws_lambda_powertools import Logger, Metrics
from aws_lambda_powertools.metrics import MetricUnit
from aws_lambda_powertools.utilities.typing import LambdaContext

# Initialize Powertools Logger and Metrics
logger = Logger()
metrics = Metrics()

@metrics.log_metrics(default_dimensions=json.loads(os.environ["METRICS_DIMENSION"]), capture_cold_start_metric=True)
@logger.inject_lambda_context
def handler(event: dict, context: LambdaContext) -> int:
    logger.info(f"Event: {event}")
    logger.info(f"Context: {context}")

    # Example metric
    metrics.add_metric(name="SuccessfulInvocation", unit=MetricUnit.Count, value=1)
    metrics.add_metric(name="FailedInvocation", unit=MetricUnit.Count, value=1)

    return 42

default_dimensions にディメンションと値の組を渡せる

locals {
  region         = "us-east-1"
  python_version = "python3.12"

  function_name  = "sample_function"
  env            = "sample"
  target_content = "target01"

  metrics = {
    namespace = "sample_application"
  }
}

resource "aws_lambda_function" "test_lambda" {
  ...
  runtime = local.python_version

  layers = [
    "arn:aws:lambda:${local.region}:017000801446:layer:AWSLambdaPowertoolsPythonV3-${replace(local.python_version, ".", "")}-x86_64:7"
  ]

  environment {
    variables = {
      POWERTOOLS_SERVICE_NAME      = "${local.function_name}-${each.key}" # メトリクスのディメンションに service として追加される
      POWERTOOLS_METRICS_NAMESPACE = local.metrics.namespace

      POWERTOOLS_LOG_LEVEL            = "DEBUG"
      POWERTOOLS_TRACER_CAPTURE_ERROR = "True"

      METRICS_DIMENSION = jsonencode({ env = local.env, target = local.target_content })
    }
  }

Alarm の設定とメール通知

1分間に1回しきい値を超えたら SNS でメールを送る

resource "aws_sns_topic" "sample_input_topic" {
  name = "alarm-notification-topic"
}
resource "aws_sns_topic_subscription" "user_updates_sqs_target" {
  topic_arn = aws_sns_topic.sample_input_topic.arn
  protocol  = "email"
  endpoint  = "<通知先のメールアドレス>"
}

resource "aws_cloudwatch_metric_alarm" "FailedInvocation" {
  alarm_name          = "failed-invocation-sample"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = 1
  metric_name         = "FailedInvocation"
  namespace           = local.metrics.namespace
  period              = 60
  statistic           = "Sum"
  threshold           = 1
  alarm_description   = "Monitor any errors on function"
  dimensions = {
    service = "${local.function_name}"
    env     = local.env
    target  = local.target_content
  }
  treat_missing_data        = "notBreaching" # 
  insufficient_data_actions = []
  alarm_actions             = [aws_sns_topic.sample_input_topic.arn]
  ok_actions                = [aws_sns_topic.sample_input_topic.arn]
}

https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm

MetricsFilter で メトリクス 作って Alarm 通知

ログから level が ERROR になってる件数を拾ってくる例

locals {
  function_name  = "sample_function"
  env            = "sample"
  target_content = "target01"

  metrics = {
    namespace = "sample_application"
    names = {
      error_log_count = "ErrorLogCount"
    }
    dimensions = { env = local.env, target = local.target_content }
  }

  functions = toset(["a", "b"])
}

resource "aws_lambda_function" "test_lambda" {
  ...
  logging_config {
    log_group        = aws_cloudwatch_log_group.test_lambda_log_group[each.key].name
    log_format       = "JSON"
    system_log_level = "DEBUG"
  }
  ...
}

resource "aws_cloudwatch_log_metric_filter" "ErrorFilter" {
  for_each = local.functions

  name           = "error-filter-sample-${each.key}"
  log_group_name = aws_cloudwatch_log_group.test_lambda_log_group[each.key].name

  pattern = "{ $.level = \"ERROR\" }"

  metric_transformation {
    name       = local.metrics.names.error_log_count
    namespace  = local.metrics.namespace
    value      = 1
    unit       = "Count"
    dimensions = merge({ service = "$.service" }, { for k, v in local.metrics.dimensions : k => "$.${k}" })
  }
}
resource "aws_cloudwatch_metric_alarm" "ErrorLogs" {
  for_each            = local.functions
  alarm_name          = "error-logs-${each.key}"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = 1
  metric_name         = local.metrics.names.error_log_count
  namespace           = local.metrics.namespace
  period              = 60
  statistic           = "Sum"
  threshold           = 1
  alarm_description   = "Monitor any error logs on function"
  dimensions = merge({
    service = aws_lambda_function.test_lambda[each.key].function_name
  }, local.metrics.dimensions)
  treat_missing_data        = "notBreaching"
  insufficient_data_actions = []
  alarm_actions             = [aws_sns_topic.sample_input_topic.arn]
  ok_actions                = [aws_sns_topic.sample_input_topic.arn]
}

AWS Chatbot + Slack 通知

手動でAppを追加する必要がある