Lework Study hard, improve every day.

使用aws-cli为常用产品批量添加监控告警

2019-09-26
lework
aws
本文 24175 字,阅读全文约需 70 分钟

aws中国竟然没有为产品批量添加监控报警的功能…呵呵呵呵,为了减轻工作,实现批量初始化,没有我们就动手自己写脚本吧,使用aws-cli工具可以获取到各项产品的信息,并且可以管理产品。

安装aws-cli

Winsows使用安装包安装

https://s3.amazonaws.com/aws-cli/AWSCLI64PY3.msi

使用pip安装

pip3 install awscli

更多见安装文档

验证aws版本

aws --version
aws-cli/1.16.239 Python/3.6.0 Windows/10 botocore/1.12.229

配置aws-cli

  1. 在iam中创建访问密钥
  2. 配置aws-cli
      aws configure
      AWS Access Key ID [None]: AKIAIOSFODNN7EXAMPLE
      AWS Secret Access Key [None]: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY
      Default region name [None]: cn-north-1
      Default output format [None]: text
    

cli的配置文件存放在当前用户家目录下的.aws/config

cli的凭证文件存放在当前用户家目录下的.aws/credentials

用到的cli命令

获取产品的信息

# ec2
aws-cli ec2 describe-instances
# elb2
aws-cli elbv2 describe-load-balancers
aws-cli elbv2 describe-target-groups --load-balancer-arn [elb-arn]
aws-cli elbv2 describe-listeners --load-balancer-arn [elb-arn]
# redis
aws-cli elasticache describe-cache-clusters
# mysql
aws-cli rds describe-db-instances

更多命令见CLI

添加监控报警

aws-cli cloudwatch put-metric-alarm \
--alarm-name "AWS_EC2_{name}_{mertic}" \              # 报警名称
--alarm-description "aws ec2 {mertic}" \              # 报警描述
--metric-name {mertic} \                      # 指标名称
--namespace AWS/EC2 \                                 # 名称空间
--statistic Average \                                 # 指标的统计方式
--period 60 \                                         # 统计指标的时间长度
--threshold 5000000 \                                 # 阈值
--evaluation-periods 3 \                              # 将数据与指定阈值进行比较的时间段数
--datapoints-to-alarm 3 \                             # 必须突破才能触发警报的数据点数
--comparison-operator GreaterThanOrEqualToThreshold \ # 比较指定的统计信息和阈值时要使用的算术运算
--treat-missing-data notBreaching \                   # 设置此警报如何处理丢失的数据点
--alarm-actions "{action}" \                          # 当此警报从任何其他状态转变为alarm状态时执行的动作, 通常指定SNS的ARN
--ok-actions "{action}" \                             # 当此警报从任何其他状态转变为OK状态时执行的动作, 通常指定SNS的ARN
--unit Percent \                                      # 指定单位
--dimensions "Name=InstanceId,Value={id}"             # 指定ec2的实例id

更多参数见put-metric-alarm

为EC2批量添加监控报警

本次使用windows环境下的python3运行脚本

脚本信息

#!/usr/bin/python
# -*- coding: utf-8 -*-


import json
import subprocess

# 1. 配置cli路径和region
Contants = {
    "AWSCLI": '"C:\\Program Files\\Amazon\\AWSCLI\\bin\\aws.exe" --output json',
    "AWSREGION": ['cn-north-1']  # 北京
}


# 构造字典
class CreateDict(dict):
    def __getitem__(self, item):
        try:
            return dict.__getitem__(self, item)
        except KeyError:
            value = self[item] = type(self)()
            return value


#########################################################################################################
# 配置告警

# CPUUtilization,3分钟检查3次,平均值大于或等于80%,就告警。
def getCPUUtilizationComm(name, action, instance_id):
    mertic = 'CPUUtilization'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_EC2_{name}_{mertic}" \
--alarm-description "aws ec2 {mertic}" \
--metric-name {mertic} \
--namespace AWS/EC2 \
--statistic Average \
--period 60 \
--threshold 70 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--unit Percent \
--dimensions "Name=InstanceId,Value={id}"'''.format(cli=Contants['AWSCLI'], name=name, action=action, id=instance_id, mertic=mertic)


# NetworkIn,3分钟检查3次,平均值大于或等于5m,就告警。
def getNetworkInComm(name, action, instance_id):
    mertic = 'NetworkIn'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_EC2_{name}_{mertic}" \
--alarm-description "aws ec2 {mertic}" \
--metric-name {mertic} \
--namespace AWS/EC2 \
--statistic Average \
--period 60 \
--threshold 5000000 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=InstanceId,Value=%s"'''.format(cli=Contants['AWSCLI'], name=name, action=action, id=instance_id, mertic=mertic)


# NetworkOut,3分钟检查3次,平均值大于或等于5m,就告警。
def getNetworkOutComm(name, action, instance_id):
    mertic = 'NetworkOut'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_EC2_{name}_{mertic}" \
--alarm-description "aws ec2 {mertic}" \
--metric-name {mertic} \
--namespace AWS/EC2 \
--statistic Average \
--period 60 \
--threshold 5000000 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=InstanceId,Value={id}"'''.format(cli=Contants['AWSCLI'], name=name, action=action, id=instance_id, mertic=mertic)


# 执行命令函数
def execCommand(comm):
    try:
        print(comm)
        (status, stdout) = subprocess.getstatusoutput(comm)
        print(status)
        return stdout
    except Exception as e:
        print(e)


# 获取当前可用区内所有lb2的基础信息
def getAll():
    comm1 = "%s ec2 describe-instances" % Contants['AWSCLI']

    all_data = json.loads(execCommand(comm1))

    instance_list = []

    for r in all_data['Reservations']:
        for i in r['Instances']:
            data = {'id': i['InstanceId']}
            for t in i['Tags']:
                if t['Key'] == 'Name':
                    data['name'] = t['Value']
            if not data['name']:
                data['name'] = i['InstanceId']
            instance_list.append(data)

    return instance_list


# 添加报警
def add_alert(data, action):
    for i in data:
        instance_id = i['id']
        name = i['name']
        execCommand(getCPUUtilizationComm(name, action, instance_id))
        execCommand(getNetworkInComm(name, action, instance_id))
        execCommand(getNetworkOutComm(name, action, instance_id))


if __name__ == '__main__':
    # 2. 配置sns的arn
    sns_arn = "arn:aws-cn:sns:cn-north-1:377051234567:sns-example"

    cli = Contants['AWSCLI']
    for i in Contants['AWSREGION']:
        print('[Region] ', i)
        Contants['AWSCLI'] = cli + ' --region ' + i
        add_alert(getAll(), sns_arn)

ec2默认没有监控内存和磁盘的信息(晕倒),需要手动添加agent才可以

在拿到脚本后

  1. 配置cli路径和region
  2. 配置sns的arn
  3. 运行脚本就行啦
  4. 运行完成后,就可以到cloudwatch中查看添加的报警条目

为ELB2批量添加监控报警

脚本信息

#!/usr/bin/python
# -*- coding: utf-8 -*-


import os, sys, subprocess, re, json

Contants = {
    "AWSCLI": '"C:\\Program Files\\Amazon\\AWSCLI\\bin\\aws.exe"  --output json',
    "AWSREGION": ['cn-north-1']  # 北京
}


# 构造字典
class CreateDict(dict):
    def __getitem__(self, item):
        try:
            return dict.__getitem__(self, item)
        except KeyError:
            value = self[item] = type(self)()
            return value


#########################################################################################################
# 配置告警
# HealthyHostCount,一分钟检查一次,当健康主机数量等于0,就告警。健康主机的最大值小于等于0就告警
def getHealthyHostCountComm(elb_name, port, tag_group, elb_arn, sns_arn):
    mertic = 'HealthyHostCount'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name \"AWS_ELB_{name}_{port}_{mertic}\" \
--alarm-description \"aws elb {mertic}\" \
--metric-name {mertic} \
--namespace AWS/ApplicationELB \
--statistic Maximum \
--period 60 \
--threshold 0 \
--evaluation-periods 1 \
--datapoints-to-alarm 1 \
--comparison-operator LessThanOrEqualToThreshold \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=TargetGroup,Value=targetgroup/{tag_group}" "Name=LoadBalancer,Value=app/{elb_arn}"'''.format(
        cli=Contants['AWSCLI'], name=elb_name, port=port, action=sns_arn, tag_group=tag_group, elb_arn=elb_arn, mertic=mertic)


# ActiveFlowCount,3分钟检查3次,当链接数平均大于或等于6000,就告警。
def getActiveFlowCountComm(elb_name, port, tag_group, elb_arn, sns_arn):
    mertic = 'ActiveFlowCount'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_ELB_{name}_{port}_{mertic}" \
--alarm-description "aws elb {mertic}" \
--metric-name {mertic} \
--namespace AWS/ApplicationELB \
--statistic Average \
--period 300 \
--threshold 6000 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=TargetGroup,Value=targetgroup/{tag_group}" "Name=LoadBalancer,Value=app/{elb_arn}"'''.format(
        cli=Contants['AWSCLI'], name=elb_name, port=port, action=sns_arn, tag_group=tag_group, elb_arn=elb_arn, mertic=mertic)


# ProcessedBytes,3分钟检查3次,当链接数平均大于或等于5M,就告警。
def getProcessedBytesComm(elb_name, port, tag_group, elb_arn, sns_arn):
    mertic = 'ProcessedBytes'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_ELB_{name}_{port}_{mertic}" \
--alarm-description "aws elb {mertic}" \
--metric-name {mertic} \
--namespace AWS/ApplicationELB \
--statistic Average \
--period 60 \
--threshold 5000000 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=TargetGroup,Value=targetgroup/{tag_group}" "Name=LoadBalancer,Value=app/{elb_arn}"'''.format(
        cli=Contants['AWSCLI'], name=elb_name, port=port, action=sns_arn, tag_group=tag_group, elb_arn=elb_arn, mertic=mertic)


# UnHealthyHostCount 五分钟检查一次,当不健康主机数量大于或等于1个,就告警.  不健康主机数量的最小值大于等于1就告警
def getUnHealthyHostCountComm(elb_name, port, tag_group, elb_arn, sns_arn):
    mertic = 'UnHealthyHostCount'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_ELB_{name}_{port}_{mertic}" \
--alarm-description "aws elb {mertic}" \
--metric-name {mertic} \
--namespace AWS/ApplicationELB \
--statistic Minimum \
--period 300 \
--threshold 1 \
--evaluation-periods 1 \
--datapoints-to-alarm 1 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=TargetGroup,Value=targetgroup/{tag_group}" "Name=LoadBalancer,Value=app/{elb_arn}"'''.format(
        cli=Contants['AWSCLI'], name=elb_name, port=port, action=sns_arn, tag_group=tag_group, elb_arn=elb_arn, mertic=mertic)


# HTTPCode_Target_5XX_Count 一分钟采集一次,周期为1分钟,1个数据点中有1次超过阈值就告警,当5xx超过10个为超过阈值
def getHTTPCode_Target_5XX_CountComm(elb_name, port, tag_group, elb_arn, sns_arn):
    mertic = 'HTTPCode_Target_5XX_Count'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_ELB_{name}_{port}_{mertic}" \
--alarm-description "aws elb {mertic}" \
--metric-name {mertic} \
--namespace AWS/ApplicationELB \
--statistic Sum \
--period 60 \
--threshold 10 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data notBreaching \
--evaluation-periods 1 \
--datapoints-to-alarm 1 \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=TargetGroup,Value=targetgroup/{tag_group}" "Name=LoadBalancer,Value=app/{elb_arn}"'''.format(
        cli=Contants['AWSCLI'], name=elb_name, port=port, action=sns_arn, tag_group=tag_group, elb_arn=elb_arn, mertic=mertic)


# HTTP_4XX 一分钟采集一次,周期为5分钟,5个数据点中有三次超过阈值就告警,当4xx超过10%为超过阈值
def getHTTPCode_Target_4XX_CountComm(elb_name, port, tag_group, elb_arn, sns_arn):
    mertic = 'HTTPCode_Target_4XX_Count'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_ELB_{name}_{port}_{mertic}" \
--alarm-description "aws elb {mertic}" \
--metric-name {mertic} \
--namespace AWS/ApplicationELB \
--statistic Sum \
--period 60 \
--threshold 10 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data notBreaching \
--evaluation-periods 5 \
--datapoints-to-alarm 3 \
--unit Percent \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions \"Name=TargetGroup,Value=targetgroup/{tag_group}\" "Name=LoadBalancer,Value=app/{elb_arn}"'''.format(
        cli=Contants['AWSCLI'], name=elb_name, port=port, action=sns_arn, tag_group=tag_group, elb_arn=elb_arn, mertic=mertic)


# 执行命令函数
def execCommand(comm):
    try:
        print(comm)
        (status, stdout) = subprocess.getstatusoutput(comm)
        # print(comm)
        print(status)
        return stdout
    except Exception as e:
        print(e)


# 获取当前可用区内所有elb2的信息
def getAll():
    comm1 = "%s elbv2 describe-load-balancers" % Contants['AWSCLI']
    AllLb2Details = json.loads(execCommand(comm1))['LoadBalancers']
    arndict = CreateDict()
    for i in range(0, len(AllLb2Details)):
        lbarn = AllLb2Details[i]["LoadBalancerArn"]
        lbname = AllLb2Details[i]["LoadBalancerName"]
        arndict[lbname]["type"] = AllLb2Details[i]["Type"]
        arndict[lbname]["lbarn"] = lbarn
        if arndict[lbname]["type"] == "network":
            comm2 = "%s elbv2 describe-target-groups --load-balancer-arn %s" % (
                Contants['AWSCLI'], lbarn)
            lb_target = json.loads(execCommand(comm2))
            arndict[lbname]["lbgroup"]['TCP'] = lb_target['TargetGroups'][0]['TargetGroupArn']
        else:
            comm2 = "%s elbv2 describe-listeners --load-balancer-arn %s" % (Contants['AWSCLI'], lbarn)
            alllisten = json.loads(execCommand(comm2))['Listeners']
            for j in range(0, len(alllisten)):
                taggroup = alllisten[j]['DefaultActions'][0]['TargetGroupArn']
                port = alllisten[j]['Port']
                arndict[lbname]["lbgroup"][port] = taggroup
    return arndict


# 添加报警
def add_alert(arn_data, sns_arn):
    for elb_name, elb_value in arn_data.items():
        elb_arn = re.split(r'loadbalancer/app/', elb_value['lbarn'])[-1]
        for port, taggroup in elb_value['lbgroup'].items():
            print("######################################################")
            taggroup = re.split(r':targetgroup/', taggroup)[-1]

            execCommand(getHealthyHostCountComm(elb_name, port, taggroup, elb_arn, sns_arn))
            execCommand(getUnHealthyHostCountComm(elb_name, port, taggroup, elb_arn, sns_arn))
            execCommand(getActiveFlowCountComm(elb_name, port, taggroup, elb_arn, sns_arn))
            execCommand(getProcessedBytesComm(elb_name, port, taggroup, elb_arn, sns_arn))

            if elb_value['type'] != 'network':
              execCommand(getHTTPCode_Target_5XX_CountComm(elb_name, port, taggroup, elb_arn, sns_arn))
              execCommand(getHTTPCode_Target_4XX_CountComm(elb_name, port, taggroup, elb_arn, sns_arn))

if __name__ == '__main__':
    sns_arn = "arn:aws-cn:sns:cn-north-1:377051234567:sns-example"

    cli = Contants['AWSCLI']
    for i in Contants['AWSREGION']:
        print('[Region] ', i)
        Contants['AWSCLI'] = cli + ' --region ' + i
        add_alert(getAll(), sns_arn)

为Redis批量添加监控报警

脚本信息

#!/usr/bin/python
# -*- coding: utf-8 -*-


import json
import subprocess

Contants = {
    "AWSCLI": '"C:\\Program Files\\Amazon\\AWSCLI\\bin\\aws.exe"  --output json',
    "AWSREGION": ['cn-north-1']  # 北京
}


# 构造字典
class CreateDict(dict):
    def __getitem__(self, item):
        try:
            return dict.__getitem__(self, item)
        except KeyError:
            value = self[item] = type(self)()
            return value


#########################################################################################################
# 配置告警

# CPUUtilization,3分钟检查3次,平均值大于或等于70%,就告警。
def getCPUUtilizationComm(name, action, instance_id):
    mertic = 'CPUUtilization'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_REDIS_{name}_{id}_{mertic}" \
--alarm-description "aws redis {mertic}" \
--metric-name {mertic} \
--namespace AWS/ElastiCache \
--statistic Average \
--period 60 \
--threshold 70 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=CacheClusterId,Value={id}"'''.format(cli=Contants['AWSCLI'], name=name, action=action,
                                                        id=instance_id, mertic=mertic)


# EngineCPUUtilization,3分钟检查3次,平均值大于或等于70%,就告警。
def getEngineCPUUtilizationComm(name, action, instance_id):
    mertic = 'EngineCPUUtilization'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_REDIS_{name}_{id}_{mertic}" \
--alarm-description "aws redis {mertic}" \
--metric-name {mertic} \
--namespace AWS/ElastiCache \
--statistic Average \
--period 60 \
--threshold 70 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=CacheClusterId,Value={id}"'''.format(cli=Contants['AWSCLI'], name=name, action=action,
                                                        id=instance_id, mertic=mertic)


# CurrConnections,3分钟检查3次,平均值大于或等于500,就告警。
def getCurrConnectionsComm(name, action, instance_id):
    mertic = 'CurrConnections'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_REDIS_{name}_{id}_{mertic}" \
--alarm-description "aws redis {mertic}" \
--metric-name {mertic} \
--namespace AWS/ElastiCache \
--statistic Average \
--period 60 \
--threshold 500 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=CacheClusterId,Value={id}"'''.format(cli=Contants['AWSCLI'], name=name, action=action,
                                                        id=instance_id, mertic=mertic)


#
# FreeableMemory,3分钟检查3次,平均值小于于或等于1g,就告警。
def getFreeableMemoryComm(name, action, instance_id):
    mertic = 'FreeableMemory'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_REDIS_{name}_{id}_{mertic}" \
--alarm-description "aws redis {mertic}" \
--metric-name {mertic} \
--namespace AWS/ElastiCache \
--statistic Average \
--period 60 \
--threshold 1000000000 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator LessThanOrEqualToThreshold  \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=CacheClusterId,Value={id}"'''.format(cli=Contants['AWSCLI'], name=name, action=action,
                                                        id=instance_id, mertic=mertic)


# NetworkBytesIn,3分钟检查3次,平均值小于于或等于5m,就告警。
def getNetworkBytesInComm(name, action, instance_id):
    mertic = 'NetworkBytesIn'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_REDIS_{name}_{id}_{mertic}" \
--alarm-description "aws redis {mertic}" \
--metric-name {mertic} \
--namespace AWS/ElastiCache \
--statistic Average \
--period 60 \
--threshold 5000000 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold  \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=CacheClusterId,Value={id}"'''.format(cli=Contants['AWSCLI'], name=name, action=action,
                                                        id=instance_id, mertic=mertic)


# NetworkBytesOut,3分钟检查3次,平均值小于于或等于5m,就告警。
def getNetworkBytesOutComm(name, action, instance_id):
    mertic = 'NetworkBytesOut'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_REDIS_{name}_{id}_{mertic}" \
--alarm-description "aws redis {mertic}" \
--metric-name {mertic} \
--namespace AWS/ElastiCache \
--statistic Average \
--period 60 \
--threshold 5000000 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold  \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=CacheClusterId,Value={id}"'''.format(cli=Contants['AWSCLI'], name=name, action=action,
                                                        id=instance_id, mertic=mertic)


# CacheMisses,3分钟检查3次,平均值小于于或等于5000,就告警。
def getCacheMissesComm(name, action, instance_id):
    mertic = 'CacheMisses'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_REDIS_{name}_{id}_{mertic}" \
--alarm-description "aws redis {mertic}" \
--metric-name {mertic} \
--namespace AWS/ElastiCache \
--statistic Average \
--period 60 \
--threshold 5000 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold  \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=CacheClusterId,Value={id}"'''.format(cli=Contants['AWSCLI'], name=name, action=action,
                                                        id=instance_id, mertic=mertic)


# 执行命令函数
def execCommand(comm):
    try:
        print(comm)
        (status, stdout) = subprocess.getstatusoutput(comm)
        print(status)
        return stdout
    except Exception as e:
        print(e)


# 获取当前可用区内elasticache的基础信息
def getAll():
    comm1 = "%s elasticache describe-cache-clusters" % Contants['AWSCLI']

    all_data = json.loads(execCommand(comm1))

    instance_list = []

    for r in all_data['CacheClusters']:
        if r['Engine'] == 'redis':
            data = {'id': r['CacheClusterId'], 'name': r['ReplicationGroupId']}
            instance_list.append(data)

    return instance_list


# 添加报警
def add_alert(data, action):
    for i in data:
        id = i['id']
        name = i['name']
        execCommand(getCPUUtilizationComm(name, action, id))
        execCommand(getEngineCPUUtilizationComm(name, action, id))
        execCommand(getCurrConnectionsComm(name, action, id))
        execCommand(getFreeableMemoryComm(name, action, id))
        execCommand(getNetworkBytesInComm(name, action, id))
        execCommand(getNetworkBytesOutComm(name, action, id))
        execCommand(getCacheMissesComm(name, action, id))


if __name__ == '__main__':
    sns_arn = "arn:aws-cn:sns:cn-north-1:377051234567:sns-example"
    cli = Contants['AWSCLI']
    for i in Contants['AWSREGION']:
        print('[Region] ', i)
        Contants['AWSCLI'] = cli + ' --region ' + i
        add_alert(getAll(), sns_arn)

为MySQL批量添加监控报警

脚本信息

#!/usr/bin/python
# -*- coding: utf-8 -*-

import json
import subprocess

Contants = {
    "AWSCLI": '"C:\\Program Files\\Amazon\\AWSCLI\\bin\\aws.exe"  --output json',
    "AWSREGION": ['cn-north-1']  # 北京
}


# 构造字典
class CreateDict(dict):
    def __getitem__(self, item):
        try:
            return dict.__getitem__(self, item)
        except KeyError:
            value = self[item] = type(self)()
            return value


#########################################################################################################
# 配置告警

# CPUUtilization,3分钟检查3次,平均值大于或等于80%,就告警。
def getCPUUtilizationComm(action, instance_id):
    mertic = 'CPUUtilization'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_MYSQL_{id}_{mertic}" \
--alarm-description "aws mysql {mertic}" \
--metric-name {mertic} \
--namespace AWS/RDS \
--statistic Average \
--period 60 \
--threshold 70 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=DBInstanceIdentifier,Value={id}"'''.format(cli=Contants['AWSCLI'], action=action,
                                                              id=instance_id, mertic=mertic)


# DatabaseConnections,3分钟检查3次,平均值大于或等于500,就告警。
def getDatabaseConnectionsComm(action, instance_id):
    mertic = 'DatabaseConnections'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_MYSQL_{id}_{mertic}" \
--alarm-description "aws mysql {mertic}" \
--metric-name {mertic} \
--namespace AWS/RDS \
--statistic Average \
--period 60 \
--threshold 500 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=DBInstanceIdentifier,Value={id}"'''.format(cli=Contants['AWSCLI'], action=action,
                                                              id=instance_id, mertic=mertic)


# FreeableMemory,3分钟检查3次,平均值小于于或等于1g,就告警。
def getFreeableMemoryComm(action, instance_id):
    mertic = 'FreeableMemory'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_MYSQL_{id}_{mertic}" \
--alarm-description "aws mysql {mertic}" \
--metric-name {mertic} \
--namespace AWS/RDS \
--statistic Average \
--period 60 \
--threshold 1000000000 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator LessThanOrEqualToThreshold  \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=DBInstanceIdentifier,Value={id}"'''.format(cli=Contants['AWSCLI'], action=action,
                                                              id=instance_id, mertic=mertic)


# FreeStorageSpace,15分钟检查3次,平均值小于于或等于20g,就告警。
def FreeStorageSpace(action, instance_id):
    mertic = 'FreeStorageSpace'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_MYSQL_{id}_{mertic}" \
--alarm-description "aws mysql {mertic}" \
--metric-name {mertic} \
--namespace AWS/RDS \
--statistic Average \
--period 300 \
--threshold 20000000000 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator LessThanOrEqualToThreshold  \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=DBInstanceIdentifier,Value={id}"'''.format(cli=Contants['AWSCLI'], action=action,
                                                              id=instance_id, mertic=mertic)


# NetworkTransmitThroughput,3分钟检查3次,平均值小于于或等于5m,就告警。
def getNetworkTransmitThroughputComm(action, instance_id):
    mertic = 'NetworkTransmitThroughput'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_MYSQL_{id}_{mertic}" \
--alarm-description "aws mysql {mertic}" \
--metric-name {mertic} \
--namespace AWS/RDS \
--statistic Average \
--period 60 \
--threshold 5000000 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold  \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=DBInstanceIdentifier,Value={id}"'''.format(cli=Contants['AWSCLI'], action=action,
                                                              id=instance_id, mertic=mertic)


# NetworkBytesOut,3分钟检查3次,平均值小于于或等于5m,就告警。
def getNetworkReceiveThroughputComm(action, instance_id):
    mertic = 'NetworkReceiveThroughput'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_MYSQL_{id}_{mertic}" \
--alarm-description "aws mysql {mertic}" \
--metric-name {mertic} \
--namespace AWS/RDS \
--statistic Average \
--period 60 \
--threshold 5000000 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold  \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=DBInstanceIdentifier,Value={id}"'''.format(cli=Contants['AWSCLI'], action=action,
                                                              id=instance_id, mertic=mertic)


# 执行命令函数
def execCommand(comm):
    try:
        print(comm)
        (status, stdout) = subprocess.getstatusoutput(comm)
        print(status)
        return stdout
    except Exception as e:
        print(e)


# 获取当前可用区内RDS的基础信息
def getAll():
    comm1 = "%s rds describe-db-instances" % Contants['AWSCLI']

    all_data = json.loads(execCommand(comm1))

    instance_list = []

    for r in all_data['DBInstances']:
        if r['Engine'] == 'mysql':
            data = {'id': r['DBInstanceIdentifier']}
            instance_list.append(data)

    return instance_list


# 添加报警
def add_alert(data, action):
    for i in data:
        id = i['id']
        execCommand(getCPUUtilizationComm(action, id))
        execCommand(getDatabaseConnectionsComm(action, id))
        execCommand(getFreeableMemoryComm(action, id))
        execCommand(getNetworkReceiveThroughputComm(action, id))
        execCommand(getNetworkTransmitThroughputComm(action, id))
        execCommand(FreeStorageSpace(action, id))


if __name__ == '__main__':
    sns_arn = "arn:aws-cn:sns:cn-north-1:377051234567:sns-example"
    cli = Contants['AWSCLI']
    for i in Contants['AWSREGION']:
        print('[Region] ', i)
        Contants['AWSCLI'] = cli + ' --region ' + i
        add_alert(getAll(), sns_arn)
原文地址 https://lework.github.io/2019/09/26/aws-alert/

Comments

Content