run.py

#!/usr/bin/env python3

import boto3
import argparse
import sys
import os
import hashlib
import json
import threading
import zipfile
import gzip
import time
import shutil
import itertools
import dateutil.relativedelta
from datetime import datetime

class Parser(argparse.ArgumentParser):
    def print_help(self, file=sys.stdout):
        super(Parser, self).print_help(file)
        print(
            """
BILLING PREFIX:
  This tool uses AWS's new Cost And Usage Report format for billing data. The
  following structure is expected in S3:

    PREFIX
    `- arbitraryReportName
       |- 20171001-20171101
       |  |- arbitraryReportName-Manifest.json
       |  |- bbe82960-6a1a-47fd-ae59-1e666e2f674a
       |  |  |- arbitraryReportName-Manifest.json
       |  |  |- arbitraryReportName-1.csv.gz
       |  |  `- ...
       |  `- ...
       `- ...

  You can get more information about this at
  https://docs.aws.amazon.com/awsaccountbilling/latest/aboutv2/billing-reports-costusage.html""",
            file=file,
        )

    def error(self, message):
        print(message)
        self.print_help()
        sys.exit(2)


def parse_args():
    parser = Parser()
    parser.add_argument(
        "--no-clear-before",
        help="Do not clear all data before doing anything. Useful when a previous invocation failed or when you add data incrementally before generating the sheet.",
        dest="clear_before",
        action="store_false",
        default=True,
    )
    parser.add_argument(
        "--no-generate-xlsx",
        help="Do not generate a XLSX file after all data was retrieved.",
        dest="generate_xslx",
        action="store_false",
        default=True,
    )
    now = datetime.now()
    parser.add_argument(
        "--xlsx-name",
        help="Name of the XLSX file.",
        dest="xlsx_name",
        default=now.strftime("trackit_aws_cost_report_%Y_%m_%d"),
    )
    parser.add_argument(
        "--generate-gsheet",
        help="Generate a Google Sheet after all data was retrieved.",
        dest="generate_gsheet",
        action="store_true",
        default=False,
    )
    parser.add_argument(
        "--billing",
        help="Get billing data from s3:/BUCKET/PREFIX using PROFILE.",
        action="append",
        nargs=3,
        metavar=("PROFILE", "BUCKET", "PREFIX"),
        default=[],
    )
    parser.add_argument(
        "--ec2",
        help="Get EC2 data for PROFILE.",
        action="append",
        nargs=1,
        metavar="PROFILE",
        default=[],
    )
    return parser.parse_args(), parser


def try_mkdir(path):
    try:
        os.mkdir(path)
    except FileExistsError:
        pass


try_mkdir("in")
try_mkdir("in/usagecost")
try_mkdir("out")
try_mkdir("out/reservation-usage")
try_mkdir("out/instance-reservation-usage")
try_mkdir("out/instance-size-recommendation")
try_mkdir("out/instance-metadata")
try_mkdir("out/last-month")
try_mkdir("out/s3")

default_region = "us-east-1"

def awsenv(profile, region):
    return "util/awsenv --profile {} --region {}".format(profile, region)


def build_billing_diff():
    os.system("src/get_bill_diff.py")


def build_instance_history():
    os.system("src/get_ec2_instance_history.py")

def build_ec2_last_month_usage():
    os.system("src/get_last_month_ec2_cost.py")

def build_ebs_last_month_usage():
    os.system("src/get_last_month_ebs_cost.py")

def build_s3_cost():
    os.system("src/get_s3_cost.py")

def build_gsheet():
    os.system("src/make_gsheet.py")


def build_xlsx(name):
    os.system("src/make_xlsx.py {}".format(name))

def get_session(profile):
    if profile != 'env':
        session = boto3.Session(profile_name=profile)
    else:
        session = boto3.Session()
    return session

def do_get_billing_data(profile, bucket, prefix):

    nonce = hashlib.sha1("{}{}".format(bucket, prefix).encode()).hexdigest()[:12]
    it = 1
    concurrent_available = 4
    concurrent_available_mutex = threading.Lock()
    thread = []

    def change_concurrent_available(value):
        nonlocal concurrent_available
        nonlocal concurrent_available_mutex

        concurrent_available_mutex.acquire()
        concurrent_available += value
        concurrent_available_mutex.release()

    def save_to_file(s3_client, bucket, file_name, report_key):
        try:
            s3_client.download_file(Bucket=bucket, Key=report_key, Filename=file_name)
        except Exception as e:
            print(e)
        finally:
            change_concurrent_available(1)

    def analyze_report(s3_client, bucket, report_keys):
        nonlocal it
        nonlocal thread
        nonlocal concurrent_available
        for report_key in report_keys:
            if concurrent_available <= 0:
                print("    Waiting to download {}...".format(report_key))
            while concurrent_available <= 0:
                time.sleep(0.1)
            file_name = "in/usagecost/{}.{}.csv.{}".format(nonce, it, report_key.split(".")[-1])
            t = threading.Thread(name=report_key, target=save_to_file, args=(s3_client, bucket, file_name, report_key))
            print("    Downloading {}...".format(report_key))
            t.start()
            change_concurrent_available(-1)
            thread.append(t)
            it += 1

    def analyze_obj(s3_client, objs):
        total = len(objs)
        current = 1
        for obj in objs:
            print("  Getting bill files from {} ({}/{})...".format(obj["Key"], current, total))
            content = s3_client.get_object(Bucket=bucket, Key=obj["Key"])["Body"].read().decode("utf-8")
            content_json = json.loads(content)
            if "bucket" in content_json:
                analyze_report(s3_client, content_json["bucket"], content_json["reportKeys"])
            current += 1
        for t in thread:
            t.join()

    def unzip_obj():
        for file_name in os.listdir("in/usagecost"):
            try:
                print("Extracting {}...".format(file_name))
                if file_name.startswith(nonce) and file_name.endswith(".zip"):
                    with zipfile.ZipFile(os.path.join("in/usagecost", file_name), "r") as z:
                        z.extractall("in/usagecost")
                elif file_name.startswith(nonce) and file_name.endswith(".gz"):
                    with gzip.GzipFile(os.path.join("in/usagecost", file_name), "r") as z:
                        with open(os.path.join("in/usagecost", file_name[:-3]), "wb+") as f:
                            shutil.copyfileobj(z, f)
            except Exception as e:
                print("Failed to extract {}: {}".format(file_name, e))
            finally:
                os.remove(os.path.join("in/usagecost", file_name))

    try:
        session = get_session(profile)
        s3_client = session.client("s3")
        page = s3_client.get_paginator("list_objects").paginate(Bucket=bucket, Prefix=prefix)
        min_date = (datetime.now() + dateutil.relativedelta.relativedelta(months=-6)).replace(day=1).strftime('%Y%m%d')
        objs = [
            obj
            for p in page
            for obj in p["Contents"]
            if obj["Key"].endswith(".json") and
                len(obj["Key"].split('/')) == 4 and
                obj["Key"].split('/')[-2] >= min_date
        ]
    except Exception as e:
        exit(e)
    analyze_obj(s3_client, objs)
    unzip_obj()


def do_get_instance_data(profile, region):
    threads = []
    for cmd in (
            "{} src/get_ec2_recommendations.py".format(awsenv(profile, region)),
            "{} src/get_ec2_metadata.py".format(awsenv(profile, region)),
        ):
        threads.append(threading.Thread(target=os.system, args=[cmd]))
        threads[-1].start()
    for t in threads:
        t.join()


def recursively_remove_file(path):
    if os.path.isdir(path):
        for f in os.listdir(path):
            recursively_remove_file(os.path.join(path, f))
    else:
        os.remove(path)


def clear_data():
    for f in os.listdir("out"):
        recursively_remove_file(os.path.join("out", f))
    for f in os.listdir("in"):
        f = os.path.join("in", f)
        if not os.path.isdir(f) or (os.path.isdir(f) and f != "in/persistent"):
            recursively_remove_file(f)

def get_regions(session):
    client_region = session.region_name or default_region
    client = session.client('ec2', region_name=client_region)
    regions = client.describe_regions()
    return [
        region['RegionName']
        for region in regions['Regions']
    ]

def main():
    args, parser = parse_args()
    args.ec2 = [a[0] for a in args.ec2] if len(args.ec2) else []
    # if len(args.billing) == 0 and len(args.ec2) == 0:
    #     return parser.print_help()
    if args.clear_before:
        clear_data()
    if not os.path.isfile("in/ondemandcosts.json"):
        os.system("src/get_ec2_costs.sh")
    for bill in args.billing:
        print("Download billings for {}...".format(bill[0]))
        do_get_billing_data(*bill)
    if len(args.ec2):
        session = get_session(args.ec2[0])
        regions = get_regions(session)
        threads = []
        for region in regions:
            print("Fetching ec2 data for all accounts in {}...".format(region))
            cmd = "src/get_ec2_data.py --region {} --profile {}".format(region, ' '.join(args.ec2))
            threads.append((region, threading.Thread(target=os.system, args=[cmd])))
            threads[-1][1].start()
        for t in threads:
            t[1].join()
            print("Fetched ec2 data for all accounts in {}".format(t[0]))
        for ec in args.ec2:
            threads = []
            for region in regions:
                print("Fetching ec2 metadata for {} in {}...".format(ec, region))
                threads.append((region, threading.Thread(target=do_get_instance_data, args=(ec, region))))
                threads[-1][1].start()
            for t in threads:
                t[1].join()
                print("Fetched ec2 metadata for {} in {}".format(ec, t[0]))
    if args.generate_gsheet or args.generate_xslx:
        fcts = [
            ("billing diff", build_billing_diff),
            ("instance history", build_instance_history),
            ("ec2 last month", build_ec2_last_month_usage),
            ("ebs last month", build_ebs_last_month_usage),
            ("s3 cost", build_s3_cost)
        ]
        for i, fct in zip(itertools.count(1), fcts):
            print("Processing billing data ({}/{} - {})...".format(i, len(fcts), fct[0]))
            fct[1]()
        if args.generate_gsheet:
            build_gsheet()
        if args.generate_xslx:
            build_xlsx(args.xlsx_name)


if __name__ == "__main__":
    main()