Skip to content

Commit

Permalink
add raw dictionary upload automation
Browse files Browse the repository at this point in the history
  • Loading branch information
eiennohito committed Aug 3, 2023
1 parent 2ef2223 commit 32420e5
Show file tree
Hide file tree
Showing 4 changed files with 359 additions and 0 deletions.
71 changes: 71 additions & 0 deletions scripts/01_upload_raw_dictionaries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from pathlib import Path

from classopt import classopt

from aws_common import CredentialCache
from raw_listing import generate_raw_listing


@classopt(default_long=True)
class Opts:
input: Path
version: str
aws_profile: str
aws_mfa: str
aws_region: str = "ap-northeast-1"
s3_bucket: str = "sudachi"
s3_prefix: str = "sudachidict-raw"


def validate_file(file: Path) -> Path:
if file.exists():
return file
raise FileNotFoundError(f"required file {file} was not present")


def validate_files(args: Opts) -> list[Path]:
return [
validate_file(args.input / "small_lex.zip"),
validate_file(args.input / "core_lex.zip"),
validate_file(args.input / "notcore_lex.zip"),
]


def make_client(args):
return CredentialCache(args.aws_profile, args.aws_mfa).session.resource("s3", region_name=args.aws_region)


def upload_files(client, args: Opts, files: list[Path]):
bucket = client.Bucket(args.s3_bucket)
for file in files:
s3_key = f"{args.s3_prefix}/{args.version}/{file.name}"
with file.open('rb') as f:
resp = bucket.put_object(
Body=f,
Key=s3_key,
ContentType='application/zip'
)
print("put", file, "size", resp.content_length, "to", s3_key, "etag", resp.e_tag)


def regenerate_index(s3, args: Opts):
listing = generate_raw_listing(s3, args.s3_bucket, args.s3_prefix)
bucket = s3.Bucket(args.s3_bucket)

bucket.put_object(
Body=listing.encode("utf-8"),
Key=f"{args.s3_prefix}/index.html",
ContentType="text/html; charset=utf-8",
)
print("updated index.html")


def main(args: Opts):
files = validate_files(args)
client = make_client(args)
upload_files(client, args, files)
regenerate_index(client, args)


if __name__ == '__main__':
main(Opts.from_args())
87 changes: 87 additions & 0 deletions scripts/aws_common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import datetime
import json
from pathlib import Path
from typing import Optional

import boto3

THIS_DIR = Path(__file__).parent
BUILD_DIR = THIS_DIR.parent / "build"


class CredentialCache(object):
def __init__(self, profile: Optional[str] = None, mfa: Optional[str] = None):
cached_creds = CachedCredentials()
if cached_creds.valid():
self.session = cached_creds.session()
return

session = boto3.Session(
profile_name=profile
)

if mfa is not None:
sts = session.client("sts")
mfa_code = input("Input your MFA code: ").strip()
response = sts.get_session_token(
DurationSeconds=60 * 60,
SerialNumber=mfa,
TokenCode=mfa_code
)
creds = response['Credentials']
cached_creds.cache(creds)
self.session = cached_creds.session()


class CachedCredentials(object):
def __init__(self):
creds_file = BUILD_DIR / "cache/aws/creds.json"
self.access_key_id = None
if not creds_file.exists():
return

with creds_file.open("rt", encoding='utf-8') as f:
self.cache(json.load(f))

def valid(self) -> bool:
return self.access_key_id is not None

def cache(self, data):
expiration = data['Expiration']
if isinstance(expiration, datetime.datetime):
data['Expiration'] = expiration.isoformat()
elif isinstance(expiration, str):
expiration = datetime.datetime.fromisoformat(expiration)

now_utc = datetime.datetime.utcnow()

delta = expiration.replace(tzinfo=None) - now_utc

# if session token validity < 60 sec, reject it
if delta.total_seconds() < 60:
return

creds_file = BUILD_DIR / "cache/aws/creds.json"
if not creds_file.parent.exists():
creds_file.parent.mkdir(parents=True)

with creds_file.open("wt", encoding='utf-8') as of:
json.dump(data, of)
of.flush()

self.access_key_id = data['AccessKeyId']
self.secret_access_key = data['SecretAccessKey']
self.session_token = data['SessionToken']

def session(self):
return boto3.Session(
aws_access_key_id=self.access_key_id,
aws_secret_access_key=self.secret_access_key,
aws_session_token=self.session_token
)


if __name__ == '__main__':
import sys

cache = CredentialCache(sys.argv[1], sys.argv[2])
199 changes: 199 additions & 0 deletions scripts/raw_listing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
import io
from typing import IO

INLINE_TAGS = {"a", "span", "i"}


def block_tag(name) -> bool:
return name not in INLINE_TAGS


class DocumentRenderer(object):
def __init__(self, buf: IO[str]):
self.buf = buf
self._indent_base = 2
self._indent = 0
self._in_render = False

def _start_tag(self, name: str, attrs: dict[str, str]):
buf = self.buf
buf.write("<")
buf.write(name)
for akey, aval in attrs.items():
buf.write(" ")
buf.write(akey)
buf.write("=\'")
buf.write(aval)
buf.write("\'")
buf.write(">")

def _end_tag(self, name):
buf = self.buf
buf.write("</")
buf.write(name)
buf.write(">")

def _render_first(self, tag, args, kwargs):
self._in_render = True
try:
self._render(tag, args, kwargs)
finally:
self._in_render = False

def _newline(self):
buf = self.buf
buf.write("\n")
for _ in range(self._indent):
buf.write(" ")

def _render(self, tag, args, kwargs):
has_children = any(isinstance(x, tuple) and block_tag(x[0]) for x in args)

self._start_tag(tag, kwargs)
if has_children:
self._indent += self._indent_base
for child in args:
if isinstance(child, tuple):
tag_name, tag_args, tag_kwargs = child
if has_children:
self._newline()
self._render(tag_name, tag_args, tag_kwargs)
else:
self.buf.write(str(child))
if has_children:
self._indent -= self._indent_base
self._newline()
self._end_tag(tag)

def render(self, structure):
name, args, kwargs = structure
self._render_first(name, args, kwargs)


class DocStructure(object):
def __getattr__(self, item):
return TagWrapper(item)


class TagWrapper(object):
def __init__(self, name: str):
self._name = name

def __call__(self, *args, **kwargs):
return self._name, args, kwargs


def gather_files(s3, bucket, prefix):
bucket_obj = s3.Bucket(bucket)

by_version = dict()

for obj in bucket_obj.objects.filter(Prefix=prefix, MaxKeys=5000):
key = obj.key
if not key.endswith("_lex.zip"):
continue
key_wo_prefix = key[len(prefix) + 1:]
version, file = key_wo_prefix.split("/", 2)
by_version.setdefault(version, dict())[file] = key_wo_prefix

return by_version


def render_table(data: dict[str, dict[str, str]]):
rows = sorted(data.keys(), reverse=True)
cols = ["small_lex.zip", "core_lex.zip", "notcore_lex.zip"]

r = DocStructure()

header = r.tr(
r.td("Release"),
*[r.td() for _ in cols]
)

table_rows = []

for row in rows:
table_row = [
r.td(row)
]
data_row = data.get(row)
for col in cols:
s3_key = data_row.get(col)
value = ""
if s3_key is not None:
value = r.a(
col,
href=s3_key
)
table_row.append(
r.td(value)
)
table_rows.append(
r.tr(*table_row)
)

return r.table(
r.thead(
header,
),
r.tbody(
*table_rows
)
)


def compute_cols(data):
cols = set()
for item in data.values():
cols.update(item)
cols = sorted(cols)
return cols


def render_doc(table=""):
r = DocStructure()
structure = r.html(
r.head(
r.link(rel="stylesheet", href="/css/style.css"),
r.title("Sudachi Dictionary Sources (CSV)"),
r.meta(name="viewport", content="width=device-width, initial-scale=1"),
lang="en"
),
r.body(
r.h1(
"Sudachi Dictionary Sources"
),
r.p(
"You may also need the ",
r.a(
"matrix.def",
href="matrix.def.zip",
),
" file to build the binary dictionary"
),
table
)
)
iobj = io.StringIO()
iobj.write("<!doctype html>\n")
DocumentRenderer(iobj).render(structure)

return iobj.getvalue()


def generate_raw_listing(s3, bucket="sudachi", prefix="sudachdic-raw") -> str:
files = gather_files(s3, bucket, prefix)
return render_doc(render_table(files))


def _main():
import sys
from aws_common import CredentialCache

s3 = CredentialCache(sys.argv[1], sys.argv[2]).session.resource("s3")
table = generate_raw_listing(s3)
print(render_doc(table))


if __name__ == '__main__':
_main()
2 changes: 2 additions & 0 deletions scripts/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
boto3
classopt

0 comments on commit 32420e5

Please sign in to comment.