From 2d822d8c8f705a8bdedb90ff850046375c6c6ee8 Mon Sep 17 00:00:00 2001 From: lovit Date: Mon, 25 Jan 2021 04:54:17 +0900 Subject: [PATCH] Implement `web_download_unxz` (#184) --- Korpora/utils.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/Korpora/utils.py b/Korpora/utils.py index 0a3d28c..33442e1 100644 --- a/Korpora/utils.py +++ b/Korpora/utils.py @@ -142,7 +142,7 @@ def web_download_untar(url, tar_path, corpus_name='', force_download=False): data_root = os.path.dirname(tar_path) with tarfile.open(tar_path) as tar: tar.extractall(data_root) - print(f'decompress {tar_path}') + print(f'untar {tar_path}') def web_download_ungzip(url, gzip_path, corpus_name='', force_download=False): @@ -155,7 +155,23 @@ def web_download_ungzip(url, gzip_path, corpus_name='', force_download=False): with gzip.open(gzip_path, 'rb') as fi: with open(data_path, 'wb') as fo: shutil.copyfileobj(fi, fo) - print(f'decompress {gzip_path}') + print(f'ungzip {gzip_path}') + + +def web_download_unxz(url, xz_path, corpus_name='', force_download=False): + web_download(url, xz_path, corpus_name, force_download) + # assume that path/to/abc.xz consists path/to/abc + xz_path = os.path.abspath(xz_path) + data_path = xz_path[:-3] + _, parent, filename = data_path.rsplit(os.path.sep, 2) + if (not force_download) and os.path.exists(data_path): + print(f'[Korpora] Corpus `{corpus_name}` is already installed at {data_path}') + return None + with lzma.open(xz_path, mode='rt') as fi: + with open(data_path, 'w', encoding='utf-8') as fo: + for line in tqdm(fi, desc=f'Unpacking {parent}/{filename}'): + fo.write(line) + print(f'unxz {xz_path}') def google_drive_download(file_id, local_path, corpus_name='', force_download=False): @@ -222,5 +238,7 @@ def fetch(remote_path, local_path, corpus_name=None, force_download=False, metho web_download_untar(remote_path, destination, corpus_name, force_download) elif method == "download & ungzip": web_download_ungzip(remote_path, destination, corpus_name, force_download) + elif method == "download & unxz": + web_download_unxz(remote_path, destination, corpus_name, force_download) else: print(f'download method is not valid ({method})')