Skip to content

Commit

Permalink
Implement web_download_unxz (#184)
Browse files Browse the repository at this point in the history
  • Loading branch information
lovit committed Jan 24, 2021
1 parent e592a79 commit 2d822d8
Showing 1 changed file with 20 additions and 2 deletions.
22 changes: 20 additions & 2 deletions Korpora/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def web_download_untar(url, tar_path, corpus_name='', force_download=False):
data_root = os.path.dirname(tar_path)
with tarfile.open(tar_path) as tar:
tar.extractall(data_root)
print(f'decompress {tar_path}')
print(f'untar {tar_path}')


def web_download_ungzip(url, gzip_path, corpus_name='', force_download=False):
Expand All @@ -155,7 +155,23 @@ def web_download_ungzip(url, gzip_path, corpus_name='', force_download=False):
with gzip.open(gzip_path, 'rb') as fi:
with open(data_path, 'wb') as fo:
shutil.copyfileobj(fi, fo)
print(f'decompress {gzip_path}')
print(f'ungzip {gzip_path}')


def web_download_unxz(url, xz_path, corpus_name='', force_download=False):
web_download(url, xz_path, corpus_name, force_download)
# assume that path/to/abc.xz consists path/to/abc
xz_path = os.path.abspath(xz_path)
data_path = xz_path[:-3]
_, parent, filename = data_path.rsplit(os.path.sep, 2)
if (not force_download) and os.path.exists(data_path):
print(f'[Korpora] Corpus `{corpus_name}` is already installed at {data_path}')
return None
with lzma.open(xz_path, mode='rt') as fi:
with open(data_path, 'w', encoding='utf-8') as fo:
for line in tqdm(fi, desc=f'Unpacking {parent}/{filename}'):
fo.write(line)
print(f'unxz {xz_path}')


def google_drive_download(file_id, local_path, corpus_name='', force_download=False):
Expand Down Expand Up @@ -222,5 +238,7 @@ def fetch(remote_path, local_path, corpus_name=None, force_download=False, metho
web_download_untar(remote_path, destination, corpus_name, force_download)
elif method == "download & ungzip":
web_download_ungzip(remote_path, destination, corpus_name, force_download)
elif method == "download & unxz":
web_download_unxz(remote_path, destination, corpus_name, force_download)
else:
print(f'download method is not valid ({method})')

0 comments on commit 2d822d8

Please sign in to comment.