Skip to content

Commit

Permalink
update link in TLC to be github links (huggingface#1737)
Browse files Browse the repository at this point in the history
* update link to be github links

* format code
  • Loading branch information
chameleonTK authored and eusip committed Jan 21, 2021
1 parent 6cd4e40 commit 34d34a7
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 4 deletions.
2 changes: 1 addition & 1 deletion datasets/tlc/dataset_infos.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"tlcv2.0": {"description": "Thai Literature Corpora (TLC): Corpora of machine-ingestible Thai classical literature texts.\n\nRelease: 6/25/19\n\nIt consists of two datasets:\n\n## TLC set\nIt is texts from [Vajirayana Digital Library](https://vajirayana.org/), stored by chapters and stanzas (non-tokenized).\n\ntlc v.2.0 (6/17/19 : a total of 34 documents, 292,270 lines, 31,790,734 characters)\ntlc v.1.0 (6/11/19 : a total of 25 documents, 113,981 lines, 28,775,761 characters)\n\n## TNHC set\nIt is texts from Thai National Historical Corpus, stored by lines (manually tokenized).\n\ntnhc v.1.0 (6/25/19 : a total of 47 documents, 756,478 lines, 13,361,142 characters)\n", "citation": "@misc{\n author={Sawatphol, Jitkapat},\n title={Thai Literature Corpora},\n year={2019},\n howpublished={\\url{https://attapol.github.io/tlc.html}}\n}\n", "homepage": "https://attapol.github.io/tlc.html", "license": "", "features": {"ch_num": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"feature": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "tlc", "config_name": "tlcv2.0", "version": {"version_str": "2.0.0", "description": null, "major": 2, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 32498, "num_examples": 1, "dataset_name": "tlc"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=1S2T72b3Kkcvy4XZcxwIipoRn6ELa4hhV": {"num_bytes": 5551710, "checksum": "56aff251d17a33b4c6a12d2b41c9fe6aaa676d08a87d12e253f465107b13ae88"}}, "download_size": 5551710, "post_processing_size": null, "dataset_size": 32498, "size_in_bytes": 5584208}, "tlcv1.0": {"description": "Thai Literature Corpora (TLC): Corpora of machine-ingestible Thai classical literature texts.\n\nRelease: 6/25/19\n\nIt consists of two datasets:\n\n## TLC set\nIt is texts from [Vajirayana Digital Library](https://vajirayana.org/), stored by chapters and stanzas (non-tokenized).\n\ntlc v.2.0 (6/17/19 : a total of 34 documents, 292,270 lines, 31,790,734 characters)\ntlc v.1.0 (6/11/19 : a total of 25 documents, 113,981 lines, 28,775,761 characters)\n\n## TNHC set\nIt is texts from Thai National Historical Corpus, stored by lines (manually tokenized).\n\ntnhc v.1.0 (6/25/19 : a total of 47 documents, 756,478 lines, 13,361,142 characters)\n", "citation": "@misc{\n author={Sawatphol, Jitkapat},\n title={Thai Literature Corpora},\n year={2019},\n howpublished={\\url{https://attapol.github.io/tlc.html}}\n}\n", "homepage": "https://attapol.github.io/tlc.html", "license": "", "features": {"ch_num": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"feature": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "tlc", "config_name": "tlcv1.0", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 32498, "num_examples": 1, "dataset_name": "tlc"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=15E64fwMeAff0bAsFGaSsv9NYeVHn1drE": {"num_bytes": 2904472, "checksum": "9b27c79bfde97960f4cb3530a2e6a89360c219c0530e8aea593883f4847214df"}}, "download_size": 2904472, "post_processing_size": null, "dataset_size": 32498, "size_in_bytes": 2936970}, "tnhcv1.0": {"description": "Thai Literature Corpora (TLC): Corpora of machine-ingestible Thai classical literature texts.\n\nRelease: 6/25/19\n\nIt consists of two datasets:\n\n## TLC set\nIt is texts from [Vajirayana Digital Library](https://vajirayana.org/), stored by chapters and stanzas (non-tokenized).\n\ntlc v.2.0 (6/17/19 : a total of 34 documents, 292,270 lines, 31,790,734 characters)\ntlc v.1.0 (6/11/19 : a total of 25 documents, 113,981 lines, 28,775,761 characters)\n\n## TNHC set\nIt is texts from Thai National Historical Corpus, stored by lines (manually tokenized).\n\ntnhc v.1.0 (6/25/19 : a total of 47 documents, 756,478 lines, 13,361,142 characters)\n", "citation": "@misc{\n author={Sawatphol, Jitkapat},\n title={Thai Literature Corpora},\n year={2019},\n howpublished={\\url{https://attapol.github.io/tlc.html}}\n}\n", "homepage": "https://attapol.github.io/tlc.html", "license": "", "features": {"text": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "tlc", "config_name": "tnhcv1.0", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 25198, "num_examples": 152, "dataset_name": "tlc"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=1T_ib-NOwQV6O6lEjCjvZReUA3pQ4h-gD": {"num_bytes": 1465403, "checksum": "ba534fb18bb07595a82e3981a164384494b01d49918939b11fe417423a637e6f"}}, "download_size": 1465403, "post_processing_size": null, "dataset_size": 25198, "size_in_bytes": 1490601}}
{"tlcv1.0": {"description": "Thai Literature Corpora (TLC): Corpora of machine-ingestible Thai classical literature texts.\n\nRelease: 6/25/19\n\nIt consists of two datasets:\n\n## TLC set\nIt is texts from [Vajirayana Digital Library](https://vajirayana.org/), stored by chapters and stanzas (non-tokenized).\n\ntlc v.2.0 (6/17/19 : a total of 34 documents, 292,270 lines, 31,790,734 characters)\ntlc v.1.0 (6/11/19 : a total of 25 documents, 113,981 lines, 28,775,761 characters)\n\n## TNHC set\nIt is texts from Thai National Historical Corpus, stored by lines (manually tokenized).\n\ntnhc v.1.0 (6/25/19 : a total of 47 documents, 756,478 lines, 13,361,142 characters)\n", "citation": "@misc{\n author={Sawatphol, Jitkapat},\n title={Thai Literature Corpora},\n year={2019},\n howpublished={\\url{https://attapol.github.io/tlc.html}}\n}\n", "homepage": "https://attapol.github.io/tlc.html", "license": "", "features": {"ch_num": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"feature": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "tlc", "config_name": "tlcv1.0", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 32498, "num_examples": 1, "dataset_name": "tlc"}}, "download_checksums": {"https://github.com/jitkapat/thailitcorpus/releases/download/v.1.0/tlc_v.1.0.tar.gz": {"num_bytes": 2904472, "checksum": "9b27c79bfde97960f4cb3530a2e6a89360c219c0530e8aea593883f4847214df"}}, "download_size": 2904472, "post_processing_size": null, "dataset_size": 32498, "size_in_bytes": 2936970}, "tlcv2.0": {"description": "Thai Literature Corpora (TLC): Corpora of machine-ingestible Thai classical literature texts.\n\nRelease: 6/25/19\n\nIt consists of two datasets:\n\n## TLC set\nIt is texts from [Vajirayana Digital Library](https://vajirayana.org/), stored by chapters and stanzas (non-tokenized).\n\ntlc v.2.0 (6/17/19 : a total of 34 documents, 292,270 lines, 31,790,734 characters)\ntlc v.1.0 (6/11/19 : a total of 25 documents, 113,981 lines, 28,775,761 characters)\n\n## TNHC set\nIt is texts from Thai National Historical Corpus, stored by lines (manually tokenized).\n\ntnhc v.1.0 (6/25/19 : a total of 47 documents, 756,478 lines, 13,361,142 characters)\n", "citation": "@misc{\n author={Sawatphol, Jitkapat},\n title={Thai Literature Corpora},\n year={2019},\n howpublished={\\url{https://attapol.github.io/tlc.html}}\n}\n", "homepage": "https://attapol.github.io/tlc.html", "license": "", "features": {"ch_num": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"feature": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "tlc", "config_name": "tlcv2.0", "version": {"version_str": "2.0.0", "description": null, "major": 2, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 32498, "num_examples": 1, "dataset_name": "tlc"}}, "download_checksums": {"https://github.com/jitkapat/thailitcorpus/releases/download/v.2.0/tlc_v.2.0.tar.gz": {"num_bytes": 5551710, "checksum": "56aff251d17a33b4c6a12d2b41c9fe6aaa676d08a87d12e253f465107b13ae88"}}, "download_size": 5551710, "post_processing_size": null, "dataset_size": 32498, "size_in_bytes": 5584208}, "tnhcv1.0": {"description": "Thai Literature Corpora (TLC): Corpora of machine-ingestible Thai classical literature texts.\n\nRelease: 6/25/19\n\nIt consists of two datasets:\n\n## TLC set\nIt is texts from [Vajirayana Digital Library](https://vajirayana.org/), stored by chapters and stanzas (non-tokenized).\n\ntlc v.2.0 (6/17/19 : a total of 34 documents, 292,270 lines, 31,790,734 characters)\ntlc v.1.0 (6/11/19 : a total of 25 documents, 113,981 lines, 28,775,761 characters)\n\n## TNHC set\nIt is texts from Thai National Historical Corpus, stored by lines (manually tokenized).\n\ntnhc v.1.0 (6/25/19 : a total of 47 documents, 756,478 lines, 13,361,142 characters)\n", "citation": "@misc{\n author={Sawatphol, Jitkapat},\n title={Thai Literature Corpora},\n year={2019},\n howpublished={\\url{https://attapol.github.io/tlc.html}}\n}\n", "homepage": "https://attapol.github.io/tlc.html", "license": "", "features": {"text": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "tlc", "config_name": "tnhcv1.0", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 25198, "num_examples": 152, "dataset_name": "tlc"}}, "download_checksums": {"https://github.com/jitkapat/thailitcorpus/releases/download/v.1.0/tnhc_v.1.0.tar.gz": {"num_bytes": 1465403, "checksum": "ba534fb18bb07595a82e3981a164384494b01d49918939b11fe417423a637e6f"}}, "download_size": 1465403, "post_processing_size": null, "dataset_size": 25198, "size_in_bytes": 1490601}}
7 changes: 4 additions & 3 deletions datasets/tlc/tlc.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@
"""

_URLs = {
"tlcv1.0": "https://drive.google.com/uc?export=download&id=15E64fwMeAff0bAsFGaSsv9NYeVHn1drE",
"tlcv2.0": "https://drive.google.com/uc?export=download&id=1S2T72b3Kkcvy4XZcxwIipoRn6ELa4hhV",
"tnhcv1.0": "https://drive.google.com/uc?export=download&id=1T_ib-NOwQV6O6lEjCjvZReUA3pQ4h-gD",
"tlcv1.0": "https://github.com/jitkapat/thailitcorpus/releases/download/v.1.0/tlc_v.1.0.tar.gz",
"tlcv2.0": "https://github.com/jitkapat/thailitcorpus/releases/download/v.2.0/tlc_v.2.0.tar.gz",
"tnhcv1.0": "https://github.com/jitkapat/thailitcorpus/releases/download/v.1.0/tnhc_v.1.0.tar.gz",
}


Expand Down Expand Up @@ -104,6 +104,7 @@ def _info(self):

def _split_generators(self, dl_manager):
data_path = dl_manager.download_and_extract(_URLs[self.config.name])

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
Expand Down

0 comments on commit 34d34a7

Please sign in to comment.