Merge branch 'master' into add_item2

huggingface · Apr 27, 2021 · d93bc76 · d93bc76 · github-actions · Apr 27, 2021
2 parents f834767 + 909c58f
commit d93bc76
Show file tree

Hide file tree

Showing 269 changed files with 2,928 additions and 445 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -15,7 +15,7 @@ jobs:
             - run: source venv/bin/activate
             - run: pip install .[tests]
             - run: pip install -r additional-tests-requirements.txt --no-deps
-            - run: pip install pyarrow --upgrade
+            - run: pip install pyarrow==3.0.0
             - run: HF_SCRIPTS_VERSION=master python -m pytest -sv ./tests/
 
     run_dataset_script_tests_pyarrow_1:
@@ -47,7 +47,7 @@ jobs:
             - run: "& venv/Scripts/activate.ps1"
             - run: pip install .[tests]
             - run: pip install -r additional-tests-requirements.txt --no-deps
-            - run: pip install pyarrow --upgrade
+            - run: pip install pyarrow==3.0.0
             - run: $env:HF_SCRIPTS_VERSION="master"
             - run: python -m pytest -sv ./tests/
 
@@ -81,6 +81,7 @@ jobs:
             - run: black --check --line-length 119 --target-version py36 tests src benchmarks datasets metrics
             - run: isort --check-only tests src benchmarks datasets metrics
             - run: flake8 tests src benchmarks datasets metrics
+            - run: ./scripts/datasets_metadata_validator.py
 
     build_doc:
         working_directory: ~/datasets

diff --git a/.circleci/deploy.sh b/.circleci/deploy.sh
@@ -34,6 +34,7 @@ deploy_doc "master" master
 
 # Example of how to deploy a doc on a certain commit (the commit doesn't have to be on the master branch).
 # The following commit would live on huggingface.co/docs/datasets/v1.0.0
+deploy_doc "e8fc41f" v1.6.1
 deploy_doc "40bb9e6" v1.6.0
 deploy_doc "f256b77" v1.5.0
 deploy_doc "ca41320" v1.4.1

diff --git a/datasets/afrikaans_ner_corpus/afrikaans_ner_corpus.py b/datasets/afrikaans_ner_corpus/afrikaans_ner_corpus.py
@@ -59,7 +59,7 @@ def __init__(self, **kwargs):
 
 
 class AfrikaansNerCorpus(datasets.GeneratorBasedBuilder):
-    """ Afrikaans Ner dataset"""
+    """Afrikaans Ner dataset"""
 
     BUILDER_CONFIGS = [
         AfrikaansNerCorpusConfig(

diff --git a/datasets/air_dialogue/air_dialogue.py b/datasets/air_dialogue/air_dialogue.py
@@ -213,7 +213,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, filepath, split):
-        """ Yields examples. """
+        """Yields examples."""
         # TODO: This method will receive as arguments the `gen_kwargs` defined in the previous `_split_generators` method.
         # It is in charge of opening the given file and yielding (key, example) tuples from the dataset
         # The key is not important, it's more here for legacy reason (legacy from tfds)

diff --git a/datasets/allegro_reviews/allegro_reviews.py b/datasets/allegro_reviews/allegro_reviews.py
@@ -99,7 +99,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, filepath, split):
-        """ Yields examples. """
+        """Yields examples."""
         with open(filepath, encoding="utf-8") as f:
             reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
             for id_, row in enumerate(reader):

diff --git a/datasets/amazon_polarity/amazon_polarity.py b/datasets/amazon_polarity/amazon_polarity.py
@@ -113,7 +113,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, filepath, split):
-        """ Yields examples. """
+        """Yields examples."""
 
         with open(filepath, encoding="utf-8") as f:
             data = csv.reader(f, delimiter=",", quoting=csv.QUOTE_ALL)

diff --git a/datasets/aqua_rat/aqua_rat.py b/datasets/aqua_rat/aqua_rat.py
@@ -118,7 +118,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, filepath, split):
-        """ Yields examples. """
+        """Yields examples."""
         with open(filepath, encoding="utf-8") as f:
             for id_, row in enumerate(f):
                 data = json.loads(row)

diff --git a/datasets/aquamuse/aquamuse.py b/datasets/aquamuse/aquamuse.py
@@ -140,7 +140,7 @@ def _split_generators(self, dl_manager):
             ]
 
     def _generate_examples(self, filepath, split):
-        """ Yields examples. """
+        """Yields examples."""
         filepath = [join(filepath, f) for f in listdir(filepath) if isfile(join(filepath, f))]
         filepath = sorted(filepath)
         raw_dataset = tf.data.TFRecordDataset(filepath)

diff --git a/datasets/ar_cov19/ar_cov19.py b/datasets/ar_cov19/ar_cov19.py
@@ -124,7 +124,7 @@ def _split_generators(self, dl_manager):
         return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"data_dir": data_dir})]
 
     def _generate_examples(self, data_dir):
-        """ Yields examples. """
+        """Yields examples."""
         # TODO: This method will receive as arguments the `gen_kwargs` defined in the previous `_split_generators` method.
         # It is in charge of opening the given file and yielding (key, example) tuples from the dataset
         # The key is not important, it's more here for legacy reason (legacy from tfds)

diff --git a/datasets/arabic_billion_words/arabic_billion_words.py b/datasets/arabic_billion_words/arabic_billion_words.py
@@ -144,7 +144,7 @@ def _clean_text(self, text):
         return text.replace("?", "")
 
     def _generate_examples(self, filepath):
-        """ Yields examples. """
+        """Yields examples."""
         current_multi_line = ""
         _idx = 0
         data_tag = self.config.name

diff --git a/datasets/arabic_pos_dialect/arabic_pos_dialect.py b/datasets/arabic_pos_dialect/arabic_pos_dialect.py
@@ -45,7 +45,7 @@
 
 
 class ArabicPosDialectConfig(datasets.BuilderConfig):
-    """ BuilderConfig for ArabicPosDialect"""
+    """BuilderConfig for ArabicPosDialect"""
 
     def __init__(self, dialect=None, **kwargs):
         """
@@ -112,7 +112,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, filepath):
-        """ Yields examples in the raw (text) form. """
+        """Yields examples in the raw (text) form."""
         with open(filepath, encoding="utf-8") as csv_file:
             reader = csv.DictReader(csv_file, delimiter="\t", quoting=csv.QUOTE_NONE)
             fold = -1

diff --git a/datasets/arsentd_lev/arsentd_lev.py b/datasets/arsentd_lev/arsentd_lev.py
@@ -40,7 +40,7 @@
 
 
 class ArsentdLev(datasets.GeneratorBasedBuilder):
-    """"ArSenTD-Lev Dataset"""
+    """ "ArSenTD-Lev Dataset"""
 
     VERSION = datasets.Version("1.1.0")
 

diff --git a/datasets/arxiv_dataset/arxiv_dataset.py b/datasets/arxiv_dataset/arxiv_dataset.py
@@ -109,7 +109,7 @@ def _split_generators(self, dl_manager):
         return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"path": path_to_manual_file})]
 
     def _generate_examples(self, path=None, title_set=None):
-        """ Yields examples. """
+        """Yields examples."""
         with open(path, encoding="utf8") as f:
             for i, entry in enumerate(f):
                 data = dict(json.loads(entry))

diff --git a/datasets/aslg_pc12/aslg_pc12.py b/datasets/aslg_pc12/aslg_pc12.py
@@ -70,7 +70,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, gloss_path, text_path):
-        """ Yields examples. """
+        """Yields examples."""
 
         gloss_f = open(gloss_path, "r", encoding="utf-8")
         text_f = open(text_path, "r", encoding="utf-8")

diff --git a/datasets/asset/asset.py b/datasets/asset/asset.py
@@ -140,7 +140,7 @@ def _split_generators(self, dl_manager):
             ]
 
     def _generate_examples(self, filepaths, split):
-        """ Yields examples. """
+        """Yields examples."""
         if self.config.name == "simplification":
             files = [open(filepaths[f"asset.{split}.orig"], encoding="utf-8")] + [
                 open(filepaths[f"asset.{split}.simp.{i}"], encoding="utf-8") for i in range(10)

diff --git a/datasets/assin/assin.py b/datasets/assin/assin.py
@@ -151,7 +151,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, filepaths, split):
-        """ Yields examples. """
+        """Yields examples."""
 
         id_ = 0
 

diff --git a/datasets/assin2/assin2.py b/datasets/assin2/assin2.py
@@ -103,7 +103,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, filepath, split):
-        """ Yields examples. """
+        """Yields examples."""
 
         id_ = 0
 

diff --git a/datasets/atomic/atomic.py b/datasets/atomic/atomic.py
@@ -116,7 +116,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, filepath, split):
-        """ Yields examples from the Atomic dataset. """
+        """Yields examples from the Atomic dataset."""
 
         with open(filepath, encoding="utf-8") as f:
             for id_, row in enumerate(f):

diff --git a/datasets/autshumato/autshumato.py b/datasets/autshumato/autshumato.py
@@ -39,7 +39,7 @@
 
 
 class AutshumatoConfig(datasets.BuilderConfig):
-    """ BuilderConfig for NewDataset"""
+    """BuilderConfig for NewDataset"""
 
     def __init__(self, langs, zip_file, **kwargs):
         """
@@ -206,7 +206,7 @@ def _split_generators_translation(self, dl_manager):
         ]
 
     def _generate_examples(self, source_files, target_files, split):
-        """ Yields examples. """
+        """Yields examples."""
         if len(self.config.langs) == 2:
             return self._generate_examples_translation(source_files, target_files, split)
         elif len(self.config.langs) == 1:

diff --git a/datasets/banking77/banking77.py b/datasets/banking77/banking77.py
@@ -159,7 +159,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, filepath):
-        """ Yields examples as (key, example) tuples. """
+        """Yields examples as (key, example) tuples."""
         with open(filepath, encoding="utf-8") as f:
             csv_reader = csv.reader(f, quotechar='"', delimiter=",", quoting=csv.QUOTE_ALL, skipinitialspace=True)
             # call next to skip header

diff --git a/datasets/bbc_hindi_nli/bbc_hindi_nli.py b/datasets/bbc_hindi_nli/bbc_hindi_nli.py
@@ -144,7 +144,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, filepath):
-        """ Yields examples. """
+        """Yields examples."""
 
         with open(filepath, encoding="utf-8") as tsv_file:
             tsv_reader = csv.reader(tsv_file, delimiter="\t")

diff --git a/datasets/bprec/bprec.py b/datasets/bprec/bprec.py
@@ -178,7 +178,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, filedirs, split="tele"):
-        """ Yields examples. """
+        """Yields examples."""
         # TODO: This method will receive as arguments the `gen_kwargs` defined in the previous `_split_generators` method.
         # It is in charge of opening the given file and yielding (key, example) tuples from the dataset
         # The key is not important, it's more here for legacy reason (legacy from tfds)

diff --git a/datasets/brwac/brwac.py b/datasets/brwac/brwac.py
@@ -97,7 +97,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, filepath, split):
-        """ Yields examples. """
+        """Yields examples."""
 
         with open(filepath, encoding="utf-8") as f:
 

diff --git a/datasets/bsd_ja_en/bsd_ja_en.py b/datasets/bsd_ja_en/bsd_ja_en.py
@@ -78,7 +78,7 @@
 
 
 class BsdJaEn(datasets.GeneratorBasedBuilder):
-    """Japanese-English Business Scene Dialogue (BSD) dataset. """
+    """Japanese-English Business Scene Dialogue (BSD) dataset."""
 
     VERSION = datasets.Version("1.0.0")
 
@@ -131,7 +131,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, filepath, split):
-        """ Yields examples. """
+        """Yields examples."""
 
         with open(filepath, encoding="utf-8") as f:
             data = json.load(f)

diff --git a/datasets/c3/c3.py b/datasets/c3/c3.py
@@ -39,7 +39,7 @@
 
 
 class C3Config(datasets.BuilderConfig):
-    """ BuilderConfig for NewDataset"""
+    """BuilderConfig for NewDataset"""
 
     def __init__(self, type_, **kwargs):
         """
@@ -138,7 +138,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, filename, split):
-        """ Yields examples. """
+        """Yields examples."""
         with open(filename, "r", encoding="utf-8") as sf:
             data = json.load(sf)
             for id_, (documents, questions, document_id) in enumerate(data):

diff --git a/datasets/caner/caner.py b/datasets/caner/caner.py
@@ -112,7 +112,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, filepath, split):
-        """ Yields examples. """
+        """Yields examples."""
 
         with open(filepath, encoding="utf-8") as csv_file:
             reader = csv.reader(csv_file, delimiter=",")

diff --git a/datasets/cbt/cbt.py b/datasets/cbt/cbt.py
@@ -154,7 +154,7 @@ def _split_generators(self, dl_manager):
     def _generate_examples(
         self, filepath  # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
     ):
-        """ Yields examples as (key, example) tuples. """
+        """Yields examples as (key, example) tuples."""
         # This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
         # The `key` is here for legacy reason (tfds) and is not important in itself.
 

diff --git a/datasets/ccaligned_multilingual/ccaligned_multilingual.py b/datasets/ccaligned_multilingual/ccaligned_multilingual.py
@@ -189,7 +189,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, filepath, from_english=False):
-        """ Yields examples. """
+        """Yields examples."""
         lc = self.config.language_code
         reverse = lc in reverse_mapped_sentences
         with open(filepath, encoding="utf-8") as f:

diff --git a/datasets/cdsc/cdsc.py b/datasets/cdsc/cdsc.py
@@ -123,7 +123,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, filepath, split):
-        """ Yields examples. """
+        """Yields examples."""
         with open(filepath, encoding="utf-8") as f:
             reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
             for id_, row in enumerate(reader):

diff --git a/datasets/cdt/cdt.py b/datasets/cdt/cdt.py
@@ -81,7 +81,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, filepath, split):
-        """ Yields examples. """
+        """Yields examples."""
         with open(filepath, encoding="utf-8") as f:
             reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
             for id_, row in enumerate(reader):

diff --git a/datasets/clickbait_news_bg/clickbait_news_bg.py b/datasets/clickbait_news_bg/clickbait_news_bg.py
@@ -96,7 +96,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, filepath, split):
-        """ Yields examples. """
+        """Yields examples."""
         keys = [
             "fake_news_score",
             "click_bait_score",

diff --git a/datasets/climate_fever/climate_fever.py b/datasets/climate_fever/climate_fever.py
@@ -106,7 +106,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, filepath, split):
-        """ Yields examples. """
+        """Yields examples."""
         with open(filepath, encoding="utf-8") as f:
             for id_, row in enumerate(f):
                 doc = json.loads(row)

diff --git a/datasets/coached_conv_pref/coached_conv_pref.py b/datasets/coached_conv_pref/coached_conv_pref.py
@@ -136,7 +136,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, filepath, split):
-        """ Yields examples. """
+        """Yields examples."""
 
         # Empty Segment list with annotations dictionary
         # First prompt of a conversation does not contain the segment dictionary

diff --git a/datasets/code_search_net/code_search_net.py b/datasets/code_search_net/code_search_net.py
@@ -57,7 +57,7 @@
 
 
 class CodeSearchNet(datasets.GeneratorBasedBuilder):
-    """"CodeSearchNet corpus: proxy dataset for semantic code search."""
+    """ "CodeSearchNet corpus: proxy dataset for semantic code search."""
 
     VERSION = datasets.Version("1.0.0", "Add CodeSearchNet corpus dataset")
     BUILDER_CONFIGS = [

diff --git a/datasets/common_voice/common_voice.py b/datasets/common_voice/common_voice.py
@@ -696,7 +696,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, filepath, path_to_clips):
-        """ Yields examples. """
+        """Yields examples."""
         data_fields = list(self._info().features.keys())
         path_idx = data_fields.index("path")
 

diff --git a/datasets/compguesswhat/compguesswhat.py b/datasets/compguesswhat/compguesswhat.py
@@ -6,7 +6,7 @@
 
 
 class CompguesswhatConfig(datasets.BuilderConfig):
-    """ BuilderConfig for CompGuessWhat?!"""
+    """BuilderConfig for CompGuessWhat?!"""
 
     def __init__(self, data_url, splits, gameplay_scenario, **kwargs):
         """

diff --git a/datasets/conceptnet5/conceptnet5.py b/datasets/conceptnet5/conceptnet5.py
@@ -147,7 +147,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, filepath, split):
-        """ Yields examples from the conceptnet5 graph if the config is 'conceptnet5', otherwise yields the sentences for omcs. """
+        """Yields examples from the conceptnet5 graph if the config is 'conceptnet5', otherwise yields the sentences for omcs."""
 
         with open(filepath, "rb") as f:
             for id_, row in enumerate(f):

diff --git a/datasets/cord19/cord19.py b/datasets/cord19/cord19.py
@@ -147,7 +147,7 @@ def _split_generators(self, dl_manager):
         ]
 
     def _generate_examples(self, filepath, split):
-        """ Yields examples. """
+        """Yields examples."""
 
         metadata_filepath = filepath["metadata"]
-Original file line number
+Diff line change
@@ Expand Up / @@ -151,7 +151,7 @@ def _split_generators(self, dl_manager): @@
             ]
         def _generate_examples(self, filepaths, split):
-            """ Yields examples. """
+            """Yields examples."""
             id_ = 0
@@ Expand Down @@