LAION-AI · ZachNagengast · Oct 20, 2023 · Oct 12, 2023 · Oct 14, 2023 · Oct 15, 2023
diff --git a/.env.example b/.env.example
@@ -1,2 +1,4 @@
 HF_TOKEN=
-DISCORD_TOKEN=
+DISCORD_TOKEN=
+DATASET_CHUNK_SIZE=300
+FETCH_ALL=
diff --git a/.gitignore b/.gitignore
@@ -99,7 +99,8 @@ ipython_config.py
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
+poetry.lock
+pyproject.toml
 
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
@@ -163,4 +164,4 @@ cython_debug/
 .vscode
 
 # macOS
-.DS_Store
+.DS_Store
diff --git a/helpers/__init__.py b/helpers/__init__.py
@@ -0,0 +1 @@
+from helpers.helpers import *
diff --git a/helpers/helpers.py b/helpers/helpers.py
@@ -0,0 +1,40 @@
+from typing import Tuple
+
+
+start_quotes = [
+    '"',
+    '“',
+    "'",
+    '«',
+    '„',
+]
+
+end_quotes = [
+    '"',
+    '”',
+    "'",
+    '»',
+    '“',
+]
+
+
+def starts_with_quotes(string: str) -> bool:
+    if len(string) == 0:
+        return False
+    return string[0] in start_quotes
+
+
+def get_start_end_quotes(string: str) -> Tuple[str, str]:
+    first_quote_index = -1
+    last_quote_index = -1
+
+    for i, char in enumerate(string):
+        if first_quote_index != -1 and last_quote_index != -1:
+            break
+        if first_quote_index != -1 and char in start_quotes:
+            first_quote_index = i
+            continue
+        if last_quote_index != -1 and char in end_quotes:
+            first_quote_index = i
+
+    return (first_quote_index, last_quote_index)
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,5 @@
 requests==2.31.0
-datasets==2.14.5
+git+https://github.com/huggingface/datasets.git@a6bd7b4a268dbda6b86d4ca59f5d2a78848b0199
 Pillow==10.0.1
+huggingface_hub>=0.18
+numpy
diff --git a/scrape_dalle/config.json b/scrape_dalle/config.json
@@ -2,6 +2,7 @@
     "base_url": "https://discord.com/api/v9",
     "channel_id": "1158354590463447092",
     "limit": 100,
+    "max_chunk_size": 300,
     "embed_images": true,
     "hf_dataset_name": "laion/dalle-3-dataset"
-}
+}
diff --git a/scrape_dalle/scrape.py b/scrape_dalle/scrape.py
@@ -5,6 +5,8 @@
 sys.path.append("..")
 
 from scraper import ScraperBot, ScraperBotConfig, HFDatasetScheme
+from helpers import starts_with_quotes, get_start_end_quotes
+
 
 def parse_fn(message: Dict[str, Any]) -> List[HFDatasetScheme]:
     """Parses a message into a list of Hugging Face Dataset Schemes.
@@ -20,22 +22,19 @@ def parse_fn(message: Dict[str, Any]) -> List[HFDatasetScheme]:
         A list of Hugging Face Dataset Schemes.
     """
     content = message["content"]
-
-    # Find the index of the first quote in the content
-    first_quote_index = content.find('"')
-
-    # Find the index of the last quote in the content
-    last_quote_index = content.rfind('"')
-
+
+    (first_quote_index, last_quote_index) = get_start_end_quotes(content)
+
     # Extract the text between the first and last quotes to get the complete prompt
     prompt = content[first_quote_index + 1:last_quote_index].strip()
     image_urls = [attachment["url"] for attachment in message["attachments"]]
     timestamp = message["timestamp"]
     message_id = message["id"]
 
-    return [HFDatasetScheme(caption=prompt, image=None, link=image_url, message_id=message_id, timestamp=timestamp) 
+    return [HFDatasetScheme(caption=prompt, image=None, link=image_url, message_id=message_id, timestamp=timestamp)
             for image_url in image_urls]
 
+
 def condition_fn(message: Dict[str, Any]) -> bool:
     """Checks if a message meets the condition to be parsed.
 
@@ -49,11 +48,12 @@ def condition_fn(message: Dict[str, Any]) -> bool:
     bool
         True if the message meets the condition, False otherwise.
     """
-    return len(message["attachments"]) > 0 and message["content"].startswith('"')
+    return len(message["attachments"]) > 0 and starts_with_quotes(message["content"])
+
 
 if __name__ == "__main__":
     config_path = os.path.join(os.path.dirname(__file__), "config.json")
     config = ScraperBotConfig.from_json(config_path)
 
     bot = ScraperBot(config=config, parse_fn=parse_fn, condition_fn=condition_fn)
-    bot.scrape(fetch_all=True)
+    bot.scrape(fetch_all=os.environ.get("FETCH_ALL", "false").lower() == "true")
diff --git a/scrape_gpt4v/config.json b/scrape_gpt4v/config.json
@@ -2,6 +2,7 @@
     "base_url": "https://discord.com/api/v9",
     "channel_id": "1159217496390389801",
     "limit": 100,
+    "max_chunk_size": 300,
     "embed_images": false,
     "hf_dataset_name": "laion/gpt4v-dataset"
-}
+}
diff --git a/scrape_gpt4v/scrape.py b/scrape_gpt4v/scrape.py
@@ -3,6 +3,7 @@
 from typing import Any, Dict, List
 
 from scraper import ScraperBot, ScraperBotConfig, HFDatasetScheme
+from helpers import starts_with_quotes, get_start_end_quotes
 
 url_pattern = re.compile(r'https?://\S+')
 
@@ -20,13 +21,9 @@ def parse_fn(message: Dict[str, Any]) -> List[HFDatasetScheme]:
         A list of Hugging Face Dataset Schemes.
     """
     content = message["content"]
-
-    # Find the index of the first quote in the content
-    first_quote_index = content.find('"')
-
-    # Find the index of the last quote in the content
-    last_quote_index = content.rfind('"')
-
+
+    (first_quote_index, last_quote_index) = get_start_end_quotes(content)
+
     # Extract the text between the first and last quotes to get the complete prompt
     prompt = content[first_quote_index + 1:last_quote_index].strip()
     image_urls = url_pattern.findall(content)
@@ -36,6 +33,7 @@ def parse_fn(message: Dict[str, Any]) -> List[HFDatasetScheme]:
     return [HFDatasetScheme(caption=prompt, image=None, link=image_url, message_id=message_id, timestamp=timestamp)
             for image_url in image_urls]
 
+
 def condition_fn(message: Dict[str, Any]) -> bool:
     """Checks if a message meets the condition to be parsed.
 
@@ -49,11 +47,12 @@ def condition_fn(message: Dict[str, Any]) -> bool:
     bool
         True if the message meets the condition, False otherwise.
     """
-    return url_pattern.search(message["content"]) and message["content"].startswith('"')
+    return url_pattern.search(message["content"]) and starts_with_quotes(message["content"])
+
 
 if __name__ == "__main__":
     config_path = os.path.join(os.path.dirname(__file__), "config.json")
     config = ScraperBotConfig.from_json(config_path)
 
     bot = ScraperBot(config=config, parse_fn=parse_fn, condition_fn=condition_fn)
-    bot.scrape(fetch_all=True)
+    bot.scrape(fetch_all=os.environ.get("FETCH_ALL", "false").lower() == "true")
diff --git a/scrape_gpt4v_emotion/config.json b/scrape_gpt4v_emotion/config.json
@@ -2,6 +2,7 @@
   "base_url": "https://discord.com/api/v9",
   "channel_id": "1162094554472788029",
   "limit": 100,
+  "max_chunk_size": 300,
   "embed_images": false,
   "hf_dataset_name": "laion/gpt4v-emotion-dataset"
 }
diff --git a/scrape_gpt4v_emotion/scrape.py b/scrape_gpt4v_emotion/scrape.py
@@ -3,6 +3,7 @@
 from typing import Any, Dict, List
 
 from scraper import ScraperBot, ScraperBotConfig, HFDatasetScheme
+from helpers import starts_with_quotes, get_start_end_quotes
 
 url_pattern = re.compile(r'https?://\S+')
 
@@ -22,11 +23,7 @@ def parse_fn(message: Dict[str, Any]) -> List[HFDatasetScheme]:
     """
     content = message["content"]
 
-    # Find the index of the first quote in the content
-    first_quote_index = content.find('"')
-
-    # Find the index of the last quote in the content
-    last_quote_index = content.rfind('"')
+    (first_quote_index, last_quote_index) = get_start_end_quotes(content)
 
     # Extract the text between the first and last quotes to get the complete prompt
     prompt = content[first_quote_index + 1:last_quote_index].strip()
@@ -51,12 +48,12 @@ def condition_fn(message: Dict[str, Any]) -> bool:
     bool
         True if the message meets the condition, False otherwise.
     """
-    return url_pattern.search(message["content"]) and message["content"].startswith('"')
+    return url_pattern.search(message["content"]) and starts_with_quotes(message["content"])
 
 
 if __name__ == "__main__":
     config_path = os.path.join(os.path.dirname(__file__), "config.json")
     config = ScraperBotConfig.from_json(config_path)
 
     bot = ScraperBot(config=config, parse_fn=parse_fn, condition_fn=condition_fn)
-    bot.scrape(fetch_all=True)
+    bot.scrape(fetch_all=os.environ.get("FETCH_ALL", "false").lower() == "true")
diff --git a/scrape_wuerstchen/config.json b/scrape_wuerstchen/config.json
@@ -2,6 +2,7 @@
     "base_url": "https://discord.com/api/v9",
     "channel_id": "1161398740595265626",
     "limit": 100,
+    "max_chunk_size": 300,
     "embed_images": true,
     "hf_dataset_name": "laion/wuerstchen-dataset"
-}
+}
diff --git a/scrape_wuerstchen/scrape.py b/scrape_wuerstchen/scrape.py
@@ -3,6 +3,7 @@
 
 from scraper import ScraperBot, ScraperBotConfig, HFDatasetScheme
 
+
 def parse_fn(message: Dict[str, Any]) -> List[HFDatasetScheme]:
     """Parses a message into a list of Hugging Face Dataset Schemes.
 
@@ -21,9 +22,10 @@ def parse_fn(message: Dict[str, Any]) -> List[HFDatasetScheme]:
     timestamp = message["timestamp"]
     message_id = message["id"]
 
-    return [HFDatasetScheme(caption=prompt, image=None, link=image_url, message_id=message_id, timestamp=timestamp) 
+    return [HFDatasetScheme(caption=prompt, image=None, link=image_url, message_id=message_id, timestamp=timestamp)
             for image_url in image_urls]
 
+
 def condition_fn(message: Dict[str, Any]) -> bool:
     """Checks if a message meets the condition to be parsed.
 
@@ -37,11 +39,12 @@ def condition_fn(message: Dict[str, Any]) -> bool:
     bool
         True if the message meets the condition, False otherwise.
     """
-    return len(message["attachments"]) > 0 
+    return len(message["attachments"]) > 0
+
 
 if __name__ == "__main__":
     config_path = os.path.join(os.path.dirname(__file__), "config.json")
     config = ScraperBotConfig.from_json(config_path)
 
     bot = ScraperBot(config=config, parse_fn=parse_fn, condition_fn=condition_fn)
-    bot.scrape(fetch_all=True)
+    bot.scrape(fetch_all=os.environ.get("FETCH_ALL", "false").lower() == "true")
diff --git a/scraper/dataset_readme_template.md b/scraper/dataset_readme_template.md
@@ -0,0 +1,27 @@
+---
+dataset_info:
+  features:
+  - name: caption
+    dtype: string
+  - name: image
+    dtype: image
+  - name: link
+    dtype: string
+  - name: message_id
+    dtype: string
+  - name: timestamp
+    dtype: string
+  splits:
+  - name: train
+    num_bytes: 0
+    num_examples: 0
+  download_size: 0
+  dataset_size: 0
+configs:
+- config_name: default
+  data_files:
+  - split: train
+    path: data/train-*
+---
+
+Use the Edit dataset card button to edit.