Merge pull request #1 from WilliamJlvt/dev

merge main from dev
WilliamJlvt · Oct 3, 2024 · 397184f · 397184f
2 parents 7e71492 + 7bb1012
commit 397184f
Show file tree

Hide file tree

Showing 12 changed files with 168 additions and 87 deletions.
diff --git a/.coverage b/.coverage
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,4 @@ __pycache__
 *.pyc
 *.pyo
 *.pyd
+.temp
diff --git a/README.md b/README.md
@@ -1,5 +1,10 @@
+from llm_pricing_sdk.enums import DataSources
+
 # LLM Pricing SDK
-LLM Pricing SDK is a Python package designed to scrape and organize pricing information for large language models (LLMs) from the following webpage: https://www.botgenuity.com/tools/llm-pricing.
+LLM Pricing SDK is a Python package designed to scrape and organize pricing information for large language models (LLMs)
+from the following sources: 
+- https://huggingface.co/spaces/philschmid/llm-pricing
+- https://www.botgenuity.com/tools/llm-pricing
 
 ## Installation
 ~~You can install the package using pip:~~ (not yet available)
@@ -17,7 +22,7 @@ pip install .
 ## Usage
 Once you have installed the SDK, you can use it to quickly retrieve the current pricing information from the website.
 ```python
-from llm_pricing_sdk.llm_pricing import LlmPricingScraper
+from llm_pricing_sdk.scrapers import LlmPricingScraper
 
 # Get the pricing information
 pricing_data = LlmPricingScraper.scrape()
@@ -46,6 +51,12 @@ for entry in gpt_4o_models:
     print(f"Updated: {entry.updated}")
     print("-" * 40)
 ```
+You can also chose the source of the data you want to scrape by passing the source as an argument to the `scrape` method. The available sources are defined in the `DataSources` enum.
+```python
+from llm_pricing_sdk.scrapers import LlmPricingScraper, DataSources
+
+pricing_data = LlmPricingScraper.scrape(DataSources.BOTGENUITY)
+```
 
 ### Example Output
 After running the above code, you should see an output like this:

diff --git a/llm_pricing_sdk/enums.py b/llm_pricing_sdk/enums.py
@@ -0,0 +1,5 @@
+from enum import Enum
+
+class DataSources(Enum):
+    BOTGENUITY = "botgenuity"
+    HUGGINGFACE = "huggingface"
diff --git a/llm_pricing_sdk/llm_pricing.py b/llm_pricing_sdk/llm_pricing.py
diff --git a/llm_pricing_sdk/models.py b/llm_pricing_sdk/models.py
@@ -0,0 +1,21 @@
+class LLMModelPricing:
+    """ Pricing information model for LLM models. """
+
+    def __init__(self, model, provider, input_tokens_price,
+                 output_tokens_price, context, source, updated):
+        self.model = model
+        self.provider = provider
+        self.input_tokens_price = input_tokens_price  # price per 1M tokens in USD
+        self.output_tokens_price = output_tokens_price  # price per 1M tokens in USD
+        self.context = context  # context for the model
+        self.source = source  # source of the pricing information
+        self.updated = updated
+
+    def __str__(self):
+        return f"Model: {self.model}, " \
+               f"Provider: {self.provider}, " \
+               f"Input Price: {self.input_tokens_price}, " \
+               f"Output Price: {self.output_tokens_price}, " \
+               f"Context: {self.context}, " \
+               f"Source: {self.source}, " \
+               f"Updated: {self.updated}"
diff --git a/llm_pricing_sdk/scrapers/__init__.py b/llm_pricing_sdk/scrapers/__init__.py
@@ -0,0 +1,18 @@
+from llm_pricing_sdk.enums import DataSources
+from llm_pricing_sdk.scrapers.botgenuity import BotgenuityScraper
+from llm_pricing_sdk.scrapers.huggingface import HuggingfaceScraper
+
+class LlmPricingScraper:
+    @staticmethod
+    def scrape(source: DataSources = DataSources.HUGGINGFACE):
+        """
+        Scrape the LLM pricing information from the specified source.
+
+        :returns: A list of LLMModelPricing objects.
+        """
+        if source == DataSources.BOTGENUITY:
+            return BotgenuityScraper.scrape()
+        elif source == DataSources.HUGGINGFACE:
+            return HuggingfaceScraper.scrape()
+        else:
+            raise Exception(f"Source '{source}' is not supported.")
diff --git a/llm_pricing_sdk/scrapers/base.py b/llm_pricing_sdk/scrapers/base.py
@@ -0,0 +1,6 @@
+class BaseScraper:
+    """
+    Base class for scrapers.
+    Can be extended to provide common methods or validation logic for all scrapers.
+    """
+    pass
diff --git a/llm_pricing_sdk/scrapers/botgenuity.py b/llm_pricing_sdk/scrapers/botgenuity.py
@@ -0,0 +1,45 @@
+import requests
+from bs4 import BeautifulSoup
+from datetime import datetime
+
+from llm_pricing_sdk.models import LLMModelPricing
+
+
+class BotgenuityScraper:
+    @staticmethod
+    def scrape():
+        url = "https://www.botgenuity.com/tools/llm-pricing"
+        response = requests.get(url)
+
+        if response.status_code != 200:
+            raise Exception(f"Failed to retrieve the webpage. Status code: {response.status_code}")
+
+        soup = BeautifulSoup(response.content, "html.parser")
+        table = soup.find("table")
+        if not table:
+            raise Exception("No table found on the page.")
+
+        rows = []
+        for tr in table.find_all("tr")[1:]:  # Skip the header row
+            cells = tr.find_all("td")
+            if len(cells) >= 5:
+                provider = cells[0].text.strip()
+                model = cells[1].text.strip()
+                context = cells[2].text.strip()
+                input_tokens_price = cells[3].text.strip().replace("$", "")
+                output_tokens_price = cells[4].text.strip().replace("$", "")
+                updated = datetime.strptime(cells[6].text.strip(), "%B %d, %Y").strftime("%B %d, %Y")
+
+                pricing_info = LLMModelPricing(
+                    provider=provider,
+                    model=model,
+                    context=context,
+                    input_tokens_price=input_tokens_price,
+                    output_tokens_price=output_tokens_price,
+                    source=url,
+                    updated=updated
+                )
+
+                rows.append(pricing_info)
+
+        return rows
diff --git a/llm_pricing_sdk/scrapers/huggingface.py b/llm_pricing_sdk/scrapers/huggingface.py
@@ -0,0 +1,44 @@
+import re
+from datetime import datetime
+
+from llm_pricing_sdk.utils import fetch_ts_file
+from llm_pricing_sdk.models import LLMModelPricing
+
+
+class HuggingfaceScraper:
+    @staticmethod
+    def scrape():
+        url = "https://huggingface.co/spaces/philschmid/llm-pricing/resolve/main/src/lib/data.ts"
+
+        provider_regex = re.compile(r"provider: '(.*?)',")
+        uri_regex = re.compile(r"uri: '(.*?)',")
+        models_regex = re.compile(r"\{ name: '(.*?)', inputPrice: ([\d.]+), outputPrice: ([\d.]+) \}")
+
+        providers = []
+
+        content = fetch_ts_file(url)
+        provider_blocks = content.split('},\n  {')
+
+        for block in provider_blocks:
+            provider_match = provider_regex.search(block)
+            uri_match = uri_regex.search(block)
+
+            if provider_match and uri_match:
+                provider_name = provider_match.group(1)
+                provider_uri = uri_match.group(1)
+                models = models_regex.findall(block)
+
+                for model in models:
+                    model_name, input_price, output_price = model
+                    pricing_data = LLMModelPricing(
+                        model=model_name,
+                        provider=provider_name,
+                        input_tokens_price=float(input_price),
+                        output_tokens_price=float(output_price),
+                        context="",
+                        source=provider_uri,
+                        updated=str(datetime.now().date())
+                    )
+                    providers.append(pricing_data)
+
+        return providers
diff --git a/llm_pricing_sdk/utils.py b/llm_pricing_sdk/utils.py
@@ -0,0 +1,10 @@
+import requests
+
+def fetch_ts_file(url):
+    response = requests.get(url)
+
+    if response.status_code == 200:
+        return response.text  # Retourner le contenu du fichier sous forme de texte
+    else:
+        raise Exception(
+            f"Cannot fetch the file, status code: {response.status_code}")
diff --git a/tests/test_llm_pricing.py b/tests/test_llm_pricing.py
@@ -1,21 +1,23 @@
 import unittest
 import requests_mock
-from llm_pricing_sdk.llm_pricing import LlmPricingScraper
+from llm_pricing_sdk.scrapers import LlmPricingScraper, DataSources
 
 
 class TestLlmPricingScraper(unittest.TestCase):
 
     def test_scrape_returns_at_least_one_result(self):
         pricing_data = LlmPricingScraper.scrape()
         self.assertTrue(len(pricing_data) > 0)
+        pricing_data = LlmPricingScraper.scrape(DataSources.BOTGENUITY)
+        self.assertTrue(len(pricing_data) > 0)
 
     @requests_mock.Mocker()
     def test_scrape_empty_table(self, mock_request):
         # Mock an empty table scenario
         mock_request.get('https://www.botgenuity.com/tools/llm-pricing', text="""
         <html><body><table></table></body></html>
         """)
-        results = LlmPricingScraper.scrape()
+        results = LlmPricingScraper.scrape(DataSources.BOTGENUITY)
         self.assertEqual(len(results),
                          0)  # No data in table, expect empty list
 
@@ -26,7 +28,7 @@ def test_scrape_raises_error_on_failure(self, mock_request):
 
         # Expect the scrape method to raise an exception
         with self.assertRaises(Exception) as context:
-            LlmPricingScraper.scrape()
+            LlmPricingScraper.scrape(DataSources.BOTGENUITY)
 
         self.assertTrue('Failed to retrieve the webpage' in str(context.exception))
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,3 +6,4 @@ __pycache__ @@
     *.pyc
     *.pyo
     *.pyd
+    .temp