Skip to content

Commit

Permalink
refactor project structure and update doc
Browse files Browse the repository at this point in the history
  • Loading branch information
WilliamJlvt committed Oct 3, 2024
1 parent b3bf0ff commit 7bb1012
Show file tree
Hide file tree
Showing 10 changed files with 157 additions and 152 deletions.
Binary file removed .coverage
Binary file not shown.
15 changes: 13 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
from llm_pricing_sdk.enums import DataSources

# LLM Pricing SDK
LLM Pricing SDK is a Python package designed to scrape and organize pricing information for large language models (LLMs) from the following webpage: https://www.botgenuity.com/tools/llm-pricing.
LLM Pricing SDK is a Python package designed to scrape and organize pricing information for large language models (LLMs)
from the following sources:
- https://huggingface.co/spaces/philschmid/llm-pricing
- https://www.botgenuity.com/tools/llm-pricing

## Installation
~~You can install the package using pip:~~ (not yet available)
Expand All @@ -17,7 +22,7 @@ pip install .
## Usage
Once you have installed the SDK, you can use it to quickly retrieve the current pricing information from the website.
```python
from llm_pricing_sdk.llm_pricing import LlmPricingScraper
from llm_pricing_sdk.scrapers import LlmPricingScraper

# Get the pricing information
pricing_data = LlmPricingScraper.scrape()
Expand Down Expand Up @@ -46,6 +51,12 @@ for entry in gpt_4o_models:
print(f"Updated: {entry.updated}")
print("-" * 40)
```
You can also chose the source of the data you want to scrape by passing the source as an argument to the `scrape` method. The available sources are defined in the `DataSources` enum.
```python
from llm_pricing_sdk.scrapers import LlmPricingScraper, DataSources

pricing_data = LlmPricingScraper.scrape(DataSources.BOTGENUITY)
```

### Example Output
After running the above code, you should see an output like this:
Expand Down
5 changes: 5 additions & 0 deletions llm_pricing_sdk/enums.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from enum import Enum

class DataSources(Enum):
BOTGENUITY = "botgenuity"
HUGGINGFACE = "huggingface"
147 changes: 0 additions & 147 deletions llm_pricing_sdk/llm_pricing.py

This file was deleted.

21 changes: 21 additions & 0 deletions llm_pricing_sdk/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
class LLMModelPricing:
""" Pricing information model for LLM models. """

def __init__(self, model, provider, input_tokens_price,
output_tokens_price, context, source, updated):
self.model = model
self.provider = provider
self.input_tokens_price = input_tokens_price # price per 1M tokens in USD
self.output_tokens_price = output_tokens_price # price per 1M tokens in USD
self.context = context # context for the model
self.source = source # source of the pricing information
self.updated = updated

def __str__(self):
return f"Model: {self.model}, " \
f"Provider: {self.provider}, " \
f"Input Price: {self.input_tokens_price}, " \
f"Output Price: {self.output_tokens_price}, " \
f"Context: {self.context}, " \
f"Source: {self.source}, " \
f"Updated: {self.updated}"
18 changes: 18 additions & 0 deletions llm_pricing_sdk/scrapers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from llm_pricing_sdk.enums import DataSources
from llm_pricing_sdk.scrapers.botgenuity import BotgenuityScraper
from llm_pricing_sdk.scrapers.huggingface import HuggingfaceScraper

class LlmPricingScraper:
@staticmethod
def scrape(source: DataSources = DataSources.HUGGINGFACE):
"""
Scrape the LLM pricing information from the specified source.
:returns: A list of LLMModelPricing objects.
"""
if source == DataSources.BOTGENUITY:
return BotgenuityScraper.scrape()
elif source == DataSources.HUGGINGFACE:
return HuggingfaceScraper.scrape()
else:
raise Exception(f"Source '{source}' is not supported.")
6 changes: 6 additions & 0 deletions llm_pricing_sdk/scrapers/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
class BaseScraper:
"""
Base class for scrapers.
Can be extended to provide common methods or validation logic for all scrapers.
"""
pass
45 changes: 45 additions & 0 deletions llm_pricing_sdk/scrapers/botgenuity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import requests
from bs4 import BeautifulSoup
from datetime import datetime

from llm_pricing_sdk.models import LLMModelPricing


class BotgenuityScraper:
@staticmethod
def scrape():
url = "https://www.botgenuity.com/tools/llm-pricing"
response = requests.get(url)

if response.status_code != 200:
raise Exception(f"Failed to retrieve the webpage. Status code: {response.status_code}")

soup = BeautifulSoup(response.content, "html.parser")
table = soup.find("table")
if not table:
raise Exception("No table found on the page.")

rows = []
for tr in table.find_all("tr")[1:]: # Skip the header row
cells = tr.find_all("td")
if len(cells) >= 5:
provider = cells[0].text.strip()
model = cells[1].text.strip()
context = cells[2].text.strip()
input_tokens_price = cells[3].text.strip().replace("$", "")
output_tokens_price = cells[4].text.strip().replace("$", "")
updated = datetime.strptime(cells[6].text.strip(), "%B %d, %Y").strftime("%B %d, %Y")

pricing_info = LLMModelPricing(
provider=provider,
model=model,
context=context,
input_tokens_price=input_tokens_price,
output_tokens_price=output_tokens_price,
source=url,
updated=updated
)

rows.append(pricing_info)

return rows
44 changes: 44 additions & 0 deletions llm_pricing_sdk/scrapers/huggingface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import re
from datetime import datetime

from llm_pricing_sdk.utils import fetch_ts_file
from llm_pricing_sdk.models import LLMModelPricing


class HuggingfaceScraper:
@staticmethod
def scrape():
url = "https://huggingface.co/spaces/philschmid/llm-pricing/resolve/main/src/lib/data.ts"

provider_regex = re.compile(r"provider: '(.*?)',")
uri_regex = re.compile(r"uri: '(.*?)',")
models_regex = re.compile(r"\{ name: '(.*?)', inputPrice: ([\d.]+), outputPrice: ([\d.]+) \}")

providers = []

content = fetch_ts_file(url)
provider_blocks = content.split('},\n {')

for block in provider_blocks:
provider_match = provider_regex.search(block)
uri_match = uri_regex.search(block)

if provider_match and uri_match:
provider_name = provider_match.group(1)
provider_uri = uri_match.group(1)
models = models_regex.findall(block)

for model in models:
model_name, input_price, output_price = model
pricing_data = LLMModelPricing(
model=model_name,
provider=provider_name,
input_tokens_price=float(input_price),
output_tokens_price=float(output_price),
context="",
source=provider_uri,
updated=str(datetime.now().date())
)
providers.append(pricing_data)

return providers
8 changes: 5 additions & 3 deletions tests/test_llm_pricing.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,23 @@
import unittest
import requests_mock
from llm_pricing_sdk.llm_pricing import LlmPricingScraper
from llm_pricing_sdk.scrapers import LlmPricingScraper, DataSources


class TestLlmPricingScraper(unittest.TestCase):

def test_scrape_returns_at_least_one_result(self):
pricing_data = LlmPricingScraper.scrape()
self.assertTrue(len(pricing_data) > 0)
pricing_data = LlmPricingScraper.scrape(DataSources.BOTGENUITY)
self.assertTrue(len(pricing_data) > 0)

@requests_mock.Mocker()
def test_scrape_empty_table(self, mock_request):
# Mock an empty table scenario
mock_request.get('https://www.botgenuity.com/tools/llm-pricing', text="""
<html><body><table></table></body></html>
""")
results = LlmPricingScraper.scrape()
results = LlmPricingScraper.scrape(DataSources.BOTGENUITY)
self.assertEqual(len(results),
0) # No data in table, expect empty list

Expand All @@ -26,7 +28,7 @@ def test_scrape_raises_error_on_failure(self, mock_request):

# Expect the scrape method to raise an exception
with self.assertRaises(Exception) as context:
LlmPricingScraper.scrape()
LlmPricingScraper.scrape(DataSources.BOTGENUITY)

self.assertTrue('Failed to retrieve the webpage' in str(context.exception))

Expand Down

0 comments on commit 7bb1012

Please sign in to comment.