Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lxml, google description, qwant and removing yandex. #7

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# searchit
Searchit is a library for async scraping of search engines. The library supports multiple search engines
(currently Google, Yandex, and Bing) with support for other search engines to come.
(currently Bing, Google, and Qwant) with support for other search engines to come.

# Install
```
Expand All @@ -12,17 +12,19 @@ Can be installed using pip, by running the above command.
```python
import asyncio

from searchit import GoogleScraper, YandexScraper, BingScraper
from searchit import BingScraper, GoogleScraper, QwantScraper
from searchit import ScrapeRequest

request = ScrapeRequest("watch movies online", 30)
google = GoogleScraper(max_results_per_page=10) # max_results = Number of results per page
yandex = YandexScraper(max_results_per_page=10)
bing = BingScraper(max_results_per_page=10) # max_results = Number of results per page
google = GoogleScraper(max_results_per_page=10)
qwant = QwantScraper(max_results_per_page=10)

loop = asyncio.get_event_loop()

results = loop.run_until_complete(bing.scrape(request))
results = loop.run_until_complete(google.scrape(request))
results = loop.run_until_complete(yandex.scrape(request))
results = loop.run_until_complete(qwant.scrape(request))
```
To use Searchit users first create a ScrapeRequest object, with term and number of results as required fields.
This object can then be passed to multiple different search engines and scraped asynchronously.
Expand All @@ -35,7 +37,7 @@ domain - Optional[str] - the domain to search i.e. .com or .com
sleep - Optional[int] - time to wait betweeen paginating pages - important to prevent getting blocked
proxy - Optional[str] - proxy to be used to make request - default none
language - Optional[str] - language to conduct search in (only Google atm)
geo - Optional[str] - Geo location to conduct search from Yandex, and Qwant
geo - Optional[str] - Geo location to conduct search from Qwant
```

## Roadmap
Expand Down
12 changes: 6 additions & 6 deletions searchit/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from searchit.scrapers.google import GoogleScraper
from searchit.scrapers.yandex import YandexScraper
from searchit.scrapers.bing import BingScraper
from searchit.scrapers.scraper import ScrapeRequest
from searchit.scrapers.bing import BingScraper
from searchit.scrapers.google import GoogleScraper
from searchit.scrapers.qwant import QwantScraper


__all__ = [
"GoogleScraper",
"YandexScraper",
"BingScraper",
"ScrapeRequest",
"BingScraper",
"GoogleScraper",
"QwantScraper",
]
2 changes: 1 addition & 1 deletion searchit/scrapers/bing.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def __init__(self, max_results_per_page: int = 10):

def _parse_page(self, results: List[SearchResult], resp: ScrapeResponse) -> None:
rank = len(results) + 1
soup = bs4.BeautifulSoup(resp.html)
soup = bs4.BeautifulSoup(resp.html, features="lxml")
for block in soup.find_all("li", attrs={"class": "b_algo"}):
link = block.find("a", href=True)
if link:
Expand Down
5 changes: 2 additions & 3 deletions searchit/scrapers/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from searchit.scrapers import SearchScraper, ScrapeRequest, SearchResult, ScrapeResponse
from searchit.exceptions import BlockedException


class GoogleScraper(SearchScraper):

BASE_URL = "https://www.google{}/search?q={}&num={}&hl={}&start={}&filter=0"
Expand All @@ -16,7 +15,7 @@ def __init__(self, max_results_per_page: int = 100):

def _parse_page(self, results: List[SearchResult], res: ScrapeResponse) -> None:
rank = len(results) + 1
soup = bs4.BeautifulSoup(res.html)
soup = bs4.BeautifulSoup(res.html, features="lxml")
for block in soup.find_all("div", attrs={"class": "g"}):
link = block.find("a", href=True)
if link:
Expand All @@ -29,7 +28,7 @@ def _parse_page(self, results: List[SearchResult], res: ScrapeResponse) -> None:
if title:
title = title.get_text()

description = block.find("span", {"class": "st"})
description = block.find("div", { "data-content-feature": "1" })
if description:
description = description.get_text()
results.append(SearchResult(rank, link, title, description))
Expand Down
33 changes: 25 additions & 8 deletions searchit/scrapers/qwant.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from searchit.scrapers import SearchScraper, ScrapeRequest, SearchResult, ScrapeResponse
from searchit.exceptions import BlockedException, ConfigException


def _check_config(max_results: int):
if max_results > 10:
raise ConfigException("Qwant max results per page cannot be larger than 10")
Expand All @@ -16,8 +15,8 @@ def _check_config(max_results: int):
class QwantScraper(SearchScraper):

BASE_URL = (
"https://api.qwant.com/api/search/web?count={}&offset={}"
"&q={}&t=web&device=tablet&extensionDisabled=true&safesearch=0&locale={}&uiv=4"
"https://api.qwant.com/v3/search/web?count={}&offset={}"
"&q={}&t=web&device=desktop&extensionDisabled=true&safesearch=0&locale={}&uiv=4"
)

def __init__(self, max_results_per_page: int = 10):
Expand All @@ -36,13 +35,14 @@ async def _scrape_one(
except ClientError as err:
raise err

def _parse_json(self, results: List[SearchResult], resp: ScrapeResponse) -> None:
data = resp.json["data"]["result"]["items"]
def _parse_json(self, results: List[SearchResult], resp: ScrapeResponse, n: int) -> None:
data = resp.json['data']['result']['items']['mainline'][-1]['items']
for search_result in data:
n += 1
title = search_result.get("title")
url = search_result.get("url")
description = search_result.get("description")
position = search_result.get("position")
description = search_result.get("desc")
position = n
results.append(SearchResult(position, url, title, description))

def _paginate(self, term: str, _: str, geo: str, count: int):
Expand All @@ -62,12 +62,29 @@ def _check_exceptions(self, res: ScrapeResponse) -> None:
async def scrape(self, req: ScrapeRequest) -> List[SearchResult]:
geo = req.geo if req.geo else "en_GB"
urls = self._paginate(req.term, "", geo, req.count)
headers = {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't actually get applied to the request - as on line 83 - the headers are overriden by the call to self.user_agent() - which is probably what should be providing the headers.

'authority': 'api.qwant.com',
'accept': 'application/json, text/plain, */*',
'accept-language': 'fr,fr-FR;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6,es-CO;q=0.5,es;q=0.4',
'cache-control': 'no-cache',
'dnt': '1',
'origin': 'https://www.qwant.com',
'pragma': 'no-cache',
'referer': 'https://www.qwant.com/',
'sec-ch-ua': '"Not_A Brand";v="99", "Microsoft Edge";v="109", "Chromium";v="109"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.70'
}
headers = self.user_agent()
results = []
for idx, uri in enumerate(urls):
response = await self._scrape_one(uri, headers, req.proxy)
self._check_exceptions(response)
self._parse_json(results, response)
self._parse_json(results, response, idx * 10)
if not idx == len(urls) - 1:
await asyncio.sleep(req.sleep)
return results
90 changes: 0 additions & 90 deletions searchit/scrapers/yandex.py

This file was deleted.

1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
install_requires = [
'aiohttp>=3.6.2',
'beautifulsoup4>=4.8.2',
'lxml'
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an extra dependency which is probably not strictly required to use the package - also the version is not pinned. It would probably be better to allow the user to provide the html.parser implementation they want and default to 'html.parser' if an implementation is not provided.

]


Expand Down
17 changes: 0 additions & 17 deletions tests/test_yandex.py

This file was deleted.