EdmundMartin · fliot · Feb 4, 2023 · Feb 4, 2023 · Feb 4, 2023 · Feb 4, 2023
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # searchit
 Searchit is a library for async scraping of search engines. The library supports multiple search engines 
-(currently Google, Yandex, and Bing) with support for other search engines to come.
+(currently Bing, Google, and Qwant) with support for other search engines to come.
 
 # Install
 ```
@@ -12,17 +12,19 @@ Can be installed using pip, by running the above command.
 ```python
 import asyncio
 
-from searchit import GoogleScraper, YandexScraper, BingScraper
+from searchit import BingScraper, GoogleScraper, QwantScraper
 from searchit import ScrapeRequest
 
 request = ScrapeRequest("watch movies online", 30)
-google = GoogleScraper(max_results_per_page=10) # max_results = Number of results per page
-yandex = YandexScraper(max_results_per_page=10)
+bing = BingScraper(max_results_per_page=10) # max_results = Number of results per page
+google = GoogleScraper(max_results_per_page=10)
+qwant = QwantScraper(max_results_per_page=10)
 
 loop = asyncio.get_event_loop()
 
+results = loop.run_until_complete(bing.scrape(request))
 results = loop.run_until_complete(google.scrape(request))
-results = loop.run_until_complete(yandex.scrape(request))
+results = loop.run_until_complete(qwant.scrape(request))
 ```
 To use Searchit users first create a ScrapeRequest object, with term and number of results as required fields. 
 This object can then be passed to multiple different search engines and scraped asynchronously.
@@ -35,7 +37,7 @@ domain - Optional[str] - the domain to search i.e. .com or .com
 sleep - Optional[int] - time to wait betweeen paginating pages - important to prevent getting blocked
 proxy - Optional[str] - proxy to be used to make request - default none
 language - Optional[str] - language to conduct search in (only Google atm)
-geo - Optional[str] - Geo location to conduct search from Yandex, and Qwant
+geo - Optional[str] - Geo location to conduct search from Qwant
 ```
 
 ## Roadmap

diff --git a/searchit/__init__.py b/searchit/__init__.py
@@ -1,12 +1,12 @@
-from searchit.scrapers.google import GoogleScraper
-from searchit.scrapers.yandex import YandexScraper
-from searchit.scrapers.bing import BingScraper
 from searchit.scrapers.scraper import ScrapeRequest
+from searchit.scrapers.bing import BingScraper
+from searchit.scrapers.google import GoogleScraper
+from searchit.scrapers.qwant import QwantScraper
 
 
 __all__ = [
-    "GoogleScraper",
-    "YandexScraper",
-    "BingScraper",
     "ScrapeRequest",
+    "BingScraper",
+    "GoogleScraper",
+    "QwantScraper",
 ]
diff --git a/searchit/scrapers/bing.py b/searchit/scrapers/bing.py
@@ -22,7 +22,7 @@ def __init__(self, max_results_per_page: int = 10):
 
     def _parse_page(self, results: List[SearchResult], resp: ScrapeResponse) -> None:
         rank = len(results) + 1
-        soup = bs4.BeautifulSoup(resp.html)
+        soup = bs4.BeautifulSoup(resp.html, features="lxml")
         for block in soup.find_all("li", attrs={"class": "b_algo"}):
             link = block.find("a", href=True)
             if link:

diff --git a/searchit/scrapers/google.py b/searchit/scrapers/google.py
@@ -6,7 +6,6 @@
 from searchit.scrapers import SearchScraper, ScrapeRequest, SearchResult, ScrapeResponse
 from searchit.exceptions import BlockedException
 
-
 class GoogleScraper(SearchScraper):
 
     BASE_URL = "https://www.google{}/search?q={}&num={}&hl={}&start={}&filter=0"
@@ -16,7 +15,7 @@ def __init__(self, max_results_per_page: int = 100):
 
     def _parse_page(self, results: List[SearchResult], res: ScrapeResponse) -> None:
         rank = len(results) + 1
-        soup = bs4.BeautifulSoup(res.html)
+        soup = bs4.BeautifulSoup(res.html, features="lxml")
         for block in soup.find_all("div", attrs={"class": "g"}):
             link = block.find("a", href=True)
             if link:
@@ -29,7 +28,7 @@ def _parse_page(self, results: List[SearchResult], res: ScrapeResponse) -> None:
             if title:
                 title = title.get_text()
 
-            description = block.find("span", {"class": "st"})
+            description = block.find("div", { "data-content-feature": "1" })
             if description:
                 description = description.get_text()
             results.append(SearchResult(rank, link, title, description))

diff --git a/searchit/scrapers/qwant.py b/searchit/scrapers/qwant.py
@@ -6,7 +6,6 @@
 from searchit.scrapers import SearchScraper, ScrapeRequest, SearchResult, ScrapeResponse
 from searchit.exceptions import BlockedException, ConfigException
 
-
 def _check_config(max_results: int):
     if max_results > 10:
         raise ConfigException("Qwant max results per page cannot be larger than 10")
@@ -16,8 +15,8 @@ def _check_config(max_results: int):
 class QwantScraper(SearchScraper):
 
     BASE_URL = (
-        "https://api.qwant.com/api/search/web?count={}&offset={}"
-        "&q={}&t=web&device=tablet&extensionDisabled=true&safesearch=0&locale={}&uiv=4"
+        "https://api.qwant.com/v3/search/web?count={}&offset={}"
+        "&q={}&t=web&device=desktop&extensionDisabled=true&safesearch=0&locale={}&uiv=4"
     )
 
     def __init__(self, max_results_per_page: int = 10):
@@ -36,13 +35,14 @@ async def _scrape_one(
             except ClientError as err:
                 raise err
 
-    def _parse_json(self, results: List[SearchResult], resp: ScrapeResponse) -> None:
-        data = resp.json["data"]["result"]["items"]
+    def _parse_json(self, results: List[SearchResult], resp: ScrapeResponse, n: int) -> None:
+        data = resp.json['data']['result']['items']['mainline'][-1]['items']
         for search_result in data:
+            n += 1
             title = search_result.get("title")
             url = search_result.get("url")
-            description = search_result.get("description")
-            position = search_result.get("position")
+            description = search_result.get("desc")
+            position = n
             results.append(SearchResult(position, url, title, description))
 
     def _paginate(self, term: str, _: str, geo: str, count: int):
@@ -62,12 +62,29 @@ def _check_exceptions(self, res: ScrapeResponse) -> None:
     async def scrape(self, req: ScrapeRequest) -> List[SearchResult]:
         geo = req.geo if req.geo else "en_GB"
         urls = self._paginate(req.term, "", geo, req.count)
+        headers = {
+            'authority': 'api.qwant.com',
+            'accept': 'application/json, text/plain, */*',
+            'accept-language': 'fr,fr-FR;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6,es-CO;q=0.5,es;q=0.4',
+            'cache-control': 'no-cache',
+            'dnt': '1',
+            'origin': 'https://www.qwant.com',
+            'pragma': 'no-cache',
+            'referer': 'https://www.qwant.com/',
+            'sec-ch-ua': '"Not_A Brand";v="99", "Microsoft Edge";v="109", "Chromium";v="109"',
+            'sec-ch-ua-mobile': '?0',
+            'sec-ch-ua-platform': '"Windows"',
+            'sec-fetch-dest': 'empty',
+            'sec-fetch-mode': 'cors',
+            'sec-fetch-site': 'same-site',
+            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.70'
+        }
         headers = self.user_agent()
         results = []
         for idx, uri in enumerate(urls):
             response = await self._scrape_one(uri, headers, req.proxy)
             self._check_exceptions(response)
-            self._parse_json(results, response)
+            self._parse_json(results, response, idx * 10)
             if not idx == len(urls) - 1:
                 await asyncio.sleep(req.sleep)
         return results
diff --git a/searchit/scrapers/yandex.py b/searchit/scrapers/yandex.py
diff --git a/setup.py b/setup.py
@@ -14,6 +14,7 @@
 install_requires = [
     'aiohttp>=3.6.2',
     'beautifulsoup4>=4.8.2',
+    'lxml'
 ]
 
 

diff --git a/tests/test_yandex.py b/tests/test_yandex.py