diff --git a/config.json b/config.json index 7a7e025..4139f9f 100644 --- a/config.json +++ b/config.json @@ -1,7 +1,7 @@ { "quizlet": true, "quizizz": true, - "brainly": false, + "search_engine": 0, "hide_show_key": "Ctrl+D", "ocr_key": "Ctrl+Shift+X", "paste_key": "Ctrl+Shift+V", diff --git a/gui.pyw b/gui.pyw index bc92d8b..ae81d4a 100644 --- a/gui.pyw +++ b/gui.pyw @@ -19,7 +19,7 @@ import tkinter as tk root = tk.Tk() root.withdraw() -from scraper import Searchify +from scraper import Searchify, SearchEngine from textshot import * from windoweffect import WindowEffect @@ -96,17 +96,13 @@ class UI(QMainWindow): self.status_label = self.findChild(QtWidgets.QLabel, "status_label") self.quizlet_button = self.findChild(QtWidgets.QPushButton, "quizlet_button") self.quizizz_button = self.findChild(QtWidgets.QPushButton, "quizizz_button") - self.brainly_button = self.findChild(QtWidgets.QPushButton, "brainly_button") self.settings_button = self.findChild(QtWidgets.QPushButton, "settings_button") self.quizlet_button.setChecked(self.conf['quizlet']) self.quizizz_button.setChecked(self.conf['quizizz']) - self.brainly_button.setChecked(self.conf['brainly']) self.quizlet_button.toggled.connect(lambda: self.updatejson('quizlet')) self.quizizz_button.toggled.connect(lambda: self.updatejson('quizizz')) - self.brainly_button.toggled.connect(lambda: self.updatejson('brainly')) - self.settings_button.clicked.connect(lambda: self.stackedWidget.setCurrentIndex(1)) @@ -115,7 +111,6 @@ class UI(QMainWindow): self.quizizz_button.setIcon(QtGui.QIcon(resource_path("img\\quizizz.png"))) self.quizlet_button.setIcon(QtGui.QIcon(resource_path("img\\quizlet.png"))) - self.brainly_button.setIcon(QtGui.QIcon(resource_path("img\\brainly.png"))) self.titleIcon.setPixmap(QtGui.QPixmap(resource_path("img\\search.png"))) @@ -203,6 +198,11 @@ class UI(QMainWindow): self.setting_on_top.setChecked(self.conf['on_top']) self.setting_on_top.toggled.connect(lambda: self.set_window_on_top()) + + self.search_engine_combo = self.findChild(QtWidgets.QComboBox, "search_engine_combo") + self.search_engine_combo.setCurrentIndex(self.conf['search_engine']) + self.search_engine = SearchEngine(self.search_engine_combo.currentText().lower()) + self.search_engine_combo.currentIndexChanged.connect(lambda: self.run_search_engine()) # window theme self.themeInput = self.findChild(QtWidgets.QComboBox, "themeInput") @@ -291,7 +291,7 @@ class UI(QMainWindow): font_size = self.font_size.value() # icon sizes - for obj in [self.quizizz_button, self.quizlet_button, self.brainly_button]: + for obj in [self.quizizz_button, self.quizlet_button]: obj.setIconSize(QtCore.QSize(font_size*2, font_size*2)) @@ -313,6 +313,10 @@ class UI(QMainWindow): # calling scraper and adding to ui + def run_search_engine(self): + self.search_engine = SearchEngine(self.search_engine_combo.currentText().lower()) + self.updatejson('search_engine') + def run_searcher(self): query = self.search_bar.text().strip() @@ -328,14 +332,13 @@ class UI(QMainWindow): if self.quizizz_button.isChecked(): sites.append('quizizz') if self.quizlet_button.isChecked(): sites.append('quizlet') - if self.brainly_button.isChecked(): sites.append('brainly') if not sites: self.status_label.setText('Please select at least one site.') self.search_frame.setEnabled(True) return - searchify = Searchify(query, sites) + searchify = Searchify(query, sites, self.search_engine) t = Thread(target=searchify.main) t.daemon = True @@ -530,7 +533,6 @@ class UI(QMainWindow): # keybinds "quizlet": lambda: self.quizlet_button.isChecked(), "quizizz": lambda: self.quizizz_button.isChecked(), - "brainly": lambda: self.brainly_button.isChecked(), "hide_show_key": lambda: self.hide_show_key.keySequence().toString(), "ocr_key": lambda: self.ocr_key.keySequence().toString(), "paste_key": lambda: self.paste_key.keySequence().toString(), @@ -550,6 +552,7 @@ class UI(QMainWindow): "hide_taskbar": lambda: self.setting_hide_taskbar.isChecked(), "theme": lambda: self.themeInput.currentIndex(), "font_size": lambda: self.font_size.value(), + "search_engine": lambda: self.search_engine_combo.currentIndex(), } def updatejson(self, key): diff --git a/img/brainly.png b/img/brainly.png deleted file mode 100644 index 8c1df69..0000000 Binary files a/img/brainly.png and /dev/null differ diff --git a/requirements.txt b/requirements.txt index b49f6b6..701b68a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,22 +1,44 @@ +appdirs==1.4.4 beautifulsoup4==4.10.0 +bs4==0.0.1 certifi==2021.10.8 cffi==1.15.0 charset-normalizer==2.0.9 +colorama==0.4.4 +cssselect==1.1.0 +fake-headers==1.0.2 +fake-useragent==0.1.11 gevent==21.12.0 greenlet==1.1.2 grequests==0.6.0 +html5lib==1.1 idna==3.3 +importlib-metadata==4.11.3 keyboard==0.13.5 -Pillow==8.4.0 +lxml==4.8.0 +packaging==21.3 +parse==1.19.0 +Pillow==9.1.0 pycparser==2.21 +pyee==8.2.2 +pyparsing==3.0.8 pyperclip==1.8.2 +pyppeteer==1.0.2 PyQt5==5.15.6 PyQt5-Qt5==5.15.2 PyQt5-sip==12.9.0 -pytesseract==0.3.8 +pyquery==1.4.3 +pytesseract==0.3.9 pywin32==303 requests==2.26.0 +requests-html==0.10.0 +six==1.16.0 soupsieve==2.3.1 +tqdm==4.64.0 urllib3==1.26.7 +w3lib==1.22.0 +webencodings==0.5.1 +websockets==10.2 +zipp==3.8.0 zope.event==4.5.0 zope.interface==5.4.0 diff --git a/scraper.py b/scraper.py index f923bfb..659bfed 100644 --- a/scraper.py +++ b/scraper.py @@ -1,71 +1,124 @@ +import grequests import json from bs4 import BeautifulSoup from difflib import SequenceMatcher import json -import grequests +from requests_html import HTMLSession +from fake_headers import Headers import re import sys import time from urllib.parse import urlencode from threading import Thread - headers = { - "Connection": "keep-alive", - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36 Edg/95.0.1020.53', - "Sec-Fetch-Site": "same-origin", + "Sec-Ch-Ua": "\"(Not(A:Brand\";v=\"8\", \"Chromium\";v=\"99\"", + "Sec-Ch-Ua-Mobile": "?0", + "Sec-Ch-Ua-Platform": "\"Windows\"", + "Upgrade-Insecure-Requests": "1", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", + "Sec-Fetch-Site": "none", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-User": "?1", "Sec-Fetch-Dest": "document", - "Referer": "https://www.bing.com/", - "Accept-Language": "en-US,en;q=0.9" + "Accept-Encoding": "gzip, deflate", + "Accept-Language": "en-US,en;q=0.9", + "Connection": "close" } - get_text = lambda x: BeautifulSoup(x, features='lxml').get_text().strip() sluggify = lambda a: ' '.join(re.sub(r'[^\\sa-z0-9\\.,\\(\\)]+', ' ', a.lower()).split()) similar = lambda a, b: SequenceMatcher(None, sluggify(a), sluggify(b)).ratio() remove_duplicates = lambda a: list(set(a)) +def _make_headers(): + return {**headers, **Headers(headers=True, browser='chrome', os='windows').generate()} -class SearchBing: +class SearchEngine: + headers = headers.copy() + def __init__(self, engine_name): + self.sess = HTMLSession() + self.engine_name = engine_name + self._web_engines = { # simple scrapers using get requests + 'google': ('https://www.google.com/search?', 'q', {'aqs': 'chrome..69i57.888j0j1', 'sourceid': 'chrome', 'ie': 'UTF-8'}), + 'bing': ('https://www.bing.com/search?', 'q', {'pq': ''}), + } + if engine_name in self._web_engines: + return + elif engine_name == 'startpage': + print('Starting startpage instance...') + self.t = Thread(target=self._init_startpage) + self.t.daemon = True + self.t.start() + + def find_items(self, soup, args): + return {i: soup.find('input', {'type': 'hidden', 'name': i})['value'] for i in args} + + def get_startpage_items(self, r): + soup = BeautifulSoup(r.text, 'lxml') + return {'query': None, 'cat': 'web', **self.find_items(soup, ['lui', 'language', 'sc', 'abp'])} + + def _init_startpage(self): + self._startpage_data = self.get_startpage_items(self.sess.get('https://www.startpage.com/', headers=self.headers)) + self.headers.update({"Sec-Fetch-Site": "same-origin", 'Referer': 'https://www.startpage.com/'}) + + def startpage_get_page(self, query, sites): + self.t.join() + resps = grequests.map([ + grequests.post('https://www.startpage.com/sp/search', + headers=self.headers, + data={**self._startpage_data, **{'query': f'{query} site:{site}.com'}} + ) + for site in sites + ]) + self.t = Thread(target=self.get_startpage_items, args=(resps[-1],)) + self.t.daemon = True + self.t.start() + return dict(zip(sites, resps)) + + def get_page(self, query, sites): + if self.engine_name == 'startpage': + return self.startpage_get_page(query, sites) + return dict(zip( + sites, + grequests.map([ + grequests.get( + (web_engine := self._web_engines[self.engine_name])[0] + + urlencode({web_engine[1]: f'{query} site:{site}.com', **web_engine[2]}), + headers=self.headers, session=self.sess + ) + for site in sites + ], size=len(sites)) + )) + + +class SearchWeb: """ - search bing for query + search web for query """ - def __init__(self, query, sites): + def __init__(self, query, sites, engine): self.query = query self.links = None self.sites = sites + self.engine = engine self._regex_objs = { 'quizlet': re.compile('https?://quizlet.com/\d+/[a-z0-9\\-]+/'), 'quizizz': re.compile('https?://quizizz.com/admin/quiz/[a-f0-9]+/[a-z\\-]+'), - 'brainly': re.compile('https?://brainly.com/question/\d+'), } def search(self): """ - search bing for query + search web for query """ - resps = dict(zip( - self.sites, - grequests.map([ - grequests.get( - 'https://www.bing.com/search?' - + urlencode({'q': self.query + f' site:{site}.com'}), - headers=headers, - ) - for site in self.sites - ], size=len(self.sites)) - )) - + resps = self.engine.get_page(self.query, self.sites) self.links = { site: remove_duplicates(re.findall(self._regex_objs[site], resps[site].text)) for site in self.sites } - class QuizizzScraper: def __init__(self, links, query): self.links = links @@ -74,7 +127,7 @@ def __init__(self, links, query): self.query = query def async_requests(self, links): - reqs = [grequests.get(u, headers=headers) for u in links] + reqs = [grequests.get(u, headers=_make_headers()) for u in links] self.resps = grequests.map(reqs, size=len(reqs)) def parse_links(self): @@ -136,7 +189,7 @@ def __init__(self, links, query): self._regex_obj = re.compile('\\= \\{"alphabeticalIsDifferent.*\\}; QLoad\\(') def async_requests(self, links): - reqs = [grequests.get(u, headers=headers) for u in links] + reqs = [grequests.get(u, headers=_make_headers()) for u in links] self.resps = grequests.map(reqs, size=len(reqs)) def parse_links(self): @@ -170,61 +223,6 @@ def quizlet_parser(self, resp): ) - -class BrainlyScraper: - def __init__(self, links, query): - self.links = links - self.resps = None - self.brainlys = [] - self.query = query - - def async_requests(self, links): - reqs = [grequests.get(u, headers=headers) for u in links] - self.resps = grequests.map(reqs, size=len(reqs)) - - def parse_links(self): - self.async_requests(self.links) - for resp in self.resps: - try: - self.brainlys.append(self.brainly_parser(resp)) - except Exception as e: - print('exception', e, resp.url) - # pass # skip over any errors - return self.brainlys - - - def brainly_parser(self, resp): - data = json.loads(BeautifulSoup(resp.text, features='lxml').find('script', type="application/ld+json").string)[0] - answers = [] - if 'acceptedAnswer' in data['mainEntity']: - answers += data['mainEntity']['acceptedAnswer'] - if 'suggestedAnswer' in data['mainEntity']: - answers += data['mainEntity']['suggestedAnswer'] - - return max( - ( - { - 'question': data['name'].strip(), - 'answer': get_text(i['text']) - .replace('Answer:', 'Answer: ') - .replace('Explanation:', '\nExplanation: ') - + '\nUpvotes: ' - + str(i['upvoteCount']), - 'similarity': ( - similar(data['name'], self.query), - True, - i['upvoteCount'], - ), - 'url': resp.url, - } - for i in answers - ), - key=lambda x: x['similarity'], - ) - - - - class TimeLogger: def __init__(self): self.elapsed_total = time.time() @@ -261,19 +259,17 @@ def print_timers(self): - - class Searchify: - def __init__(self, query, sites): + def __init__(self, query, sites, engine): self.query = query self.sites = sites + self.engine = engine self.timer = TimeLogger() self.flashcards = [] self.links = [] self.site_scrapers = { 'quizlet': QuizletScraper, 'quizizz': QuizizzScraper, - 'brainly': BrainlyScraper, } def main(self): @@ -306,8 +302,8 @@ def _flashcard_thread(self, site_scraper, links, site_name): def get_links(self): - self.timer.start('bing search') - search_bing = SearchBing(self.query, self.sites) + self.timer.start('web search') + search_bing = SearchWeb(self.query, self.sites, self.engine) search_bing.search() self.timer.end() self.links = search_bing.links @@ -328,10 +324,11 @@ def sort_flashcards(self): # sourcery skip: for-index-replacement if __name__ == '__main__' and len(sys.argv) > 1: # argument parsing import argparse - parser = argparse.ArgumentParser(description='Search Bing for flashcards') + parser = argparse.ArgumentParser(description='Search the web for flashcards') parser.add_argument('--query', '-q', help='query to search for', default=None) parser.add_argument('--output', '-o', help='output file', default=None) - parser.add_argument('--sites', '-s', help='question sources quizlet,quizizz,brainly (comma seperated list)', default='quizlet,quizizz,brainly') + parser.add_argument('--sites', '-s', help='question sources quizlet,quizizz (comma seperated list)', default='quizlet,quizizz') + parser.add_argument('--engine', '-e', help='search engine to use (google, bing)', default='bing') args = parser.parse_args() if args.output: @@ -348,15 +345,20 @@ def sort_flashcards(self): # sourcery skip: for-index-replacement flashcards = [] # create flashcard list sites = args.sites.lower().split(',') # get list of sites + engine_name = args.engine.lower().strip() # get search engine + # start search engine + engine = SearchEngine(engine_name) + # run search s = Searchify( query=args.query, sites=sites, + engine=engine, ) s.main() write(json.dumps(s.flashcards, indent=4)) - print(str(len(s.flashcards))+ ' flashcards found') + print(f'{len(s.flashcards)} flashcards found') s.timer.print_timers() \ No newline at end of file diff --git a/textshot.py b/textshot.py index 1c215e0..cae8cc3 100644 --- a/textshot.py +++ b/textshot.py @@ -134,8 +134,15 @@ def mouseReleaseEvent(self, event): return super().mouseReleaseEvent(event) self.hide() + QtWidgets.QApplication.restoreOverrideCursor() QtWidgets.QApplication.processEvents() - shot = self.screen.copy(QtCore.QRect(self.start, self.end)) + + shot = self.screen.copy( + min(self.start.x(), self.end.x()), + min(self.start.y(), self.end.y()), + abs(self.start.x() - self.end.x()), + abs(self.start.y() - self.end.y()), + ) self.processImage(shot) self.quit_app() print('done') diff --git a/window.ui b/window.ui index ae39845..9473920 100644 --- a/window.ui +++ b/window.ui @@ -204,28 +204,6 @@ - - - - - 30 - 30 - - - - Brainly - - - true - - - false - - - true - - - @@ -404,9 +382,9 @@ 0 - 0 - 379 - 631 + -77 + 656 + 465 @@ -775,26 +753,97 @@ true - - + + - Search after running OCR + Search after pasting true - - + + - Search after pasting + Search after running OCR true + + + + + + + 150 + 0 + + + + + Bing + + + + + Google + + + + + Startpage + + + + + + + + + 100 + 0 + + + + Search engine: + + + + + + + Qt::Horizontal + + + + 40 + 20 + + + + + + + + Qt::Horizontal + + + QSizePolicy::Fixed + + + + 70 + 20 + + + + + + @@ -972,7 +1021,7 @@ Window transparency - 15 + 5 100