diff --git a/config.json b/config.json
index 7a7e025..4139f9f 100644
--- a/config.json
+++ b/config.json
@@ -1,7 +1,7 @@
{
"quizlet": true,
"quizizz": true,
- "brainly": false,
+ "search_engine": 0,
"hide_show_key": "Ctrl+D",
"ocr_key": "Ctrl+Shift+X",
"paste_key": "Ctrl+Shift+V",
diff --git a/gui.pyw b/gui.pyw
index bc92d8b..ae81d4a 100644
--- a/gui.pyw
+++ b/gui.pyw
@@ -19,7 +19,7 @@ import tkinter as tk
root = tk.Tk()
root.withdraw()
-from scraper import Searchify
+from scraper import Searchify, SearchEngine
from textshot import *
from windoweffect import WindowEffect
@@ -96,17 +96,13 @@ class UI(QMainWindow):
self.status_label = self.findChild(QtWidgets.QLabel, "status_label")
self.quizlet_button = self.findChild(QtWidgets.QPushButton, "quizlet_button")
self.quizizz_button = self.findChild(QtWidgets.QPushButton, "quizizz_button")
- self.brainly_button = self.findChild(QtWidgets.QPushButton, "brainly_button")
self.settings_button = self.findChild(QtWidgets.QPushButton, "settings_button")
self.quizlet_button.setChecked(self.conf['quizlet'])
self.quizizz_button.setChecked(self.conf['quizizz'])
- self.brainly_button.setChecked(self.conf['brainly'])
self.quizlet_button.toggled.connect(lambda: self.updatejson('quizlet'))
self.quizizz_button.toggled.connect(lambda: self.updatejson('quizizz'))
- self.brainly_button.toggled.connect(lambda: self.updatejson('brainly'))
-
self.settings_button.clicked.connect(lambda: self.stackedWidget.setCurrentIndex(1))
@@ -115,7 +111,6 @@ class UI(QMainWindow):
self.quizizz_button.setIcon(QtGui.QIcon(resource_path("img\\quizizz.png")))
self.quizlet_button.setIcon(QtGui.QIcon(resource_path("img\\quizlet.png")))
- self.brainly_button.setIcon(QtGui.QIcon(resource_path("img\\brainly.png")))
self.titleIcon.setPixmap(QtGui.QPixmap(resource_path("img\\search.png")))
@@ -203,6 +198,11 @@ class UI(QMainWindow):
self.setting_on_top.setChecked(self.conf['on_top'])
self.setting_on_top.toggled.connect(lambda: self.set_window_on_top())
+
+ self.search_engine_combo = self.findChild(QtWidgets.QComboBox, "search_engine_combo")
+ self.search_engine_combo.setCurrentIndex(self.conf['search_engine'])
+ self.search_engine = SearchEngine(self.search_engine_combo.currentText().lower())
+ self.search_engine_combo.currentIndexChanged.connect(lambda: self.run_search_engine())
# window theme
self.themeInput = self.findChild(QtWidgets.QComboBox, "themeInput")
@@ -291,7 +291,7 @@ class UI(QMainWindow):
font_size = self.font_size.value()
# icon sizes
- for obj in [self.quizizz_button, self.quizlet_button, self.brainly_button]:
+ for obj in [self.quizizz_button, self.quizlet_button]:
obj.setIconSize(QtCore.QSize(font_size*2, font_size*2))
@@ -313,6 +313,10 @@ class UI(QMainWindow):
# calling scraper and adding to ui
+ def run_search_engine(self):
+ self.search_engine = SearchEngine(self.search_engine_combo.currentText().lower())
+ self.updatejson('search_engine')
+
def run_searcher(self):
query = self.search_bar.text().strip()
@@ -328,14 +332,13 @@ class UI(QMainWindow):
if self.quizizz_button.isChecked(): sites.append('quizizz')
if self.quizlet_button.isChecked(): sites.append('quizlet')
- if self.brainly_button.isChecked(): sites.append('brainly')
if not sites:
self.status_label.setText('Please select at least one site.')
self.search_frame.setEnabled(True)
return
- searchify = Searchify(query, sites)
+ searchify = Searchify(query, sites, self.search_engine)
t = Thread(target=searchify.main)
t.daemon = True
@@ -530,7 +533,6 @@ class UI(QMainWindow):
# keybinds
"quizlet": lambda: self.quizlet_button.isChecked(),
"quizizz": lambda: self.quizizz_button.isChecked(),
- "brainly": lambda: self.brainly_button.isChecked(),
"hide_show_key": lambda: self.hide_show_key.keySequence().toString(),
"ocr_key": lambda: self.ocr_key.keySequence().toString(),
"paste_key": lambda: self.paste_key.keySequence().toString(),
@@ -550,6 +552,7 @@ class UI(QMainWindow):
"hide_taskbar": lambda: self.setting_hide_taskbar.isChecked(),
"theme": lambda: self.themeInput.currentIndex(),
"font_size": lambda: self.font_size.value(),
+ "search_engine": lambda: self.search_engine_combo.currentIndex(),
}
def updatejson(self, key):
diff --git a/img/brainly.png b/img/brainly.png
deleted file mode 100644
index 8c1df69..0000000
Binary files a/img/brainly.png and /dev/null differ
diff --git a/requirements.txt b/requirements.txt
index b49f6b6..701b68a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,22 +1,44 @@
+appdirs==1.4.4
beautifulsoup4==4.10.0
+bs4==0.0.1
certifi==2021.10.8
cffi==1.15.0
charset-normalizer==2.0.9
+colorama==0.4.4
+cssselect==1.1.0
+fake-headers==1.0.2
+fake-useragent==0.1.11
gevent==21.12.0
greenlet==1.1.2
grequests==0.6.0
+html5lib==1.1
idna==3.3
+importlib-metadata==4.11.3
keyboard==0.13.5
-Pillow==8.4.0
+lxml==4.8.0
+packaging==21.3
+parse==1.19.0
+Pillow==9.1.0
pycparser==2.21
+pyee==8.2.2
+pyparsing==3.0.8
pyperclip==1.8.2
+pyppeteer==1.0.2
PyQt5==5.15.6
PyQt5-Qt5==5.15.2
PyQt5-sip==12.9.0
-pytesseract==0.3.8
+pyquery==1.4.3
+pytesseract==0.3.9
pywin32==303
requests==2.26.0
+requests-html==0.10.0
+six==1.16.0
soupsieve==2.3.1
+tqdm==4.64.0
urllib3==1.26.7
+w3lib==1.22.0
+webencodings==0.5.1
+websockets==10.2
+zipp==3.8.0
zope.event==4.5.0
zope.interface==5.4.0
diff --git a/scraper.py b/scraper.py
index f923bfb..659bfed 100644
--- a/scraper.py
+++ b/scraper.py
@@ -1,71 +1,124 @@
+import grequests
import json
from bs4 import BeautifulSoup
from difflib import SequenceMatcher
import json
-import grequests
+from requests_html import HTMLSession
+from fake_headers import Headers
import re
import sys
import time
from urllib.parse import urlencode
from threading import Thread
-
headers = {
- "Connection": "keep-alive",
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36 Edg/95.0.1020.53',
- "Sec-Fetch-Site": "same-origin",
+ "Sec-Ch-Ua": "\"(Not(A:Brand\";v=\"8\", \"Chromium\";v=\"99\"",
+ "Sec-Ch-Ua-Mobile": "?0",
+ "Sec-Ch-Ua-Platform": "\"Windows\"",
+ "Upgrade-Insecure-Requests": "1",
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36",
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
+ "Sec-Fetch-Site": "none",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-User": "?1",
"Sec-Fetch-Dest": "document",
- "Referer": "https://www.bing.com/",
- "Accept-Language": "en-US,en;q=0.9"
+ "Accept-Encoding": "gzip, deflate",
+ "Accept-Language": "en-US,en;q=0.9",
+ "Connection": "close"
}
-
get_text = lambda x: BeautifulSoup(x, features='lxml').get_text().strip()
sluggify = lambda a: ' '.join(re.sub(r'[^\\sa-z0-9\\.,\\(\\)]+', ' ', a.lower()).split())
similar = lambda a, b: SequenceMatcher(None, sluggify(a), sluggify(b)).ratio()
remove_duplicates = lambda a: list(set(a))
+def _make_headers():
+ return {**headers, **Headers(headers=True, browser='chrome', os='windows').generate()}
-class SearchBing:
+class SearchEngine:
+ headers = headers.copy()
+ def __init__(self, engine_name):
+ self.sess = HTMLSession()
+ self.engine_name = engine_name
+ self._web_engines = { # simple scrapers using get requests
+ 'google': ('https://www.google.com/search?', 'q', {'aqs': 'chrome..69i57.888j0j1', 'sourceid': 'chrome', 'ie': 'UTF-8'}),
+ 'bing': ('https://www.bing.com/search?', 'q', {'pq': ''}),
+ }
+ if engine_name in self._web_engines:
+ return
+ elif engine_name == 'startpage':
+ print('Starting startpage instance...')
+ self.t = Thread(target=self._init_startpage)
+ self.t.daemon = True
+ self.t.start()
+
+ def find_items(self, soup, args):
+ return {i: soup.find('input', {'type': 'hidden', 'name': i})['value'] for i in args}
+
+ def get_startpage_items(self, r):
+ soup = BeautifulSoup(r.text, 'lxml')
+ return {'query': None, 'cat': 'web', **self.find_items(soup, ['lui', 'language', 'sc', 'abp'])}
+
+ def _init_startpage(self):
+ self._startpage_data = self.get_startpage_items(self.sess.get('https://www.startpage.com/', headers=self.headers))
+ self.headers.update({"Sec-Fetch-Site": "same-origin", 'Referer': 'https://www.startpage.com/'})
+
+ def startpage_get_page(self, query, sites):
+ self.t.join()
+ resps = grequests.map([
+ grequests.post('https://www.startpage.com/sp/search',
+ headers=self.headers,
+ data={**self._startpage_data, **{'query': f'{query} site:{site}.com'}}
+ )
+ for site in sites
+ ])
+ self.t = Thread(target=self.get_startpage_items, args=(resps[-1],))
+ self.t.daemon = True
+ self.t.start()
+ return dict(zip(sites, resps))
+
+ def get_page(self, query, sites):
+ if self.engine_name == 'startpage':
+ return self.startpage_get_page(query, sites)
+ return dict(zip(
+ sites,
+ grequests.map([
+ grequests.get(
+ (web_engine := self._web_engines[self.engine_name])[0]
+ + urlencode({web_engine[1]: f'{query} site:{site}.com', **web_engine[2]}),
+ headers=self.headers, session=self.sess
+ )
+ for site in sites
+ ], size=len(sites))
+ ))
+
+
+class SearchWeb:
"""
- search bing for query
+ search web for query
"""
- def __init__(self, query, sites):
+ def __init__(self, query, sites, engine):
self.query = query
self.links = None
self.sites = sites
+ self.engine = engine
self._regex_objs = {
'quizlet': re.compile('https?://quizlet.com/\d+/[a-z0-9\\-]+/'),
'quizizz': re.compile('https?://quizizz.com/admin/quiz/[a-f0-9]+/[a-z\\-]+'),
- 'brainly': re.compile('https?://brainly.com/question/\d+'),
}
def search(self):
"""
- search bing for query
+ search web for query
"""
- resps = dict(zip(
- self.sites,
- grequests.map([
- grequests.get(
- 'https://www.bing.com/search?'
- + urlencode({'q': self.query + f' site:{site}.com'}),
- headers=headers,
- )
- for site in self.sites
- ], size=len(self.sites))
- ))
-
+ resps = self.engine.get_page(self.query, self.sites)
self.links = {
site: remove_duplicates(re.findall(self._regex_objs[site], resps[site].text))
for site in self.sites
}
-
class QuizizzScraper:
def __init__(self, links, query):
self.links = links
@@ -74,7 +127,7 @@ def __init__(self, links, query):
self.query = query
def async_requests(self, links):
- reqs = [grequests.get(u, headers=headers) for u in links]
+ reqs = [grequests.get(u, headers=_make_headers()) for u in links]
self.resps = grequests.map(reqs, size=len(reqs))
def parse_links(self):
@@ -136,7 +189,7 @@ def __init__(self, links, query):
self._regex_obj = re.compile('\\= \\{"alphabeticalIsDifferent.*\\}; QLoad\\(')
def async_requests(self, links):
- reqs = [grequests.get(u, headers=headers) for u in links]
+ reqs = [grequests.get(u, headers=_make_headers()) for u in links]
self.resps = grequests.map(reqs, size=len(reqs))
def parse_links(self):
@@ -170,61 +223,6 @@ def quizlet_parser(self, resp):
)
-
-class BrainlyScraper:
- def __init__(self, links, query):
- self.links = links
- self.resps = None
- self.brainlys = []
- self.query = query
-
- def async_requests(self, links):
- reqs = [grequests.get(u, headers=headers) for u in links]
- self.resps = grequests.map(reqs, size=len(reqs))
-
- def parse_links(self):
- self.async_requests(self.links)
- for resp in self.resps:
- try:
- self.brainlys.append(self.brainly_parser(resp))
- except Exception as e:
- print('exception', e, resp.url)
- # pass # skip over any errors
- return self.brainlys
-
-
- def brainly_parser(self, resp):
- data = json.loads(BeautifulSoup(resp.text, features='lxml').find('script', type="application/ld+json").string)[0]
- answers = []
- if 'acceptedAnswer' in data['mainEntity']:
- answers += data['mainEntity']['acceptedAnswer']
- if 'suggestedAnswer' in data['mainEntity']:
- answers += data['mainEntity']['suggestedAnswer']
-
- return max(
- (
- {
- 'question': data['name'].strip(),
- 'answer': get_text(i['text'])
- .replace('Answer:', 'Answer: ')
- .replace('Explanation:', '\nExplanation: ')
- + '\nUpvotes: '
- + str(i['upvoteCount']),
- 'similarity': (
- similar(data['name'], self.query),
- True,
- i['upvoteCount'],
- ),
- 'url': resp.url,
- }
- for i in answers
- ),
- key=lambda x: x['similarity'],
- )
-
-
-
-
class TimeLogger:
def __init__(self):
self.elapsed_total = time.time()
@@ -261,19 +259,17 @@ def print_timers(self):
-
-
class Searchify:
- def __init__(self, query, sites):
+ def __init__(self, query, sites, engine):
self.query = query
self.sites = sites
+ self.engine = engine
self.timer = TimeLogger()
self.flashcards = []
self.links = []
self.site_scrapers = {
'quizlet': QuizletScraper,
'quizizz': QuizizzScraper,
- 'brainly': BrainlyScraper,
}
def main(self):
@@ -306,8 +302,8 @@ def _flashcard_thread(self, site_scraper, links, site_name):
def get_links(self):
- self.timer.start('bing search')
- search_bing = SearchBing(self.query, self.sites)
+ self.timer.start('web search')
+ search_bing = SearchWeb(self.query, self.sites, self.engine)
search_bing.search()
self.timer.end()
self.links = search_bing.links
@@ -328,10 +324,11 @@ def sort_flashcards(self): # sourcery skip: for-index-replacement
if __name__ == '__main__' and len(sys.argv) > 1:
# argument parsing
import argparse
- parser = argparse.ArgumentParser(description='Search Bing for flashcards')
+ parser = argparse.ArgumentParser(description='Search the web for flashcards')
parser.add_argument('--query', '-q', help='query to search for', default=None)
parser.add_argument('--output', '-o', help='output file', default=None)
- parser.add_argument('--sites', '-s', help='question sources quizlet,quizizz,brainly (comma seperated list)', default='quizlet,quizizz,brainly')
+ parser.add_argument('--sites', '-s', help='question sources quizlet,quizizz (comma seperated list)', default='quizlet,quizizz')
+ parser.add_argument('--engine', '-e', help='search engine to use (google, bing)', default='bing')
args = parser.parse_args()
if args.output:
@@ -348,15 +345,20 @@ def sort_flashcards(self): # sourcery skip: for-index-replacement
flashcards = [] # create flashcard list
sites = args.sites.lower().split(',') # get list of sites
+ engine_name = args.engine.lower().strip() # get search engine
+ # start search engine
+ engine = SearchEngine(engine_name)
+
# run search
s = Searchify(
query=args.query,
sites=sites,
+ engine=engine,
)
s.main()
write(json.dumps(s.flashcards, indent=4))
- print(str(len(s.flashcards))+ ' flashcards found')
+ print(f'{len(s.flashcards)} flashcards found')
s.timer.print_timers()
\ No newline at end of file
diff --git a/textshot.py b/textshot.py
index 1c215e0..cae8cc3 100644
--- a/textshot.py
+++ b/textshot.py
@@ -134,8 +134,15 @@ def mouseReleaseEvent(self, event):
return super().mouseReleaseEvent(event)
self.hide()
+ QtWidgets.QApplication.restoreOverrideCursor()
QtWidgets.QApplication.processEvents()
- shot = self.screen.copy(QtCore.QRect(self.start, self.end))
+
+ shot = self.screen.copy(
+ min(self.start.x(), self.end.x()),
+ min(self.start.y(), self.end.y()),
+ abs(self.start.x() - self.end.x()),
+ abs(self.start.y() - self.end.y()),
+ )
self.processImage(shot)
self.quit_app()
print('done')
diff --git a/window.ui b/window.ui
index ae39845..9473920 100644
--- a/window.ui
+++ b/window.ui
@@ -204,28 +204,6 @@
- -
-
-
-
- 30
- 30
-
-
-
- Brainly
-
-
- true
-
-
- false
-
-
- true
-
-
-
-
@@ -404,9 +382,9 @@
0
- 0
- 379
- 631
+ -77
+ 656
+ 465
@@ -775,26 +753,97 @@
true
-
-
-
+
-
+
- Search after running OCR
+ Search after pasting
true
- -
-
+
-
+
- Search after pasting
+ Search after running OCR
true
+ -
+
+
-
+
+
+
+ 150
+ 0
+
+
+
-
+
+ Bing
+
+
+ -
+
+ Google
+
+
+ -
+
+ Startpage
+
+
+
+
+ -
+
+
+
+ 100
+ 0
+
+
+
+ Search engine:
+
+
+
+ -
+
+
+ Qt::Horizontal
+
+
+
+ 40
+ 20
+
+
+
+
+ -
+
+
+ Qt::Horizontal
+
+
+ QSizePolicy::Fixed
+
+
+
+ 70
+ 20
+
+
+
+
+
+
@@ -972,7 +1021,7 @@
Window transparency
- 15
+ 5
100