From 2e854a467ec152a1475769bea9f08c06cbcced33 Mon Sep 17 00:00:00 2001 From: daijro <72637910+daijro@users.noreply.github.com> Date: Mon, 11 Apr 2022 02:54:24 -0500 Subject: [PATCH] Fix #2 - Fix Bing search - Add Google and Startpage scrapers (change in ui settings) - Fix headers + add header randomizer - Completely remove Brainly from the project due to new WAF + slow processing time - Fix delay in cursor restoring after OCR - Manually calculate selection rectangle for OCR - Change minimum value for window transparency slider to 5% - Upgrade Pillow and pytesseract to fix security concerns - Add requests_html, fake_headers, and their dependencies to requirements.txt --- config.json | 2 +- gui.pyw | 23 +++--- img/brainly.png | Bin 6933 -> 0 bytes requirements.txt | 26 ++++++- scraper.py | 186 ++++++++++++++++++++++++----------------------- textshot.py | 9 ++- window.ui | 113 ++++++++++++++++++++-------- 7 files changed, 221 insertions(+), 138 deletions(-) delete mode 100644 img/brainly.png diff --git a/config.json b/config.json index 7a7e025..4139f9f 100644 --- a/config.json +++ b/config.json @@ -1,7 +1,7 @@ { "quizlet": true, "quizizz": true, - "brainly": false, + "search_engine": 0, "hide_show_key": "Ctrl+D", "ocr_key": "Ctrl+Shift+X", "paste_key": "Ctrl+Shift+V", diff --git a/gui.pyw b/gui.pyw index bc92d8b..ae81d4a 100644 --- a/gui.pyw +++ b/gui.pyw @@ -19,7 +19,7 @@ import tkinter as tk root = tk.Tk() root.withdraw() -from scraper import Searchify +from scraper import Searchify, SearchEngine from textshot import * from windoweffect import WindowEffect @@ -96,17 +96,13 @@ class UI(QMainWindow): self.status_label = self.findChild(QtWidgets.QLabel, "status_label") self.quizlet_button = self.findChild(QtWidgets.QPushButton, "quizlet_button") self.quizizz_button = self.findChild(QtWidgets.QPushButton, "quizizz_button") - self.brainly_button = self.findChild(QtWidgets.QPushButton, "brainly_button") self.settings_button = self.findChild(QtWidgets.QPushButton, "settings_button") self.quizlet_button.setChecked(self.conf['quizlet']) self.quizizz_button.setChecked(self.conf['quizizz']) - self.brainly_button.setChecked(self.conf['brainly']) self.quizlet_button.toggled.connect(lambda: self.updatejson('quizlet')) self.quizizz_button.toggled.connect(lambda: self.updatejson('quizizz')) - self.brainly_button.toggled.connect(lambda: self.updatejson('brainly')) - self.settings_button.clicked.connect(lambda: self.stackedWidget.setCurrentIndex(1)) @@ -115,7 +111,6 @@ class UI(QMainWindow): self.quizizz_button.setIcon(QtGui.QIcon(resource_path("img\\quizizz.png"))) self.quizlet_button.setIcon(QtGui.QIcon(resource_path("img\\quizlet.png"))) - self.brainly_button.setIcon(QtGui.QIcon(resource_path("img\\brainly.png"))) self.titleIcon.setPixmap(QtGui.QPixmap(resource_path("img\\search.png"))) @@ -203,6 +198,11 @@ class UI(QMainWindow): self.setting_on_top.setChecked(self.conf['on_top']) self.setting_on_top.toggled.connect(lambda: self.set_window_on_top()) + + self.search_engine_combo = self.findChild(QtWidgets.QComboBox, "search_engine_combo") + self.search_engine_combo.setCurrentIndex(self.conf['search_engine']) + self.search_engine = SearchEngine(self.search_engine_combo.currentText().lower()) + self.search_engine_combo.currentIndexChanged.connect(lambda: self.run_search_engine()) # window theme self.themeInput = self.findChild(QtWidgets.QComboBox, "themeInput") @@ -291,7 +291,7 @@ class UI(QMainWindow): font_size = self.font_size.value() # icon sizes - for obj in [self.quizizz_button, self.quizlet_button, self.brainly_button]: + for obj in [self.quizizz_button, self.quizlet_button]: obj.setIconSize(QtCore.QSize(font_size*2, font_size*2)) @@ -313,6 +313,10 @@ class UI(QMainWindow): # calling scraper and adding to ui + def run_search_engine(self): + self.search_engine = SearchEngine(self.search_engine_combo.currentText().lower()) + self.updatejson('search_engine') + def run_searcher(self): query = self.search_bar.text().strip() @@ -328,14 +332,13 @@ class UI(QMainWindow): if self.quizizz_button.isChecked(): sites.append('quizizz') if self.quizlet_button.isChecked(): sites.append('quizlet') - if self.brainly_button.isChecked(): sites.append('brainly') if not sites: self.status_label.setText('Please select at least one site.') self.search_frame.setEnabled(True) return - searchify = Searchify(query, sites) + searchify = Searchify(query, sites, self.search_engine) t = Thread(target=searchify.main) t.daemon = True @@ -530,7 +533,6 @@ class UI(QMainWindow): # keybinds "quizlet": lambda: self.quizlet_button.isChecked(), "quizizz": lambda: self.quizizz_button.isChecked(), - "brainly": lambda: self.brainly_button.isChecked(), "hide_show_key": lambda: self.hide_show_key.keySequence().toString(), "ocr_key": lambda: self.ocr_key.keySequence().toString(), "paste_key": lambda: self.paste_key.keySequence().toString(), @@ -550,6 +552,7 @@ class UI(QMainWindow): "hide_taskbar": lambda: self.setting_hide_taskbar.isChecked(), "theme": lambda: self.themeInput.currentIndex(), "font_size": lambda: self.font_size.value(), + "search_engine": lambda: self.search_engine_combo.currentIndex(), } def updatejson(self, key): diff --git a/img/brainly.png b/img/brainly.png deleted file mode 100644 index 8c1df6900f3c61269080e89372a10cebe0b2aed5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6933 zcmZvBWmFqcw{~!cVkI~fTI>aa1uGC-UL;706c5E+in|qxh2mbEU`>%0C{P?qahKv& zw0MCQzUlpb-L>w@nw(^E_L)6bT zt-8;FAMmaJtEvQg`1dMmuXqQXA#ztU_5y)OsQw*TlRrZKfItj%>Pqqkev3!>#?CM2 znqnT>$JR^3Q!BI6_?^y*DzWyvmGzgk-Ou4{^15=%&?Ck1M%Rhw%dzIZe%vN(c+0V% zU?-eX!l0KlT5uYkh+@}u5P4p)T25&t^Gi&PakHA^nkE^|*oTvzb=fCR_0C|6oJ}EN zgely+@Qs0{xz5?eyAKaHdk?s3r5YN9gzRjdr9z-JQd8X`34}WW%TPj7A2w!#imSjh z(=_qR<|hWofIzbPucLs;!-kBq-CDGha_MJTn9<2@?v0i@!8~i?#V7h zudU*nktoVsB6eVr13r(vmy2}=BBMnT24O;kJ)Cg$JHe&$Yzj``!W#zfC2C&fdXVeS zy?1f8a^Z>bmWcmdgNJl}7S-ELHJf2QSZDVdXUOQ7!TFBFGUriwnAJXwuDWzMZWM&% zOs$_j3;p$Hihn8g6r{-R<9tFHrkP1GD1-=;{b*x~y?3`;Ew*G{v*tY^3Oc>S69Ll1 z(igu94#ef%Mp@>Cdy)73j3_o4Zie7p)Wm37ll^q3iHoM~wTpO>mN$Y$|6j$dow@_B z=tv$W&~TT1cVoEd@qr9=B>{i30oWju+EiePbi9G#V-*?aNzK2IO;joOmMgv)jY=V| zvlhvTaL7_VVfWH!$5j$ghLBNnpPpayVK)g(syPwS-ff{`$`zLe*;+ow8$crFH6LMk z_%xom3L=BWh&Za#ajEi*c@)fPXcb`K^m=!yN- z+r$}8z4wIH4eUx4k#DqOQ?e|&7d=TA`(K;}Lc)Z|A$(7UndAwvz{z!6+C{S(EZHI) z=%t8MKBjUjh1KP5+vkbi5;4C`!4sxY31cxS=JD)p&BkXe30{z94&}jYv?F!6Nwhzw zp#;q*YSX6pErl&9=#WoMH1ahIeLgumlu^OagEo6hig-}vz46C4%cC4ef{KZM@(HJ7 zvsB}sT&m4fnNUtz(GFhnlPxr9anc#4n3y(Y9`1P)4oMHszbR)s?wTJI#DOp6dtzU* zZ=p2T*U-c>RbkcWSI!?9J~GmmD(IFfQuf-~u+S?3C_dneOYO&(WIK9g^4t`uSTswM z5{>8C((9?bgL<4+?h0F)?^fXq_l87d<*VXT7Lp-dcwRI{f6m0nGvlZiWh~4?q7{8U zA9gekYTY|On@u5CNii}~v2mObNUczyw3!nQ1T*6I8)<#D(nYJ^_Zd2ayepR{lh+r zI2KdJ-c}%gb|tLmTQgL+mnN2!VPph&h-U?Un&~nssf5CoB*g~ovQ>A|EnkzFa5E){ zfOFDS%N2DwBPo0By5DFEnp_UHKxPf8A!5knS4(Ks96Wzcjb>(b0`RoirJAJu`fKBN za(O+|WH#c$^F~P=wDD|0oz9m{0DZqlqs-&w$84^@bMQYQIYIpQQVsi(au;+II(>!kUf$#>5*-<>?EU-xFCQ< zwFU*4>s_bTluukD*BLk=Pm0S|I&H$k5z|?yyW|%`EObl)Q$mpML_x%XB{1-J|57Q zMo^LrEW0L_e=2lNAJo9|quO|IV4g7#od}8Am;`FjA1B z;dgMZxw1O`1+q%Tup^WXJMXVMm7Sppzk|R-eT2do?}vK4O#qZ&3!1LI!rKP|VJbEt8ZtVK@vF92^V=gTq!&R09L07nhbM zCMP5M`&A_JwYlT5&E_yoP0`*`iT0xLeSONBnwo3-Gu6MZJ0B9&IITl@Ks3?WFqt)2 zPPs8v0|Z{|?*uyOK<$w;2JFG~PO2y8D`M=N)rF z@RjjFUagdfkr>&&Ijfq!J~cVBRMh&qRi1yj4WG($*md zKei{F_5N3^-)SG+!Kmd-kdjY}DIqSl^7&w0U0rd%t=;?fjIiZ;-VZG;aXKMq?>>8M zs{BxXq5D>_Kg(}B-B5{}hAKBVw>Zt)j4r2~NNXyp?x48fX=vx1d7u{PQJ$A>b>T_f zSljFqv?y_W@!DdOliwj`;ILA+)_NrcUpAXG!<>r~a$lVqFgCKnY@;e*iI-+_Uu7O?@Jei*d2gw%Z zlRv5~MH{?)$%q{elTUjJS4T8mf1pPzs)} zXGBM0^&Ye;rU>yPad~+%rMK}-pyJ|`#L?~tv;50n{Ha-kFDRdJabZ4w#Ae4;QBnET zeiBTfBXen>?38=9I5jn8WNuzpPfH>OO^8_j+SlYX7xhR19CJE|54*pe>x|d0(2nC! zO(Im*)#z}OWb?c_G`?dfcgXk~?2z*;>d{77ra2YpDoCg4;)%#**wdnF!}^WC*Q)~? z8&61kK*;X}l0XcTK6^f17w^0wGY`H*NRw5jp<2CvQniFkHrpI$ck;mJeD0ol&Ho8D zrV});na@s0H|sx?*8BSUBK8ag=wq-Dh|_*X zH#UaE*_pIZEmeaX`Q7B-RcPWKNQ!m^g(+__`o*$8fR6qCqq!iZbeW^3CNa~_5N3US z{m=gA&w#L;t^}T}|JUM{$~@%e6*xxsbaA^q1e3%fWAZv>OP(h5cB=3fFlFWkl&0x1 zN9>>7Y!SOhaY`u~WzC`Ni0HZl1ikKdVCa5T1#fDutE=lqG@W_nFl~t~2ihSgD?h)9 zBmEGh4zG5x9$a^aO^N17dg1HH{q>e7s4{K&d2_z&@%Q$!_4!6c`G~>&5Sz4ImgC^q zR;H?Ju4M47sDSC`4}7%T1f>tZ2jqUewfNJ0?|WIE1J$4Lo=q+0ZbOruk@QLqw0P~h zpKd7owXb~iDYr3iZEcNr3pvldYPbK%MH{WGB1As~M(!OS_wVg-%L3xCk!@GdHBq8^ z#Nqc1k7t(&0KdiGeM2D;5&gM}Tgplb?)tg6U0w1)$6kd!z`TF50}vPx#GcL5dCPD1 zhOTO3d6JQzRsx1cM-GgO3(V1IV$g<@#hr<~PKVy4XsylI3MAnpYY_V`L-9oA?{2yd z_%G?X^S36I6s?@w@rN;1^Dg==y~8R>E&HF(a@;!qLbdayqJb4jv@05pIe7U0zKYnj zB2^mRzWjq)_XbPsw?L;T^VPAJ2>ckclB`)wwQ|N3)2 zR#IO-2KBiN;wIRx6ywcitgo+syZUf{ZSCy*y&yQDbWw^Fly?d9o^j5{5u=4gbK+DPP}Y zeUPX#ZHX9Q4P|PrGaR`9hiiNcPwx=~l=(Lx!=#<7Zf+=cz~U*<0Z=_c{vKG!P4DA| zYqZ)K`Irr4cK&(w{QSGf6^I0ojt&ZOLwyph7)Y0K!BJUB^Vv>GMTKMp17buU`h2&-q@YtA`M})ZUcULah zmuwml0Kh+d_+ZJJ3YC?0rAX$aii;?uQz&X{V-)=I3%lby|HR27<#97Jh@k7XRXilA-B>~G{0+|J5np^n zQc}{QSWMJ%B>xRHHFZ`&L32qtQ-(kWz8W&UaQOZ#vr`G08(qvz`!Z?U=ZC9r7cZ=c zg-wzv$qcrVSe^M0Y;BD#A|iqZL);E4Q(l5pX9Z!3bYL#8@)tK zav@BSk&*T;El`$CZz{b28h0B61rbIxo)%1 zOMP%0@4efLxd}GDjJK1fq0xX2_9ch@*~}48pJA?)zUEW6qpj+_EF^>(0$RrF`8E9+$7 z_BYx*DIelY0W-H-7axJ>yS=hd$^q>FGJ zX>%3*!vLZT*-!!TVm+<4_)9!jXV!3jW}O_u1y`rV6M5K@>j!(A4RqdL3jSHA2>p6c z-v8JciMwP^JAn8?3G6$BmQ+^b84zz`u33wm5BVZ9jbZLaH?t zkDX1?Y}#@D37tYX-S&;c_ZT$e!wkSID?7VuYpb-~c^Em;uy9x~48>9}I_4vauRDMS zc`gXpI$LAFJ3BiNNy)g=QxEB6QzN6Vx3>Wkfm;?_p)KXFa$)V)7S;p0TR*)SKDn*- zPfyeI4-A~H$Ff=j_ji7NuEt5FOtMpUrslZ{bZe?pn$IHLB@+I1M6t@^D%ihl`l@8834Sug1^Ix#a7t@Ol-^(I){pEC{?Tspq9i@GU$Mip*C zJ1hryo2oX`;5b9%mK>>31aV+Bl3f&U!|B2`c==E-(v*%< z5~xJz;`Nmy$KGZrn%Oh&g}X_BR*P8qW2GjhHzTVTaAuAoAb3#@Uk4cgNel_Y1#kLS zG-o}sMbTSC_e{+uWGLp_GwwyZNrYXo%bAe-mb83g7N{pc4-E5We5M*Jyky7tx5 zcjinU^(B%)nFxvSfboPxyrC7C)UvjycT+JCghl*X!k>V{pX#Z}@@w6ppT9Dr(^VPp z6Ka}la$~Tpa2(1AxqJMFareXB7h5Vc=!Br;pQsCx9uGTET$sSfZzq`cVETr9d5J9C z0tA*O@L{G&(dKitjJJc8YeZ|fjxrK1{mqG5JQYXq^;5B)z)F&iF7w|p*!HYfJxgbkp*aOOUyMdg#G>)Bo6z}C+$h~{hd{>h=}pBYSUuvN{;cHE zoi67uvm|m8+mNR10_iBFSpwNuNyf0lDePH@*yCr}%YdEA#2>Zx(vxqS<@u7Gr6+OR zig3t|_}xWYyO`tCRrjBfYBgx4@*GlRcA=eS4_D)1!4m$QyQB(5d!8J)xKn_eXf_2qZex2 zUz?=m6e{bU9TjfZSTV^`Mu;e1GEI5q(DPJSaBn}p^TYGWnZt1L@wjI8(7NdPPcF_T zxkat{mU#;>h8%Lt@Vv9C4M4HU`44S_9EfDFKCE-^rKdhY$b-W>|CwyeKDqu%tbX|opJGa-VZ0=e(zh;BSe|yzUo$a$XPxV zUBjYY^r8eabeY}9;$OM}pBP_A6@k8px*)q*C-W}CRMRG@P#9FqOf_?q+8UL(AWYe~ z;MMcF$gv~|Nqgsm3GX+l7n>abNdx5=fxzWsm+2b+e%lu z&)r^%kEKXQ!XU4=P_LMjIc9h&m-u_z^P3xnBFm4VPCaueGZ+WPi=L&Wq?oJ6l`LGJ zztzThv^XF1A3uJ=RiRaMyxC~%x;ku*=Q(qvgf!$cq3~V$!LXXbI&x1AWW8%*Eut8` zG_|QL1>FMD*9L_?MwKO!`BvH={_0+udqq1C3lJ1wL4N`CGlL8~Zx+Wz>&JHysI(Lm zmovOI^SxC99hkAY$jk<+hZrB9z^?CP?syLq$h>&T-^=0z)${mPaP^u5^Fnn(Eg5^- z)M`0CWzr#6`FehEQl|l!%ibv#52b+(uM3F!+^xB~MC^NRnY;lcUpQWcUrA3LPLFH1 z{Ov%3z*5p>3KIY1(7{PTU#G2|e^z`;bqU2Hdi8sJ?Oee}+N-Bk#cJT08%^SN1tI5T zq>_Q%I|3~k*?(FpLtt|N6U+e^Iz9npI3CZ32#Lc~qbsV1C+`FGkD9??610|5R`(t5 zfF0n539@5@YgT0rU`wv(p7ja7N>~34z69{U(Nk|_BX1ik?^jYt&sV?z5{3v1@e2v_ z3kw?vL8U~*r65B55U3OcqTv`4`@ala+-w|d{r?XG;s0+yyt-Tt7=YB3wUz1=EF=CO D#nc(I diff --git a/requirements.txt b/requirements.txt index b49f6b6..701b68a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,22 +1,44 @@ +appdirs==1.4.4 beautifulsoup4==4.10.0 +bs4==0.0.1 certifi==2021.10.8 cffi==1.15.0 charset-normalizer==2.0.9 +colorama==0.4.4 +cssselect==1.1.0 +fake-headers==1.0.2 +fake-useragent==0.1.11 gevent==21.12.0 greenlet==1.1.2 grequests==0.6.0 +html5lib==1.1 idna==3.3 +importlib-metadata==4.11.3 keyboard==0.13.5 -Pillow==8.4.0 +lxml==4.8.0 +packaging==21.3 +parse==1.19.0 +Pillow==9.1.0 pycparser==2.21 +pyee==8.2.2 +pyparsing==3.0.8 pyperclip==1.8.2 +pyppeteer==1.0.2 PyQt5==5.15.6 PyQt5-Qt5==5.15.2 PyQt5-sip==12.9.0 -pytesseract==0.3.8 +pyquery==1.4.3 +pytesseract==0.3.9 pywin32==303 requests==2.26.0 +requests-html==0.10.0 +six==1.16.0 soupsieve==2.3.1 +tqdm==4.64.0 urllib3==1.26.7 +w3lib==1.22.0 +webencodings==0.5.1 +websockets==10.2 +zipp==3.8.0 zope.event==4.5.0 zope.interface==5.4.0 diff --git a/scraper.py b/scraper.py index f923bfb..659bfed 100644 --- a/scraper.py +++ b/scraper.py @@ -1,71 +1,124 @@ +import grequests import json from bs4 import BeautifulSoup from difflib import SequenceMatcher import json -import grequests +from requests_html import HTMLSession +from fake_headers import Headers import re import sys import time from urllib.parse import urlencode from threading import Thread - headers = { - "Connection": "keep-alive", - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36 Edg/95.0.1020.53', - "Sec-Fetch-Site": "same-origin", + "Sec-Ch-Ua": "\"(Not(A:Brand\";v=\"8\", \"Chromium\";v=\"99\"", + "Sec-Ch-Ua-Mobile": "?0", + "Sec-Ch-Ua-Platform": "\"Windows\"", + "Upgrade-Insecure-Requests": "1", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", + "Sec-Fetch-Site": "none", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-User": "?1", "Sec-Fetch-Dest": "document", - "Referer": "https://www.bing.com/", - "Accept-Language": "en-US,en;q=0.9" + "Accept-Encoding": "gzip, deflate", + "Accept-Language": "en-US,en;q=0.9", + "Connection": "close" } - get_text = lambda x: BeautifulSoup(x, features='lxml').get_text().strip() sluggify = lambda a: ' '.join(re.sub(r'[^\\sa-z0-9\\.,\\(\\)]+', ' ', a.lower()).split()) similar = lambda a, b: SequenceMatcher(None, sluggify(a), sluggify(b)).ratio() remove_duplicates = lambda a: list(set(a)) +def _make_headers(): + return {**headers, **Headers(headers=True, browser='chrome', os='windows').generate()} -class SearchBing: +class SearchEngine: + headers = headers.copy() + def __init__(self, engine_name): + self.sess = HTMLSession() + self.engine_name = engine_name + self._web_engines = { # simple scrapers using get requests + 'google': ('https://www.google.com/search?', 'q', {'aqs': 'chrome..69i57.888j0j1', 'sourceid': 'chrome', 'ie': 'UTF-8'}), + 'bing': ('https://www.bing.com/search?', 'q', {'pq': ''}), + } + if engine_name in self._web_engines: + return + elif engine_name == 'startpage': + print('Starting startpage instance...') + self.t = Thread(target=self._init_startpage) + self.t.daemon = True + self.t.start() + + def find_items(self, soup, args): + return {i: soup.find('input', {'type': 'hidden', 'name': i})['value'] for i in args} + + def get_startpage_items(self, r): + soup = BeautifulSoup(r.text, 'lxml') + return {'query': None, 'cat': 'web', **self.find_items(soup, ['lui', 'language', 'sc', 'abp'])} + + def _init_startpage(self): + self._startpage_data = self.get_startpage_items(self.sess.get('https://www.startpage.com/', headers=self.headers)) + self.headers.update({"Sec-Fetch-Site": "same-origin", 'Referer': 'https://www.startpage.com/'}) + + def startpage_get_page(self, query, sites): + self.t.join() + resps = grequests.map([ + grequests.post('https://www.startpage.com/sp/search', + headers=self.headers, + data={**self._startpage_data, **{'query': f'{query} site:{site}.com'}} + ) + for site in sites + ]) + self.t = Thread(target=self.get_startpage_items, args=(resps[-1],)) + self.t.daemon = True + self.t.start() + return dict(zip(sites, resps)) + + def get_page(self, query, sites): + if self.engine_name == 'startpage': + return self.startpage_get_page(query, sites) + return dict(zip( + sites, + grequests.map([ + grequests.get( + (web_engine := self._web_engines[self.engine_name])[0] + + urlencode({web_engine[1]: f'{query} site:{site}.com', **web_engine[2]}), + headers=self.headers, session=self.sess + ) + for site in sites + ], size=len(sites)) + )) + + +class SearchWeb: """ - search bing for query + search web for query """ - def __init__(self, query, sites): + def __init__(self, query, sites, engine): self.query = query self.links = None self.sites = sites + self.engine = engine self._regex_objs = { 'quizlet': re.compile('https?://quizlet.com/\d+/[a-z0-9\\-]+/'), 'quizizz': re.compile('https?://quizizz.com/admin/quiz/[a-f0-9]+/[a-z\\-]+'), - 'brainly': re.compile('https?://brainly.com/question/\d+'), } def search(self): """ - search bing for query + search web for query """ - resps = dict(zip( - self.sites, - grequests.map([ - grequests.get( - 'https://www.bing.com/search?' - + urlencode({'q': self.query + f' site:{site}.com'}), - headers=headers, - ) - for site in self.sites - ], size=len(self.sites)) - )) - + resps = self.engine.get_page(self.query, self.sites) self.links = { site: remove_duplicates(re.findall(self._regex_objs[site], resps[site].text)) for site in self.sites } - class QuizizzScraper: def __init__(self, links, query): self.links = links @@ -74,7 +127,7 @@ def __init__(self, links, query): self.query = query def async_requests(self, links): - reqs = [grequests.get(u, headers=headers) for u in links] + reqs = [grequests.get(u, headers=_make_headers()) for u in links] self.resps = grequests.map(reqs, size=len(reqs)) def parse_links(self): @@ -136,7 +189,7 @@ def __init__(self, links, query): self._regex_obj = re.compile('\\= \\{"alphabeticalIsDifferent.*\\}; QLoad\\(') def async_requests(self, links): - reqs = [grequests.get(u, headers=headers) for u in links] + reqs = [grequests.get(u, headers=_make_headers()) for u in links] self.resps = grequests.map(reqs, size=len(reqs)) def parse_links(self): @@ -170,61 +223,6 @@ def quizlet_parser(self, resp): ) - -class BrainlyScraper: - def __init__(self, links, query): - self.links = links - self.resps = None - self.brainlys = [] - self.query = query - - def async_requests(self, links): - reqs = [grequests.get(u, headers=headers) for u in links] - self.resps = grequests.map(reqs, size=len(reqs)) - - def parse_links(self): - self.async_requests(self.links) - for resp in self.resps: - try: - self.brainlys.append(self.brainly_parser(resp)) - except Exception as e: - print('exception', e, resp.url) - # pass # skip over any errors - return self.brainlys - - - def brainly_parser(self, resp): - data = json.loads(BeautifulSoup(resp.text, features='lxml').find('script', type="application/ld+json").string)[0] - answers = [] - if 'acceptedAnswer' in data['mainEntity']: - answers += data['mainEntity']['acceptedAnswer'] - if 'suggestedAnswer' in data['mainEntity']: - answers += data['mainEntity']['suggestedAnswer'] - - return max( - ( - { - 'question': data['name'].strip(), - 'answer': get_text(i['text']) - .replace('Answer:', 'Answer: ') - .replace('Explanation:', '\nExplanation: ') - + '\nUpvotes: ' - + str(i['upvoteCount']), - 'similarity': ( - similar(data['name'], self.query), - True, - i['upvoteCount'], - ), - 'url': resp.url, - } - for i in answers - ), - key=lambda x: x['similarity'], - ) - - - - class TimeLogger: def __init__(self): self.elapsed_total = time.time() @@ -261,19 +259,17 @@ def print_timers(self): - - class Searchify: - def __init__(self, query, sites): + def __init__(self, query, sites, engine): self.query = query self.sites = sites + self.engine = engine self.timer = TimeLogger() self.flashcards = [] self.links = [] self.site_scrapers = { 'quizlet': QuizletScraper, 'quizizz': QuizizzScraper, - 'brainly': BrainlyScraper, } def main(self): @@ -306,8 +302,8 @@ def _flashcard_thread(self, site_scraper, links, site_name): def get_links(self): - self.timer.start('bing search') - search_bing = SearchBing(self.query, self.sites) + self.timer.start('web search') + search_bing = SearchWeb(self.query, self.sites, self.engine) search_bing.search() self.timer.end() self.links = search_bing.links @@ -328,10 +324,11 @@ def sort_flashcards(self): # sourcery skip: for-index-replacement if __name__ == '__main__' and len(sys.argv) > 1: # argument parsing import argparse - parser = argparse.ArgumentParser(description='Search Bing for flashcards') + parser = argparse.ArgumentParser(description='Search the web for flashcards') parser.add_argument('--query', '-q', help='query to search for', default=None) parser.add_argument('--output', '-o', help='output file', default=None) - parser.add_argument('--sites', '-s', help='question sources quizlet,quizizz,brainly (comma seperated list)', default='quizlet,quizizz,brainly') + parser.add_argument('--sites', '-s', help='question sources quizlet,quizizz (comma seperated list)', default='quizlet,quizizz') + parser.add_argument('--engine', '-e', help='search engine to use (google, bing)', default='bing') args = parser.parse_args() if args.output: @@ -348,15 +345,20 @@ def sort_flashcards(self): # sourcery skip: for-index-replacement flashcards = [] # create flashcard list sites = args.sites.lower().split(',') # get list of sites + engine_name = args.engine.lower().strip() # get search engine + # start search engine + engine = SearchEngine(engine_name) + # run search s = Searchify( query=args.query, sites=sites, + engine=engine, ) s.main() write(json.dumps(s.flashcards, indent=4)) - print(str(len(s.flashcards))+ ' flashcards found') + print(f'{len(s.flashcards)} flashcards found') s.timer.print_timers() \ No newline at end of file diff --git a/textshot.py b/textshot.py index 1c215e0..cae8cc3 100644 --- a/textshot.py +++ b/textshot.py @@ -134,8 +134,15 @@ def mouseReleaseEvent(self, event): return super().mouseReleaseEvent(event) self.hide() + QtWidgets.QApplication.restoreOverrideCursor() QtWidgets.QApplication.processEvents() - shot = self.screen.copy(QtCore.QRect(self.start, self.end)) + + shot = self.screen.copy( + min(self.start.x(), self.end.x()), + min(self.start.y(), self.end.y()), + abs(self.start.x() - self.end.x()), + abs(self.start.y() - self.end.y()), + ) self.processImage(shot) self.quit_app() print('done') diff --git a/window.ui b/window.ui index ae39845..9473920 100644 --- a/window.ui +++ b/window.ui @@ -204,28 +204,6 @@ - - - - - 30 - 30 - - - - Brainly - - - true - - - false - - - true - - - @@ -404,9 +382,9 @@ 0 - 0 - 379 - 631 + -77 + 656 + 465 @@ -775,26 +753,97 @@ true - - + + - Search after running OCR + Search after pasting true - - + + - Search after pasting + Search after running OCR true + + + + + + + 150 + 0 + + + + + Bing + + + + + Google + + + + + Startpage + + + + + + + + + 100 + 0 + + + + Search engine: + + + + + + + Qt::Horizontal + + + + 40 + 20 + + + + + + + + Qt::Horizontal + + + QSizePolicy::Fixed + + + + 70 + 20 + + + + + + @@ -972,7 +1021,7 @@ Window transparency - 15 + 5 100