-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathyahoo.py
54 lines (44 loc) · 1.56 KB
/
yahoo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import mechanize
from bs4 import BeautifulSoup
import re
from StringIO import StringIO
import gzip
def YahooResults(search,no):
#mechanize emulates a Browser
br = mechanize.Browser()
br.set_handle_robots(False)
br.addheaders = [('User-agent','chrome'),('Accept-encoding','gzip')]
term = search.replace(" ","+")
query = "https://search.yahoo.com/search?q=" + term + "&n=" + str(no)
if br.open(query).info().get('Content-Encoding') == 'gzip':
buf = StringIO(br.open(query).read())
f = gzip.GzipFile(fileobj=buf)
htmltext = f.read()
soup = BeautifulSoup(htmltext)
#Since all results are located in the ol tag
search = soup.findAll('ol')
searchtext = str(search)
#Using BeautifulSoup to parse the HTML source
soup1 = BeautifulSoup(searchtext)
#Each search result is contained within div tag
list_items = soup1.findAll('div', attrs={'class':'res'})
#Each description is contained
Yahoo_Result = {}
for li in list_items:
list_item = str(li)
title = ""
soup2 = BeautifulSoup(list_item)
link = soup2.findAll('a')
desc = soup2.findAll('div')
for c in link[0].contents:
title += c.encode('utf-8')
d = desc[-1]
Yahoo_Result[link[0].get('href')] = {
'title': title.decode('utf-8').replace("<wbr></wbr>","").encode('utf-8'),
'desc': str(d).split("\">")[1].replace("</div>",""),
'YRank': no
}
no-=1
return Yahoo_Result