forked from ABHISHEKVALSAN/Indian-Government-Websites
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrap_us_gov.py
35 lines (32 loc) · 810 Bytes
/
scrap_us_gov.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import requests
from bs4 import BeautifulSoup
def get_url(ext):
url="https://www.usa.gov"+str(ext)
resp=requests.get(url)
if resp.status_code==200:
soup=BeautifulSoup(resp.text,'html.parser')
sites =soup.findAll("section")
for i in sites:
heading=i.find("h3")
try:
if heading.text=="Website:":
filename="tempUS.txt"
f=open(filename,"a+")
f.write(i.find("a")['href'])
print(i.find("a")['href'])
f.close()
except:
pass
def main(s):
url="https://www.usa.gov/federal-agencies/"+str(s)
resp=requests.get(url)
if resp.status_code==200:
soup=BeautifulSoup(resp.text,'html.parser')
sites =soup.findAll("a",{"class":"url"})
sites=sites[7:]
for i in sites:
get_url(i['href'])
if __name__=="__main__":
s="abcdefghijklmnoprstuvw"
for i in s:
main(i)